Blame - fs/btrfs/raid56.c - SHIFTPHONES/mainline/linux

blob: 5394641541f7a6f862a955ddc51b49fa66bf351e [file] [log] [blame]

David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2	/*
				3	* Copyright (C) 2012 Fusion-io All rights reserved.
				4	* Copyright (C) 2012 Intel Corp. All rights reserved.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	5	*/
David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	6
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	7	#include <linux/sched.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	8	#include <linux/bio.h>
				9	#include <linux/slab.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	10	#include <linux/blkdev.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	11	#include <linux/raid/pq.h>
				12	#include <linux/hash.h>
				13	#include <linux/list_sort.h>
				14	#include <linux/raid/xor.h>
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	15	#include <linux/mm.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	16	#include "ctree.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	17	#include "disk-io.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	18	#include "volumes.h"
				19	#include "raid56.h"
				20	#include "async-thread.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	21
				22	/* set when additional merges to this rbio are not allowed */
				23	#define RBIO_RMW_LOCKED_BIT 1
				24
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	25	/*
				26	* set when this rbio is sitting in the hash, but it is just a cache
				27	* of past RMW
				28	*/
				29	#define RBIO_CACHE_BIT 2
				30
				31	/*
				32	* set when it is safe to trust the stripe_pages for caching
				33	*/
				34	#define RBIO_CACHE_READY_BIT 3
				35
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	36	#define RBIO_CACHE_SIZE 1024
				37
David Sterba	8a95334	2019-08-21 19:06:17 +0200	[diff] [blame]	38	#define BTRFS_STRIPE_HASH_TABLE_BITS 11
				39
				40	/* Used by the raid56 code to lock stripes for read/modify/write */
				41	struct btrfs_stripe_hash {
				42	struct list_head hash_list;
				43	spinlock_t lock;
				44	};
				45
				46	/* Used by the raid56 code to lock stripes for read/modify/write */
				47	struct btrfs_stripe_hash_table {
				48	struct list_head stripe_cache;
				49	spinlock_t cache_lock;
				50	int cache_size;
				51	struct btrfs_stripe_hash table[];
				52	};
				53
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	54	enum btrfs_rbio_ops {
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	55	BTRFS_RBIO_WRITE,
				56	BTRFS_RBIO_READ_REBUILD,
				57	BTRFS_RBIO_PARITY_SCRUB,
				58	BTRFS_RBIO_REBUILD_MISSING,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	59	};
				60
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	61	struct btrfs_raid_bio {
				62	struct btrfs_fs_info *fs_info;
				63	struct btrfs_bio *bbio;
				64
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	65	/* while we're doing rmw on a stripe
				66	* we put it into a hash table so we can
				67	* lock the stripe and merge more rbios
				68	* into it.
				69	*/
				70	struct list_head hash_list;
				71
				72	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	73	* LRU list for the stripe cache
				74	*/
				75	struct list_head stripe_cache;
				76
				77	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	78	* for scheduling work in the helper threads
				79	*/
				80	struct btrfs_work work;
				81
				82	/*
				83	* bio list and bio_list_lock are used
				84	* to add more bios into the stripe
				85	* in hopes of avoiding the full rmw
				86	*/
				87	struct bio_list bio_list;
				88	spinlock_t bio_list_lock;
				89
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	90	/* also protected by the bio_list_lock, the
				91	* plug list is used by the plugging code
				92	* to collect partial bios while plugged. The
				93	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	94	* the stripe lock to the next pending IO
				95	*/
				96	struct list_head plug_list;
				97
				98	/*
				99	* flags that tell us if it is safe to
				100	* merge with this bio
				101	*/
				102	unsigned long flags;
				103
				104	/* size of each individual stripe on disk */
				105	int stripe_len;
				106
				107	/* number of data stripes (no p/q) */
				108	int nr_data;
				109
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	110	int real_stripes;
				111
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	112	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	113	/*
				114	* set if we're doing a parity rebuild
				115	* for a read from higher up, which is handled
				116	* differently from a parity rebuild as part of
				117	* rmw
				118	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	119	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	120
				121	/* first bad stripe */
				122	int faila;
				123
				124	/* second bad stripe (for raid6 use) */
				125	int failb;
				126
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	127	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	128	/*
				129	* number of pages needed to represent the full
				130	* stripe
				131	*/
				132	int nr_pages;
				133
				134	/*
				135	* size of all the bios in the bio_list. This
				136	* helps us decide if the rbio maps to a full
				137	* stripe or not
				138	*/
				139	int bio_list_bytes;
				140
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	141	int generic_bio_cnt;
				142
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	143	refcount_t refs;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	144
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	145	atomic_t stripes_pending;
				146
				147	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	148	/*
				149	* these are two arrays of pointers. We allocate the
				150	* rbio big enough to hold them both and setup their
				151	* locations when the rbio is allocated
				152	*/
				153
				154	/* pointers to pages that we allocated for
				155	* reading/writing stripes directly from the disk (including P/Q)
				156	*/
				157	struct page **stripe_pages;
				158
				159	/*
				160	* pointers to the pages in the bio_list. Stored
				161	* here for faster lookup
				162	*/
				163	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	164
				165	/*
				166	* bitmap to record which horizontal stripe has data
				167	*/
				168	unsigned long *dbitmap;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	169
				170	/* allocated with real_stripes-many pointers for finish_() calls /
				171	void **finish_pointers;
				172
				173	/* allocated with stripe_npages-many bits for finish_() calls /
				174	unsigned long *finish_pbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	175	};
				176
				177	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				178	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				179	static void rmw_work(struct btrfs_work *work);
				180	static void read_rebuild_work(struct btrfs_work *work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	181	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				182	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				183	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				184	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				185	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				186
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	187	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				188	int need_check);
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	189	static void scrub_parity_work(struct btrfs_work *work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	190
David Sterba	ac63885	2018-06-29 10:56:56 +0200	[diff] [blame]	191	static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
				192	{
Omar Sandoval	a0cac0e	2019-09-16 11:30:57 -0700	[diff] [blame]	193	btrfs_init_work(&rbio->work, work_func, NULL, NULL);
David Sterba	ac63885	2018-06-29 10:56:56 +0200	[diff] [blame]	194	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
				195	}
				196
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	197	/*
				198	* the stripe hash table is used for locking, and to collect
				199	* bios in hopes of making a full stripe
				200	*/
				201	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				202	{
				203	struct btrfs_stripe_hash_table *table;
				204	struct btrfs_stripe_hash_table *x;
				205	struct btrfs_stripe_hash *cur;
				206	struct btrfs_stripe_hash *h;
				207	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				208	int i;
				209
				210	if (info->stripe_hash_table)
				211	return 0;
				212
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	213	/*
				214	* The table is large, starting with order 4 and can go as high as
				215	* order 7 in case lock debugging is turned on.
				216	*
				217	* Try harder to allocate and fallback to vmalloc to lower the chance
				218	* of a failing mount.
				219	*/
David Sterba	ee787f9	2019-03-29 02:07:02 +0100	[diff] [blame]	220	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	221	if (!table)
				222	return -ENOMEM;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	223
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	224	spin_lock_init(&table->cache_lock);
				225	INIT_LIST_HEAD(&table->stripe_cache);
				226
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	227	h = table->table;
				228
				229	for (i = 0; i < num_entries; i++) {
				230	cur = h + i;
				231	INIT_LIST_HEAD(&cur->hash_list);
				232	spin_lock_init(&cur->lock);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	233	}
				234
				235	x = cmpxchg(&info->stripe_hash_table, NULL, table);
Yang Li	fe3b7bb	2021-01-21 16:19:47 +0800	[diff] [blame^]	236	kvfree(x);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	237	return 0;
				238	}
				239
				240	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	241	* caching an rbio means to copy anything from the
				242	* bio_pages array into the stripe_pages array. We
				243	* use the page uptodate bit in the stripe cache array
				244	* to indicate if it has valid data
				245	*
				246	* once the caching is done, we set the cache ready
				247	* bit.
				248	*/
				249	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				250	{
				251	int i;
				252	char *s;
				253	char *d;
				254	int ret;
				255
				256	ret = alloc_rbio_pages(rbio);
				257	if (ret)
				258	return;
				259
				260	for (i = 0; i < rbio->nr_pages; i++) {
				261	if (!rbio->bio_pages[i])
				262	continue;
				263
				264	s = kmap(rbio->bio_pages[i]);
				265	d = kmap(rbio->stripe_pages[i]);
				266
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	267	copy_page(d, s);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	268
				269	kunmap(rbio->bio_pages[i]);
				270	kunmap(rbio->stripe_pages[i]);
				271	SetPageUptodate(rbio->stripe_pages[i]);
				272	}
				273	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				274	}
				275
				276	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	277	* we hash on the first logical address of the stripe
				278	*/
				279	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				280	{
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	281	u64 num = rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	282
				283	/*
				284	* we shift down quite a bit. We're using byte
				285	* addressing, and most of the lower bits are zeros.
				286	* This tends to upset hash_64, and it consistently
				287	* returns just one or two different values.
				288	*
				289	* shifting off the lower bits fixes things.
				290	*/
				291	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				292	}
				293
				294	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	295	* stealing an rbio means taking all the uptodate pages from the stripe
				296	* array in the source rbio and putting them into the destination rbio
				297	*/
				298	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				299	{
				300	int i;
				301	struct page *s;
				302	struct page *d;
				303
				304	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				305	return;
				306
				307	for (i = 0; i < dest->nr_pages; i++) {
				308	s = src->stripe_pages[i];
				309	if (!s \|\| !PageUptodate(s)) {
				310	continue;
				311	}
				312
				313	d = dest->stripe_pages[i];
				314	if (d)
				315	__free_page(d);
				316
				317	dest->stripe_pages[i] = s;
				318	src->stripe_pages[i] = NULL;
				319	}
				320	}
				321
				322	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	323	* merging means we take the bio_list from the victim and
				324	* splice it into the destination. The victim should
				325	* be discarded afterwards.
				326	*
				327	* must be called with dest->rbio_list_lock held
				328	*/
				329	static void merge_rbio(struct btrfs_raid_bio *dest,
				330	struct btrfs_raid_bio *victim)
				331	{
				332	bio_list_merge(&dest->bio_list, &victim->bio_list);
				333	dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	334	dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	335	bio_list_init(&victim->bio_list);
				336	}
				337
				338	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	339	* used to prune items that are in the cache. The caller
				340	* must hold the hash table lock.
				341	*/
				342	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				343	{
				344	int bucket = rbio_bucket(rbio);
				345	struct btrfs_stripe_hash_table *table;
				346	struct btrfs_stripe_hash *h;
				347	int freeit = 0;
				348
				349	/*
				350	* check the bit again under the hash table lock.
				351	*/
				352	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				353	return;
				354
				355	table = rbio->fs_info->stripe_hash_table;
				356	h = table->table + bucket;
				357
				358	/* hold the lock for the bucket because we may be
				359	* removing it from the hash table
				360	*/
				361	spin_lock(&h->lock);
				362
				363	/*
				364	* hold the lock for the bio list because we need
				365	* to make sure the bio list is empty
				366	*/
				367	spin_lock(&rbio->bio_list_lock);
				368
				369	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				370	list_del_init(&rbio->stripe_cache);
				371	table->cache_size -= 1;
				372	freeit = 1;
				373
				374	/* if the bio list isn't empty, this rbio is
				375	* still involved in an IO. We take it out
				376	* of the cache list, and drop the ref that
				377	* was held for the list.
				378	*
				379	* If the bio_list was empty, we also remove
				380	* the rbio from the hash_table, and drop
				381	* the corresponding ref
				382	*/
				383	if (bio_list_empty(&rbio->bio_list)) {
				384	if (!list_empty(&rbio->hash_list)) {
				385	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	386	refcount_dec(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	387	BUG_ON(!list_empty(&rbio->plug_list));
				388	}
				389	}
				390	}
				391
				392	spin_unlock(&rbio->bio_list_lock);
				393	spin_unlock(&h->lock);
				394
				395	if (freeit)
				396	__free_raid_bio(rbio);
				397	}
				398
				399	/*
				400	* prune a given rbio from the cache
				401	*/
				402	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				403	{
				404	struct btrfs_stripe_hash_table *table;
				405	unsigned long flags;
				406
				407	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				408	return;
				409
				410	table = rbio->fs_info->stripe_hash_table;
				411
				412	spin_lock_irqsave(&table->cache_lock, flags);
				413	__remove_rbio_from_cache(rbio);
				414	spin_unlock_irqrestore(&table->cache_lock, flags);
				415	}
				416
				417	/*
				418	* remove everything in the cache
				419	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	420	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	421	{
				422	struct btrfs_stripe_hash_table *table;
				423	unsigned long flags;
				424	struct btrfs_raid_bio *rbio;
				425
				426	table = info->stripe_hash_table;
				427
				428	spin_lock_irqsave(&table->cache_lock, flags);
				429	while (!list_empty(&table->stripe_cache)) {
				430	rbio = list_entry(table->stripe_cache.next,
				431	struct btrfs_raid_bio,
				432	stripe_cache);
				433	__remove_rbio_from_cache(rbio);
				434	}
				435	spin_unlock_irqrestore(&table->cache_lock, flags);
				436	}
				437
				438	/*
				439	* remove all cached entries and free the hash table
				440	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	441	*/
				442	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				443	{
				444	if (!info->stripe_hash_table)
				445	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	446	btrfs_clear_rbio_cache(info);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	447	kvfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	448	info->stripe_hash_table = NULL;
				449	}
				450
				451	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	452	* insert an rbio into the stripe cache. It
				453	* must have already been prepared by calling
				454	* cache_rbio_pages
				455	*
				456	* If this rbio was already cached, it gets
				457	* moved to the front of the lru.
				458	*
				459	* If the size of the rbio cache is too big, we
				460	* prune an item.
				461	*/
				462	static void cache_rbio(struct btrfs_raid_bio *rbio)
				463	{
				464	struct btrfs_stripe_hash_table *table;
				465	unsigned long flags;
				466
				467	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				468	return;
				469
				470	table = rbio->fs_info->stripe_hash_table;
				471
				472	spin_lock_irqsave(&table->cache_lock, flags);
				473	spin_lock(&rbio->bio_list_lock);
				474
				475	/* bump our ref if we were not in the list before */
				476	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	477	refcount_inc(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	478
				479	if (!list_empty(&rbio->stripe_cache)){
				480	list_move(&rbio->stripe_cache, &table->stripe_cache);
				481	} else {
				482	list_add(&rbio->stripe_cache, &table->stripe_cache);
				483	table->cache_size += 1;
				484	}
				485
				486	spin_unlock(&rbio->bio_list_lock);
				487
				488	if (table->cache_size > RBIO_CACHE_SIZE) {
				489	struct btrfs_raid_bio *found;
				490
				491	found = list_entry(table->stripe_cache.prev,
				492	struct btrfs_raid_bio,
				493	stripe_cache);
				494
				495	if (found != rbio)
				496	__remove_rbio_from_cache(found);
				497	}
				498
				499	spin_unlock_irqrestore(&table->cache_lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	500	}
				501
				502	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	503	* helper function to run the xor_blocks api. It is only
				504	* able to do MAX_XOR_BLOCKS at a time, so we need to
				505	* loop through.
				506	*/
				507	static void run_xor(void **pages, int src_cnt, ssize_t len)
				508	{
				509	int src_off = 0;
				510	int xor_src_cnt = 0;
				511	void *dest = pages[src_cnt];
				512
				513	while(src_cnt > 0) {
				514	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				515	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				516
				517	src_cnt -= xor_src_cnt;
				518	src_off += xor_src_cnt;
				519	}
				520	}
				521
				522	/*
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	523	* Returns true if the bio list inside this rbio covers an entire stripe (no
				524	* rmw required).
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	525	*/
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	526	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				527	{
				528	unsigned long flags;
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	529	unsigned long size = rbio->bio_list_bytes;
				530	int ret = 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	531
				532	spin_lock_irqsave(&rbio->bio_list_lock, flags);
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	533	if (size != rbio->nr_data * rbio->stripe_len)
				534	ret = 0;
				535	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	536	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	537
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	538	return ret;
				539	}
				540
				541	/*
				542	* returns 1 if it is safe to merge two rbios together.
				543	* The merging is safe if the two rbios correspond to
				544	* the same stripe and if they are both going in the same
				545	* direction (read vs write), and if neither one is
				546	* locked for final IO
				547	*
				548	* The caller is responsible for locking such that
				549	* rmw_locked is safe to test
				550	*/
				551	static int rbio_can_merge(struct btrfs_raid_bio *last,
				552	struct btrfs_raid_bio *cur)
				553	{
				554	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				555	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				556	return 0;
				557
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	558	/*
				559	* we can't merge with cached rbios, since the
				560	* idea is that when we merge the destination
				561	* rbio is going to run our IO for us. We can
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	562	* steal from cached rbios though, other functions
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	563	* handle that.
				564	*/
				565	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				566	test_bit(RBIO_CACHE_BIT, &cur->flags))
				567	return 0;
				568
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	569	if (last->bbio->raid_map[0] !=
				570	cur->bbio->raid_map[0])
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	571	return 0;
				572
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	573	/* we can't merge with different operations */
				574	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	575	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	576	/*
				577	* We've need read the full stripe from the drive.
				578	* check and repair the parity and write the new results.
				579	*
				580	* We're not allowed to add any new bios to the
				581	* bio list here, anyone else that wants to
				582	* change this stripe needs to do their own rmw.
				583	*/
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	584	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	585	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	586
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	587	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	588	return 0;
				589
Liu Bo	cc54ff6	2017-12-11 14:56:31 -0700	[diff] [blame]	590	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
				591	int fa = last->faila;
				592	int fb = last->failb;
				593	int cur_fa = cur->faila;
				594	int cur_fb = cur->failb;
				595
				596	if (last->faila >= last->failb) {
				597	fa = last->failb;
				598	fb = last->faila;
				599	}
				600
				601	if (cur->faila >= cur->failb) {
				602	cur_fa = cur->failb;
				603	cur_fb = cur->faila;
				604	}
				605
				606	if (fa != cur_fa \|\| fb != cur_fb)
				607	return 0;
				608	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	609	return 1;
				610	}
				611
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	612	static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
				613	int index)
				614	{
				615	return stripe * rbio->stripe_npages + index;
				616	}
				617
				618	/*
				619	* these are just the pages from the rbio array, not from anything
				620	* the FS sent down to us
				621	*/
				622	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe,
				623	int index)
				624	{
				625	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
				626	}
				627
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	628	/*
				629	* helper to index into the pstripe
				630	*/
				631	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				632	{
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	633	return rbio_stripe_page(rbio, rbio->nr_data, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	634	}
				635
				636	/*
				637	* helper to index into the qstripe, returns null
				638	* if there is no qstripe
				639	*/
				640	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				641	{
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	642	if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	643	return NULL;
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	644	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	645	}
				646
				647	/*
				648	* The first stripe in the table for a logical address
				649	* has the lock. rbios are added in one of three ways:
				650	*
				651	* 1) Nobody has the stripe locked yet. The rbio is given
				652	* the lock and 0 is returned. The caller must start the IO
				653	* themselves.
				654	*
				655	* 2) Someone has the stripe locked, but we're able to merge
				656	* with the lock owner. The rbio is freed and the IO will
				657	* start automatically along with the existing rbio. 1 is returned.
				658	*
				659	* 3) Someone has the stripe locked, but we're not able to merge.
				660	* The rbio is added to the lock owner's plug list, or merged into
				661	* an rbio already on the plug list. When the lock owner unlocks,
				662	* the next rbio on the list is run and the IO is started automatically.
				663	* 1 is returned
				664	*
				665	* If we return 0, the caller still owns the rbio and must continue with
				666	* IO submission. If we return 1, the caller must assume the rbio has
				667	* already been freed.
				668	*/
				669	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				670	{
Johannes Thumshirn	721860d	2019-10-18 11:58:21 +0200	[diff] [blame]	671	struct btrfs_stripe_hash *h;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	672	struct btrfs_raid_bio *cur;
				673	struct btrfs_raid_bio *pending;
				674	unsigned long flags;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	675	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	676	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	677	int ret = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	678
Johannes Thumshirn	721860d	2019-10-18 11:58:21 +0200	[diff] [blame]	679	h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
				680
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	681	spin_lock_irqsave(&h->lock, flags);
				682	list_for_each_entry(cur, &h->hash_list, hash_list) {
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	683	if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
				684	continue;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	685
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	686	spin_lock(&cur->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	687
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	688	/* Can we steal this cached rbio's pages? */
				689	if (bio_list_empty(&cur->bio_list) &&
				690	list_empty(&cur->plug_list) &&
				691	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				692	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				693	list_del_init(&cur->hash_list);
				694	refcount_dec(&cur->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	695
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	696	steal_rbio(cur, rbio);
				697	cache_drop = cur;
				698	spin_unlock(&cur->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	699
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	700	goto lockit;
				701	}
				702
				703	/* Can we merge into the lock owner? */
				704	if (rbio_can_merge(cur, rbio)) {
				705	merge_rbio(cur, rbio);
				706	spin_unlock(&cur->bio_list_lock);
				707	freeit = rbio;
				708	ret = 1;
				709	goto out;
				710	}
				711
				712
				713	/*
				714	* We couldn't merge with the running rbio, see if we can merge
				715	* with the pending ones. We don't have to check for rmw_locked
				716	* because there is no way they are inside finish_rmw right now
				717	*/
				718	list_for_each_entry(pending, &cur->plug_list, plug_list) {
				719	if (rbio_can_merge(pending, rbio)) {
				720	merge_rbio(pending, rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	721	spin_unlock(&cur->bio_list_lock);
				722	freeit = rbio;
				723	ret = 1;
				724	goto out;
				725	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	726	}
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	727
				728	/*
				729	* No merging, put us on the tail of the plug list, our rbio
				730	* will be started with the currently running rbio unlocks
				731	*/
				732	list_add_tail(&rbio->plug_list, &cur->plug_list);
				733	spin_unlock(&cur->bio_list_lock);
				734	ret = 1;
				735	goto out;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	736	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	737	lockit:
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	738	refcount_inc(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	739	list_add(&rbio->hash_list, &h->hash_list);
				740	out:
				741	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	742	if (cache_drop)
				743	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	744	if (freeit)
				745	__free_raid_bio(freeit);
				746	return ret;
				747	}
				748
				749	/*
				750	* called as rmw or parity rebuild is completed. If the plug list has more
				751	* rbios waiting for this stripe, the next one on the list will be started
				752	*/
				753	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				754	{
				755	int bucket;
				756	struct btrfs_stripe_hash *h;
				757	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	758	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	759
				760	bucket = rbio_bucket(rbio);
				761	h = rbio->fs_info->stripe_hash_table->table + bucket;
				762
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	763	if (list_empty(&rbio->plug_list))
				764	cache_rbio(rbio);
				765
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	766	spin_lock_irqsave(&h->lock, flags);
				767	spin_lock(&rbio->bio_list_lock);
				768
				769	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	770	/*
				771	* if we're still cached and there is no other IO
				772	* to perform, just leave this rbio here for others
				773	* to steal from later
				774	*/
				775	if (list_empty(&rbio->plug_list) &&
				776	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				777	keep_cache = 1;
				778	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				779	BUG_ON(!bio_list_empty(&rbio->bio_list));
				780	goto done;
				781	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	782
				783	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	784	refcount_dec(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	785
				786	/*
				787	* we use the plug list to hold all the rbios
				788	* waiting for the chance to lock this stripe.
				789	* hand the lock over to one of them.
				790	*/
				791	if (!list_empty(&rbio->plug_list)) {
				792	struct btrfs_raid_bio *next;
				793	struct list_head *head = rbio->plug_list.next;
				794
				795	next = list_entry(head, struct btrfs_raid_bio,
				796	plug_list);
				797
				798	list_del_init(&rbio->plug_list);
				799
				800	list_add(&next->hash_list, &h->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	801	refcount_inc(&next->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	802	spin_unlock(&rbio->bio_list_lock);
				803	spin_unlock_irqrestore(&h->lock, flags);
				804
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	805	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	806	start_async_work(next, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	807	else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
				808	steal_rbio(rbio, next);
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	809	start_async_work(next, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	810	} else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	811	steal_rbio(rbio, next);
David Sterba	cf6a4a7	2018-06-29 10:56:58 +0200	[diff] [blame]	812	start_async_work(next, rmw_work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	813	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				814	steal_rbio(rbio, next);
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	815	start_async_work(next, scrub_parity_work);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	816	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	817
				818	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	819	}
				820	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	821	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	822	spin_unlock(&rbio->bio_list_lock);
				823	spin_unlock_irqrestore(&h->lock, flags);
				824
				825	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	826	if (!keep_cache)
				827	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	828	}
				829
				830	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				831	{
				832	int i;
				833
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	834	if (!refcount_dec_and_test(&rbio->refs))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	835	return;
				836
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	837	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	838	WARN_ON(!list_empty(&rbio->hash_list));
				839	WARN_ON(!bio_list_empty(&rbio->bio_list));
				840
				841	for (i = 0; i < rbio->nr_pages; i++) {
				842	if (rbio->stripe_pages[i]) {
				843	__free_page(rbio->stripe_pages[i]);
				844	rbio->stripe_pages[i] = NULL;
				845	}
				846	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	847
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	848	btrfs_put_bbio(rbio->bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	849	kfree(rbio);
				850	}
				851
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	852	static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	853	{
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	854	struct bio *next;
				855
				856	while (cur) {
				857	next = cur->bi_next;
				858	cur->bi_next = NULL;
				859	cur->bi_status = err;
				860	bio_endio(cur);
				861	cur = next;
				862	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	863	}
				864
				865	/*
				866	* this frees the rbio and runs through all the bios in the
				867	* bio_list and calls end_io on them
				868	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	869	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	870	{
				871	struct bio *cur = bio_list_get(&rbio->bio_list);
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	872	struct bio *extra;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	873
				874	if (rbio->generic_bio_cnt)
				875	btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
				876
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	877	/*
				878	* At this moment, rbio->bio_list is empty, however since rbio does not
				879	* always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
				880	* hash list, rbio may be merged with others so that rbio->bio_list
				881	* becomes non-empty.
				882	* Once unlock_stripe() is done, rbio->bio_list will not be updated any
				883	* more and we can call bio_endio() on all queued bios.
				884	*/
				885	unlock_stripe(rbio);
				886	extra = bio_list_get(&rbio->bio_list);
				887	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	888
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	889	rbio_endio_bio_list(cur, err);
				890	if (extra)
				891	rbio_endio_bio_list(extra, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	892	}
				893
				894	/*
				895	* end io function used by finish_rmw. When we finally
				896	* get here, we've written a full stripe
				897	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	898	static void raid_write_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	899	{
				900	struct btrfs_raid_bio *rbio = bio->bi_private;
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	901	blk_status_t err = bio->bi_status;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	902	int max_errors;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	903
				904	if (err)
				905	fail_bio_stripe(rbio, bio);
				906
				907	bio_put(bio);
				908
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	909	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	910	return;
				911
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	912	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	913
				914	/* OK, we have read all the stripes we need to. */
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	915	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
				916	0 : rbio->bbio->max_errors;
				917	if (atomic_read(&rbio->error) > max_errors)
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	918	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	919
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	920	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	921	}
				922
				923	/*
				924	* the read/modify/write code wants to use the original bio for
				925	* any pages it included, and then use the rbio for everything
				926	* else. This function decides if a given index (stripe number)
				927	* and page number in that stripe fall inside the original bio
				928	* or the rbio.
				929	*
				930	* if you set bio_list_only, you'll get a NULL back for any ranges
				931	* that are outside the bio_list
				932	*
				933	* This doesn't take any refs on anything, you get a bare page pointer
				934	* and the caller must bump refs as required.
				935	*
				936	* You must call index_rbio_pages once before you can trust
				937	* the answers from this function.
				938	*/
				939	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				940	int index, int pagenr, int bio_list_only)
				941	{
				942	int chunk_page;
				943	struct page *p = NULL;
				944
				945	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				946
				947	spin_lock_irq(&rbio->bio_list_lock);
				948	p = rbio->bio_pages[chunk_page];
				949	spin_unlock_irq(&rbio->bio_list_lock);
				950
				951	if (p \|\| bio_list_only)
				952	return p;
				953
				954	return rbio->stripe_pages[chunk_page];
				955	}
				956
				957	/*
				958	* number of pages we need for the entire stripe across all the
				959	* drives
				960	*/
				961	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				962	{
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	963	return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	964	}
				965
				966	/*
				967	* allocation and initial setup for the btrfs_raid_bio. Not
				968	* this does not allocate any pages for rbio->pages.
				969	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	970	static struct btrfs_raid_bio alloc_rbio(struct btrfs_fs_info fs_info,
				971	struct btrfs_bio *bbio,
				972	u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	973	{
				974	struct btrfs_raid_bio *rbio;
				975	int nr_data = 0;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	976	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
				977	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	978	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	979	void *p;
				980
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	981	rbio = kzalloc(sizeof(*rbio) +
				982	sizeof(rbio->stripe_pages) num_pages +
				983	sizeof(rbio->bio_pages) num_pages +
				984	sizeof(rbio->finish_pointers) real_stripes +
				985	sizeof(rbio->dbitmap) BITS_TO_LONGS(stripe_npages) +
				986	sizeof(rbio->finish_pbitmap)
				987	BITS_TO_LONGS(stripe_npages),
				988	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	989	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	990	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	991
				992	bio_list_init(&rbio->bio_list);
				993	INIT_LIST_HEAD(&rbio->plug_list);
				994	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	995	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	996	INIT_LIST_HEAD(&rbio->hash_list);
				997	rbio->bbio = bbio;
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	998	rbio->fs_info = fs_info;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	999	rbio->stripe_len = stripe_len;
				1000	rbio->nr_pages = num_pages;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1001	rbio->real_stripes = real_stripes;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1002	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1003	rbio->faila = -1;
				1004	rbio->failb = -1;
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	1005	refcount_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1006	atomic_set(&rbio->error, 0);
				1007	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1008
				1009	/*
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1010	* the stripe_pages, bio_pages, etc arrays point to the extra
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1011	* memory we allocated past the end of the rbio
				1012	*/
				1013	p = rbio + 1;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1014	#define CONSUME_ALLOC(ptr, count) do { \
				1015	ptr = p; \
				1016	p = (unsigned char )p + sizeof((ptr)) * (count); \
				1017	} while (0)
				1018	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
				1019	CONSUME_ALLOC(rbio->bio_pages, num_pages);
				1020	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
				1021	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
				1022	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
				1023	#undef CONSUME_ALLOC
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1024
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1025	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
				1026	nr_data = real_stripes - 1;
				1027	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1028	nr_data = real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1029	else
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1030	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1031
				1032	rbio->nr_data = nr_data;
				1033	return rbio;
				1034	}
				1035
				1036	/* allocate pages for all the stripes in the bio, including parity */
				1037	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1038	{
				1039	int i;
				1040	struct page *page;
				1041
				1042	for (i = 0; i < rbio->nr_pages; i++) {
				1043	if (rbio->stripe_pages[i])
				1044	continue;
				1045	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1046	if (!page)
				1047	return -ENOMEM;
				1048	rbio->stripe_pages[i] = page;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1049	}
				1050	return 0;
				1051	}
				1052
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1053	/* only allocate pages for p/q stripes */
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1054	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1055	{
				1056	int i;
				1057	struct page *page;
				1058
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1059	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1060
				1061	for (; i < rbio->nr_pages; i++) {
				1062	if (rbio->stripe_pages[i])
				1063	continue;
				1064	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1065	if (!page)
				1066	return -ENOMEM;
				1067	rbio->stripe_pages[i] = page;
				1068	}
				1069	return 0;
				1070	}
				1071
				1072	/*
				1073	* add a single page from a specific stripe into our list of bios for IO
				1074	* this will try to merge into existing bios if possible, and returns
				1075	* zero if all went well.
				1076	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1077	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1078	struct bio_list *bio_list,
				1079	struct page *page,
				1080	int stripe_nr,
				1081	unsigned long page_index,
				1082	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1083	{
				1084	struct bio *last = bio_list->tail;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1085	int ret;
				1086	struct bio *bio;
				1087	struct btrfs_bio_stripe *stripe;
				1088	u64 disk_start;
				1089
				1090	stripe = &rbio->bbio->stripes[stripe_nr];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1091	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1092
				1093	/* if the device is missing, just fail this stripe */
				1094	if (!stripe->dev->bdev)
				1095	return fail_rbio_index(rbio, stripe_nr);
				1096
				1097	/* see if we can add this page onto our existing bio */
				1098	if (last) {
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	1099	u64 last_end = last->bi_iter.bi_sector << 9;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1100	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1101
				1102	/*
				1103	* we can't merge these if they are from different
				1104	* devices or if they are not contiguous
				1105	*/
Nikolay Borisov	f90ae76	2020-07-02 16:46:42 +0300	[diff] [blame]	1106	if (last_end == disk_start && !last->bi_status &&
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1107	last->bi_disk == stripe->dev->bdev->bd_disk &&
				1108	last->bi_partno == stripe->dev->bdev->bd_partno) {
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1109	ret = bio_add_page(last, page, PAGE_SIZE, 0);
				1110	if (ret == PAGE_SIZE)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1111	return 0;
				1112	}
				1113	}
				1114
				1115	/* put a new bio on the list */
David Sterba	c5e4c3d	2017-06-12 17:29:41 +0200	[diff] [blame]	1116	bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
Nikolay Borisov	c31efbd	2020-07-03 11:14:27 +0300	[diff] [blame]	1117	btrfs_io_bio(bio)->device = stripe->dev;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1118	bio->bi_iter.bi_size = 0;
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1119	bio_set_dev(bio, stripe->dev->bdev);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1120	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1121
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1122	bio_add_page(bio, page, PAGE_SIZE, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1123	bio_list_add(bio_list, bio);
				1124	return 0;
				1125	}
				1126
				1127	/*
				1128	* while we're doing the read/modify/write cycle, we could
				1129	* have errors in reading pages off the disk. This checks
				1130	* for errors and if we're not able to read the page it'll
				1131	* trigger parity reconstruction. The rmw will be finished
				1132	* after we've reconstructed the failed stripes
				1133	*/
				1134	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1135	{
				1136	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1137	BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1138	__raid56_parity_recover(rbio);
				1139	} else {
				1140	finish_rmw(rbio);
				1141	}
				1142	}
				1143
				1144	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1145	* helper function to walk our bio list and populate the bio_pages array with
				1146	* the result. This seems expensive, but it is faster than constantly
				1147	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1148	* reconstruction.
				1149	*
				1150	* This must be called before you trust the answers from page_in_rbio
				1151	*/
				1152	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1153	{
				1154	struct bio *bio;
				1155	u64 start;
				1156	unsigned long stripe_offset;
				1157	unsigned long page_index;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1158
				1159	spin_lock_irq(&rbio->bio_list_lock);
				1160	bio_list_for_each(bio, &rbio->bio_list) {
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1161	struct bio_vec bvec;
				1162	struct bvec_iter iter;
				1163	int i = 0;
				1164
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	1165	start = bio->bi_iter.bi_sector << 9;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1166	stripe_offset = start - rbio->bbio->raid_map[0];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1167	page_index = stripe_offset >> PAGE_SHIFT;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1168
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1169	if (bio_flagged(bio, BIO_CLONED))
				1170	bio->bi_iter = btrfs_io_bio(bio)->iter;
				1171
				1172	bio_for_each_segment(bvec, bio, iter) {
				1173	rbio->bio_pages[page_index + i] = bvec.bv_page;
				1174	i++;
				1175	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1176	}
				1177	spin_unlock_irq(&rbio->bio_list_lock);
				1178	}
				1179
				1180	/*
				1181	* this is called from one of two situations. We either
				1182	* have a full stripe from the higher layers, or we've read all
				1183	* the missing bits off disk.
				1184	*
				1185	* This will calculate the parity and then send down any
				1186	* changed blocks.
				1187	*/
				1188	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1189	{
				1190	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1191	void **pointers = rbio->finish_pointers;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1192	int nr_data = rbio->nr_data;
				1193	int stripe;
				1194	int pagenr;
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	1195	bool has_qstripe;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1196	struct bio_list bio_list;
				1197	struct bio *bio;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1198	int ret;
				1199
				1200	bio_list_init(&bio_list);
				1201
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	1202	if (rbio->real_stripes - rbio->nr_data == 1)
				1203	has_qstripe = false;
				1204	else if (rbio->real_stripes - rbio->nr_data == 2)
				1205	has_qstripe = true;
				1206	else
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1207	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1208
				1209	/* at this point we either have a full stripe,
				1210	* or we've read the full stripe from the drive.
				1211	* recalculate the parity and write the new results.
				1212	*
				1213	* We're not allowed to add any new bios to the
				1214	* bio list here, anyone else that wants to
				1215	* change this stripe needs to do their own rmw.
				1216	*/
				1217	spin_lock_irq(&rbio->bio_list_lock);
				1218	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1219	spin_unlock_irq(&rbio->bio_list_lock);
				1220
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1221	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1222
				1223	/*
				1224	* now that we've set rmw_locked, run through the
				1225	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1226	*
				1227	* We don't cache full rbios because we're assuming
				1228	* the higher layers are unlikely to use this area of
				1229	* the disk again soon. If they do use it again,
				1230	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1231	*/
				1232	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1233	if (!rbio_is_full(rbio))
				1234	cache_rbio_pages(rbio);
				1235	else
				1236	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1237
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1238	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1239	struct page *p;
				1240	/* first collect one page from each data stripe */
				1241	for (stripe = 0; stripe < nr_data; stripe++) {
				1242	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1243	pointers[stripe] = kmap(p);
				1244	}
				1245
				1246	/* then add the parity stripe */
				1247	p = rbio_pstripe_page(rbio, pagenr);
				1248	SetPageUptodate(p);
				1249	pointers[stripe++] = kmap(p);
				1250
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	1251	if (has_qstripe) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1252
				1253	/*
				1254	* raid6, add the qstripe and call the
				1255	* library function to fill in our p/q
				1256	*/
				1257	p = rbio_qstripe_page(rbio, pagenr);
				1258	SetPageUptodate(p);
				1259	pointers[stripe++] = kmap(p);
				1260
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1261	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1262	pointers);
				1263	} else {
				1264	/* raid5 */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	1265	copy_page(pointers[nr_data], pointers[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1266	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1267	}
				1268
				1269
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1270	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1271	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1272	}
				1273
				1274	/*
				1275	* time to start writing. Make bios for everything from the
				1276	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1277	* everything else.
				1278	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1279	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1280	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1281	struct page *page;
				1282	if (stripe < rbio->nr_data) {
				1283	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1284	if (!page)
				1285	continue;
				1286	} else {
				1287	page = rbio_stripe_page(rbio, stripe, pagenr);
				1288	}
				1289
				1290	ret = rbio_add_io_page(rbio, &bio_list,
				1291	page, stripe, pagenr, rbio->stripe_len);
				1292	if (ret)
				1293	goto cleanup;
				1294	}
				1295	}
				1296
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1297	if (likely(!bbio->num_tgtdevs))
				1298	goto write_data;
				1299
				1300	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
				1301	if (!bbio->tgtdev_map[stripe])
				1302	continue;
				1303
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1304	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1305	struct page *page;
				1306	if (stripe < rbio->nr_data) {
				1307	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1308	if (!page)
				1309	continue;
				1310	} else {
				1311	page = rbio_stripe_page(rbio, stripe, pagenr);
				1312	}
				1313
				1314	ret = rbio_add_io_page(rbio, &bio_list, page,
				1315	rbio->bbio->tgtdev_map[stripe],
				1316	pagenr, rbio->stripe_len);
				1317	if (ret)
				1318	goto cleanup;
				1319	}
				1320	}
				1321
				1322	write_data:
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1323	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1324	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1325
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	1326	while ((bio = bio_list_pop(&bio_list))) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1327	bio->bi_private = rbio;
				1328	bio->bi_end_io = raid_write_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	1329	bio->bi_opf = REQ_OP_WRITE;
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1330
				1331	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1332	}
				1333	return;
				1334
				1335	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1336	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1337
				1338	while ((bio = bio_list_pop(&bio_list)))
				1339	bio_put(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1340	}
				1341
				1342	/*
				1343	* helper to find the stripe number for a given bio. Used to figure out which
				1344	* stripe has failed. This expects the bio to correspond to a physical disk,
				1345	* so it looks up based on physical sector numbers.
				1346	*/
				1347	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1348	struct bio *bio)
				1349	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1350	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1351	int i;
				1352	struct btrfs_bio_stripe *stripe;
				1353
				1354	physical <<= 9;
				1355
				1356	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1357	stripe = &rbio->bbio->stripes[i];
Nikolay Borisov	8302586	2020-07-02 16:46:45 +0300	[diff] [blame]	1358	if (in_range(physical, stripe->physical, rbio->stripe_len) &&
Dmitriy Gorokh	047fdea	2018-02-16 19:51:38 +0000	[diff] [blame]	1359	stripe->dev->bdev &&
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1360	bio->bi_disk == stripe->dev->bdev->bd_disk &&
				1361	bio->bi_partno == stripe->dev->bdev->bd_partno) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1362	return i;
				1363	}
				1364	}
				1365	return -1;
				1366	}
				1367
				1368	/*
				1369	* helper to find the stripe number for a given
				1370	* bio (before mapping). Used to figure out which stripe has
				1371	* failed. This looks up based on logical block numbers.
				1372	*/
				1373	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1374	struct bio *bio)
				1375	{
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	1376	u64 logical = bio->bi_iter.bi_sector << 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1377	int i;
				1378
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1379	for (i = 0; i < rbio->nr_data; i++) {
Nikolay Borisov	8302586	2020-07-02 16:46:45 +0300	[diff] [blame]	1380	u64 stripe_start = rbio->bbio->raid_map[i];
				1381
				1382	if (in_range(logical, stripe_start, rbio->stripe_len))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1383	return i;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1384	}
				1385	return -1;
				1386	}
				1387
				1388	/*
				1389	* returns -EIO if we had too many failures
				1390	*/
				1391	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1392	{
				1393	unsigned long flags;
				1394	int ret = 0;
				1395
				1396	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1397
				1398	/* we already know this stripe is bad, move on */
				1399	if (rbio->faila == failed \|\| rbio->failb == failed)
				1400	goto out;
				1401
				1402	if (rbio->faila == -1) {
				1403	/* first failure on this rbio */
				1404	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1405	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1406	} else if (rbio->failb == -1) {
				1407	/* second failure on this rbio */
				1408	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1409	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1410	} else {
				1411	ret = -EIO;
				1412	}
				1413	out:
				1414	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1415
				1416	return ret;
				1417	}
				1418
				1419	/*
				1420	* helper to fail a stripe based on a physical disk
				1421	* bio.
				1422	*/
				1423	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1424	struct bio *bio)
				1425	{
				1426	int failed = find_bio_stripe(rbio, bio);
				1427
				1428	if (failed < 0)
				1429	return -EIO;
				1430
				1431	return fail_rbio_index(rbio, failed);
				1432	}
				1433
				1434	/*
				1435	* this sets each page in the bio uptodate. It should only be used on private
				1436	* rbio pages, nothing that comes in from the higher layers
				1437	*/
				1438	static void set_bio_pages_uptodate(struct bio *bio)
				1439	{
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1440	struct bio_vec *bvec;
Ming Lei	6dc4f10	2019-02-15 19:13:19 +0800	[diff] [blame]	1441	struct bvec_iter_all iter_all;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1442
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1443	ASSERT(!bio_flagged(bio, BIO_CLONED));
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1444
Christoph Hellwig	2b070cf	2019-04-25 09:03:00 +0200	[diff] [blame]	1445	bio_for_each_segment_all(bvec, bio, iter_all)
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1446	SetPageUptodate(bvec->bv_page);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1447	}
				1448
				1449	/*
				1450	* end io for the read phase of the rmw cycle. All the bios here are physical
				1451	* stripe bios we've read from the disk so we can recalculate the parity of the
				1452	* stripe.
				1453	*
				1454	* This will usually kick off finish_rmw once all the bios are read in, but it
				1455	* may trigger parity reconstruction if we had any errors along the way
				1456	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1457	static void raid_rmw_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1458	{
				1459	struct btrfs_raid_bio *rbio = bio->bi_private;
				1460
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1461	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1462	fail_bio_stripe(rbio, bio);
				1463	else
				1464	set_bio_pages_uptodate(bio);
				1465
				1466	bio_put(bio);
				1467
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1468	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1469	return;
				1470
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1471	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1472	goto cleanup;
				1473
				1474	/*
				1475	* this will normally call finish_rmw to start our write
				1476	* but if there are any failed stripes we'll reconstruct
				1477	* from parity first
				1478	*/
				1479	validate_rbio_for_rmw(rbio);
				1480	return;
				1481
				1482	cleanup:
				1483
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1484	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1485	}
				1486
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1487	/*
				1488	* the stripe must be locked by the caller. It will
				1489	* unlock after all the writes are done
				1490	*/
				1491	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1492	{
				1493	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1494	struct bio_list bio_list;
				1495	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1496	int pagenr;
				1497	int stripe;
				1498	struct bio *bio;
				1499
				1500	bio_list_init(&bio_list);
				1501
				1502	ret = alloc_rbio_pages(rbio);
				1503	if (ret)
				1504	goto cleanup;
				1505
				1506	index_rbio_pages(rbio);
				1507
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1508	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1509	/*
				1510	* build a list of bios to read all the missing parts of this
				1511	* stripe
				1512	*/
				1513	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1514	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1515	struct page *page;
				1516	/*
				1517	* we want to find all the pages missing from
				1518	* the rbio and read them from the disk. If
				1519	* page_in_rbio finds a page in the bio list
				1520	* we don't need to read it off the stripe.
				1521	*/
				1522	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1523	if (page)
				1524	continue;
				1525
				1526	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1527	/*
				1528	* the bio cache may have handed us an uptodate
				1529	* page. If so, be happy and use it
				1530	*/
				1531	if (PageUptodate(page))
				1532	continue;
				1533
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1534	ret = rbio_add_io_page(rbio, &bio_list, page,
				1535	stripe, pagenr, rbio->stripe_len);
				1536	if (ret)
				1537	goto cleanup;
				1538	}
				1539	}
				1540
				1541	bios_to_read = bio_list_size(&bio_list);
				1542	if (!bios_to_read) {
				1543	/*
				1544	* this can happen if others have merged with
				1545	* us, it means there is nothing left to read.
				1546	* But if there are missing devices it may not be
				1547	* safe to do the full stripe write yet.
				1548	*/
				1549	goto finish;
				1550	}
				1551
				1552	/*
				1553	* the bbio may be freed once we submit the last bio. Make sure
				1554	* not to touch it after that
				1555	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1556	atomic_set(&rbio->stripes_pending, bios_to_read);
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	1557	while ((bio = bio_list_pop(&bio_list))) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1558	bio->bi_private = rbio;
				1559	bio->bi_end_io = raid_rmw_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	1560	bio->bi_opf = REQ_OP_READ;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1561
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1562	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1563
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1564	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1565	}
				1566	/* the actual write will happen once the reads are done */
				1567	return 0;
				1568
				1569	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1570	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1571
				1572	while ((bio = bio_list_pop(&bio_list)))
				1573	bio_put(bio);
				1574
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1575	return -EIO;
				1576
				1577	finish:
				1578	validate_rbio_for_rmw(rbio);
				1579	return 0;
				1580	}
				1581
				1582	/*
				1583	* if the upper layers pass in a full stripe, we thank them by only allocating
				1584	* enough pages to hold the parity, and sending it all down quickly.
				1585	*/
				1586	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1587	{
				1588	int ret;
				1589
				1590	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1591	if (ret) {
				1592	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1593	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1594	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1595
				1596	ret = lock_stripe_add(rbio);
				1597	if (ret == 0)
				1598	finish_rmw(rbio);
				1599	return 0;
				1600	}
				1601
				1602	/*
				1603	* partial stripe writes get handed over to async helpers.
				1604	* We're really hoping to merge a few more writes into this
				1605	* rbio before calculating new parity
				1606	*/
				1607	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1608	{
				1609	int ret;
				1610
				1611	ret = lock_stripe_add(rbio);
				1612	if (ret == 0)
David Sterba	cf6a4a7	2018-06-29 10:56:58 +0200	[diff] [blame]	1613	start_async_work(rbio, rmw_work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1614	return 0;
				1615	}
				1616
				1617	/*
				1618	* sometimes while we were reading from the drive to
				1619	* recalculate parity, enough new bios come into create
				1620	* a full stripe. So we do a check here to see if we can
				1621	* go directly to finish_rmw
				1622	*/
				1623	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1624	{
				1625	/* head off into rmw land if we don't have a full stripe */
				1626	if (!rbio_is_full(rbio))
				1627	return partial_stripe_write(rbio);
				1628	return full_stripe_write(rbio);
				1629	}
				1630
				1631	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1632	* We use plugging call backs to collect full stripes.
				1633	* Any time we get a partial stripe write while plugged
				1634	* we collect it into a list. When the unplug comes down,
				1635	* we sort the list by logical block number and merge
				1636	* everything we can into the same rbios
				1637	*/
				1638	struct btrfs_plug_cb {
				1639	struct blk_plug_cb cb;
				1640	struct btrfs_fs_info *info;
				1641	struct list_head rbio_list;
				1642	struct btrfs_work work;
				1643	};
				1644
				1645	/*
				1646	* rbios on the plug list are sorted for easier merging.
				1647	*/
				1648	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1649	{
				1650	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1651	plug_list);
				1652	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1653	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1654	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1655	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1656
				1657	if (a_sector < b_sector)
				1658	return -1;
				1659	if (a_sector > b_sector)
				1660	return 1;
				1661	return 0;
				1662	}
				1663
				1664	static void run_plug(struct btrfs_plug_cb *plug)
				1665	{
				1666	struct btrfs_raid_bio *cur;
				1667	struct btrfs_raid_bio *last = NULL;
				1668
				1669	/*
				1670	* sort our plug list then try to merge
				1671	* everything we can in hopes of creating full
				1672	* stripes.
				1673	*/
				1674	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1675	while (!list_empty(&plug->rbio_list)) {
				1676	cur = list_entry(plug->rbio_list.next,
				1677	struct btrfs_raid_bio, plug_list);
				1678	list_del_init(&cur->plug_list);
				1679
				1680	if (rbio_is_full(cur)) {
David Sterba	c7b562c	2018-06-29 10:57:10 +0200	[diff] [blame]	1681	int ret;
				1682
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1683	/* we have a full stripe, send it down */
David Sterba	c7b562c	2018-06-29 10:57:10 +0200	[diff] [blame]	1684	ret = full_stripe_write(cur);
				1685	BUG_ON(ret);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1686	continue;
				1687	}
				1688	if (last) {
				1689	if (rbio_can_merge(last, cur)) {
				1690	merge_rbio(last, cur);
				1691	__free_raid_bio(cur);
				1692	continue;
				1693
				1694	}
				1695	__raid56_parity_write(last);
				1696	}
				1697	last = cur;
				1698	}
				1699	if (last) {
				1700	__raid56_parity_write(last);
				1701	}
				1702	kfree(plug);
				1703	}
				1704
				1705	/*
				1706	* if the unplug comes from schedule, we have to push the
				1707	* work off to a helper thread
				1708	*/
				1709	static void unplug_work(struct btrfs_work *work)
				1710	{
				1711	struct btrfs_plug_cb *plug;
				1712	plug = container_of(work, struct btrfs_plug_cb, work);
				1713	run_plug(plug);
				1714	}
				1715
				1716	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1717	{
				1718	struct btrfs_plug_cb *plug;
				1719	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1720
				1721	if (from_schedule) {
Omar Sandoval	a0cac0e	2019-09-16 11:30:57 -0700	[diff] [blame]	1722	btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1723	btrfs_queue_work(plug->info->rmw_workers,
				1724	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1725	return;
				1726	}
				1727	run_plug(plug);
				1728	}
				1729
				1730	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1731	* our main entry point for writes from the rest of the FS.
				1732	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1733	int raid56_parity_write(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1734	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1735	{
				1736	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1737	struct btrfs_plug_cb *plug = NULL;
				1738	struct blk_plug_cb *cb;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1739	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1740
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1741	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1742	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1743	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1744	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1745	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1746	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1747	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1748	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1749
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1750	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1751	rbio->generic_bio_cnt = 1;
				1752
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1753	/*
				1754	* don't plug on full rbios, just get them out the door
				1755	* as quickly as we can
				1756	*/
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1757	if (rbio_is_full(rbio)) {
				1758	ret = full_stripe_write(rbio);
				1759	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1760	btrfs_bio_counter_dec(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1761	return ret;
				1762	}
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1763
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1764	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1765	if (cb) {
				1766	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1767	if (!plug->info) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1768	plug->info = fs_info;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1769	INIT_LIST_HEAD(&plug->rbio_list);
				1770	}
				1771	list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1772	ret = 0;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1773	} else {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1774	ret = __raid56_parity_write(rbio);
				1775	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1776	btrfs_bio_counter_dec(fs_info);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1777	}
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1778	return ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1779	}
				1780
				1781	/*
				1782	* all parity reconstruction happens here. We've read in everything
				1783	* we can find from the drives and this does the heavy lifting of
				1784	* sorting the good from the bad.
				1785	*/
				1786	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1787	{
				1788	int pagenr, stripe;
				1789	void **pointers;
				1790	int faila = -1, failb = -1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1791	struct page *page;
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1792	blk_status_t err;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1793	int i;
				1794
David Sterba	31e818f	2015-02-20 18:00:26 +0100	[diff] [blame]	1795	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1796	if (!pointers) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1797	err = BLK_STS_RESOURCE;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1798	goto cleanup_io;
				1799	}
				1800
				1801	faila = rbio->faila;
				1802	failb = rbio->failb;
				1803
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1804	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1805	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1806	spin_lock_irq(&rbio->bio_list_lock);
				1807	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1808	spin_unlock_irq(&rbio->bio_list_lock);
				1809	}
				1810
				1811	index_rbio_pages(rbio);
				1812
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1813	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1814	/*
				1815	* Now we just use bitmap to mark the horizontal stripes in
				1816	* which we have data when doing parity scrub.
				1817	*/
				1818	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1819	!test_bit(pagenr, rbio->dbitmap))
				1820	continue;
				1821
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1822	/* setup our array of pointers with pages
				1823	* from each stripe
				1824	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1825	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1826	/*
				1827	* if we're rebuilding a read, we have to use
				1828	* pages from the bio list
				1829	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1830	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1831	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1832	(stripe == faila \|\| stripe == failb)) {
				1833	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1834	} else {
				1835	page = rbio_stripe_page(rbio, stripe, pagenr);
				1836	}
				1837	pointers[stripe] = kmap(page);
				1838	}
				1839
				1840	/* all raid6 handling here */
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1841	if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1842	/*
				1843	* single failure, rebuild from parity raid5
				1844	* style
				1845	*/
				1846	if (failb < 0) {
				1847	if (faila == rbio->nr_data) {
				1848	/*
				1849	* Just the P stripe has failed, without
				1850	* a bad data or Q stripe.
				1851	* TODO, we should redo the xor here.
				1852	*/
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1853	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1854	goto cleanup;
				1855	}
				1856	/*
				1857	* a single failure in raid6 is rebuilt
				1858	* in the pstripe code below
				1859	*/
				1860	goto pstripe;
				1861	}
				1862
				1863	/* make sure our ps and qs are in order */
Nikolay Borisov	b7d2083	2020-07-02 16:46:46 +0300	[diff] [blame]	1864	if (faila > failb)
				1865	swap(faila, failb);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1866
				1867	/* if the q stripe is failed, do a pstripe reconstruction
				1868	* from the xors.
				1869	* If both the q stripe and the P stripe are failed, we're
				1870	* here due to a crc mismatch and we can't give them the
				1871	* data they want
				1872	*/
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1873	if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1874	if (rbio->bbio->raid_map[faila] ==
				1875	RAID5_P_STRIPE) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1876	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1877	goto cleanup;
				1878	}
				1879	/*
				1880	* otherwise we have one bad data stripe and
				1881	* a good P stripe. raid5!
				1882	*/
				1883	goto pstripe;
				1884	}
				1885
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1886	if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1887	raid6_datap_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1888	PAGE_SIZE, faila, pointers);
				1889	} else {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1890	raid6_2data_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1891	PAGE_SIZE, faila, failb,
				1892	pointers);
				1893	}
				1894	} else {
				1895	void *p;
				1896
				1897	/* rebuild from P stripe here (raid5 or raid6) */
				1898	BUG_ON(failb != -1);
				1899	pstripe:
				1900	/* Copy parity block into failed block to start with */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	1901	copy_page(pointers[faila], pointers[rbio->nr_data]);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1902
				1903	/* rearrange the pointer array */
				1904	p = pointers[faila];
				1905	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1906	pointers[stripe] = pointers[stripe + 1];
				1907	pointers[rbio->nr_data - 1] = p;
				1908
				1909	/* xor in the rest */
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1910	run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1911	}
				1912	/* if we're doing this rebuild as part of an rmw, go through
				1913	* and set all of our private rbio pages in the
				1914	* failed stripes as uptodate. This way finish_rmw will
				1915	* know they can be trusted. If this was a read reconstruction,
				1916	* other endio functions will fiddle the uptodate bits
				1917	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1918	if (rbio->operation == BTRFS_RBIO_WRITE) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1919	for (i = 0; i < rbio->stripe_npages; i++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1920	if (faila != -1) {
				1921	page = rbio_stripe_page(rbio, faila, i);
				1922	SetPageUptodate(page);
				1923	}
				1924	if (failb != -1) {
				1925	page = rbio_stripe_page(rbio, failb, i);
				1926	SetPageUptodate(page);
				1927	}
				1928	}
				1929	}
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1930	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1931	/*
				1932	* if we're rebuilding a read, we have to use
				1933	* pages from the bio list
				1934	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1935	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1936	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1937	(stripe == faila \|\| stripe == failb)) {
				1938	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1939	} else {
				1940	page = rbio_stripe_page(rbio, stripe, pagenr);
				1941	}
				1942	kunmap(page);
				1943	}
				1944	}
				1945
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1946	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1947	cleanup:
				1948	kfree(pointers);
				1949
				1950	cleanup_io:
Liu Bo	580c6ef	2018-03-22 09:20:11 +0800	[diff] [blame]	1951	/*
				1952	* Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
				1953	* valid rbio which is consistent with ondisk content, thus such a
				1954	* valid rbio can be cached to avoid further disk reads.
				1955	*/
				1956	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1957	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	1958	/*
				1959	* - In case of two failures, where rbio->failb != -1:
				1960	*
				1961	* Do not cache this rbio since the above read reconstruction
				1962	* (raid6_datap_recov() or raid6_2data_recov()) may have
				1963	* changed some content of stripes which are not identical to
				1964	* on-disk content any more, otherwise, a later write/recover
				1965	* may steal stripe_pages from this rbio and end up with
				1966	* corruptions or rebuild failures.
				1967	*
				1968	* - In case of single failure, where rbio->failb == -1:
				1969	*
				1970	* Cache this rbio iff the above read reconstruction is
Andrea Gelmini	52042d8	2018-11-28 12:05:13 +0100	[diff] [blame]	1971	* executed without problems.
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	1972	*/
				1973	if (err == BLK_STS_OK && rbio->failb < 0)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1974	cache_rbio_pages(rbio);
				1975	else
				1976	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1977
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1978	rbio_orig_end_io(rbio, err);
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1979	} else if (err == BLK_STS_OK) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1980	rbio->faila = -1;
				1981	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1982
				1983	if (rbio->operation == BTRFS_RBIO_WRITE)
				1984	finish_rmw(rbio);
				1985	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				1986	finish_parity_scrub(rbio, 0);
				1987	else
				1988	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1989	} else {
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1990	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1991	}
				1992	}
				1993
				1994	/*
				1995	* This is called only for stripes we've read from disk to
				1996	* reconstruct the parity.
				1997	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1998	static void raid_recover_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1999	{
				2000	struct btrfs_raid_bio *rbio = bio->bi_private;
				2001
				2002	/*
				2003	* we only read stripe pages off the disk, set them
				2004	* up to date if there were no errors
				2005	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2006	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2007	fail_bio_stripe(rbio, bio);
				2008	else
				2009	set_bio_pages_uptodate(bio);
				2010	bio_put(bio);
				2011
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2012	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2013	return;
				2014
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2015	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2016	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2017	else
				2018	__raid_recover_end_io(rbio);
				2019	}
				2020
				2021	/*
				2022	* reads everything we need off the disk to reconstruct
				2023	* the parity. endio handlers trigger final reconstruction
				2024	* when the IO is done.
				2025	*
				2026	* This is used both for reads from the higher layers and for
				2027	* parity construction required to finish a rmw cycle.
				2028	*/
				2029	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2030	{
				2031	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2032	struct bio_list bio_list;
				2033	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2034	int pagenr;
				2035	int stripe;
				2036	struct bio *bio;
				2037
				2038	bio_list_init(&bio_list);
				2039
				2040	ret = alloc_rbio_pages(rbio);
				2041	if (ret)
				2042	goto cleanup;
				2043
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2044	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2045
				2046	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2047	* read everything that hasn't failed. Thanks to the
				2048	* stripe cache, it is possible that some or all of these
				2049	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2050	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2051	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2052	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2053	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2054	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2055	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2056
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	2057	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2058	struct page *p;
				2059
				2060	/*
				2061	* the rmw code may have already read this
				2062	* page in
				2063	*/
				2064	p = rbio_stripe_page(rbio, stripe, pagenr);
				2065	if (PageUptodate(p))
				2066	continue;
				2067
				2068	ret = rbio_add_io_page(rbio, &bio_list,
				2069	rbio_stripe_page(rbio, stripe, pagenr),
				2070	stripe, pagenr, rbio->stripe_len);
				2071	if (ret < 0)
				2072	goto cleanup;
				2073	}
				2074	}
				2075
				2076	bios_to_read = bio_list_size(&bio_list);
				2077	if (!bios_to_read) {
				2078	/*
				2079	* we might have no bios to read just because the pages
				2080	* were up to date, or we might have no bios to read because
				2081	* the devices were gone.
				2082	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2083	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2084	__raid_recover_end_io(rbio);
Nikolay Borisov	813f8a0	2020-07-15 14:02:17 +0300	[diff] [blame]	2085	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2086	} else {
				2087	goto cleanup;
				2088	}
				2089	}
				2090
				2091	/*
				2092	* the bbio may be freed once we submit the last bio. Make sure
				2093	* not to touch it after that
				2094	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2095	atomic_set(&rbio->stripes_pending, bios_to_read);
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	2096	while ((bio = bio_list_pop(&bio_list))) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2097	bio->bi_private = rbio;
				2098	bio->bi_end_io = raid_recover_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2099	bio->bi_opf = REQ_OP_READ;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2100
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2101	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2102
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2103	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2104	}
Nikolay Borisov	813f8a0	2020-07-15 14:02:17 +0300	[diff] [blame]	2105
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2106	return 0;
				2107
				2108	cleanup:
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2109	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				2110	rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2111	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2112
				2113	while ((bio = bio_list_pop(&bio_list)))
				2114	bio_put(bio);
				2115
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2116	return -EIO;
				2117	}
				2118
				2119	/*
				2120	* the main entry point for reads from the higher layers. This
				2121	* is really only called when the normal read path had a failure,
				2122	* so we assume the bio they send down corresponds to a failed part
				2123	* of the drive.
				2124	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2125	int raid56_parity_recover(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2126	struct btrfs_bio *bbio, u64 stripe_len,
				2127	int mirror_num, int generic_io)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2128	{
				2129	struct btrfs_raid_bio *rbio;
				2130	int ret;
				2131
Liu Bo	abad60c	2017-03-29 10:54:26 -0700	[diff] [blame]	2132	if (generic_io) {
				2133	ASSERT(bbio->mirror_num == mirror_num);
				2134	btrfs_io_bio(bio)->mirror_num = mirror_num;
				2135	}
				2136
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2137	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2138	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2139	if (generic_io)
				2140	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2141	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2142	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2143
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2144	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2145	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2146	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2147
				2148	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2149	if (rbio->faila == -1) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2150	btrfs_warn(fs_info,
Liu Bo	e46a28c	2016-07-29 10:57:55 -0700	[diff] [blame]	2151	"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	2152	__func__, bio->bi_iter.bi_sector << 9,
Liu Bo	e46a28c	2016-07-29 10:57:55 -0700	[diff] [blame]	2153	(u64)bio->bi_iter.bi_size, bbio->map_type);
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2154	if (generic_io)
				2155	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2156	kfree(rbio);
				2157	return -EIO;
				2158	}
				2159
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2160	if (generic_io) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2161	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2162	rbio->generic_bio_cnt = 1;
				2163	} else {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2164	btrfs_get_bbio(bbio);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2165	}
				2166
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2167	/*
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2168	* Loop retry:
				2169	* for 'mirror == 2', reconstruct from all other stripes.
				2170	* for 'mirror_num > 2', select a stripe to fail on every retry.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2171	*/
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2172	if (mirror_num > 2) {
				2173	/*
				2174	* 'mirror == 3' is to fail the p stripe and
				2175	* reconstruct from the q stripe. 'mirror > 3' is to
				2176	* fail a data stripe and reconstruct from p+q stripe.
				2177	*/
				2178	rbio->failb = rbio->real_stripes - (mirror_num - 1);
				2179	ASSERT(rbio->failb > 0);
				2180	if (rbio->failb <= rbio->faila)
				2181	rbio->failb--;
				2182	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2183
				2184	ret = lock_stripe_add(rbio);
				2185
				2186	/*
				2187	* __raid56_parity_recover will end the bio with
				2188	* any errors it hits. We don't want to return
				2189	* its error value up the stack because our caller
				2190	* will end up calling bio_endio with any nonzero
				2191	* return
				2192	*/
				2193	if (ret == 0)
				2194	__raid56_parity_recover(rbio);
				2195	/*
				2196	* our rbio has been added to the list of
				2197	* rbios that will be handled after the
				2198	* currently lock owner is done
				2199	*/
				2200	return 0;
				2201
				2202	}
				2203
				2204	static void rmw_work(struct btrfs_work *work)
				2205	{
				2206	struct btrfs_raid_bio *rbio;
				2207
				2208	rbio = container_of(work, struct btrfs_raid_bio, work);
				2209	raid56_rmw_stripe(rbio);
				2210	}
				2211
				2212	static void read_rebuild_work(struct btrfs_work *work)
				2213	{
				2214	struct btrfs_raid_bio *rbio;
				2215
				2216	rbio = container_of(work, struct btrfs_raid_bio, work);
				2217	__raid56_parity_recover(rbio);
				2218	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2219
				2220	/*
				2221	* The following code is used to scrub/replace the parity stripe
				2222	*
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2223	* Caller must have already increased bio_counter for getting @bbio.
				2224	*
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2225	* Note: We need make sure all the pages that add into the scrub/replace
				2226	* raid bio are correct and not be changed during the scrub/replace. That
				2227	* is those pages just hold metadata or file data with checksum.
				2228	*/
				2229
				2230	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2231	raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2232	struct btrfs_bio *bbio, u64 stripe_len,
				2233	struct btrfs_device *scrub_dev,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2234	unsigned long *dbitmap, int stripe_nsectors)
				2235	{
				2236	struct btrfs_raid_bio *rbio;
				2237	int i;
				2238
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2239	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2240	if (IS_ERR(rbio))
				2241	return NULL;
				2242	bio_list_add(&rbio->bio_list, bio);
				2243	/*
				2244	* This is a special bio which is used to hold the completion handler
				2245	* and make the scrub rbio is similar to the other types
				2246	*/
				2247	ASSERT(!bio->bi_iter.bi_size);
				2248	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2249
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2250	/*
				2251	* After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
				2252	* to the end position, so this search can start from the first parity
				2253	* stripe.
				2254	*/
				2255	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2256	if (bbio->stripes[i].dev == scrub_dev) {
				2257	rbio->scrubp = i;
				2258	break;
				2259	}
				2260	}
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2261	ASSERT(i < rbio->real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2262
				2263	/* Now we just support the sectorsize equals to page size */
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2264	ASSERT(fs_info->sectorsize == PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2265	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2266	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2267
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2268	/*
				2269	* We have already increased bio_counter when getting bbio, record it
				2270	* so we can free it at rbio_orig_end_io().
				2271	*/
				2272	rbio->generic_bio_cnt = 1;
				2273
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2274	return rbio;
				2275	}
				2276
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2277	/* Used for both parity scrub and missing. */
				2278	void raid56_add_scrub_pages(struct btrfs_raid_bio rbio, struct page page,
				2279	u64 logical)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2280	{
				2281	int stripe_offset;
				2282	int index;
				2283
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2284	ASSERT(logical >= rbio->bbio->raid_map[0]);
				2285	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2286	rbio->stripe_len * rbio->nr_data);
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2287	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2288	index = stripe_offset >> PAGE_SHIFT;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2289	rbio->bio_pages[index] = page;
				2290	}
				2291
				2292	/*
				2293	* We just scrub the parity that we have correct data on the same horizontal,
				2294	* so we needn't allocate all pages for all the stripes.
				2295	*/
				2296	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2297	{
				2298	int i;
				2299	int bit;
				2300	int index;
				2301	struct page *page;
				2302
				2303	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2304	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2305	index = i * rbio->stripe_npages + bit;
				2306	if (rbio->stripe_pages[index])
				2307	continue;
				2308
				2309	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2310	if (!page)
				2311	return -ENOMEM;
				2312	rbio->stripe_pages[index] = page;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2313	}
				2314	}
				2315	return 0;
				2316	}
				2317
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2318	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2319	int need_check)
				2320	{
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2321	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	2322	void **pointers = rbio->finish_pointers;
				2323	unsigned long *pbitmap = rbio->finish_pbitmap;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2324	int nr_data = rbio->nr_data;
				2325	int stripe;
				2326	int pagenr;
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2327	bool has_qstripe;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2328	struct page *p_page = NULL;
				2329	struct page *q_page = NULL;
				2330	struct bio_list bio_list;
				2331	struct bio *bio;
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2332	int is_replace = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2333	int ret;
				2334
				2335	bio_list_init(&bio_list);
				2336
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2337	if (rbio->real_stripes - rbio->nr_data == 1)
				2338	has_qstripe = false;
				2339	else if (rbio->real_stripes - rbio->nr_data == 2)
				2340	has_qstripe = true;
				2341	else
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2342	BUG();
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2343
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2344	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
				2345	is_replace = 1;
				2346	bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
				2347	}
				2348
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2349	/*
				2350	* Because the higher layers(scrubber) are unlikely to
				2351	* use this area of the disk again soon, so don't cache
				2352	* it.
				2353	*/
				2354	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2355
				2356	if (!need_check)
				2357	goto writeback;
				2358
				2359	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2360	if (!p_page)
				2361	goto cleanup;
				2362	SetPageUptodate(p_page);
				2363
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2364	if (has_qstripe) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2365	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2366	if (!q_page) {
				2367	__free_page(p_page);
				2368	goto cleanup;
				2369	}
				2370	SetPageUptodate(q_page);
				2371	}
				2372
				2373	atomic_set(&rbio->error, 0);
				2374
				2375	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2376	struct page *p;
				2377	void *parity;
				2378	/* first collect one page from each data stripe */
				2379	for (stripe = 0; stripe < nr_data; stripe++) {
				2380	p = page_in_rbio(rbio, stripe, pagenr, 0);
				2381	pointers[stripe] = kmap(p);
				2382	}
				2383
				2384	/* then add the parity stripe */
				2385	pointers[stripe++] = kmap(p_page);
				2386
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2387	if (has_qstripe) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2388	/*
				2389	* raid6, add the qstripe and call the
				2390	* library function to fill in our p/q
				2391	*/
				2392	pointers[stripe++] = kmap(q_page);
				2393
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2394	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2395	pointers);
				2396	} else {
				2397	/* raid5 */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	2398	copy_page(pointers[nr_data], pointers[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2399	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2400	}
				2401
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2402	/* Check scrubbing parity and repair it */
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2403	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2404	parity = kmap(p);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2405	if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	2406	copy_page(parity, pointers[rbio->scrubp]);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2407	else
				2408	/* Parity is right, needn't writeback */
				2409	bitmap_clear(rbio->dbitmap, pagenr, 1);
				2410	kunmap(p);
				2411
Andrea Righi	3897b6f	2019-03-14 08:56:28 +0100	[diff] [blame]	2412	for (stripe = 0; stripe < nr_data; stripe++)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2413	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
Andrea Righi	3897b6f	2019-03-14 08:56:28 +0100	[diff] [blame]	2414	kunmap(p_page);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2415	}
				2416
				2417	__free_page(p_page);
				2418	if (q_page)
				2419	__free_page(q_page);
				2420
				2421	writeback:
				2422	/*
				2423	* time to start writing. Make bios for everything from the
				2424	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2425	* everything else.
				2426	*/
				2427	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2428	struct page *page;
				2429
				2430	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2431	ret = rbio_add_io_page(rbio, &bio_list,
				2432	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2433	if (ret)
				2434	goto cleanup;
				2435	}
				2436
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2437	if (!is_replace)
				2438	goto submit_write;
				2439
				2440	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
				2441	struct page *page;
				2442
				2443	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2444	ret = rbio_add_io_page(rbio, &bio_list, page,
				2445	bbio->tgtdev_map[rbio->scrubp],
				2446	pagenr, rbio->stripe_len);
				2447	if (ret)
				2448	goto cleanup;
				2449	}
				2450
				2451	submit_write:
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2452	nr_data = bio_list_size(&bio_list);
				2453	if (!nr_data) {
				2454	/* Every parity is right */
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2455	rbio_orig_end_io(rbio, BLK_STS_OK);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2456	return;
				2457	}
				2458
				2459	atomic_set(&rbio->stripes_pending, nr_data);
				2460
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	2461	while ((bio = bio_list_pop(&bio_list))) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2462	bio->bi_private = rbio;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	2463	bio->bi_end_io = raid_write_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2464	bio->bi_opf = REQ_OP_WRITE;
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2465
				2466	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2467	}
				2468	return;
				2469
				2470	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2471	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2472
				2473	while ((bio = bio_list_pop(&bio_list)))
				2474	bio_put(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2475	}
				2476
				2477	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2478	{
				2479	if (stripe >= 0 && stripe < rbio->nr_data)
				2480	return 1;
				2481	return 0;
				2482	}
				2483
				2484	/*
				2485	* While we're doing the parity check and repair, we could have errors
				2486	* in reading pages off the disk. This checks for errors and if we're
				2487	* not able to read the page it'll trigger parity reconstruction. The
				2488	* parity scrub will be finished after we've reconstructed the failed
				2489	* stripes
				2490	*/
				2491	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2492	{
				2493	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2494	goto cleanup;
				2495
				2496	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2497	int dfail = 0, failp = -1;
				2498
				2499	if (is_data_stripe(rbio, rbio->faila))
				2500	dfail++;
				2501	else if (is_parity_stripe(rbio->faila))
				2502	failp = rbio->faila;
				2503
				2504	if (is_data_stripe(rbio, rbio->failb))
				2505	dfail++;
				2506	else if (is_parity_stripe(rbio->failb))
				2507	failp = rbio->failb;
				2508
				2509	/*
				2510	* Because we can not use a scrubbing parity to repair
				2511	* the data, so the capability of the repair is declined.
				2512	* (In the case of RAID5, we can not repair anything)
				2513	*/
				2514	if (dfail > rbio->bbio->max_errors - 1)
				2515	goto cleanup;
				2516
				2517	/*
				2518	* If all data is good, only parity is correctly, just
				2519	* repair the parity.
				2520	*/
				2521	if (dfail == 0) {
				2522	finish_parity_scrub(rbio, 0);
				2523	return;
				2524	}
				2525
				2526	/*
				2527	* Here means we got one corrupted data stripe and one
				2528	* corrupted parity on RAID6, if the corrupted parity
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2529	* is scrubbing parity, luckily, use the other one to repair
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2530	* the data, or we can not repair the data stripe.
				2531	*/
				2532	if (failp != rbio->scrubp)
				2533	goto cleanup;
				2534
				2535	__raid_recover_end_io(rbio);
				2536	} else {
				2537	finish_parity_scrub(rbio, 1);
				2538	}
				2539	return;
				2540
				2541	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2542	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2543	}
				2544
				2545	/*
				2546	* end io for the read phase of the rmw cycle. All the bios here are physical
				2547	* stripe bios we've read from the disk so we can recalculate the parity of the
				2548	* stripe.
				2549	*
				2550	* This will usually kick off finish_rmw once all the bios are read in, but it
				2551	* may trigger parity reconstruction if we had any errors along the way
				2552	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2553	static void raid56_parity_scrub_end_io(struct bio *bio)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2554	{
				2555	struct btrfs_raid_bio *rbio = bio->bi_private;
				2556
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2557	if (bio->bi_status)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2558	fail_bio_stripe(rbio, bio);
				2559	else
				2560	set_bio_pages_uptodate(bio);
				2561
				2562	bio_put(bio);
				2563
				2564	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2565	return;
				2566
				2567	/*
				2568	* this will normally call finish_rmw to start our write
				2569	* but if there are any failed stripes we'll reconstruct
				2570	* from parity first
				2571	*/
				2572	validate_rbio_for_parity_scrub(rbio);
				2573	}
				2574
				2575	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2576	{
				2577	int bios_to_read = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2578	struct bio_list bio_list;
				2579	int ret;
				2580	int pagenr;
				2581	int stripe;
				2582	struct bio *bio;
				2583
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2584	bio_list_init(&bio_list);
				2585
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2586	ret = alloc_rbio_essential_pages(rbio);
				2587	if (ret)
				2588	goto cleanup;
				2589
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2590	atomic_set(&rbio->error, 0);
				2591	/*
				2592	* build a list of bios to read all the missing parts of this
				2593	* stripe
				2594	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2595	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2596	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2597	struct page *page;
				2598	/*
				2599	* we want to find all the pages missing from
				2600	* the rbio and read them from the disk. If
				2601	* page_in_rbio finds a page in the bio list
				2602	* we don't need to read it off the stripe.
				2603	*/
				2604	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2605	if (page)
				2606	continue;
				2607
				2608	page = rbio_stripe_page(rbio, stripe, pagenr);
				2609	/*
				2610	* the bio cache may have handed us an uptodate
				2611	* page. If so, be happy and use it
				2612	*/
				2613	if (PageUptodate(page))
				2614	continue;
				2615
				2616	ret = rbio_add_io_page(rbio, &bio_list, page,
				2617	stripe, pagenr, rbio->stripe_len);
				2618	if (ret)
				2619	goto cleanup;
				2620	}
				2621	}
				2622
				2623	bios_to_read = bio_list_size(&bio_list);
				2624	if (!bios_to_read) {
				2625	/*
				2626	* this can happen if others have merged with
				2627	* us, it means there is nothing left to read.
				2628	* But if there are missing devices it may not be
				2629	* safe to do the full stripe write yet.
				2630	*/
				2631	goto finish;
				2632	}
				2633
				2634	/*
				2635	* the bbio may be freed once we submit the last bio. Make sure
				2636	* not to touch it after that
				2637	*/
				2638	atomic_set(&rbio->stripes_pending, bios_to_read);
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	2639	while ((bio = bio_list_pop(&bio_list))) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2640	bio->bi_private = rbio;
				2641	bio->bi_end_io = raid56_parity_scrub_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2642	bio->bi_opf = REQ_OP_READ;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2643
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2644	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2645
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2646	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2647	}
				2648	/* the actual write will happen once the reads are done */
				2649	return;
				2650
				2651	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2652	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2653
				2654	while ((bio = bio_list_pop(&bio_list)))
				2655	bio_put(bio);
				2656
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2657	return;
				2658
				2659	finish:
				2660	validate_rbio_for_parity_scrub(rbio);
				2661	}
				2662
				2663	static void scrub_parity_work(struct btrfs_work *work)
				2664	{
				2665	struct btrfs_raid_bio *rbio;
				2666
				2667	rbio = container_of(work, struct btrfs_raid_bio, work);
				2668	raid56_parity_scrub_stripe(rbio);
				2669	}
				2670
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2671	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2672	{
				2673	if (!lock_stripe_add(rbio))
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	2674	start_async_work(rbio, scrub_parity_work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2675	}
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2676
				2677	/* The following code is used for dev replace of a missing RAID 5/6 device. */
				2678
				2679	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2680	raid56_alloc_missing_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2681	struct btrfs_bio *bbio, u64 length)
				2682	{
				2683	struct btrfs_raid_bio *rbio;
				2684
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2685	rbio = alloc_rbio(fs_info, bbio, length);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2686	if (IS_ERR(rbio))
				2687	return NULL;
				2688
				2689	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
				2690	bio_list_add(&rbio->bio_list, bio);
				2691	/*
				2692	* This is a special bio which is used to hold the completion handler
				2693	* and make the scrub rbio is similar to the other types
				2694	*/
				2695	ASSERT(!bio->bi_iter.bi_size);
				2696
				2697	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2698	if (rbio->faila == -1) {
				2699	BUG();
				2700	kfree(rbio);
				2701	return NULL;
				2702	}
				2703
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2704	/*
				2705	* When we get bbio, we have already increased bio_counter, record it
				2706	* so we can free it at rbio_orig_end_io()
				2707	*/
				2708	rbio->generic_bio_cnt = 1;
				2709
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2710	return rbio;
				2711	}
				2712
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2713	void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
				2714	{
				2715	if (!lock_stripe_add(rbio))
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	2716	start_async_work(rbio, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2717	}