Blame - fs/btrfs/raid56.c - SHIFTPHONES/kernel/shift/mainline

blob: fcfc20de2df395bfd70aa5541ad34689e72f8deb [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	34	#include <linux/mm.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	35	#include <asm/div64.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	50	/*
				51	* set when this rbio is sitting in the hash, but it is just a cache
				52	* of past RMW
				53	*/
				54	#define RBIO_CACHE_BIT 2
				55
				56	/*
				57	* set when it is safe to trust the stripe_pages for caching
				58	*/
				59	#define RBIO_CACHE_READY_BIT 3
				60
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	61	#define RBIO_CACHE_SIZE 1024
				62
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	63	enum btrfs_rbio_ops {
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	64	BTRFS_RBIO_WRITE,
				65	BTRFS_RBIO_READ_REBUILD,
				66	BTRFS_RBIO_PARITY_SCRUB,
				67	BTRFS_RBIO_REBUILD_MISSING,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	68	};
				69
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	70	struct btrfs_raid_bio {
				71	struct btrfs_fs_info *fs_info;
				72	struct btrfs_bio *bbio;
				73
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	74	/* while we're doing rmw on a stripe
				75	* we put it into a hash table so we can
				76	* lock the stripe and merge more rbios
				77	* into it.
				78	*/
				79	struct list_head hash_list;
				80
				81	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	82	* LRU list for the stripe cache
				83	*/
				84	struct list_head stripe_cache;
				85
				86	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	87	* for scheduling work in the helper threads
				88	*/
				89	struct btrfs_work work;
				90
				91	/*
				92	* bio list and bio_list_lock are used
				93	* to add more bios into the stripe
				94	* in hopes of avoiding the full rmw
				95	*/
				96	struct bio_list bio_list;
				97	spinlock_t bio_list_lock;
				98
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	99	/* also protected by the bio_list_lock, the
				100	* plug list is used by the plugging code
				101	* to collect partial bios while plugged. The
				102	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	103	* the stripe lock to the next pending IO
				104	*/
				105	struct list_head plug_list;
				106
				107	/*
				108	* flags that tell us if it is safe to
				109	* merge with this bio
				110	*/
				111	unsigned long flags;
				112
				113	/* size of each individual stripe on disk */
				114	int stripe_len;
				115
				116	/* number of data stripes (no p/q) */
				117	int nr_data;
				118
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	119	int real_stripes;
				120
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	121	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	122	/*
				123	* set if we're doing a parity rebuild
				124	* for a read from higher up, which is handled
				125	* differently from a parity rebuild as part of
				126	* rmw
				127	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	128	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	129
				130	/* first bad stripe */
				131	int faila;
				132
				133	/* second bad stripe (for raid6 use) */
				134	int failb;
				135
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	136	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	137	/*
				138	* number of pages needed to represent the full
				139	* stripe
				140	*/
				141	int nr_pages;
				142
				143	/*
				144	* size of all the bios in the bio_list. This
				145	* helps us decide if the rbio maps to a full
				146	* stripe or not
				147	*/
				148	int bio_list_bytes;
				149
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	150	int generic_bio_cnt;
				151
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	152	refcount_t refs;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	153
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	154	atomic_t stripes_pending;
				155
				156	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	157	/*
				158	* these are two arrays of pointers. We allocate the
				159	* rbio big enough to hold them both and setup their
				160	* locations when the rbio is allocated
				161	*/
				162
				163	/* pointers to pages that we allocated for
				164	* reading/writing stripes directly from the disk (including P/Q)
				165	*/
				166	struct page **stripe_pages;
				167
				168	/*
				169	* pointers to the pages in the bio_list. Stored
				170	* here for faster lookup
				171	*/
				172	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	173
				174	/*
				175	* bitmap to record which horizontal stripe has data
				176	*/
				177	unsigned long *dbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	178	};
				179
				180	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				181	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				182	static void rmw_work(struct btrfs_work *work);
				183	static void read_rebuild_work(struct btrfs_work *work);
				184	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				185	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				186	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				187	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				188	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				189	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				190	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				191
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	192	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				193	int need_check);
				194	static void async_scrub_parity(struct btrfs_raid_bio *rbio);
				195
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	196	/*
				197	* the stripe hash table is used for locking, and to collect
				198	* bios in hopes of making a full stripe
				199	*/
				200	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				201	{
				202	struct btrfs_stripe_hash_table *table;
				203	struct btrfs_stripe_hash_table *x;
				204	struct btrfs_stripe_hash *cur;
				205	struct btrfs_stripe_hash *h;
				206	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				207	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	208	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	209
				210	if (info->stripe_hash_table)
				211	return 0;
				212
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	213	/*
				214	* The table is large, starting with order 4 and can go as high as
				215	* order 7 in case lock debugging is turned on.
				216	*
				217	* Try harder to allocate and fallback to vmalloc to lower the chance
				218	* of a failing mount.
				219	*/
				220	table_size = sizeof(table) + sizeof(h) * num_entries;
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	221	table = kvzalloc(table_size, GFP_KERNEL);
				222	if (!table)
				223	return -ENOMEM;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	224
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	225	spin_lock_init(&table->cache_lock);
				226	INIT_LIST_HEAD(&table->stripe_cache);
				227
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	228	h = table->table;
				229
				230	for (i = 0; i < num_entries; i++) {
				231	cur = h + i;
				232	INIT_LIST_HEAD(&cur->hash_list);
				233	spin_lock_init(&cur->lock);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	234	}
				235
				236	x = cmpxchg(&info->stripe_hash_table, NULL, table);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	237	if (x)
				238	kvfree(x);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	239	return 0;
				240	}
				241
				242	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	243	* caching an rbio means to copy anything from the
				244	* bio_pages array into the stripe_pages array. We
				245	* use the page uptodate bit in the stripe cache array
				246	* to indicate if it has valid data
				247	*
				248	* once the caching is done, we set the cache ready
				249	* bit.
				250	*/
				251	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				252	{
				253	int i;
				254	char *s;
				255	char *d;
				256	int ret;
				257
				258	ret = alloc_rbio_pages(rbio);
				259	if (ret)
				260	return;
				261
				262	for (i = 0; i < rbio->nr_pages; i++) {
				263	if (!rbio->bio_pages[i])
				264	continue;
				265
				266	s = kmap(rbio->bio_pages[i]);
				267	d = kmap(rbio->stripe_pages[i]);
				268
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	269	memcpy(d, s, PAGE_SIZE);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	270
				271	kunmap(rbio->bio_pages[i]);
				272	kunmap(rbio->stripe_pages[i]);
				273	SetPageUptodate(rbio->stripe_pages[i]);
				274	}
				275	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				276	}
				277
				278	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	279	* we hash on the first logical address of the stripe
				280	*/
				281	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				282	{
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	283	u64 num = rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	284
				285	/*
				286	* we shift down quite a bit. We're using byte
				287	* addressing, and most of the lower bits are zeros.
				288	* This tends to upset hash_64, and it consistently
				289	* returns just one or two different values.
				290	*
				291	* shifting off the lower bits fixes things.
				292	*/
				293	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				294	}
				295
				296	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	297	* stealing an rbio means taking all the uptodate pages from the stripe
				298	* array in the source rbio and putting them into the destination rbio
				299	*/
				300	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				301	{
				302	int i;
				303	struct page *s;
				304	struct page *d;
				305
				306	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				307	return;
				308
				309	for (i = 0; i < dest->nr_pages; i++) {
				310	s = src->stripe_pages[i];
				311	if (!s \|\| !PageUptodate(s)) {
				312	continue;
				313	}
				314
				315	d = dest->stripe_pages[i];
				316	if (d)
				317	__free_page(d);
				318
				319	dest->stripe_pages[i] = s;
				320	src->stripe_pages[i] = NULL;
				321	}
				322	}
				323
				324	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	325	* merging means we take the bio_list from the victim and
				326	* splice it into the destination. The victim should
				327	* be discarded afterwards.
				328	*
				329	* must be called with dest->rbio_list_lock held
				330	*/
				331	static void merge_rbio(struct btrfs_raid_bio *dest,
				332	struct btrfs_raid_bio *victim)
				333	{
				334	bio_list_merge(&dest->bio_list, &victim->bio_list);
				335	dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	336	dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	337	bio_list_init(&victim->bio_list);
				338	}
				339
				340	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	341	* used to prune items that are in the cache. The caller
				342	* must hold the hash table lock.
				343	*/
				344	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				345	{
				346	int bucket = rbio_bucket(rbio);
				347	struct btrfs_stripe_hash_table *table;
				348	struct btrfs_stripe_hash *h;
				349	int freeit = 0;
				350
				351	/*
				352	* check the bit again under the hash table lock.
				353	*/
				354	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				355	return;
				356
				357	table = rbio->fs_info->stripe_hash_table;
				358	h = table->table + bucket;
				359
				360	/* hold the lock for the bucket because we may be
				361	* removing it from the hash table
				362	*/
				363	spin_lock(&h->lock);
				364
				365	/*
				366	* hold the lock for the bio list because we need
				367	* to make sure the bio list is empty
				368	*/
				369	spin_lock(&rbio->bio_list_lock);
				370
				371	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				372	list_del_init(&rbio->stripe_cache);
				373	table->cache_size -= 1;
				374	freeit = 1;
				375
				376	/* if the bio list isn't empty, this rbio is
				377	* still involved in an IO. We take it out
				378	* of the cache list, and drop the ref that
				379	* was held for the list.
				380	*
				381	* If the bio_list was empty, we also remove
				382	* the rbio from the hash_table, and drop
				383	* the corresponding ref
				384	*/
				385	if (bio_list_empty(&rbio->bio_list)) {
				386	if (!list_empty(&rbio->hash_list)) {
				387	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	388	refcount_dec(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	389	BUG_ON(!list_empty(&rbio->plug_list));
				390	}
				391	}
				392	}
				393
				394	spin_unlock(&rbio->bio_list_lock);
				395	spin_unlock(&h->lock);
				396
				397	if (freeit)
				398	__free_raid_bio(rbio);
				399	}
				400
				401	/*
				402	* prune a given rbio from the cache
				403	*/
				404	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				405	{
				406	struct btrfs_stripe_hash_table *table;
				407	unsigned long flags;
				408
				409	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				410	return;
				411
				412	table = rbio->fs_info->stripe_hash_table;
				413
				414	spin_lock_irqsave(&table->cache_lock, flags);
				415	__remove_rbio_from_cache(rbio);
				416	spin_unlock_irqrestore(&table->cache_lock, flags);
				417	}
				418
				419	/*
				420	* remove everything in the cache
				421	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	422	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	423	{
				424	struct btrfs_stripe_hash_table *table;
				425	unsigned long flags;
				426	struct btrfs_raid_bio *rbio;
				427
				428	table = info->stripe_hash_table;
				429
				430	spin_lock_irqsave(&table->cache_lock, flags);
				431	while (!list_empty(&table->stripe_cache)) {
				432	rbio = list_entry(table->stripe_cache.next,
				433	struct btrfs_raid_bio,
				434	stripe_cache);
				435	__remove_rbio_from_cache(rbio);
				436	}
				437	spin_unlock_irqrestore(&table->cache_lock, flags);
				438	}
				439
				440	/*
				441	* remove all cached entries and free the hash table
				442	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	443	*/
				444	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				445	{
				446	if (!info->stripe_hash_table)
				447	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	448	btrfs_clear_rbio_cache(info);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	449	kvfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	450	info->stripe_hash_table = NULL;
				451	}
				452
				453	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	454	* insert an rbio into the stripe cache. It
				455	* must have already been prepared by calling
				456	* cache_rbio_pages
				457	*
				458	* If this rbio was already cached, it gets
				459	* moved to the front of the lru.
				460	*
				461	* If the size of the rbio cache is too big, we
				462	* prune an item.
				463	*/
				464	static void cache_rbio(struct btrfs_raid_bio *rbio)
				465	{
				466	struct btrfs_stripe_hash_table *table;
				467	unsigned long flags;
				468
				469	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				470	return;
				471
				472	table = rbio->fs_info->stripe_hash_table;
				473
				474	spin_lock_irqsave(&table->cache_lock, flags);
				475	spin_lock(&rbio->bio_list_lock);
				476
				477	/* bump our ref if we were not in the list before */
				478	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	479	refcount_inc(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	480
				481	if (!list_empty(&rbio->stripe_cache)){
				482	list_move(&rbio->stripe_cache, &table->stripe_cache);
				483	} else {
				484	list_add(&rbio->stripe_cache, &table->stripe_cache);
				485	table->cache_size += 1;
				486	}
				487
				488	spin_unlock(&rbio->bio_list_lock);
				489
				490	if (table->cache_size > RBIO_CACHE_SIZE) {
				491	struct btrfs_raid_bio *found;
				492
				493	found = list_entry(table->stripe_cache.prev,
				494	struct btrfs_raid_bio,
				495	stripe_cache);
				496
				497	if (found != rbio)
				498	__remove_rbio_from_cache(found);
				499	}
				500
				501	spin_unlock_irqrestore(&table->cache_lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	502	}
				503
				504	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	505	* helper function to run the xor_blocks api. It is only
				506	* able to do MAX_XOR_BLOCKS at a time, so we need to
				507	* loop through.
				508	*/
				509	static void run_xor(void **pages, int src_cnt, ssize_t len)
				510	{
				511	int src_off = 0;
				512	int xor_src_cnt = 0;
				513	void *dest = pages[src_cnt];
				514
				515	while(src_cnt > 0) {
				516	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				517	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				518
				519	src_cnt -= xor_src_cnt;
				520	src_off += xor_src_cnt;
				521	}
				522	}
				523
				524	/*
				525	* returns true if the bio list inside this rbio
				526	* covers an entire stripe (no rmw required).
				527	* Must be called with the bio list lock held, or
				528	* at a time when you know it is impossible to add
				529	* new bios into the list
				530	*/
				531	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				532	{
				533	unsigned long size = rbio->bio_list_bytes;
				534	int ret = 1;
				535
				536	if (size != rbio->nr_data * rbio->stripe_len)
				537	ret = 0;
				538
				539	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				540	return ret;
				541	}
				542
				543	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				544	{
				545	unsigned long flags;
				546	int ret;
				547
				548	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				549	ret = __rbio_is_full(rbio);
				550	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				551	return ret;
				552	}
				553
				554	/*
				555	* returns 1 if it is safe to merge two rbios together.
				556	* The merging is safe if the two rbios correspond to
				557	* the same stripe and if they are both going in the same
				558	* direction (read vs write), and if neither one is
				559	* locked for final IO
				560	*
				561	* The caller is responsible for locking such that
				562	* rmw_locked is safe to test
				563	*/
				564	static int rbio_can_merge(struct btrfs_raid_bio *last,
				565	struct btrfs_raid_bio *cur)
				566	{
				567	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				568	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				569	return 0;
				570
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	571	/*
				572	* we can't merge with cached rbios, since the
				573	* idea is that when we merge the destination
				574	* rbio is going to run our IO for us. We can
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	575	* steal from cached rbios though, other functions
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	576	* handle that.
				577	*/
				578	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				579	test_bit(RBIO_CACHE_BIT, &cur->flags))
				580	return 0;
				581
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	582	if (last->bbio->raid_map[0] !=
				583	cur->bbio->raid_map[0])
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	584	return 0;
				585
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	586	/* we can't merge with different operations */
				587	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	588	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	589	/*
				590	* We've need read the full stripe from the drive.
				591	* check and repair the parity and write the new results.
				592	*
				593	* We're not allowed to add any new bios to the
				594	* bio list here, anyone else that wants to
				595	* change this stripe needs to do their own rmw.
				596	*/
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	597	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	598	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	599
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	600	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	601	return 0;
				602
Liu Bo	cc54ff6	2017-12-11 14:56:31 -0700	[diff] [blame]	603	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
				604	int fa = last->faila;
				605	int fb = last->failb;
				606	int cur_fa = cur->faila;
				607	int cur_fb = cur->failb;
				608
				609	if (last->faila >= last->failb) {
				610	fa = last->failb;
				611	fb = last->faila;
				612	}
				613
				614	if (cur->faila >= cur->failb) {
				615	cur_fa = cur->failb;
				616	cur_fb = cur->faila;
				617	}
				618
				619	if (fa != cur_fa \|\| fb != cur_fb)
				620	return 0;
				621	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	622	return 1;
				623	}
				624
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	625	static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
				626	int index)
				627	{
				628	return stripe * rbio->stripe_npages + index;
				629	}
				630
				631	/*
				632	* these are just the pages from the rbio array, not from anything
				633	* the FS sent down to us
				634	*/
				635	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe,
				636	int index)
				637	{
				638	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
				639	}
				640
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	641	/*
				642	* helper to index into the pstripe
				643	*/
				644	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				645	{
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	646	return rbio_stripe_page(rbio, rbio->nr_data, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	647	}
				648
				649	/*
				650	* helper to index into the qstripe, returns null
				651	* if there is no qstripe
				652	*/
				653	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				654	{
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	655	if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	656	return NULL;
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	657	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	658	}
				659
				660	/*
				661	* The first stripe in the table for a logical address
				662	* has the lock. rbios are added in one of three ways:
				663	*
				664	* 1) Nobody has the stripe locked yet. The rbio is given
				665	* the lock and 0 is returned. The caller must start the IO
				666	* themselves.
				667	*
				668	* 2) Someone has the stripe locked, but we're able to merge
				669	* with the lock owner. The rbio is freed and the IO will
				670	* start automatically along with the existing rbio. 1 is returned.
				671	*
				672	* 3) Someone has the stripe locked, but we're not able to merge.
				673	* The rbio is added to the lock owner's plug list, or merged into
				674	* an rbio already on the plug list. When the lock owner unlocks,
				675	* the next rbio on the list is run and the IO is started automatically.
				676	* 1 is returned
				677	*
				678	* If we return 0, the caller still owns the rbio and must continue with
				679	* IO submission. If we return 1, the caller must assume the rbio has
				680	* already been freed.
				681	*/
				682	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				683	{
				684	int bucket = rbio_bucket(rbio);
				685	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				686	struct btrfs_raid_bio *cur;
				687	struct btrfs_raid_bio *pending;
				688	unsigned long flags;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	689	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	690	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	691	int ret = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	692
				693	spin_lock_irqsave(&h->lock, flags);
				694	list_for_each_entry(cur, &h->hash_list, hash_list) {
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	695	if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	696	spin_lock(&cur->bio_list_lock);
				697
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	698	/* can we steal this cached rbio's pages? */
				699	if (bio_list_empty(&cur->bio_list) &&
				700	list_empty(&cur->plug_list) &&
				701	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				702	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				703	list_del_init(&cur->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	704	refcount_dec(&cur->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	705
				706	steal_rbio(cur, rbio);
				707	cache_drop = cur;
				708	spin_unlock(&cur->bio_list_lock);
				709
				710	goto lockit;
				711	}
				712
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	713	/* can we merge into the lock owner? */
				714	if (rbio_can_merge(cur, rbio)) {
				715	merge_rbio(cur, rbio);
				716	spin_unlock(&cur->bio_list_lock);
				717	freeit = rbio;
				718	ret = 1;
				719	goto out;
				720	}
				721
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	722
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	723	/*
				724	* we couldn't merge with the running
				725	* rbio, see if we can merge with the
				726	* pending ones. We don't have to
				727	* check for rmw_locked because there
				728	* is no way they are inside finish_rmw
				729	* right now
				730	*/
				731	list_for_each_entry(pending, &cur->plug_list,
				732	plug_list) {
				733	if (rbio_can_merge(pending, rbio)) {
				734	merge_rbio(pending, rbio);
				735	spin_unlock(&cur->bio_list_lock);
				736	freeit = rbio;
				737	ret = 1;
				738	goto out;
				739	}
				740	}
				741
				742	/* no merging, put us on the tail of the plug list,
				743	* our rbio will be started with the currently
				744	* running rbio unlocks
				745	*/
				746	list_add_tail(&rbio->plug_list, &cur->plug_list);
				747	spin_unlock(&cur->bio_list_lock);
				748	ret = 1;
				749	goto out;
				750	}
				751	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	752	lockit:
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	753	refcount_inc(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	754	list_add(&rbio->hash_list, &h->hash_list);
				755	out:
				756	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	757	if (cache_drop)
				758	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	759	if (freeit)
				760	__free_raid_bio(freeit);
				761	return ret;
				762	}
				763
				764	/*
				765	* called as rmw or parity rebuild is completed. If the plug list has more
				766	* rbios waiting for this stripe, the next one on the list will be started
				767	*/
				768	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				769	{
				770	int bucket;
				771	struct btrfs_stripe_hash *h;
				772	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	773	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	774
				775	bucket = rbio_bucket(rbio);
				776	h = rbio->fs_info->stripe_hash_table->table + bucket;
				777
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	778	if (list_empty(&rbio->plug_list))
				779	cache_rbio(rbio);
				780
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	781	spin_lock_irqsave(&h->lock, flags);
				782	spin_lock(&rbio->bio_list_lock);
				783
				784	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	785	/*
				786	* if we're still cached and there is no other IO
				787	* to perform, just leave this rbio here for others
				788	* to steal from later
				789	*/
				790	if (list_empty(&rbio->plug_list) &&
				791	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				792	keep_cache = 1;
				793	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				794	BUG_ON(!bio_list_empty(&rbio->bio_list));
				795	goto done;
				796	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	797
				798	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	799	refcount_dec(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	800
				801	/*
				802	* we use the plug list to hold all the rbios
				803	* waiting for the chance to lock this stripe.
				804	* hand the lock over to one of them.
				805	*/
				806	if (!list_empty(&rbio->plug_list)) {
				807	struct btrfs_raid_bio *next;
				808	struct list_head *head = rbio->plug_list.next;
				809
				810	next = list_entry(head, struct btrfs_raid_bio,
				811	plug_list);
				812
				813	list_del_init(&rbio->plug_list);
				814
				815	list_add(&next->hash_list, &h->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	816	refcount_inc(&next->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	817	spin_unlock(&rbio->bio_list_lock);
				818	spin_unlock_irqrestore(&h->lock, flags);
				819
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	820	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	821	async_read_rebuild(next);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	822	else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
				823	steal_rbio(rbio, next);
				824	async_read_rebuild(next);
				825	} else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	826	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	827	async_rmw_stripe(next);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	828	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				829	steal_rbio(rbio, next);
				830	async_scrub_parity(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	831	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	832
				833	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	834	}
				835	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	836	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	837	spin_unlock(&rbio->bio_list_lock);
				838	spin_unlock_irqrestore(&h->lock, flags);
				839
				840	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	841	if (!keep_cache)
				842	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	843	}
				844
				845	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				846	{
				847	int i;
				848
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	849	if (!refcount_dec_and_test(&rbio->refs))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	850	return;
				851
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	852	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	853	WARN_ON(!list_empty(&rbio->hash_list));
				854	WARN_ON(!bio_list_empty(&rbio->bio_list));
				855
				856	for (i = 0; i < rbio->nr_pages; i++) {
				857	if (rbio->stripe_pages[i]) {
				858	__free_page(rbio->stripe_pages[i]);
				859	rbio->stripe_pages[i] = NULL;
				860	}
				861	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	862
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	863	btrfs_put_bbio(rbio->bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	864	kfree(rbio);
				865	}
				866
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	867	static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	868	{
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	869	struct bio *next;
				870
				871	while (cur) {
				872	next = cur->bi_next;
				873	cur->bi_next = NULL;
				874	cur->bi_status = err;
				875	bio_endio(cur);
				876	cur = next;
				877	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	878	}
				879
				880	/*
				881	* this frees the rbio and runs through all the bios in the
				882	* bio_list and calls end_io on them
				883	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	884	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	885	{
				886	struct bio *cur = bio_list_get(&rbio->bio_list);
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	887	struct bio *extra;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	888
				889	if (rbio->generic_bio_cnt)
				890	btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
				891
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	892	/*
				893	* At this moment, rbio->bio_list is empty, however since rbio does not
				894	* always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
				895	* hash list, rbio may be merged with others so that rbio->bio_list
				896	* becomes non-empty.
				897	* Once unlock_stripe() is done, rbio->bio_list will not be updated any
				898	* more and we can call bio_endio() on all queued bios.
				899	*/
				900	unlock_stripe(rbio);
				901	extra = bio_list_get(&rbio->bio_list);
				902	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	903
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	904	rbio_endio_bio_list(cur, err);
				905	if (extra)
				906	rbio_endio_bio_list(extra, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	907	}
				908
				909	/*
				910	* end io function used by finish_rmw. When we finally
				911	* get here, we've written a full stripe
				912	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	913	static void raid_write_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	914	{
				915	struct btrfs_raid_bio *rbio = bio->bi_private;
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	916	blk_status_t err = bio->bi_status;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	917	int max_errors;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	918
				919	if (err)
				920	fail_bio_stripe(rbio, bio);
				921
				922	bio_put(bio);
				923
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	924	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	925	return;
				926
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	927	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	928
				929	/* OK, we have read all the stripes we need to. */
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	930	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
				931	0 : rbio->bbio->max_errors;
				932	if (atomic_read(&rbio->error) > max_errors)
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	933	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	934
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	935	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	936	}
				937
				938	/*
				939	* the read/modify/write code wants to use the original bio for
				940	* any pages it included, and then use the rbio for everything
				941	* else. This function decides if a given index (stripe number)
				942	* and page number in that stripe fall inside the original bio
				943	* or the rbio.
				944	*
				945	* if you set bio_list_only, you'll get a NULL back for any ranges
				946	* that are outside the bio_list
				947	*
				948	* This doesn't take any refs on anything, you get a bare page pointer
				949	* and the caller must bump refs as required.
				950	*
				951	* You must call index_rbio_pages once before you can trust
				952	* the answers from this function.
				953	*/
				954	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				955	int index, int pagenr, int bio_list_only)
				956	{
				957	int chunk_page;
				958	struct page *p = NULL;
				959
				960	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				961
				962	spin_lock_irq(&rbio->bio_list_lock);
				963	p = rbio->bio_pages[chunk_page];
				964	spin_unlock_irq(&rbio->bio_list_lock);
				965
				966	if (p \|\| bio_list_only)
				967	return p;
				968
				969	return rbio->stripe_pages[chunk_page];
				970	}
				971
				972	/*
				973	* number of pages we need for the entire stripe across all the
				974	* drives
				975	*/
				976	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				977	{
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	978	return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	979	}
				980
				981	/*
				982	* allocation and initial setup for the btrfs_raid_bio. Not
				983	* this does not allocate any pages for rbio->pages.
				984	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	985	static struct btrfs_raid_bio alloc_rbio(struct btrfs_fs_info fs_info,
				986	struct btrfs_bio *bbio,
				987	u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	988	{
				989	struct btrfs_raid_bio *rbio;
				990	int nr_data = 0;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	991	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
				992	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	993	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	994	void *p;
				995
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	996	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2 +
Zhao Lei	bfca9a6	2014-12-08 19:55:57 +0800	[diff] [blame]	997	DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) *
				998	sizeof(long), GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	999	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1000	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1001
				1002	bio_list_init(&rbio->bio_list);
				1003	INIT_LIST_HEAD(&rbio->plug_list);
				1004	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1005	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1006	INIT_LIST_HEAD(&rbio->hash_list);
				1007	rbio->bbio = bbio;
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1008	rbio->fs_info = fs_info;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1009	rbio->stripe_len = stripe_len;
				1010	rbio->nr_pages = num_pages;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1011	rbio->real_stripes = real_stripes;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1012	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1013	rbio->faila = -1;
				1014	rbio->failb = -1;
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	1015	refcount_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1016	atomic_set(&rbio->error, 0);
				1017	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1018
				1019	/*
				1020	* the stripe_pages and bio_pages array point to the extra
				1021	* memory we allocated past the end of the rbio
				1022	*/
				1023	p = rbio + 1;
				1024	rbio->stripe_pages = p;
				1025	rbio->bio_pages = p + sizeof(struct page ) num_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1026	rbio->dbitmap = p + sizeof(struct page ) num_pages * 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1027
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1028	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
				1029	nr_data = real_stripes - 1;
				1030	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1031	nr_data = real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1032	else
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1033	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1034
				1035	rbio->nr_data = nr_data;
				1036	return rbio;
				1037	}
				1038
				1039	/* allocate pages for all the stripes in the bio, including parity */
				1040	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1041	{
				1042	int i;
				1043	struct page *page;
				1044
				1045	for (i = 0; i < rbio->nr_pages; i++) {
				1046	if (rbio->stripe_pages[i])
				1047	continue;
				1048	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1049	if (!page)
				1050	return -ENOMEM;
				1051	rbio->stripe_pages[i] = page;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1052	}
				1053	return 0;
				1054	}
				1055
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1056	/* only allocate pages for p/q stripes */
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1057	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1058	{
				1059	int i;
				1060	struct page *page;
				1061
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1062	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1063
				1064	for (; i < rbio->nr_pages; i++) {
				1065	if (rbio->stripe_pages[i])
				1066	continue;
				1067	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1068	if (!page)
				1069	return -ENOMEM;
				1070	rbio->stripe_pages[i] = page;
				1071	}
				1072	return 0;
				1073	}
				1074
				1075	/*
				1076	* add a single page from a specific stripe into our list of bios for IO
				1077	* this will try to merge into existing bios if possible, and returns
				1078	* zero if all went well.
				1079	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1080	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1081	struct bio_list *bio_list,
				1082	struct page *page,
				1083	int stripe_nr,
				1084	unsigned long page_index,
				1085	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1086	{
				1087	struct bio *last = bio_list->tail;
				1088	u64 last_end = 0;
				1089	int ret;
				1090	struct bio *bio;
				1091	struct btrfs_bio_stripe *stripe;
				1092	u64 disk_start;
				1093
				1094	stripe = &rbio->bbio->stripes[stripe_nr];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1095	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1096
				1097	/* if the device is missing, just fail this stripe */
				1098	if (!stripe->dev->bdev)
				1099	return fail_rbio_index(rbio, stripe_nr);
				1100
				1101	/* see if we can add this page onto our existing bio */
				1102	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1103	last_end = (u64)last->bi_iter.bi_sector << 9;
				1104	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1105
				1106	/*
				1107	* we can't merge these if they are from different
				1108	* devices or if they are not contiguous
				1109	*/
				1110	if (last_end == disk_start && stripe->dev->bdev &&
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1111	!last->bi_status &&
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1112	last->bi_disk == stripe->dev->bdev->bd_disk &&
				1113	last->bi_partno == stripe->dev->bdev->bd_partno) {
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1114	ret = bio_add_page(last, page, PAGE_SIZE, 0);
				1115	if (ret == PAGE_SIZE)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1116	return 0;
				1117	}
				1118	}
				1119
				1120	/* put a new bio on the list */
David Sterba	c5e4c3d	2017-06-12 17:29:41 +0200	[diff] [blame]	1121	bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1122	bio->bi_iter.bi_size = 0;
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1123	bio_set_dev(bio, stripe->dev->bdev);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1124	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1125
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1126	bio_add_page(bio, page, PAGE_SIZE, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1127	bio_list_add(bio_list, bio);
				1128	return 0;
				1129	}
				1130
				1131	/*
				1132	* while we're doing the read/modify/write cycle, we could
				1133	* have errors in reading pages off the disk. This checks
				1134	* for errors and if we're not able to read the page it'll
				1135	* trigger parity reconstruction. The rmw will be finished
				1136	* after we've reconstructed the failed stripes
				1137	*/
				1138	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1139	{
				1140	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1141	BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1142	__raid56_parity_recover(rbio);
				1143	} else {
				1144	finish_rmw(rbio);
				1145	}
				1146	}
				1147
				1148	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1149	* helper function to walk our bio list and populate the bio_pages array with
				1150	* the result. This seems expensive, but it is faster than constantly
				1151	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1152	* reconstruction.
				1153	*
				1154	* This must be called before you trust the answers from page_in_rbio
				1155	*/
				1156	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1157	{
				1158	struct bio *bio;
				1159	u64 start;
				1160	unsigned long stripe_offset;
				1161	unsigned long page_index;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1162
				1163	spin_lock_irq(&rbio->bio_list_lock);
				1164	bio_list_for_each(bio, &rbio->bio_list) {
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1165	struct bio_vec bvec;
				1166	struct bvec_iter iter;
				1167	int i = 0;
				1168
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1169	start = (u64)bio->bi_iter.bi_sector << 9;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1170	stripe_offset = start - rbio->bbio->raid_map[0];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1171	page_index = stripe_offset >> PAGE_SHIFT;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1172
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1173	if (bio_flagged(bio, BIO_CLONED))
				1174	bio->bi_iter = btrfs_io_bio(bio)->iter;
				1175
				1176	bio_for_each_segment(bvec, bio, iter) {
				1177	rbio->bio_pages[page_index + i] = bvec.bv_page;
				1178	i++;
				1179	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1180	}
				1181	spin_unlock_irq(&rbio->bio_list_lock);
				1182	}
				1183
				1184	/*
				1185	* this is called from one of two situations. We either
				1186	* have a full stripe from the higher layers, or we've read all
				1187	* the missing bits off disk.
				1188	*
				1189	* This will calculate the parity and then send down any
				1190	* changed blocks.
				1191	*/
				1192	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1193	{
				1194	struct btrfs_bio *bbio = rbio->bbio;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1195	void *pointers[rbio->real_stripes];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1196	int nr_data = rbio->nr_data;
				1197	int stripe;
				1198	int pagenr;
				1199	int p_stripe = -1;
				1200	int q_stripe = -1;
				1201	struct bio_list bio_list;
				1202	struct bio *bio;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1203	int ret;
				1204
				1205	bio_list_init(&bio_list);
				1206
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1207	if (rbio->real_stripes - rbio->nr_data == 1) {
				1208	p_stripe = rbio->real_stripes - 1;
				1209	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				1210	p_stripe = rbio->real_stripes - 2;
				1211	q_stripe = rbio->real_stripes - 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1212	} else {
				1213	BUG();
				1214	}
				1215
				1216	/* at this point we either have a full stripe,
				1217	* or we've read the full stripe from the drive.
				1218	* recalculate the parity and write the new results.
				1219	*
				1220	* We're not allowed to add any new bios to the
				1221	* bio list here, anyone else that wants to
				1222	* change this stripe needs to do their own rmw.
				1223	*/
				1224	spin_lock_irq(&rbio->bio_list_lock);
				1225	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1226	spin_unlock_irq(&rbio->bio_list_lock);
				1227
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1228	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1229
				1230	/*
				1231	* now that we've set rmw_locked, run through the
				1232	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1233	*
				1234	* We don't cache full rbios because we're assuming
				1235	* the higher layers are unlikely to use this area of
				1236	* the disk again soon. If they do use it again,
				1237	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1238	*/
				1239	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1240	if (!rbio_is_full(rbio))
				1241	cache_rbio_pages(rbio);
				1242	else
				1243	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1244
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1245	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1246	struct page *p;
				1247	/* first collect one page from each data stripe */
				1248	for (stripe = 0; stripe < nr_data; stripe++) {
				1249	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1250	pointers[stripe] = kmap(p);
				1251	}
				1252
				1253	/* then add the parity stripe */
				1254	p = rbio_pstripe_page(rbio, pagenr);
				1255	SetPageUptodate(p);
				1256	pointers[stripe++] = kmap(p);
				1257
				1258	if (q_stripe != -1) {
				1259
				1260	/*
				1261	* raid6, add the qstripe and call the
				1262	* library function to fill in our p/q
				1263	*/
				1264	p = rbio_qstripe_page(rbio, pagenr);
				1265	SetPageUptodate(p);
				1266	pointers[stripe++] = kmap(p);
				1267
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1268	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1269	pointers);
				1270	} else {
				1271	/* raid5 */
				1272	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1273	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1274	}
				1275
				1276
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1277	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1278	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1279	}
				1280
				1281	/*
				1282	* time to start writing. Make bios for everything from the
				1283	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1284	* everything else.
				1285	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1286	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1287	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1288	struct page *page;
				1289	if (stripe < rbio->nr_data) {
				1290	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1291	if (!page)
				1292	continue;
				1293	} else {
				1294	page = rbio_stripe_page(rbio, stripe, pagenr);
				1295	}
				1296
				1297	ret = rbio_add_io_page(rbio, &bio_list,
				1298	page, stripe, pagenr, rbio->stripe_len);
				1299	if (ret)
				1300	goto cleanup;
				1301	}
				1302	}
				1303
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1304	if (likely(!bbio->num_tgtdevs))
				1305	goto write_data;
				1306
				1307	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
				1308	if (!bbio->tgtdev_map[stripe])
				1309	continue;
				1310
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1311	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1312	struct page *page;
				1313	if (stripe < rbio->nr_data) {
				1314	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1315	if (!page)
				1316	continue;
				1317	} else {
				1318	page = rbio_stripe_page(rbio, stripe, pagenr);
				1319	}
				1320
				1321	ret = rbio_add_io_page(rbio, &bio_list, page,
				1322	rbio->bbio->tgtdev_map[stripe],
				1323	pagenr, rbio->stripe_len);
				1324	if (ret)
				1325	goto cleanup;
				1326	}
				1327	}
				1328
				1329	write_data:
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1330	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1331	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1332
				1333	while (1) {
				1334	bio = bio_list_pop(&bio_list);
				1335	if (!bio)
				1336	break;
				1337
				1338	bio->bi_private = rbio;
				1339	bio->bi_end_io = raid_write_end_io;
Mike Christie	37226b2	2016-06-05 14:31:52 -0500	[diff] [blame]	1340	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1341
				1342	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1343	}
				1344	return;
				1345
				1346	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1347	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1348
				1349	while ((bio = bio_list_pop(&bio_list)))
				1350	bio_put(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1351	}
				1352
				1353	/*
				1354	* helper to find the stripe number for a given bio. Used to figure out which
				1355	* stripe has failed. This expects the bio to correspond to a physical disk,
				1356	* so it looks up based on physical sector numbers.
				1357	*/
				1358	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1359	struct bio *bio)
				1360	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1361	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1362	u64 stripe_start;
				1363	int i;
				1364	struct btrfs_bio_stripe *stripe;
				1365
				1366	physical <<= 9;
				1367
				1368	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1369	stripe = &rbio->bbio->stripes[i];
				1370	stripe_start = stripe->physical;
				1371	if (physical >= stripe_start &&
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1372	physical < stripe_start + rbio->stripe_len &&
Dmitriy Gorokh	047fdea	2018-02-16 19:51:38 +0000	[diff] [blame^]	1373	stripe->dev->bdev &&
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1374	bio->bi_disk == stripe->dev->bdev->bd_disk &&
				1375	bio->bi_partno == stripe->dev->bdev->bd_partno) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1376	return i;
				1377	}
				1378	}
				1379	return -1;
				1380	}
				1381
				1382	/*
				1383	* helper to find the stripe number for a given
				1384	* bio (before mapping). Used to figure out which stripe has
				1385	* failed. This looks up based on logical block numbers.
				1386	*/
				1387	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1388	struct bio *bio)
				1389	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1390	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1391	u64 stripe_start;
				1392	int i;
				1393
				1394	logical <<= 9;
				1395
				1396	for (i = 0; i < rbio->nr_data; i++) {
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1397	stripe_start = rbio->bbio->raid_map[i];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1398	if (logical >= stripe_start &&
				1399	logical < stripe_start + rbio->stripe_len) {
				1400	return i;
				1401	}
				1402	}
				1403	return -1;
				1404	}
				1405
				1406	/*
				1407	* returns -EIO if we had too many failures
				1408	*/
				1409	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1410	{
				1411	unsigned long flags;
				1412	int ret = 0;
				1413
				1414	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1415
				1416	/* we already know this stripe is bad, move on */
				1417	if (rbio->faila == failed \|\| rbio->failb == failed)
				1418	goto out;
				1419
				1420	if (rbio->faila == -1) {
				1421	/* first failure on this rbio */
				1422	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1423	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1424	} else if (rbio->failb == -1) {
				1425	/* second failure on this rbio */
				1426	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1427	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1428	} else {
				1429	ret = -EIO;
				1430	}
				1431	out:
				1432	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1433
				1434	return ret;
				1435	}
				1436
				1437	/*
				1438	* helper to fail a stripe based on a physical disk
				1439	* bio.
				1440	*/
				1441	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1442	struct bio *bio)
				1443	{
				1444	int failed = find_bio_stripe(rbio, bio);
				1445
				1446	if (failed < 0)
				1447	return -EIO;
				1448
				1449	return fail_rbio_index(rbio, failed);
				1450	}
				1451
				1452	/*
				1453	* this sets each page in the bio uptodate. It should only be used on private
				1454	* rbio pages, nothing that comes in from the higher layers
				1455	*/
				1456	static void set_bio_pages_uptodate(struct bio *bio)
				1457	{
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1458	struct bio_vec *bvec;
				1459	int i;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1460
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1461	ASSERT(!bio_flagged(bio, BIO_CLONED));
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1462
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1463	bio_for_each_segment_all(bvec, bio, i)
				1464	SetPageUptodate(bvec->bv_page);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1465	}
				1466
				1467	/*
				1468	* end io for the read phase of the rmw cycle. All the bios here are physical
				1469	* stripe bios we've read from the disk so we can recalculate the parity of the
				1470	* stripe.
				1471	*
				1472	* This will usually kick off finish_rmw once all the bios are read in, but it
				1473	* may trigger parity reconstruction if we had any errors along the way
				1474	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1475	static void raid_rmw_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1476	{
				1477	struct btrfs_raid_bio *rbio = bio->bi_private;
				1478
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1479	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1480	fail_bio_stripe(rbio, bio);
				1481	else
				1482	set_bio_pages_uptodate(bio);
				1483
				1484	bio_put(bio);
				1485
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1486	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1487	return;
				1488
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1489	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1490	goto cleanup;
				1491
				1492	/*
				1493	* this will normally call finish_rmw to start our write
				1494	* but if there are any failed stripes we'll reconstruct
				1495	* from parity first
				1496	*/
				1497	validate_rbio_for_rmw(rbio);
				1498	return;
				1499
				1500	cleanup:
				1501
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1502	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1503	}
				1504
				1505	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1506	{
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1507	btrfs_init_work(&rbio->work, btrfs_rmw_helper, rmw_work, NULL, NULL);
				1508	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1509	}
				1510
				1511	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1512	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1513	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1514	read_rebuild_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1515
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1516	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1517	}
				1518
				1519	/*
				1520	* the stripe must be locked by the caller. It will
				1521	* unlock after all the writes are done
				1522	*/
				1523	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1524	{
				1525	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1526	struct bio_list bio_list;
				1527	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1528	int pagenr;
				1529	int stripe;
				1530	struct bio *bio;
				1531
				1532	bio_list_init(&bio_list);
				1533
				1534	ret = alloc_rbio_pages(rbio);
				1535	if (ret)
				1536	goto cleanup;
				1537
				1538	index_rbio_pages(rbio);
				1539
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1540	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1541	/*
				1542	* build a list of bios to read all the missing parts of this
				1543	* stripe
				1544	*/
				1545	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1546	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1547	struct page *page;
				1548	/*
				1549	* we want to find all the pages missing from
				1550	* the rbio and read them from the disk. If
				1551	* page_in_rbio finds a page in the bio list
				1552	* we don't need to read it off the stripe.
				1553	*/
				1554	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1555	if (page)
				1556	continue;
				1557
				1558	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1559	/*
				1560	* the bio cache may have handed us an uptodate
				1561	* page. If so, be happy and use it
				1562	*/
				1563	if (PageUptodate(page))
				1564	continue;
				1565
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1566	ret = rbio_add_io_page(rbio, &bio_list, page,
				1567	stripe, pagenr, rbio->stripe_len);
				1568	if (ret)
				1569	goto cleanup;
				1570	}
				1571	}
				1572
				1573	bios_to_read = bio_list_size(&bio_list);
				1574	if (!bios_to_read) {
				1575	/*
				1576	* this can happen if others have merged with
				1577	* us, it means there is nothing left to read.
				1578	* But if there are missing devices it may not be
				1579	* safe to do the full stripe write yet.
				1580	*/
				1581	goto finish;
				1582	}
				1583
				1584	/*
				1585	* the bbio may be freed once we submit the last bio. Make sure
				1586	* not to touch it after that
				1587	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1588	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1589	while (1) {
				1590	bio = bio_list_pop(&bio_list);
				1591	if (!bio)
				1592	break;
				1593
				1594	bio->bi_private = rbio;
				1595	bio->bi_end_io = raid_rmw_end_io;
Mike Christie	37226b2	2016-06-05 14:31:52 -0500	[diff] [blame]	1596	bio_set_op_attrs(bio, REQ_OP_READ, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1597
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1598	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1599
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1600	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1601	}
				1602	/* the actual write will happen once the reads are done */
				1603	return 0;
				1604
				1605	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1606	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1607
				1608	while ((bio = bio_list_pop(&bio_list)))
				1609	bio_put(bio);
				1610
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1611	return -EIO;
				1612
				1613	finish:
				1614	validate_rbio_for_rmw(rbio);
				1615	return 0;
				1616	}
				1617
				1618	/*
				1619	* if the upper layers pass in a full stripe, we thank them by only allocating
				1620	* enough pages to hold the parity, and sending it all down quickly.
				1621	*/
				1622	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1623	{
				1624	int ret;
				1625
				1626	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1627	if (ret) {
				1628	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1629	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1630	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1631
				1632	ret = lock_stripe_add(rbio);
				1633	if (ret == 0)
				1634	finish_rmw(rbio);
				1635	return 0;
				1636	}
				1637
				1638	/*
				1639	* partial stripe writes get handed over to async helpers.
				1640	* We're really hoping to merge a few more writes into this
				1641	* rbio before calculating new parity
				1642	*/
				1643	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1644	{
				1645	int ret;
				1646
				1647	ret = lock_stripe_add(rbio);
				1648	if (ret == 0)
				1649	async_rmw_stripe(rbio);
				1650	return 0;
				1651	}
				1652
				1653	/*
				1654	* sometimes while we were reading from the drive to
				1655	* recalculate parity, enough new bios come into create
				1656	* a full stripe. So we do a check here to see if we can
				1657	* go directly to finish_rmw
				1658	*/
				1659	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1660	{
				1661	/* head off into rmw land if we don't have a full stripe */
				1662	if (!rbio_is_full(rbio))
				1663	return partial_stripe_write(rbio);
				1664	return full_stripe_write(rbio);
				1665	}
				1666
				1667	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1668	* We use plugging call backs to collect full stripes.
				1669	* Any time we get a partial stripe write while plugged
				1670	* we collect it into a list. When the unplug comes down,
				1671	* we sort the list by logical block number and merge
				1672	* everything we can into the same rbios
				1673	*/
				1674	struct btrfs_plug_cb {
				1675	struct blk_plug_cb cb;
				1676	struct btrfs_fs_info *info;
				1677	struct list_head rbio_list;
				1678	struct btrfs_work work;
				1679	};
				1680
				1681	/*
				1682	* rbios on the plug list are sorted for easier merging.
				1683	*/
				1684	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1685	{
				1686	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1687	plug_list);
				1688	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1689	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1690	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1691	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1692
				1693	if (a_sector < b_sector)
				1694	return -1;
				1695	if (a_sector > b_sector)
				1696	return 1;
				1697	return 0;
				1698	}
				1699
				1700	static void run_plug(struct btrfs_plug_cb *plug)
				1701	{
				1702	struct btrfs_raid_bio *cur;
				1703	struct btrfs_raid_bio *last = NULL;
				1704
				1705	/*
				1706	* sort our plug list then try to merge
				1707	* everything we can in hopes of creating full
				1708	* stripes.
				1709	*/
				1710	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1711	while (!list_empty(&plug->rbio_list)) {
				1712	cur = list_entry(plug->rbio_list.next,
				1713	struct btrfs_raid_bio, plug_list);
				1714	list_del_init(&cur->plug_list);
				1715
				1716	if (rbio_is_full(cur)) {
				1717	/* we have a full stripe, send it down */
				1718	full_stripe_write(cur);
				1719	continue;
				1720	}
				1721	if (last) {
				1722	if (rbio_can_merge(last, cur)) {
				1723	merge_rbio(last, cur);
				1724	__free_raid_bio(cur);
				1725	continue;
				1726
				1727	}
				1728	__raid56_parity_write(last);
				1729	}
				1730	last = cur;
				1731	}
				1732	if (last) {
				1733	__raid56_parity_write(last);
				1734	}
				1735	kfree(plug);
				1736	}
				1737
				1738	/*
				1739	* if the unplug comes from schedule, we have to push the
				1740	* work off to a helper thread
				1741	*/
				1742	static void unplug_work(struct btrfs_work *work)
				1743	{
				1744	struct btrfs_plug_cb *plug;
				1745	plug = container_of(work, struct btrfs_plug_cb, work);
				1746	run_plug(plug);
				1747	}
				1748
				1749	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1750	{
				1751	struct btrfs_plug_cb *plug;
				1752	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1753
				1754	if (from_schedule) {
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1755	btrfs_init_work(&plug->work, btrfs_rmw_helper,
				1756	unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1757	btrfs_queue_work(plug->info->rmw_workers,
				1758	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1759	return;
				1760	}
				1761	run_plug(plug);
				1762	}
				1763
				1764	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1765	* our main entry point for writes from the rest of the FS.
				1766	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1767	int raid56_parity_write(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1768	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1769	{
				1770	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1771	struct btrfs_plug_cb *plug = NULL;
				1772	struct blk_plug_cb *cb;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1773	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1774
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1775	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1776	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1777	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1778	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1779	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1780	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1781	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1782	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1783
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1784	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1785	rbio->generic_bio_cnt = 1;
				1786
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1787	/*
				1788	* don't plug on full rbios, just get them out the door
				1789	* as quickly as we can
				1790	*/
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1791	if (rbio_is_full(rbio)) {
				1792	ret = full_stripe_write(rbio);
				1793	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1794	btrfs_bio_counter_dec(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1795	return ret;
				1796	}
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1797
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1798	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1799	if (cb) {
				1800	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1801	if (!plug->info) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1802	plug->info = fs_info;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1803	INIT_LIST_HEAD(&plug->rbio_list);
				1804	}
				1805	list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1806	ret = 0;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1807	} else {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1808	ret = __raid56_parity_write(rbio);
				1809	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1810	btrfs_bio_counter_dec(fs_info);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1811	}
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1812	return ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1813	}
				1814
				1815	/*
				1816	* all parity reconstruction happens here. We've read in everything
				1817	* we can find from the drives and this does the heavy lifting of
				1818	* sorting the good from the bad.
				1819	*/
				1820	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1821	{
				1822	int pagenr, stripe;
				1823	void **pointers;
				1824	int faila = -1, failb = -1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1825	struct page *page;
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1826	blk_status_t err;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1827	int i;
				1828
David Sterba	31e818f	2015-02-20 18:00:26 +0100	[diff] [blame]	1829	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1830	if (!pointers) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1831	err = BLK_STS_RESOURCE;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1832	goto cleanup_io;
				1833	}
				1834
				1835	faila = rbio->faila;
				1836	failb = rbio->failb;
				1837
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1838	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1839	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1840	spin_lock_irq(&rbio->bio_list_lock);
				1841	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1842	spin_unlock_irq(&rbio->bio_list_lock);
				1843	}
				1844
				1845	index_rbio_pages(rbio);
				1846
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1847	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1848	/*
				1849	* Now we just use bitmap to mark the horizontal stripes in
				1850	* which we have data when doing parity scrub.
				1851	*/
				1852	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1853	!test_bit(pagenr, rbio->dbitmap))
				1854	continue;
				1855
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1856	/* setup our array of pointers with pages
				1857	* from each stripe
				1858	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1859	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1860	/*
				1861	* if we're rebuilding a read, we have to use
				1862	* pages from the bio list
				1863	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1864	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1865	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1866	(stripe == faila \|\| stripe == failb)) {
				1867	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1868	} else {
				1869	page = rbio_stripe_page(rbio, stripe, pagenr);
				1870	}
				1871	pointers[stripe] = kmap(page);
				1872	}
				1873
				1874	/* all raid6 handling here */
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1875	if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1876	/*
				1877	* single failure, rebuild from parity raid5
				1878	* style
				1879	*/
				1880	if (failb < 0) {
				1881	if (faila == rbio->nr_data) {
				1882	/*
				1883	* Just the P stripe has failed, without
				1884	* a bad data or Q stripe.
				1885	* TODO, we should redo the xor here.
				1886	*/
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1887	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1888	goto cleanup;
				1889	}
				1890	/*
				1891	* a single failure in raid6 is rebuilt
				1892	* in the pstripe code below
				1893	*/
				1894	goto pstripe;
				1895	}
				1896
				1897	/* make sure our ps and qs are in order */
				1898	if (faila > failb) {
				1899	int tmp = failb;
				1900	failb = faila;
				1901	faila = tmp;
				1902	}
				1903
				1904	/* if the q stripe is failed, do a pstripe reconstruction
				1905	* from the xors.
				1906	* If both the q stripe and the P stripe are failed, we're
				1907	* here due to a crc mismatch and we can't give them the
				1908	* data they want
				1909	*/
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1910	if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1911	if (rbio->bbio->raid_map[faila] ==
				1912	RAID5_P_STRIPE) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1913	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1914	goto cleanup;
				1915	}
				1916	/*
				1917	* otherwise we have one bad data stripe and
				1918	* a good P stripe. raid5!
				1919	*/
				1920	goto pstripe;
				1921	}
				1922
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1923	if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1924	raid6_datap_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1925	PAGE_SIZE, faila, pointers);
				1926	} else {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1927	raid6_2data_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1928	PAGE_SIZE, faila, failb,
				1929	pointers);
				1930	}
				1931	} else {
				1932	void *p;
				1933
				1934	/* rebuild from P stripe here (raid5 or raid6) */
				1935	BUG_ON(failb != -1);
				1936	pstripe:
				1937	/* Copy parity block into failed block to start with */
				1938	memcpy(pointers[faila],
				1939	pointers[rbio->nr_data],
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1940	PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1941
				1942	/* rearrange the pointer array */
				1943	p = pointers[faila];
				1944	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1945	pointers[stripe] = pointers[stripe + 1];
				1946	pointers[rbio->nr_data - 1] = p;
				1947
				1948	/* xor in the rest */
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1949	run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1950	}
				1951	/* if we're doing this rebuild as part of an rmw, go through
				1952	* and set all of our private rbio pages in the
				1953	* failed stripes as uptodate. This way finish_rmw will
				1954	* know they can be trusted. If this was a read reconstruction,
				1955	* other endio functions will fiddle the uptodate bits
				1956	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1957	if (rbio->operation == BTRFS_RBIO_WRITE) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1958	for (i = 0; i < rbio->stripe_npages; i++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1959	if (faila != -1) {
				1960	page = rbio_stripe_page(rbio, faila, i);
				1961	SetPageUptodate(page);
				1962	}
				1963	if (failb != -1) {
				1964	page = rbio_stripe_page(rbio, failb, i);
				1965	SetPageUptodate(page);
				1966	}
				1967	}
				1968	}
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1969	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1970	/*
				1971	* if we're rebuilding a read, we have to use
				1972	* pages from the bio list
				1973	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1974	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1975	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1976	(stripe == faila \|\| stripe == failb)) {
				1977	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1978	} else {
				1979	page = rbio_stripe_page(rbio, stripe, pagenr);
				1980	}
				1981	kunmap(page);
				1982	}
				1983	}
				1984
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1985	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1986	cleanup:
				1987	kfree(pointers);
				1988
				1989	cleanup_io:
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1990	if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	1991	/*
				1992	* - In case of two failures, where rbio->failb != -1:
				1993	*
				1994	* Do not cache this rbio since the above read reconstruction
				1995	* (raid6_datap_recov() or raid6_2data_recov()) may have
				1996	* changed some content of stripes which are not identical to
				1997	* on-disk content any more, otherwise, a later write/recover
				1998	* may steal stripe_pages from this rbio and end up with
				1999	* corruptions or rebuild failures.
				2000	*
				2001	* - In case of single failure, where rbio->failb == -1:
				2002	*
				2003	* Cache this rbio iff the above read reconstruction is
				2004	* excuted without problems.
				2005	*/
				2006	if (err == BLK_STS_OK && rbio->failb < 0)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2007	cache_rbio_pages(rbio);
				2008	else
				2009	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2010
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2011	rbio_orig_end_io(rbio, err);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2012	} else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
Linus Torvalds	2236597	2015-09-05 15:14:43 -0700	[diff] [blame]	2013	rbio_orig_end_io(rbio, err);
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2014	} else if (err == BLK_STS_OK) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2015	rbio->faila = -1;
				2016	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2017
				2018	if (rbio->operation == BTRFS_RBIO_WRITE)
				2019	finish_rmw(rbio);
				2020	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				2021	finish_parity_scrub(rbio, 0);
				2022	else
				2023	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2024	} else {
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2025	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2026	}
				2027	}
				2028
				2029	/*
				2030	* This is called only for stripes we've read from disk to
				2031	* reconstruct the parity.
				2032	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2033	static void raid_recover_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2034	{
				2035	struct btrfs_raid_bio *rbio = bio->bi_private;
				2036
				2037	/*
				2038	* we only read stripe pages off the disk, set them
				2039	* up to date if there were no errors
				2040	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2041	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2042	fail_bio_stripe(rbio, bio);
				2043	else
				2044	set_bio_pages_uptodate(bio);
				2045	bio_put(bio);
				2046
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2047	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2048	return;
				2049
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2050	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2051	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2052	else
				2053	__raid_recover_end_io(rbio);
				2054	}
				2055
				2056	/*
				2057	* reads everything we need off the disk to reconstruct
				2058	* the parity. endio handlers trigger final reconstruction
				2059	* when the IO is done.
				2060	*
				2061	* This is used both for reads from the higher layers and for
				2062	* parity construction required to finish a rmw cycle.
				2063	*/
				2064	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2065	{
				2066	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2067	struct bio_list bio_list;
				2068	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2069	int pagenr;
				2070	int stripe;
				2071	struct bio *bio;
				2072
				2073	bio_list_init(&bio_list);
				2074
				2075	ret = alloc_rbio_pages(rbio);
				2076	if (ret)
				2077	goto cleanup;
				2078
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2079	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2080
				2081	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2082	* read everything that hasn't failed. Thanks to the
				2083	* stripe cache, it is possible that some or all of these
				2084	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2085	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2086	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2087	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2088	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2089	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2090	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2091
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	2092	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2093	struct page *p;
				2094
				2095	/*
				2096	* the rmw code may have already read this
				2097	* page in
				2098	*/
				2099	p = rbio_stripe_page(rbio, stripe, pagenr);
				2100	if (PageUptodate(p))
				2101	continue;
				2102
				2103	ret = rbio_add_io_page(rbio, &bio_list,
				2104	rbio_stripe_page(rbio, stripe, pagenr),
				2105	stripe, pagenr, rbio->stripe_len);
				2106	if (ret < 0)
				2107	goto cleanup;
				2108	}
				2109	}
				2110
				2111	bios_to_read = bio_list_size(&bio_list);
				2112	if (!bios_to_read) {
				2113	/*
				2114	* we might have no bios to read just because the pages
				2115	* were up to date, or we might have no bios to read because
				2116	* the devices were gone.
				2117	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2118	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2119	__raid_recover_end_io(rbio);
				2120	goto out;
				2121	} else {
				2122	goto cleanup;
				2123	}
				2124	}
				2125
				2126	/*
				2127	* the bbio may be freed once we submit the last bio. Make sure
				2128	* not to touch it after that
				2129	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2130	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2131	while (1) {
				2132	bio = bio_list_pop(&bio_list);
				2133	if (!bio)
				2134	break;
				2135
				2136	bio->bi_private = rbio;
				2137	bio->bi_end_io = raid_recover_end_io;
Mike Christie	37226b2	2016-06-05 14:31:52 -0500	[diff] [blame]	2138	bio_set_op_attrs(bio, REQ_OP_READ, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2139
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2140	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2141
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2142	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2143	}
				2144	out:
				2145	return 0;
				2146
				2147	cleanup:
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2148	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				2149	rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2150	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2151
				2152	while ((bio = bio_list_pop(&bio_list)))
				2153	bio_put(bio);
				2154
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2155	return -EIO;
				2156	}
				2157
				2158	/*
				2159	* the main entry point for reads from the higher layers. This
				2160	* is really only called when the normal read path had a failure,
				2161	* so we assume the bio they send down corresponds to a failed part
				2162	* of the drive.
				2163	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2164	int raid56_parity_recover(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2165	struct btrfs_bio *bbio, u64 stripe_len,
				2166	int mirror_num, int generic_io)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2167	{
				2168	struct btrfs_raid_bio *rbio;
				2169	int ret;
				2170
Liu Bo	abad60c	2017-03-29 10:54:26 -0700	[diff] [blame]	2171	if (generic_io) {
				2172	ASSERT(bbio->mirror_num == mirror_num);
				2173	btrfs_io_bio(bio)->mirror_num = mirror_num;
				2174	}
				2175
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2176	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2177	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2178	if (generic_io)
				2179	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2180	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2181	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2182
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2183	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2184	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2185	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2186
				2187	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2188	if (rbio->faila == -1) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2189	btrfs_warn(fs_info,
Liu Bo	e46a28c	2016-07-29 10:57:55 -0700	[diff] [blame]	2190	"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
				2191	__func__, (u64)bio->bi_iter.bi_sector << 9,
				2192	(u64)bio->bi_iter.bi_size, bbio->map_type);
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2193	if (generic_io)
				2194	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2195	kfree(rbio);
				2196	return -EIO;
				2197	}
				2198
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2199	if (generic_io) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2200	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2201	rbio->generic_bio_cnt = 1;
				2202	} else {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2203	btrfs_get_bbio(bbio);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2204	}
				2205
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2206	/*
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2207	* Loop retry:
				2208	* for 'mirror == 2', reconstruct from all other stripes.
				2209	* for 'mirror_num > 2', select a stripe to fail on every retry.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2210	*/
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2211	if (mirror_num > 2) {
				2212	/*
				2213	* 'mirror == 3' is to fail the p stripe and
				2214	* reconstruct from the q stripe. 'mirror > 3' is to
				2215	* fail a data stripe and reconstruct from p+q stripe.
				2216	*/
				2217	rbio->failb = rbio->real_stripes - (mirror_num - 1);
				2218	ASSERT(rbio->failb > 0);
				2219	if (rbio->failb <= rbio->faila)
				2220	rbio->failb--;
				2221	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2222
				2223	ret = lock_stripe_add(rbio);
				2224
				2225	/*
				2226	* __raid56_parity_recover will end the bio with
				2227	* any errors it hits. We don't want to return
				2228	* its error value up the stack because our caller
				2229	* will end up calling bio_endio with any nonzero
				2230	* return
				2231	*/
				2232	if (ret == 0)
				2233	__raid56_parity_recover(rbio);
				2234	/*
				2235	* our rbio has been added to the list of
				2236	* rbios that will be handled after the
				2237	* currently lock owner is done
				2238	*/
				2239	return 0;
				2240
				2241	}
				2242
				2243	static void rmw_work(struct btrfs_work *work)
				2244	{
				2245	struct btrfs_raid_bio *rbio;
				2246
				2247	rbio = container_of(work, struct btrfs_raid_bio, work);
				2248	raid56_rmw_stripe(rbio);
				2249	}
				2250
				2251	static void read_rebuild_work(struct btrfs_work *work)
				2252	{
				2253	struct btrfs_raid_bio *rbio;
				2254
				2255	rbio = container_of(work, struct btrfs_raid_bio, work);
				2256	__raid56_parity_recover(rbio);
				2257	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2258
				2259	/*
				2260	* The following code is used to scrub/replace the parity stripe
				2261	*
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2262	* Caller must have already increased bio_counter for getting @bbio.
				2263	*
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2264	* Note: We need make sure all the pages that add into the scrub/replace
				2265	* raid bio are correct and not be changed during the scrub/replace. That
				2266	* is those pages just hold metadata or file data with checksum.
				2267	*/
				2268
				2269	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2270	raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2271	struct btrfs_bio *bbio, u64 stripe_len,
				2272	struct btrfs_device *scrub_dev,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2273	unsigned long *dbitmap, int stripe_nsectors)
				2274	{
				2275	struct btrfs_raid_bio *rbio;
				2276	int i;
				2277
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2278	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2279	if (IS_ERR(rbio))
				2280	return NULL;
				2281	bio_list_add(&rbio->bio_list, bio);
				2282	/*
				2283	* This is a special bio which is used to hold the completion handler
				2284	* and make the scrub rbio is similar to the other types
				2285	*/
				2286	ASSERT(!bio->bi_iter.bi_size);
				2287	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2288
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2289	/*
				2290	* After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
				2291	* to the end position, so this search can start from the first parity
				2292	* stripe.
				2293	*/
				2294	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2295	if (bbio->stripes[i].dev == scrub_dev) {
				2296	rbio->scrubp = i;
				2297	break;
				2298	}
				2299	}
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2300	ASSERT(i < rbio->real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2301
				2302	/* Now we just support the sectorsize equals to page size */
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2303	ASSERT(fs_info->sectorsize == PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2304	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2305	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2306
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2307	/*
				2308	* We have already increased bio_counter when getting bbio, record it
				2309	* so we can free it at rbio_orig_end_io().
				2310	*/
				2311	rbio->generic_bio_cnt = 1;
				2312
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2313	return rbio;
				2314	}
				2315
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2316	/* Used for both parity scrub and missing. */
				2317	void raid56_add_scrub_pages(struct btrfs_raid_bio rbio, struct page page,
				2318	u64 logical)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2319	{
				2320	int stripe_offset;
				2321	int index;
				2322
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2323	ASSERT(logical >= rbio->bbio->raid_map[0]);
				2324	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2325	rbio->stripe_len * rbio->nr_data);
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2326	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2327	index = stripe_offset >> PAGE_SHIFT;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2328	rbio->bio_pages[index] = page;
				2329	}
				2330
				2331	/*
				2332	* We just scrub the parity that we have correct data on the same horizontal,
				2333	* so we needn't allocate all pages for all the stripes.
				2334	*/
				2335	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2336	{
				2337	int i;
				2338	int bit;
				2339	int index;
				2340	struct page *page;
				2341
				2342	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2343	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2344	index = i * rbio->stripe_npages + bit;
				2345	if (rbio->stripe_pages[index])
				2346	continue;
				2347
				2348	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2349	if (!page)
				2350	return -ENOMEM;
				2351	rbio->stripe_pages[index] = page;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2352	}
				2353	}
				2354	return 0;
				2355	}
				2356
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2357	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2358	int need_check)
				2359	{
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2360	struct btrfs_bio *bbio = rbio->bbio;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2361	void *pointers[rbio->real_stripes];
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2362	DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2363	int nr_data = rbio->nr_data;
				2364	int stripe;
				2365	int pagenr;
				2366	int p_stripe = -1;
				2367	int q_stripe = -1;
				2368	struct page *p_page = NULL;
				2369	struct page *q_page = NULL;
				2370	struct bio_list bio_list;
				2371	struct bio *bio;
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2372	int is_replace = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2373	int ret;
				2374
				2375	bio_list_init(&bio_list);
				2376
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2377	if (rbio->real_stripes - rbio->nr_data == 1) {
				2378	p_stripe = rbio->real_stripes - 1;
				2379	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				2380	p_stripe = rbio->real_stripes - 2;
				2381	q_stripe = rbio->real_stripes - 1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2382	} else {
				2383	BUG();
				2384	}
				2385
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2386	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
				2387	is_replace = 1;
				2388	bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
				2389	}
				2390
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2391	/*
				2392	* Because the higher layers(scrubber) are unlikely to
				2393	* use this area of the disk again soon, so don't cache
				2394	* it.
				2395	*/
				2396	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2397
				2398	if (!need_check)
				2399	goto writeback;
				2400
				2401	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2402	if (!p_page)
				2403	goto cleanup;
				2404	SetPageUptodate(p_page);
				2405
				2406	if (q_stripe != -1) {
				2407	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2408	if (!q_page) {
				2409	__free_page(p_page);
				2410	goto cleanup;
				2411	}
				2412	SetPageUptodate(q_page);
				2413	}
				2414
				2415	atomic_set(&rbio->error, 0);
				2416
				2417	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2418	struct page *p;
				2419	void *parity;
				2420	/* first collect one page from each data stripe */
				2421	for (stripe = 0; stripe < nr_data; stripe++) {
				2422	p = page_in_rbio(rbio, stripe, pagenr, 0);
				2423	pointers[stripe] = kmap(p);
				2424	}
				2425
				2426	/* then add the parity stripe */
				2427	pointers[stripe++] = kmap(p_page);
				2428
				2429	if (q_stripe != -1) {
				2430
				2431	/*
				2432	* raid6, add the qstripe and call the
				2433	* library function to fill in our p/q
				2434	*/
				2435	pointers[stripe++] = kmap(q_page);
				2436
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2437	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2438	pointers);
				2439	} else {
				2440	/* raid5 */
				2441	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2442	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2443	}
				2444
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2445	/* Check scrubbing parity and repair it */
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2446	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2447	parity = kmap(p);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2448	if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
				2449	memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2450	else
				2451	/* Parity is right, needn't writeback */
				2452	bitmap_clear(rbio->dbitmap, pagenr, 1);
				2453	kunmap(p);
				2454
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2455	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2456	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				2457	}
				2458
				2459	__free_page(p_page);
				2460	if (q_page)
				2461	__free_page(q_page);
				2462
				2463	writeback:
				2464	/*
				2465	* time to start writing. Make bios for everything from the
				2466	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2467	* everything else.
				2468	*/
				2469	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2470	struct page *page;
				2471
				2472	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2473	ret = rbio_add_io_page(rbio, &bio_list,
				2474	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2475	if (ret)
				2476	goto cleanup;
				2477	}
				2478
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2479	if (!is_replace)
				2480	goto submit_write;
				2481
				2482	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
				2483	struct page *page;
				2484
				2485	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2486	ret = rbio_add_io_page(rbio, &bio_list, page,
				2487	bbio->tgtdev_map[rbio->scrubp],
				2488	pagenr, rbio->stripe_len);
				2489	if (ret)
				2490	goto cleanup;
				2491	}
				2492
				2493	submit_write:
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2494	nr_data = bio_list_size(&bio_list);
				2495	if (!nr_data) {
				2496	/* Every parity is right */
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2497	rbio_orig_end_io(rbio, BLK_STS_OK);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2498	return;
				2499	}
				2500
				2501	atomic_set(&rbio->stripes_pending, nr_data);
				2502
				2503	while (1) {
				2504	bio = bio_list_pop(&bio_list);
				2505	if (!bio)
				2506	break;
				2507
				2508	bio->bi_private = rbio;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	2509	bio->bi_end_io = raid_write_end_io;
Mike Christie	37226b2	2016-06-05 14:31:52 -0500	[diff] [blame]	2510	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2511
				2512	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2513	}
				2514	return;
				2515
				2516	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2517	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2518
				2519	while ((bio = bio_list_pop(&bio_list)))
				2520	bio_put(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2521	}
				2522
				2523	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2524	{
				2525	if (stripe >= 0 && stripe < rbio->nr_data)
				2526	return 1;
				2527	return 0;
				2528	}
				2529
				2530	/*
				2531	* While we're doing the parity check and repair, we could have errors
				2532	* in reading pages off the disk. This checks for errors and if we're
				2533	* not able to read the page it'll trigger parity reconstruction. The
				2534	* parity scrub will be finished after we've reconstructed the failed
				2535	* stripes
				2536	*/
				2537	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2538	{
				2539	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2540	goto cleanup;
				2541
				2542	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2543	int dfail = 0, failp = -1;
				2544
				2545	if (is_data_stripe(rbio, rbio->faila))
				2546	dfail++;
				2547	else if (is_parity_stripe(rbio->faila))
				2548	failp = rbio->faila;
				2549
				2550	if (is_data_stripe(rbio, rbio->failb))
				2551	dfail++;
				2552	else if (is_parity_stripe(rbio->failb))
				2553	failp = rbio->failb;
				2554
				2555	/*
				2556	* Because we can not use a scrubbing parity to repair
				2557	* the data, so the capability of the repair is declined.
				2558	* (In the case of RAID5, we can not repair anything)
				2559	*/
				2560	if (dfail > rbio->bbio->max_errors - 1)
				2561	goto cleanup;
				2562
				2563	/*
				2564	* If all data is good, only parity is correctly, just
				2565	* repair the parity.
				2566	*/
				2567	if (dfail == 0) {
				2568	finish_parity_scrub(rbio, 0);
				2569	return;
				2570	}
				2571
				2572	/*
				2573	* Here means we got one corrupted data stripe and one
				2574	* corrupted parity on RAID6, if the corrupted parity
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2575	* is scrubbing parity, luckily, use the other one to repair
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2576	* the data, or we can not repair the data stripe.
				2577	*/
				2578	if (failp != rbio->scrubp)
				2579	goto cleanup;
				2580
				2581	__raid_recover_end_io(rbio);
				2582	} else {
				2583	finish_parity_scrub(rbio, 1);
				2584	}
				2585	return;
				2586
				2587	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2588	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2589	}
				2590
				2591	/*
				2592	* end io for the read phase of the rmw cycle. All the bios here are physical
				2593	* stripe bios we've read from the disk so we can recalculate the parity of the
				2594	* stripe.
				2595	*
				2596	* This will usually kick off finish_rmw once all the bios are read in, but it
				2597	* may trigger parity reconstruction if we had any errors along the way
				2598	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2599	static void raid56_parity_scrub_end_io(struct bio *bio)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2600	{
				2601	struct btrfs_raid_bio *rbio = bio->bi_private;
				2602
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2603	if (bio->bi_status)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2604	fail_bio_stripe(rbio, bio);
				2605	else
				2606	set_bio_pages_uptodate(bio);
				2607
				2608	bio_put(bio);
				2609
				2610	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2611	return;
				2612
				2613	/*
				2614	* this will normally call finish_rmw to start our write
				2615	* but if there are any failed stripes we'll reconstruct
				2616	* from parity first
				2617	*/
				2618	validate_rbio_for_parity_scrub(rbio);
				2619	}
				2620
				2621	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2622	{
				2623	int bios_to_read = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2624	struct bio_list bio_list;
				2625	int ret;
				2626	int pagenr;
				2627	int stripe;
				2628	struct bio *bio;
				2629
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2630	bio_list_init(&bio_list);
				2631
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2632	ret = alloc_rbio_essential_pages(rbio);
				2633	if (ret)
				2634	goto cleanup;
				2635
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2636	atomic_set(&rbio->error, 0);
				2637	/*
				2638	* build a list of bios to read all the missing parts of this
				2639	* stripe
				2640	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2641	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2642	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2643	struct page *page;
				2644	/*
				2645	* we want to find all the pages missing from
				2646	* the rbio and read them from the disk. If
				2647	* page_in_rbio finds a page in the bio list
				2648	* we don't need to read it off the stripe.
				2649	*/
				2650	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2651	if (page)
				2652	continue;
				2653
				2654	page = rbio_stripe_page(rbio, stripe, pagenr);
				2655	/*
				2656	* the bio cache may have handed us an uptodate
				2657	* page. If so, be happy and use it
				2658	*/
				2659	if (PageUptodate(page))
				2660	continue;
				2661
				2662	ret = rbio_add_io_page(rbio, &bio_list, page,
				2663	stripe, pagenr, rbio->stripe_len);
				2664	if (ret)
				2665	goto cleanup;
				2666	}
				2667	}
				2668
				2669	bios_to_read = bio_list_size(&bio_list);
				2670	if (!bios_to_read) {
				2671	/*
				2672	* this can happen if others have merged with
				2673	* us, it means there is nothing left to read.
				2674	* But if there are missing devices it may not be
				2675	* safe to do the full stripe write yet.
				2676	*/
				2677	goto finish;
				2678	}
				2679
				2680	/*
				2681	* the bbio may be freed once we submit the last bio. Make sure
				2682	* not to touch it after that
				2683	*/
				2684	atomic_set(&rbio->stripes_pending, bios_to_read);
				2685	while (1) {
				2686	bio = bio_list_pop(&bio_list);
				2687	if (!bio)
				2688	break;
				2689
				2690	bio->bi_private = rbio;
				2691	bio->bi_end_io = raid56_parity_scrub_end_io;
Mike Christie	37226b2	2016-06-05 14:31:52 -0500	[diff] [blame]	2692	bio_set_op_attrs(bio, REQ_OP_READ, 0);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2693
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2694	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2695
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2696	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2697	}
				2698	/* the actual write will happen once the reads are done */
				2699	return;
				2700
				2701	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2702	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2703
				2704	while ((bio = bio_list_pop(&bio_list)))
				2705	bio_put(bio);
				2706
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2707	return;
				2708
				2709	finish:
				2710	validate_rbio_for_parity_scrub(rbio);
				2711	}
				2712
				2713	static void scrub_parity_work(struct btrfs_work *work)
				2714	{
				2715	struct btrfs_raid_bio *rbio;
				2716
				2717	rbio = container_of(work, struct btrfs_raid_bio, work);
				2718	raid56_parity_scrub_stripe(rbio);
				2719	}
				2720
				2721	static void async_scrub_parity(struct btrfs_raid_bio *rbio)
				2722	{
				2723	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				2724	scrub_parity_work, NULL, NULL);
				2725
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2726	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2727	}
				2728
				2729	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2730	{
				2731	if (!lock_stripe_add(rbio))
				2732	async_scrub_parity(rbio);
				2733	}
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2734
				2735	/* The following code is used for dev replace of a missing RAID 5/6 device. */
				2736
				2737	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2738	raid56_alloc_missing_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2739	struct btrfs_bio *bbio, u64 length)
				2740	{
				2741	struct btrfs_raid_bio *rbio;
				2742
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2743	rbio = alloc_rbio(fs_info, bbio, length);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2744	if (IS_ERR(rbio))
				2745	return NULL;
				2746
				2747	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
				2748	bio_list_add(&rbio->bio_list, bio);
				2749	/*
				2750	* This is a special bio which is used to hold the completion handler
				2751	* and make the scrub rbio is similar to the other types
				2752	*/
				2753	ASSERT(!bio->bi_iter.bi_size);
				2754
				2755	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2756	if (rbio->faila == -1) {
				2757	BUG();
				2758	kfree(rbio);
				2759	return NULL;
				2760	}
				2761
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2762	/*
				2763	* When we get bbio, we have already increased bio_counter, record it
				2764	* so we can free it at rbio_orig_end_io()
				2765	*/
				2766	rbio->generic_bio_cnt = 1;
				2767
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2768	return rbio;
				2769	}
				2770
				2771	static void missing_raid56_work(struct btrfs_work *work)
				2772	{
				2773	struct btrfs_raid_bio *rbio;
				2774
				2775	rbio = container_of(work, struct btrfs_raid_bio, work);
				2776	__raid56_parity_recover(rbio);
				2777	}
				2778
				2779	static void async_missing_raid56(struct btrfs_raid_bio *rbio)
				2780	{
				2781	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				2782	missing_raid56_work, NULL, NULL);
				2783
				2784	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
				2785	}
				2786
				2787	void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
				2788	{
				2789	if (!lock_stripe_add(rbio))
				2790	async_missing_raid56(rbio);
				2791	}