Blame - fs/btrfs/raid56.c - SHIFTPHONES/mainline/linux

blob: f30d847baf07c5a1aaf3583b3ed6335831fb6419 [file] [log] [blame]

David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2	/*
				3	* Copyright (C) 2012 Fusion-io All rights reserved.
				4	* Copyright (C) 2012 Intel Corp. All rights reserved.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	5	*/
David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	6
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	7	#include <linux/sched.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	8	#include <linux/bio.h>
				9	#include <linux/slab.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	10	#include <linux/blkdev.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	11	#include <linux/raid/pq.h>
				12	#include <linux/hash.h>
				13	#include <linux/list_sort.h>
				14	#include <linux/raid/xor.h>
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	15	#include <linux/mm.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	16	#include "ctree.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	17	#include "disk-io.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	18	#include "volumes.h"
				19	#include "raid56.h"
				20	#include "async-thread.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	21
				22	/* set when additional merges to this rbio are not allowed */
				23	#define RBIO_RMW_LOCKED_BIT 1
				24
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	25	/*
				26	* set when this rbio is sitting in the hash, but it is just a cache
				27	* of past RMW
				28	*/
				29	#define RBIO_CACHE_BIT 2
				30
				31	/*
				32	* set when it is safe to trust the stripe_pages for caching
				33	*/
				34	#define RBIO_CACHE_READY_BIT 3
				35
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	36	#define RBIO_CACHE_SIZE 1024
				37
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	38	enum btrfs_rbio_ops {
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	39	BTRFS_RBIO_WRITE,
				40	BTRFS_RBIO_READ_REBUILD,
				41	BTRFS_RBIO_PARITY_SCRUB,
				42	BTRFS_RBIO_REBUILD_MISSING,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	43	};
				44
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	45	struct btrfs_raid_bio {
				46	struct btrfs_fs_info *fs_info;
				47	struct btrfs_bio *bbio;
				48
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	49	/* while we're doing rmw on a stripe
				50	* we put it into a hash table so we can
				51	* lock the stripe and merge more rbios
				52	* into it.
				53	*/
				54	struct list_head hash_list;
				55
				56	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	57	* LRU list for the stripe cache
				58	*/
				59	struct list_head stripe_cache;
				60
				61	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	62	* for scheduling work in the helper threads
				63	*/
				64	struct btrfs_work work;
				65
				66	/*
				67	* bio list and bio_list_lock are used
				68	* to add more bios into the stripe
				69	* in hopes of avoiding the full rmw
				70	*/
				71	struct bio_list bio_list;
				72	spinlock_t bio_list_lock;
				73
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	74	/* also protected by the bio_list_lock, the
				75	* plug list is used by the plugging code
				76	* to collect partial bios while plugged. The
				77	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	78	* the stripe lock to the next pending IO
				79	*/
				80	struct list_head plug_list;
				81
				82	/*
				83	* flags that tell us if it is safe to
				84	* merge with this bio
				85	*/
				86	unsigned long flags;
				87
				88	/* size of each individual stripe on disk */
				89	int stripe_len;
				90
				91	/* number of data stripes (no p/q) */
				92	int nr_data;
				93
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	94	int real_stripes;
				95
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	96	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	97	/*
				98	* set if we're doing a parity rebuild
				99	* for a read from higher up, which is handled
				100	* differently from a parity rebuild as part of
				101	* rmw
				102	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	103	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	104
				105	/* first bad stripe */
				106	int faila;
				107
				108	/* second bad stripe (for raid6 use) */
				109	int failb;
				110
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	111	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	112	/*
				113	* number of pages needed to represent the full
				114	* stripe
				115	*/
				116	int nr_pages;
				117
				118	/*
				119	* size of all the bios in the bio_list. This
				120	* helps us decide if the rbio maps to a full
				121	* stripe or not
				122	*/
				123	int bio_list_bytes;
				124
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	125	int generic_bio_cnt;
				126
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	127	refcount_t refs;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	128
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	129	atomic_t stripes_pending;
				130
				131	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	132	/*
				133	* these are two arrays of pointers. We allocate the
				134	* rbio big enough to hold them both and setup their
				135	* locations when the rbio is allocated
				136	*/
				137
				138	/* pointers to pages that we allocated for
				139	* reading/writing stripes directly from the disk (including P/Q)
				140	*/
				141	struct page **stripe_pages;
				142
				143	/*
				144	* pointers to the pages in the bio_list. Stored
				145	* here for faster lookup
				146	*/
				147	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	148
				149	/*
				150	* bitmap to record which horizontal stripe has data
				151	*/
				152	unsigned long *dbitmap;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	153
				154	/* allocated with real_stripes-many pointers for finish_() calls /
				155	void **finish_pointers;
				156
				157	/* allocated with stripe_npages-many bits for finish_() calls /
				158	unsigned long *finish_pbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	159	};
				160
				161	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				162	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				163	static void rmw_work(struct btrfs_work *work);
				164	static void read_rebuild_work(struct btrfs_work *work);
				165	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				166	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				167	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				168	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				169	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				170	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				171	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				172
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	173	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				174	int need_check);
				175	static void async_scrub_parity(struct btrfs_raid_bio *rbio);
				176
David Sterba	ac63885	2018-06-29 10:56:56 +0200	[diff] [blame^]	177	static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
				178	{
				179	btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL);
				180	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
				181	}
				182
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	183	/*
				184	* the stripe hash table is used for locking, and to collect
				185	* bios in hopes of making a full stripe
				186	*/
				187	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				188	{
				189	struct btrfs_stripe_hash_table *table;
				190	struct btrfs_stripe_hash_table *x;
				191	struct btrfs_stripe_hash *cur;
				192	struct btrfs_stripe_hash *h;
				193	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				194	int i;
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	195	int table_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	196
				197	if (info->stripe_hash_table)
				198	return 0;
				199
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	200	/*
				201	* The table is large, starting with order 4 and can go as high as
				202	* order 7 in case lock debugging is turned on.
				203	*
				204	* Try harder to allocate and fallback to vmalloc to lower the chance
				205	* of a failing mount.
				206	*/
				207	table_size = sizeof(table) + sizeof(h) * num_entries;
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	208	table = kvzalloc(table_size, GFP_KERNEL);
				209	if (!table)
				210	return -ENOMEM;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	211
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	212	spin_lock_init(&table->cache_lock);
				213	INIT_LIST_HEAD(&table->stripe_cache);
				214
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	215	h = table->table;
				216
				217	for (i = 0; i < num_entries; i++) {
				218	cur = h + i;
				219	INIT_LIST_HEAD(&cur->hash_list);
				220	spin_lock_init(&cur->lock);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	221	}
				222
				223	x = cmpxchg(&info->stripe_hash_table, NULL, table);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	224	if (x)
				225	kvfree(x);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	226	return 0;
				227	}
				228
				229	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	230	* caching an rbio means to copy anything from the
				231	* bio_pages array into the stripe_pages array. We
				232	* use the page uptodate bit in the stripe cache array
				233	* to indicate if it has valid data
				234	*
				235	* once the caching is done, we set the cache ready
				236	* bit.
				237	*/
				238	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				239	{
				240	int i;
				241	char *s;
				242	char *d;
				243	int ret;
				244
				245	ret = alloc_rbio_pages(rbio);
				246	if (ret)
				247	return;
				248
				249	for (i = 0; i < rbio->nr_pages; i++) {
				250	if (!rbio->bio_pages[i])
				251	continue;
				252
				253	s = kmap(rbio->bio_pages[i]);
				254	d = kmap(rbio->stripe_pages[i]);
				255
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	256	copy_page(d, s);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	257
				258	kunmap(rbio->bio_pages[i]);
				259	kunmap(rbio->stripe_pages[i]);
				260	SetPageUptodate(rbio->stripe_pages[i]);
				261	}
				262	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				263	}
				264
				265	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	266	* we hash on the first logical address of the stripe
				267	*/
				268	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				269	{
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	270	u64 num = rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	271
				272	/*
				273	* we shift down quite a bit. We're using byte
				274	* addressing, and most of the lower bits are zeros.
				275	* This tends to upset hash_64, and it consistently
				276	* returns just one or two different values.
				277	*
				278	* shifting off the lower bits fixes things.
				279	*/
				280	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				281	}
				282
				283	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	284	* stealing an rbio means taking all the uptodate pages from the stripe
				285	* array in the source rbio and putting them into the destination rbio
				286	*/
				287	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				288	{
				289	int i;
				290	struct page *s;
				291	struct page *d;
				292
				293	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				294	return;
				295
				296	for (i = 0; i < dest->nr_pages; i++) {
				297	s = src->stripe_pages[i];
				298	if (!s \|\| !PageUptodate(s)) {
				299	continue;
				300	}
				301
				302	d = dest->stripe_pages[i];
				303	if (d)
				304	__free_page(d);
				305
				306	dest->stripe_pages[i] = s;
				307	src->stripe_pages[i] = NULL;
				308	}
				309	}
				310
				311	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	312	* merging means we take the bio_list from the victim and
				313	* splice it into the destination. The victim should
				314	* be discarded afterwards.
				315	*
				316	* must be called with dest->rbio_list_lock held
				317	*/
				318	static void merge_rbio(struct btrfs_raid_bio *dest,
				319	struct btrfs_raid_bio *victim)
				320	{
				321	bio_list_merge(&dest->bio_list, &victim->bio_list);
				322	dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	323	dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	324	bio_list_init(&victim->bio_list);
				325	}
				326
				327	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	328	* used to prune items that are in the cache. The caller
				329	* must hold the hash table lock.
				330	*/
				331	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				332	{
				333	int bucket = rbio_bucket(rbio);
				334	struct btrfs_stripe_hash_table *table;
				335	struct btrfs_stripe_hash *h;
				336	int freeit = 0;
				337
				338	/*
				339	* check the bit again under the hash table lock.
				340	*/
				341	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				342	return;
				343
				344	table = rbio->fs_info->stripe_hash_table;
				345	h = table->table + bucket;
				346
				347	/* hold the lock for the bucket because we may be
				348	* removing it from the hash table
				349	*/
				350	spin_lock(&h->lock);
				351
				352	/*
				353	* hold the lock for the bio list because we need
				354	* to make sure the bio list is empty
				355	*/
				356	spin_lock(&rbio->bio_list_lock);
				357
				358	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				359	list_del_init(&rbio->stripe_cache);
				360	table->cache_size -= 1;
				361	freeit = 1;
				362
				363	/* if the bio list isn't empty, this rbio is
				364	* still involved in an IO. We take it out
				365	* of the cache list, and drop the ref that
				366	* was held for the list.
				367	*
				368	* If the bio_list was empty, we also remove
				369	* the rbio from the hash_table, and drop
				370	* the corresponding ref
				371	*/
				372	if (bio_list_empty(&rbio->bio_list)) {
				373	if (!list_empty(&rbio->hash_list)) {
				374	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	375	refcount_dec(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	376	BUG_ON(!list_empty(&rbio->plug_list));
				377	}
				378	}
				379	}
				380
				381	spin_unlock(&rbio->bio_list_lock);
				382	spin_unlock(&h->lock);
				383
				384	if (freeit)
				385	__free_raid_bio(rbio);
				386	}
				387
				388	/*
				389	* prune a given rbio from the cache
				390	*/
				391	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				392	{
				393	struct btrfs_stripe_hash_table *table;
				394	unsigned long flags;
				395
				396	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				397	return;
				398
				399	table = rbio->fs_info->stripe_hash_table;
				400
				401	spin_lock_irqsave(&table->cache_lock, flags);
				402	__remove_rbio_from_cache(rbio);
				403	spin_unlock_irqrestore(&table->cache_lock, flags);
				404	}
				405
				406	/*
				407	* remove everything in the cache
				408	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	409	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	410	{
				411	struct btrfs_stripe_hash_table *table;
				412	unsigned long flags;
				413	struct btrfs_raid_bio *rbio;
				414
				415	table = info->stripe_hash_table;
				416
				417	spin_lock_irqsave(&table->cache_lock, flags);
				418	while (!list_empty(&table->stripe_cache)) {
				419	rbio = list_entry(table->stripe_cache.next,
				420	struct btrfs_raid_bio,
				421	stripe_cache);
				422	__remove_rbio_from_cache(rbio);
				423	}
				424	spin_unlock_irqrestore(&table->cache_lock, flags);
				425	}
				426
				427	/*
				428	* remove all cached entries and free the hash table
				429	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	430	*/
				431	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				432	{
				433	if (!info->stripe_hash_table)
				434	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	435	btrfs_clear_rbio_cache(info);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	436	kvfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	437	info->stripe_hash_table = NULL;
				438	}
				439
				440	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	441	* insert an rbio into the stripe cache. It
				442	* must have already been prepared by calling
				443	* cache_rbio_pages
				444	*
				445	* If this rbio was already cached, it gets
				446	* moved to the front of the lru.
				447	*
				448	* If the size of the rbio cache is too big, we
				449	* prune an item.
				450	*/
				451	static void cache_rbio(struct btrfs_raid_bio *rbio)
				452	{
				453	struct btrfs_stripe_hash_table *table;
				454	unsigned long flags;
				455
				456	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				457	return;
				458
				459	table = rbio->fs_info->stripe_hash_table;
				460
				461	spin_lock_irqsave(&table->cache_lock, flags);
				462	spin_lock(&rbio->bio_list_lock);
				463
				464	/* bump our ref if we were not in the list before */
				465	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	466	refcount_inc(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	467
				468	if (!list_empty(&rbio->stripe_cache)){
				469	list_move(&rbio->stripe_cache, &table->stripe_cache);
				470	} else {
				471	list_add(&rbio->stripe_cache, &table->stripe_cache);
				472	table->cache_size += 1;
				473	}
				474
				475	spin_unlock(&rbio->bio_list_lock);
				476
				477	if (table->cache_size > RBIO_CACHE_SIZE) {
				478	struct btrfs_raid_bio *found;
				479
				480	found = list_entry(table->stripe_cache.prev,
				481	struct btrfs_raid_bio,
				482	stripe_cache);
				483
				484	if (found != rbio)
				485	__remove_rbio_from_cache(found);
				486	}
				487
				488	spin_unlock_irqrestore(&table->cache_lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	489	}
				490
				491	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	492	* helper function to run the xor_blocks api. It is only
				493	* able to do MAX_XOR_BLOCKS at a time, so we need to
				494	* loop through.
				495	*/
				496	static void run_xor(void **pages, int src_cnt, ssize_t len)
				497	{
				498	int src_off = 0;
				499	int xor_src_cnt = 0;
				500	void *dest = pages[src_cnt];
				501
				502	while(src_cnt > 0) {
				503	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				504	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				505
				506	src_cnt -= xor_src_cnt;
				507	src_off += xor_src_cnt;
				508	}
				509	}
				510
				511	/*
				512	* returns true if the bio list inside this rbio
				513	* covers an entire stripe (no rmw required).
				514	* Must be called with the bio list lock held, or
				515	* at a time when you know it is impossible to add
				516	* new bios into the list
				517	*/
				518	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				519	{
				520	unsigned long size = rbio->bio_list_bytes;
				521	int ret = 1;
				522
				523	if (size != rbio->nr_data * rbio->stripe_len)
				524	ret = 0;
				525
				526	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				527	return ret;
				528	}
				529
				530	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				531	{
				532	unsigned long flags;
				533	int ret;
				534
				535	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				536	ret = __rbio_is_full(rbio);
				537	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				538	return ret;
				539	}
				540
				541	/*
				542	* returns 1 if it is safe to merge two rbios together.
				543	* The merging is safe if the two rbios correspond to
				544	* the same stripe and if they are both going in the same
				545	* direction (read vs write), and if neither one is
				546	* locked for final IO
				547	*
				548	* The caller is responsible for locking such that
				549	* rmw_locked is safe to test
				550	*/
				551	static int rbio_can_merge(struct btrfs_raid_bio *last,
				552	struct btrfs_raid_bio *cur)
				553	{
				554	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				555	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				556	return 0;
				557
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	558	/*
				559	* we can't merge with cached rbios, since the
				560	* idea is that when we merge the destination
				561	* rbio is going to run our IO for us. We can
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	562	* steal from cached rbios though, other functions
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	563	* handle that.
				564	*/
				565	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				566	test_bit(RBIO_CACHE_BIT, &cur->flags))
				567	return 0;
				568
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	569	if (last->bbio->raid_map[0] !=
				570	cur->bbio->raid_map[0])
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	571	return 0;
				572
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	573	/* we can't merge with different operations */
				574	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	575	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	576	/*
				577	* We've need read the full stripe from the drive.
				578	* check and repair the parity and write the new results.
				579	*
				580	* We're not allowed to add any new bios to the
				581	* bio list here, anyone else that wants to
				582	* change this stripe needs to do their own rmw.
				583	*/
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	584	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	585	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	586
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	587	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	588	return 0;
				589
Liu Bo	cc54ff6	2017-12-11 14:56:31 -0700	[diff] [blame]	590	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
				591	int fa = last->faila;
				592	int fb = last->failb;
				593	int cur_fa = cur->faila;
				594	int cur_fb = cur->failb;
				595
				596	if (last->faila >= last->failb) {
				597	fa = last->failb;
				598	fb = last->faila;
				599	}
				600
				601	if (cur->faila >= cur->failb) {
				602	cur_fa = cur->failb;
				603	cur_fb = cur->faila;
				604	}
				605
				606	if (fa != cur_fa \|\| fb != cur_fb)
				607	return 0;
				608	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	609	return 1;
				610	}
				611
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	612	static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
				613	int index)
				614	{
				615	return stripe * rbio->stripe_npages + index;
				616	}
				617
				618	/*
				619	* these are just the pages from the rbio array, not from anything
				620	* the FS sent down to us
				621	*/
				622	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe,
				623	int index)
				624	{
				625	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
				626	}
				627
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	628	/*
				629	* helper to index into the pstripe
				630	*/
				631	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				632	{
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	633	return rbio_stripe_page(rbio, rbio->nr_data, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	634	}
				635
				636	/*
				637	* helper to index into the qstripe, returns null
				638	* if there is no qstripe
				639	*/
				640	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				641	{
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	642	if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	643	return NULL;
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	644	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	645	}
				646
				647	/*
				648	* The first stripe in the table for a logical address
				649	* has the lock. rbios are added in one of three ways:
				650	*
				651	* 1) Nobody has the stripe locked yet. The rbio is given
				652	* the lock and 0 is returned. The caller must start the IO
				653	* themselves.
				654	*
				655	* 2) Someone has the stripe locked, but we're able to merge
				656	* with the lock owner. The rbio is freed and the IO will
				657	* start automatically along with the existing rbio. 1 is returned.
				658	*
				659	* 3) Someone has the stripe locked, but we're not able to merge.
				660	* The rbio is added to the lock owner's plug list, or merged into
				661	* an rbio already on the plug list. When the lock owner unlocks,
				662	* the next rbio on the list is run and the IO is started automatically.
				663	* 1 is returned
				664	*
				665	* If we return 0, the caller still owns the rbio and must continue with
				666	* IO submission. If we return 1, the caller must assume the rbio has
				667	* already been freed.
				668	*/
				669	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				670	{
				671	int bucket = rbio_bucket(rbio);
				672	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				673	struct btrfs_raid_bio *cur;
				674	struct btrfs_raid_bio *pending;
				675	unsigned long flags;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	676	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	677	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	678	int ret = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	679
				680	spin_lock_irqsave(&h->lock, flags);
				681	list_for_each_entry(cur, &h->hash_list, hash_list) {
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	682	if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	683	spin_lock(&cur->bio_list_lock);
				684
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	685	/* can we steal this cached rbio's pages? */
				686	if (bio_list_empty(&cur->bio_list) &&
				687	list_empty(&cur->plug_list) &&
				688	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				689	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				690	list_del_init(&cur->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	691	refcount_dec(&cur->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	692
				693	steal_rbio(cur, rbio);
				694	cache_drop = cur;
				695	spin_unlock(&cur->bio_list_lock);
				696
				697	goto lockit;
				698	}
				699
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	700	/* can we merge into the lock owner? */
				701	if (rbio_can_merge(cur, rbio)) {
				702	merge_rbio(cur, rbio);
				703	spin_unlock(&cur->bio_list_lock);
				704	freeit = rbio;
				705	ret = 1;
				706	goto out;
				707	}
				708
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	709
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	710	/*
				711	* we couldn't merge with the running
				712	* rbio, see if we can merge with the
				713	* pending ones. We don't have to
				714	* check for rmw_locked because there
				715	* is no way they are inside finish_rmw
				716	* right now
				717	*/
				718	list_for_each_entry(pending, &cur->plug_list,
				719	plug_list) {
				720	if (rbio_can_merge(pending, rbio)) {
				721	merge_rbio(pending, rbio);
				722	spin_unlock(&cur->bio_list_lock);
				723	freeit = rbio;
				724	ret = 1;
				725	goto out;
				726	}
				727	}
				728
				729	/* no merging, put us on the tail of the plug list,
				730	* our rbio will be started with the currently
				731	* running rbio unlocks
				732	*/
				733	list_add_tail(&rbio->plug_list, &cur->plug_list);
				734	spin_unlock(&cur->bio_list_lock);
				735	ret = 1;
				736	goto out;
				737	}
				738	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	739	lockit:
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	740	refcount_inc(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	741	list_add(&rbio->hash_list, &h->hash_list);
				742	out:
				743	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	744	if (cache_drop)
				745	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	746	if (freeit)
				747	__free_raid_bio(freeit);
				748	return ret;
				749	}
				750
				751	/*
				752	* called as rmw or parity rebuild is completed. If the plug list has more
				753	* rbios waiting for this stripe, the next one on the list will be started
				754	*/
				755	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				756	{
				757	int bucket;
				758	struct btrfs_stripe_hash *h;
				759	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	760	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	761
				762	bucket = rbio_bucket(rbio);
				763	h = rbio->fs_info->stripe_hash_table->table + bucket;
				764
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	765	if (list_empty(&rbio->plug_list))
				766	cache_rbio(rbio);
				767
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	768	spin_lock_irqsave(&h->lock, flags);
				769	spin_lock(&rbio->bio_list_lock);
				770
				771	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	772	/*
				773	* if we're still cached and there is no other IO
				774	* to perform, just leave this rbio here for others
				775	* to steal from later
				776	*/
				777	if (list_empty(&rbio->plug_list) &&
				778	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				779	keep_cache = 1;
				780	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				781	BUG_ON(!bio_list_empty(&rbio->bio_list));
				782	goto done;
				783	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	784
				785	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	786	refcount_dec(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	787
				788	/*
				789	* we use the plug list to hold all the rbios
				790	* waiting for the chance to lock this stripe.
				791	* hand the lock over to one of them.
				792	*/
				793	if (!list_empty(&rbio->plug_list)) {
				794	struct btrfs_raid_bio *next;
				795	struct list_head *head = rbio->plug_list.next;
				796
				797	next = list_entry(head, struct btrfs_raid_bio,
				798	plug_list);
				799
				800	list_del_init(&rbio->plug_list);
				801
				802	list_add(&next->hash_list, &h->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	803	refcount_inc(&next->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	804	spin_unlock(&rbio->bio_list_lock);
				805	spin_unlock_irqrestore(&h->lock, flags);
				806
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	807	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	808	async_read_rebuild(next);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	809	else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
				810	steal_rbio(rbio, next);
				811	async_read_rebuild(next);
				812	} else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	813	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	814	async_rmw_stripe(next);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	815	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				816	steal_rbio(rbio, next);
				817	async_scrub_parity(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	818	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	819
				820	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	821	}
				822	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	823	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	824	spin_unlock(&rbio->bio_list_lock);
				825	spin_unlock_irqrestore(&h->lock, flags);
				826
				827	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	828	if (!keep_cache)
				829	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	830	}
				831
				832	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				833	{
				834	int i;
				835
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	836	if (!refcount_dec_and_test(&rbio->refs))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	837	return;
				838
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	839	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	840	WARN_ON(!list_empty(&rbio->hash_list));
				841	WARN_ON(!bio_list_empty(&rbio->bio_list));
				842
				843	for (i = 0; i < rbio->nr_pages; i++) {
				844	if (rbio->stripe_pages[i]) {
				845	__free_page(rbio->stripe_pages[i]);
				846	rbio->stripe_pages[i] = NULL;
				847	}
				848	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	849
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	850	btrfs_put_bbio(rbio->bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	851	kfree(rbio);
				852	}
				853
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	854	static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	855	{
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	856	struct bio *next;
				857
				858	while (cur) {
				859	next = cur->bi_next;
				860	cur->bi_next = NULL;
				861	cur->bi_status = err;
				862	bio_endio(cur);
				863	cur = next;
				864	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	865	}
				866
				867	/*
				868	* this frees the rbio and runs through all the bios in the
				869	* bio_list and calls end_io on them
				870	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	871	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	872	{
				873	struct bio *cur = bio_list_get(&rbio->bio_list);
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	874	struct bio *extra;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	875
				876	if (rbio->generic_bio_cnt)
				877	btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
				878
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	879	/*
				880	* At this moment, rbio->bio_list is empty, however since rbio does not
				881	* always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
				882	* hash list, rbio may be merged with others so that rbio->bio_list
				883	* becomes non-empty.
				884	* Once unlock_stripe() is done, rbio->bio_list will not be updated any
				885	* more and we can call bio_endio() on all queued bios.
				886	*/
				887	unlock_stripe(rbio);
				888	extra = bio_list_get(&rbio->bio_list);
				889	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	890
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	891	rbio_endio_bio_list(cur, err);
				892	if (extra)
				893	rbio_endio_bio_list(extra, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	894	}
				895
				896	/*
				897	* end io function used by finish_rmw. When we finally
				898	* get here, we've written a full stripe
				899	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	900	static void raid_write_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	901	{
				902	struct btrfs_raid_bio *rbio = bio->bi_private;
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	903	blk_status_t err = bio->bi_status;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	904	int max_errors;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	905
				906	if (err)
				907	fail_bio_stripe(rbio, bio);
				908
				909	bio_put(bio);
				910
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	911	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	912	return;
				913
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	914	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	915
				916	/* OK, we have read all the stripes we need to. */
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	917	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
				918	0 : rbio->bbio->max_errors;
				919	if (atomic_read(&rbio->error) > max_errors)
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	920	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	921
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	922	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	923	}
				924
				925	/*
				926	* the read/modify/write code wants to use the original bio for
				927	* any pages it included, and then use the rbio for everything
				928	* else. This function decides if a given index (stripe number)
				929	* and page number in that stripe fall inside the original bio
				930	* or the rbio.
				931	*
				932	* if you set bio_list_only, you'll get a NULL back for any ranges
				933	* that are outside the bio_list
				934	*
				935	* This doesn't take any refs on anything, you get a bare page pointer
				936	* and the caller must bump refs as required.
				937	*
				938	* You must call index_rbio_pages once before you can trust
				939	* the answers from this function.
				940	*/
				941	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				942	int index, int pagenr, int bio_list_only)
				943	{
				944	int chunk_page;
				945	struct page *p = NULL;
				946
				947	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				948
				949	spin_lock_irq(&rbio->bio_list_lock);
				950	p = rbio->bio_pages[chunk_page];
				951	spin_unlock_irq(&rbio->bio_list_lock);
				952
				953	if (p \|\| bio_list_only)
				954	return p;
				955
				956	return rbio->stripe_pages[chunk_page];
				957	}
				958
				959	/*
				960	* number of pages we need for the entire stripe across all the
				961	* drives
				962	*/
				963	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				964	{
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	965	return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	966	}
				967
				968	/*
				969	* allocation and initial setup for the btrfs_raid_bio. Not
				970	* this does not allocate any pages for rbio->pages.
				971	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	972	static struct btrfs_raid_bio alloc_rbio(struct btrfs_fs_info fs_info,
				973	struct btrfs_bio *bbio,
				974	u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	975	{
				976	struct btrfs_raid_bio *rbio;
				977	int nr_data = 0;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	978	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
				979	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	980	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	981	void *p;
				982
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	983	rbio = kzalloc(sizeof(*rbio) +
				984	sizeof(rbio->stripe_pages) num_pages +
				985	sizeof(rbio->bio_pages) num_pages +
				986	sizeof(rbio->finish_pointers) real_stripes +
				987	sizeof(rbio->dbitmap) BITS_TO_LONGS(stripe_npages) +
				988	sizeof(rbio->finish_pbitmap)
				989	BITS_TO_LONGS(stripe_npages),
				990	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	991	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	992	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	993
				994	bio_list_init(&rbio->bio_list);
				995	INIT_LIST_HEAD(&rbio->plug_list);
				996	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	997	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	998	INIT_LIST_HEAD(&rbio->hash_list);
				999	rbio->bbio = bbio;
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1000	rbio->fs_info = fs_info;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1001	rbio->stripe_len = stripe_len;
				1002	rbio->nr_pages = num_pages;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1003	rbio->real_stripes = real_stripes;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1004	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1005	rbio->faila = -1;
				1006	rbio->failb = -1;
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	1007	refcount_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1008	atomic_set(&rbio->error, 0);
				1009	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1010
				1011	/*
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1012	* the stripe_pages, bio_pages, etc arrays point to the extra
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1013	* memory we allocated past the end of the rbio
				1014	*/
				1015	p = rbio + 1;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1016	#define CONSUME_ALLOC(ptr, count) do { \
				1017	ptr = p; \
				1018	p = (unsigned char )p + sizeof((ptr)) * (count); \
				1019	} while (0)
				1020	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
				1021	CONSUME_ALLOC(rbio->bio_pages, num_pages);
				1022	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
				1023	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
				1024	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
				1025	#undef CONSUME_ALLOC
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1026
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1027	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
				1028	nr_data = real_stripes - 1;
				1029	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1030	nr_data = real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1031	else
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1032	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1033
				1034	rbio->nr_data = nr_data;
				1035	return rbio;
				1036	}
				1037
				1038	/* allocate pages for all the stripes in the bio, including parity */
				1039	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1040	{
				1041	int i;
				1042	struct page *page;
				1043
				1044	for (i = 0; i < rbio->nr_pages; i++) {
				1045	if (rbio->stripe_pages[i])
				1046	continue;
				1047	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1048	if (!page)
				1049	return -ENOMEM;
				1050	rbio->stripe_pages[i] = page;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1051	}
				1052	return 0;
				1053	}
				1054
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1055	/* only allocate pages for p/q stripes */
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1056	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1057	{
				1058	int i;
				1059	struct page *page;
				1060
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1061	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1062
				1063	for (; i < rbio->nr_pages; i++) {
				1064	if (rbio->stripe_pages[i])
				1065	continue;
				1066	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1067	if (!page)
				1068	return -ENOMEM;
				1069	rbio->stripe_pages[i] = page;
				1070	}
				1071	return 0;
				1072	}
				1073
				1074	/*
				1075	* add a single page from a specific stripe into our list of bios for IO
				1076	* this will try to merge into existing bios if possible, and returns
				1077	* zero if all went well.
				1078	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1079	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1080	struct bio_list *bio_list,
				1081	struct page *page,
				1082	int stripe_nr,
				1083	unsigned long page_index,
				1084	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1085	{
				1086	struct bio *last = bio_list->tail;
				1087	u64 last_end = 0;
				1088	int ret;
				1089	struct bio *bio;
				1090	struct btrfs_bio_stripe *stripe;
				1091	u64 disk_start;
				1092
				1093	stripe = &rbio->bbio->stripes[stripe_nr];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1094	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1095
				1096	/* if the device is missing, just fail this stripe */
				1097	if (!stripe->dev->bdev)
				1098	return fail_rbio_index(rbio, stripe_nr);
				1099
				1100	/* see if we can add this page onto our existing bio */
				1101	if (last) {
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1102	last_end = (u64)last->bi_iter.bi_sector << 9;
				1103	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1104
				1105	/*
				1106	* we can't merge these if they are from different
				1107	* devices or if they are not contiguous
				1108	*/
				1109	if (last_end == disk_start && stripe->dev->bdev &&
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1110	!last->bi_status &&
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1111	last->bi_disk == stripe->dev->bdev->bd_disk &&
				1112	last->bi_partno == stripe->dev->bdev->bd_partno) {
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1113	ret = bio_add_page(last, page, PAGE_SIZE, 0);
				1114	if (ret == PAGE_SIZE)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1115	return 0;
				1116	}
				1117	}
				1118
				1119	/* put a new bio on the list */
David Sterba	c5e4c3d	2017-06-12 17:29:41 +0200	[diff] [blame]	1120	bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1121	bio->bi_iter.bi_size = 0;
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1122	bio_set_dev(bio, stripe->dev->bdev);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1123	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1124
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1125	bio_add_page(bio, page, PAGE_SIZE, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1126	bio_list_add(bio_list, bio);
				1127	return 0;
				1128	}
				1129
				1130	/*
				1131	* while we're doing the read/modify/write cycle, we could
				1132	* have errors in reading pages off the disk. This checks
				1133	* for errors and if we're not able to read the page it'll
				1134	* trigger parity reconstruction. The rmw will be finished
				1135	* after we've reconstructed the failed stripes
				1136	*/
				1137	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1138	{
				1139	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1140	BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1141	__raid56_parity_recover(rbio);
				1142	} else {
				1143	finish_rmw(rbio);
				1144	}
				1145	}
				1146
				1147	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1148	* helper function to walk our bio list and populate the bio_pages array with
				1149	* the result. This seems expensive, but it is faster than constantly
				1150	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1151	* reconstruction.
				1152	*
				1153	* This must be called before you trust the answers from page_in_rbio
				1154	*/
				1155	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1156	{
				1157	struct bio *bio;
				1158	u64 start;
				1159	unsigned long stripe_offset;
				1160	unsigned long page_index;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1161
				1162	spin_lock_irq(&rbio->bio_list_lock);
				1163	bio_list_for_each(bio, &rbio->bio_list) {
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1164	struct bio_vec bvec;
				1165	struct bvec_iter iter;
				1166	int i = 0;
				1167
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1168	start = (u64)bio->bi_iter.bi_sector << 9;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1169	stripe_offset = start - rbio->bbio->raid_map[0];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1170	page_index = stripe_offset >> PAGE_SHIFT;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1171
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1172	if (bio_flagged(bio, BIO_CLONED))
				1173	bio->bi_iter = btrfs_io_bio(bio)->iter;
				1174
				1175	bio_for_each_segment(bvec, bio, iter) {
				1176	rbio->bio_pages[page_index + i] = bvec.bv_page;
				1177	i++;
				1178	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1179	}
				1180	spin_unlock_irq(&rbio->bio_list_lock);
				1181	}
				1182
				1183	/*
				1184	* this is called from one of two situations. We either
				1185	* have a full stripe from the higher layers, or we've read all
				1186	* the missing bits off disk.
				1187	*
				1188	* This will calculate the parity and then send down any
				1189	* changed blocks.
				1190	*/
				1191	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1192	{
				1193	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1194	void **pointers = rbio->finish_pointers;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1195	int nr_data = rbio->nr_data;
				1196	int stripe;
				1197	int pagenr;
				1198	int p_stripe = -1;
				1199	int q_stripe = -1;
				1200	struct bio_list bio_list;
				1201	struct bio *bio;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1202	int ret;
				1203
				1204	bio_list_init(&bio_list);
				1205
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1206	if (rbio->real_stripes - rbio->nr_data == 1) {
				1207	p_stripe = rbio->real_stripes - 1;
				1208	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				1209	p_stripe = rbio->real_stripes - 2;
				1210	q_stripe = rbio->real_stripes - 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1211	} else {
				1212	BUG();
				1213	}
				1214
				1215	/* at this point we either have a full stripe,
				1216	* or we've read the full stripe from the drive.
				1217	* recalculate the parity and write the new results.
				1218	*
				1219	* We're not allowed to add any new bios to the
				1220	* bio list here, anyone else that wants to
				1221	* change this stripe needs to do their own rmw.
				1222	*/
				1223	spin_lock_irq(&rbio->bio_list_lock);
				1224	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1225	spin_unlock_irq(&rbio->bio_list_lock);
				1226
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1227	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1228
				1229	/*
				1230	* now that we've set rmw_locked, run through the
				1231	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1232	*
				1233	* We don't cache full rbios because we're assuming
				1234	* the higher layers are unlikely to use this area of
				1235	* the disk again soon. If they do use it again,
				1236	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1237	*/
				1238	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1239	if (!rbio_is_full(rbio))
				1240	cache_rbio_pages(rbio);
				1241	else
				1242	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1243
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1244	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1245	struct page *p;
				1246	/* first collect one page from each data stripe */
				1247	for (stripe = 0; stripe < nr_data; stripe++) {
				1248	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1249	pointers[stripe] = kmap(p);
				1250	}
				1251
				1252	/* then add the parity stripe */
				1253	p = rbio_pstripe_page(rbio, pagenr);
				1254	SetPageUptodate(p);
				1255	pointers[stripe++] = kmap(p);
				1256
				1257	if (q_stripe != -1) {
				1258
				1259	/*
				1260	* raid6, add the qstripe and call the
				1261	* library function to fill in our p/q
				1262	*/
				1263	p = rbio_qstripe_page(rbio, pagenr);
				1264	SetPageUptodate(p);
				1265	pointers[stripe++] = kmap(p);
				1266
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1267	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1268	pointers);
				1269	} else {
				1270	/* raid5 */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	1271	copy_page(pointers[nr_data], pointers[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1272	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1273	}
				1274
				1275
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1276	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1277	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1278	}
				1279
				1280	/*
				1281	* time to start writing. Make bios for everything from the
				1282	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1283	* everything else.
				1284	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1285	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1286	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1287	struct page *page;
				1288	if (stripe < rbio->nr_data) {
				1289	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1290	if (!page)
				1291	continue;
				1292	} else {
				1293	page = rbio_stripe_page(rbio, stripe, pagenr);
				1294	}
				1295
				1296	ret = rbio_add_io_page(rbio, &bio_list,
				1297	page, stripe, pagenr, rbio->stripe_len);
				1298	if (ret)
				1299	goto cleanup;
				1300	}
				1301	}
				1302
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1303	if (likely(!bbio->num_tgtdevs))
				1304	goto write_data;
				1305
				1306	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
				1307	if (!bbio->tgtdev_map[stripe])
				1308	continue;
				1309
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1310	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1311	struct page *page;
				1312	if (stripe < rbio->nr_data) {
				1313	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1314	if (!page)
				1315	continue;
				1316	} else {
				1317	page = rbio_stripe_page(rbio, stripe, pagenr);
				1318	}
				1319
				1320	ret = rbio_add_io_page(rbio, &bio_list, page,
				1321	rbio->bbio->tgtdev_map[stripe],
				1322	pagenr, rbio->stripe_len);
				1323	if (ret)
				1324	goto cleanup;
				1325	}
				1326	}
				1327
				1328	write_data:
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1329	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1330	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1331
				1332	while (1) {
				1333	bio = bio_list_pop(&bio_list);
				1334	if (!bio)
				1335	break;
				1336
				1337	bio->bi_private = rbio;
				1338	bio->bi_end_io = raid_write_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	1339	bio->bi_opf = REQ_OP_WRITE;
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1340
				1341	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1342	}
				1343	return;
				1344
				1345	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1346	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1347
				1348	while ((bio = bio_list_pop(&bio_list)))
				1349	bio_put(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1350	}
				1351
				1352	/*
				1353	* helper to find the stripe number for a given bio. Used to figure out which
				1354	* stripe has failed. This expects the bio to correspond to a physical disk,
				1355	* so it looks up based on physical sector numbers.
				1356	*/
				1357	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1358	struct bio *bio)
				1359	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1360	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1361	u64 stripe_start;
				1362	int i;
				1363	struct btrfs_bio_stripe *stripe;
				1364
				1365	physical <<= 9;
				1366
				1367	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1368	stripe = &rbio->bbio->stripes[i];
				1369	stripe_start = stripe->physical;
				1370	if (physical >= stripe_start &&
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1371	physical < stripe_start + rbio->stripe_len &&
Dmitriy Gorokh	047fdea	2018-02-16 19:51:38 +0000	[diff] [blame]	1372	stripe->dev->bdev &&
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1373	bio->bi_disk == stripe->dev->bdev->bd_disk &&
				1374	bio->bi_partno == stripe->dev->bdev->bd_partno) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1375	return i;
				1376	}
				1377	}
				1378	return -1;
				1379	}
				1380
				1381	/*
				1382	* helper to find the stripe number for a given
				1383	* bio (before mapping). Used to figure out which stripe has
				1384	* failed. This looks up based on logical block numbers.
				1385	*/
				1386	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1387	struct bio *bio)
				1388	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1389	u64 logical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1390	u64 stripe_start;
				1391	int i;
				1392
				1393	logical <<= 9;
				1394
				1395	for (i = 0; i < rbio->nr_data; i++) {
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1396	stripe_start = rbio->bbio->raid_map[i];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1397	if (logical >= stripe_start &&
				1398	logical < stripe_start + rbio->stripe_len) {
				1399	return i;
				1400	}
				1401	}
				1402	return -1;
				1403	}
				1404
				1405	/*
				1406	* returns -EIO if we had too many failures
				1407	*/
				1408	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1409	{
				1410	unsigned long flags;
				1411	int ret = 0;
				1412
				1413	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1414
				1415	/* we already know this stripe is bad, move on */
				1416	if (rbio->faila == failed \|\| rbio->failb == failed)
				1417	goto out;
				1418
				1419	if (rbio->faila == -1) {
				1420	/* first failure on this rbio */
				1421	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1422	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1423	} else if (rbio->failb == -1) {
				1424	/* second failure on this rbio */
				1425	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1426	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1427	} else {
				1428	ret = -EIO;
				1429	}
				1430	out:
				1431	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1432
				1433	return ret;
				1434	}
				1435
				1436	/*
				1437	* helper to fail a stripe based on a physical disk
				1438	* bio.
				1439	*/
				1440	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1441	struct bio *bio)
				1442	{
				1443	int failed = find_bio_stripe(rbio, bio);
				1444
				1445	if (failed < 0)
				1446	return -EIO;
				1447
				1448	return fail_rbio_index(rbio, failed);
				1449	}
				1450
				1451	/*
				1452	* this sets each page in the bio uptodate. It should only be used on private
				1453	* rbio pages, nothing that comes in from the higher layers
				1454	*/
				1455	static void set_bio_pages_uptodate(struct bio *bio)
				1456	{
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1457	struct bio_vec *bvec;
				1458	int i;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1459
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1460	ASSERT(!bio_flagged(bio, BIO_CLONED));
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1461
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1462	bio_for_each_segment_all(bvec, bio, i)
				1463	SetPageUptodate(bvec->bv_page);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1464	}
				1465
				1466	/*
				1467	* end io for the read phase of the rmw cycle. All the bios here are physical
				1468	* stripe bios we've read from the disk so we can recalculate the parity of the
				1469	* stripe.
				1470	*
				1471	* This will usually kick off finish_rmw once all the bios are read in, but it
				1472	* may trigger parity reconstruction if we had any errors along the way
				1473	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1474	static void raid_rmw_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1475	{
				1476	struct btrfs_raid_bio *rbio = bio->bi_private;
				1477
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1478	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1479	fail_bio_stripe(rbio, bio);
				1480	else
				1481	set_bio_pages_uptodate(bio);
				1482
				1483	bio_put(bio);
				1484
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1485	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1486	return;
				1487
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1488	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1489	goto cleanup;
				1490
				1491	/*
				1492	* this will normally call finish_rmw to start our write
				1493	* but if there are any failed stripes we'll reconstruct
				1494	* from parity first
				1495	*/
				1496	validate_rbio_for_rmw(rbio);
				1497	return;
				1498
				1499	cleanup:
				1500
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1501	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1502	}
				1503
				1504	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1505	{
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1506	btrfs_init_work(&rbio->work, btrfs_rmw_helper, rmw_work, NULL, NULL);
				1507	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1508	}
				1509
				1510	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1511	{
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1512	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				1513	read_rebuild_work, NULL, NULL);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1514
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1515	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1516	}
				1517
				1518	/*
				1519	* the stripe must be locked by the caller. It will
				1520	* unlock after all the writes are done
				1521	*/
				1522	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1523	{
				1524	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1525	struct bio_list bio_list;
				1526	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1527	int pagenr;
				1528	int stripe;
				1529	struct bio *bio;
				1530
				1531	bio_list_init(&bio_list);
				1532
				1533	ret = alloc_rbio_pages(rbio);
				1534	if (ret)
				1535	goto cleanup;
				1536
				1537	index_rbio_pages(rbio);
				1538
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1539	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1540	/*
				1541	* build a list of bios to read all the missing parts of this
				1542	* stripe
				1543	*/
				1544	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1545	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1546	struct page *page;
				1547	/*
				1548	* we want to find all the pages missing from
				1549	* the rbio and read them from the disk. If
				1550	* page_in_rbio finds a page in the bio list
				1551	* we don't need to read it off the stripe.
				1552	*/
				1553	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1554	if (page)
				1555	continue;
				1556
				1557	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1558	/*
				1559	* the bio cache may have handed us an uptodate
				1560	* page. If so, be happy and use it
				1561	*/
				1562	if (PageUptodate(page))
				1563	continue;
				1564
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1565	ret = rbio_add_io_page(rbio, &bio_list, page,
				1566	stripe, pagenr, rbio->stripe_len);
				1567	if (ret)
				1568	goto cleanup;
				1569	}
				1570	}
				1571
				1572	bios_to_read = bio_list_size(&bio_list);
				1573	if (!bios_to_read) {
				1574	/*
				1575	* this can happen if others have merged with
				1576	* us, it means there is nothing left to read.
				1577	* But if there are missing devices it may not be
				1578	* safe to do the full stripe write yet.
				1579	*/
				1580	goto finish;
				1581	}
				1582
				1583	/*
				1584	* the bbio may be freed once we submit the last bio. Make sure
				1585	* not to touch it after that
				1586	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1587	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1588	while (1) {
				1589	bio = bio_list_pop(&bio_list);
				1590	if (!bio)
				1591	break;
				1592
				1593	bio->bi_private = rbio;
				1594	bio->bi_end_io = raid_rmw_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	1595	bio->bi_opf = REQ_OP_READ;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1596
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1597	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1598
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1599	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1600	}
				1601	/* the actual write will happen once the reads are done */
				1602	return 0;
				1603
				1604	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1605	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1606
				1607	while ((bio = bio_list_pop(&bio_list)))
				1608	bio_put(bio);
				1609
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1610	return -EIO;
				1611
				1612	finish:
				1613	validate_rbio_for_rmw(rbio);
				1614	return 0;
				1615	}
				1616
				1617	/*
				1618	* if the upper layers pass in a full stripe, we thank them by only allocating
				1619	* enough pages to hold the parity, and sending it all down quickly.
				1620	*/
				1621	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1622	{
				1623	int ret;
				1624
				1625	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1626	if (ret) {
				1627	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1628	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1629	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1630
				1631	ret = lock_stripe_add(rbio);
				1632	if (ret == 0)
				1633	finish_rmw(rbio);
				1634	return 0;
				1635	}
				1636
				1637	/*
				1638	* partial stripe writes get handed over to async helpers.
				1639	* We're really hoping to merge a few more writes into this
				1640	* rbio before calculating new parity
				1641	*/
				1642	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1643	{
				1644	int ret;
				1645
				1646	ret = lock_stripe_add(rbio);
				1647	if (ret == 0)
				1648	async_rmw_stripe(rbio);
				1649	return 0;
				1650	}
				1651
				1652	/*
				1653	* sometimes while we were reading from the drive to
				1654	* recalculate parity, enough new bios come into create
				1655	* a full stripe. So we do a check here to see if we can
				1656	* go directly to finish_rmw
				1657	*/
				1658	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1659	{
				1660	/* head off into rmw land if we don't have a full stripe */
				1661	if (!rbio_is_full(rbio))
				1662	return partial_stripe_write(rbio);
				1663	return full_stripe_write(rbio);
				1664	}
				1665
				1666	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1667	* We use plugging call backs to collect full stripes.
				1668	* Any time we get a partial stripe write while plugged
				1669	* we collect it into a list. When the unplug comes down,
				1670	* we sort the list by logical block number and merge
				1671	* everything we can into the same rbios
				1672	*/
				1673	struct btrfs_plug_cb {
				1674	struct blk_plug_cb cb;
				1675	struct btrfs_fs_info *info;
				1676	struct list_head rbio_list;
				1677	struct btrfs_work work;
				1678	};
				1679
				1680	/*
				1681	* rbios on the plug list are sorted for easier merging.
				1682	*/
				1683	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1684	{
				1685	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1686	plug_list);
				1687	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1688	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1689	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1690	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1691
				1692	if (a_sector < b_sector)
				1693	return -1;
				1694	if (a_sector > b_sector)
				1695	return 1;
				1696	return 0;
				1697	}
				1698
				1699	static void run_plug(struct btrfs_plug_cb *plug)
				1700	{
				1701	struct btrfs_raid_bio *cur;
				1702	struct btrfs_raid_bio *last = NULL;
				1703
				1704	/*
				1705	* sort our plug list then try to merge
				1706	* everything we can in hopes of creating full
				1707	* stripes.
				1708	*/
				1709	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1710	while (!list_empty(&plug->rbio_list)) {
				1711	cur = list_entry(plug->rbio_list.next,
				1712	struct btrfs_raid_bio, plug_list);
				1713	list_del_init(&cur->plug_list);
				1714
				1715	if (rbio_is_full(cur)) {
				1716	/* we have a full stripe, send it down */
				1717	full_stripe_write(cur);
				1718	continue;
				1719	}
				1720	if (last) {
				1721	if (rbio_can_merge(last, cur)) {
				1722	merge_rbio(last, cur);
				1723	__free_raid_bio(cur);
				1724	continue;
				1725
				1726	}
				1727	__raid56_parity_write(last);
				1728	}
				1729	last = cur;
				1730	}
				1731	if (last) {
				1732	__raid56_parity_write(last);
				1733	}
				1734	kfree(plug);
				1735	}
				1736
				1737	/*
				1738	* if the unplug comes from schedule, we have to push the
				1739	* work off to a helper thread
				1740	*/
				1741	static void unplug_work(struct btrfs_work *work)
				1742	{
				1743	struct btrfs_plug_cb *plug;
				1744	plug = container_of(work, struct btrfs_plug_cb, work);
				1745	run_plug(plug);
				1746	}
				1747
				1748	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1749	{
				1750	struct btrfs_plug_cb *plug;
				1751	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1752
				1753	if (from_schedule) {
Liu Bo	9e0af23	2014-08-15 23:36:53 +0800	[diff] [blame]	1754	btrfs_init_work(&plug->work, btrfs_rmw_helper,
				1755	unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1756	btrfs_queue_work(plug->info->rmw_workers,
				1757	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1758	return;
				1759	}
				1760	run_plug(plug);
				1761	}
				1762
				1763	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1764	* our main entry point for writes from the rest of the FS.
				1765	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1766	int raid56_parity_write(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1767	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1768	{
				1769	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1770	struct btrfs_plug_cb *plug = NULL;
				1771	struct blk_plug_cb *cb;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1772	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1773
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1774	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1775	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1776	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1777	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1778	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1779	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1780	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1781	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1782
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1783	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1784	rbio->generic_bio_cnt = 1;
				1785
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1786	/*
				1787	* don't plug on full rbios, just get them out the door
				1788	* as quickly as we can
				1789	*/
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1790	if (rbio_is_full(rbio)) {
				1791	ret = full_stripe_write(rbio);
				1792	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1793	btrfs_bio_counter_dec(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1794	return ret;
				1795	}
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1796
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1797	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1798	if (cb) {
				1799	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1800	if (!plug->info) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1801	plug->info = fs_info;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1802	INIT_LIST_HEAD(&plug->rbio_list);
				1803	}
				1804	list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1805	ret = 0;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1806	} else {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1807	ret = __raid56_parity_write(rbio);
				1808	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1809	btrfs_bio_counter_dec(fs_info);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1810	}
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1811	return ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1812	}
				1813
				1814	/*
				1815	* all parity reconstruction happens here. We've read in everything
				1816	* we can find from the drives and this does the heavy lifting of
				1817	* sorting the good from the bad.
				1818	*/
				1819	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1820	{
				1821	int pagenr, stripe;
				1822	void **pointers;
				1823	int faila = -1, failb = -1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1824	struct page *page;
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1825	blk_status_t err;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1826	int i;
				1827
David Sterba	31e818f	2015-02-20 18:00:26 +0100	[diff] [blame]	1828	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1829	if (!pointers) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1830	err = BLK_STS_RESOURCE;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1831	goto cleanup_io;
				1832	}
				1833
				1834	faila = rbio->faila;
				1835	failb = rbio->failb;
				1836
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1837	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1838	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1839	spin_lock_irq(&rbio->bio_list_lock);
				1840	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1841	spin_unlock_irq(&rbio->bio_list_lock);
				1842	}
				1843
				1844	index_rbio_pages(rbio);
				1845
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1846	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1847	/*
				1848	* Now we just use bitmap to mark the horizontal stripes in
				1849	* which we have data when doing parity scrub.
				1850	*/
				1851	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1852	!test_bit(pagenr, rbio->dbitmap))
				1853	continue;
				1854
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1855	/* setup our array of pointers with pages
				1856	* from each stripe
				1857	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1858	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1859	/*
				1860	* if we're rebuilding a read, we have to use
				1861	* pages from the bio list
				1862	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1863	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1864	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1865	(stripe == faila \|\| stripe == failb)) {
				1866	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1867	} else {
				1868	page = rbio_stripe_page(rbio, stripe, pagenr);
				1869	}
				1870	pointers[stripe] = kmap(page);
				1871	}
				1872
				1873	/* all raid6 handling here */
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1874	if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1875	/*
				1876	* single failure, rebuild from parity raid5
				1877	* style
				1878	*/
				1879	if (failb < 0) {
				1880	if (faila == rbio->nr_data) {
				1881	/*
				1882	* Just the P stripe has failed, without
				1883	* a bad data or Q stripe.
				1884	* TODO, we should redo the xor here.
				1885	*/
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1886	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1887	goto cleanup;
				1888	}
				1889	/*
				1890	* a single failure in raid6 is rebuilt
				1891	* in the pstripe code below
				1892	*/
				1893	goto pstripe;
				1894	}
				1895
				1896	/* make sure our ps and qs are in order */
				1897	if (faila > failb) {
				1898	int tmp = failb;
				1899	failb = faila;
				1900	faila = tmp;
				1901	}
				1902
				1903	/* if the q stripe is failed, do a pstripe reconstruction
				1904	* from the xors.
				1905	* If both the q stripe and the P stripe are failed, we're
				1906	* here due to a crc mismatch and we can't give them the
				1907	* data they want
				1908	*/
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1909	if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1910	if (rbio->bbio->raid_map[faila] ==
				1911	RAID5_P_STRIPE) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1912	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1913	goto cleanup;
				1914	}
				1915	/*
				1916	* otherwise we have one bad data stripe and
				1917	* a good P stripe. raid5!
				1918	*/
				1919	goto pstripe;
				1920	}
				1921
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1922	if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1923	raid6_datap_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1924	PAGE_SIZE, faila, pointers);
				1925	} else {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1926	raid6_2data_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1927	PAGE_SIZE, faila, failb,
				1928	pointers);
				1929	}
				1930	} else {
				1931	void *p;
				1932
				1933	/* rebuild from P stripe here (raid5 or raid6) */
				1934	BUG_ON(failb != -1);
				1935	pstripe:
				1936	/* Copy parity block into failed block to start with */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	1937	copy_page(pointers[faila], pointers[rbio->nr_data]);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1938
				1939	/* rearrange the pointer array */
				1940	p = pointers[faila];
				1941	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1942	pointers[stripe] = pointers[stripe + 1];
				1943	pointers[rbio->nr_data - 1] = p;
				1944
				1945	/* xor in the rest */
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1946	run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1947	}
				1948	/* if we're doing this rebuild as part of an rmw, go through
				1949	* and set all of our private rbio pages in the
				1950	* failed stripes as uptodate. This way finish_rmw will
				1951	* know they can be trusted. If this was a read reconstruction,
				1952	* other endio functions will fiddle the uptodate bits
				1953	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1954	if (rbio->operation == BTRFS_RBIO_WRITE) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1955	for (i = 0; i < rbio->stripe_npages; i++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1956	if (faila != -1) {
				1957	page = rbio_stripe_page(rbio, faila, i);
				1958	SetPageUptodate(page);
				1959	}
				1960	if (failb != -1) {
				1961	page = rbio_stripe_page(rbio, failb, i);
				1962	SetPageUptodate(page);
				1963	}
				1964	}
				1965	}
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1966	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1967	/*
				1968	* if we're rebuilding a read, we have to use
				1969	* pages from the bio list
				1970	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1971	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1972	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1973	(stripe == faila \|\| stripe == failb)) {
				1974	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1975	} else {
				1976	page = rbio_stripe_page(rbio, stripe, pagenr);
				1977	}
				1978	kunmap(page);
				1979	}
				1980	}
				1981
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1982	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1983	cleanup:
				1984	kfree(pointers);
				1985
				1986	cleanup_io:
Liu Bo	580c6ef	2018-03-22 09:20:11 +0800	[diff] [blame]	1987	/*
				1988	* Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
				1989	* valid rbio which is consistent with ondisk content, thus such a
				1990	* valid rbio can be cached to avoid further disk reads.
				1991	*/
				1992	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1993	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	1994	/*
				1995	* - In case of two failures, where rbio->failb != -1:
				1996	*
				1997	* Do not cache this rbio since the above read reconstruction
				1998	* (raid6_datap_recov() or raid6_2data_recov()) may have
				1999	* changed some content of stripes which are not identical to
				2000	* on-disk content any more, otherwise, a later write/recover
				2001	* may steal stripe_pages from this rbio and end up with
				2002	* corruptions or rebuild failures.
				2003	*
				2004	* - In case of single failure, where rbio->failb == -1:
				2005	*
				2006	* Cache this rbio iff the above read reconstruction is
				2007	* excuted without problems.
				2008	*/
				2009	if (err == BLK_STS_OK && rbio->failb < 0)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2010	cache_rbio_pages(rbio);
				2011	else
				2012	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2013
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2014	rbio_orig_end_io(rbio, err);
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2015	} else if (err == BLK_STS_OK) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2016	rbio->faila = -1;
				2017	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2018
				2019	if (rbio->operation == BTRFS_RBIO_WRITE)
				2020	finish_rmw(rbio);
				2021	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				2022	finish_parity_scrub(rbio, 0);
				2023	else
				2024	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2025	} else {
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2026	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2027	}
				2028	}
				2029
				2030	/*
				2031	* This is called only for stripes we've read from disk to
				2032	* reconstruct the parity.
				2033	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2034	static void raid_recover_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2035	{
				2036	struct btrfs_raid_bio *rbio = bio->bi_private;
				2037
				2038	/*
				2039	* we only read stripe pages off the disk, set them
				2040	* up to date if there were no errors
				2041	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2042	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2043	fail_bio_stripe(rbio, bio);
				2044	else
				2045	set_bio_pages_uptodate(bio);
				2046	bio_put(bio);
				2047
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2048	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2049	return;
				2050
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2051	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2052	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2053	else
				2054	__raid_recover_end_io(rbio);
				2055	}
				2056
				2057	/*
				2058	* reads everything we need off the disk to reconstruct
				2059	* the parity. endio handlers trigger final reconstruction
				2060	* when the IO is done.
				2061	*
				2062	* This is used both for reads from the higher layers and for
				2063	* parity construction required to finish a rmw cycle.
				2064	*/
				2065	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2066	{
				2067	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2068	struct bio_list bio_list;
				2069	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2070	int pagenr;
				2071	int stripe;
				2072	struct bio *bio;
				2073
				2074	bio_list_init(&bio_list);
				2075
				2076	ret = alloc_rbio_pages(rbio);
				2077	if (ret)
				2078	goto cleanup;
				2079
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2080	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2081
				2082	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2083	* read everything that hasn't failed. Thanks to the
				2084	* stripe cache, it is possible that some or all of these
				2085	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2086	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2087	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2088	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2089	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2090	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2091	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2092
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	2093	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2094	struct page *p;
				2095
				2096	/*
				2097	* the rmw code may have already read this
				2098	* page in
				2099	*/
				2100	p = rbio_stripe_page(rbio, stripe, pagenr);
				2101	if (PageUptodate(p))
				2102	continue;
				2103
				2104	ret = rbio_add_io_page(rbio, &bio_list,
				2105	rbio_stripe_page(rbio, stripe, pagenr),
				2106	stripe, pagenr, rbio->stripe_len);
				2107	if (ret < 0)
				2108	goto cleanup;
				2109	}
				2110	}
				2111
				2112	bios_to_read = bio_list_size(&bio_list);
				2113	if (!bios_to_read) {
				2114	/*
				2115	* we might have no bios to read just because the pages
				2116	* were up to date, or we might have no bios to read because
				2117	* the devices were gone.
				2118	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2119	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2120	__raid_recover_end_io(rbio);
				2121	goto out;
				2122	} else {
				2123	goto cleanup;
				2124	}
				2125	}
				2126
				2127	/*
				2128	* the bbio may be freed once we submit the last bio. Make sure
				2129	* not to touch it after that
				2130	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2131	atomic_set(&rbio->stripes_pending, bios_to_read);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2132	while (1) {
				2133	bio = bio_list_pop(&bio_list);
				2134	if (!bio)
				2135	break;
				2136
				2137	bio->bi_private = rbio;
				2138	bio->bi_end_io = raid_recover_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2139	bio->bi_opf = REQ_OP_READ;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2140
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2141	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2142
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2143	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2144	}
				2145	out:
				2146	return 0;
				2147
				2148	cleanup:
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2149	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				2150	rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2151	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2152
				2153	while ((bio = bio_list_pop(&bio_list)))
				2154	bio_put(bio);
				2155
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2156	return -EIO;
				2157	}
				2158
				2159	/*
				2160	* the main entry point for reads from the higher layers. This
				2161	* is really only called when the normal read path had a failure,
				2162	* so we assume the bio they send down corresponds to a failed part
				2163	* of the drive.
				2164	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2165	int raid56_parity_recover(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2166	struct btrfs_bio *bbio, u64 stripe_len,
				2167	int mirror_num, int generic_io)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2168	{
				2169	struct btrfs_raid_bio *rbio;
				2170	int ret;
				2171
Liu Bo	abad60c	2017-03-29 10:54:26 -0700	[diff] [blame]	2172	if (generic_io) {
				2173	ASSERT(bbio->mirror_num == mirror_num);
				2174	btrfs_io_bio(bio)->mirror_num = mirror_num;
				2175	}
				2176
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2177	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2178	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2179	if (generic_io)
				2180	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2181	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2182	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2183
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2184	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2185	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2186	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2187
				2188	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2189	if (rbio->faila == -1) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2190	btrfs_warn(fs_info,
Liu Bo	e46a28c	2016-07-29 10:57:55 -0700	[diff] [blame]	2191	"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
				2192	__func__, (u64)bio->bi_iter.bi_sector << 9,
				2193	(u64)bio->bi_iter.bi_size, bbio->map_type);
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2194	if (generic_io)
				2195	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2196	kfree(rbio);
				2197	return -EIO;
				2198	}
				2199
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2200	if (generic_io) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2201	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2202	rbio->generic_bio_cnt = 1;
				2203	} else {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2204	btrfs_get_bbio(bbio);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2205	}
				2206
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2207	/*
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2208	* Loop retry:
				2209	* for 'mirror == 2', reconstruct from all other stripes.
				2210	* for 'mirror_num > 2', select a stripe to fail on every retry.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2211	*/
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2212	if (mirror_num > 2) {
				2213	/*
				2214	* 'mirror == 3' is to fail the p stripe and
				2215	* reconstruct from the q stripe. 'mirror > 3' is to
				2216	* fail a data stripe and reconstruct from p+q stripe.
				2217	*/
				2218	rbio->failb = rbio->real_stripes - (mirror_num - 1);
				2219	ASSERT(rbio->failb > 0);
				2220	if (rbio->failb <= rbio->faila)
				2221	rbio->failb--;
				2222	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2223
				2224	ret = lock_stripe_add(rbio);
				2225
				2226	/*
				2227	* __raid56_parity_recover will end the bio with
				2228	* any errors it hits. We don't want to return
				2229	* its error value up the stack because our caller
				2230	* will end up calling bio_endio with any nonzero
				2231	* return
				2232	*/
				2233	if (ret == 0)
				2234	__raid56_parity_recover(rbio);
				2235	/*
				2236	* our rbio has been added to the list of
				2237	* rbios that will be handled after the
				2238	* currently lock owner is done
				2239	*/
				2240	return 0;
				2241
				2242	}
				2243
				2244	static void rmw_work(struct btrfs_work *work)
				2245	{
				2246	struct btrfs_raid_bio *rbio;
				2247
				2248	rbio = container_of(work, struct btrfs_raid_bio, work);
				2249	raid56_rmw_stripe(rbio);
				2250	}
				2251
				2252	static void read_rebuild_work(struct btrfs_work *work)
				2253	{
				2254	struct btrfs_raid_bio *rbio;
				2255
				2256	rbio = container_of(work, struct btrfs_raid_bio, work);
				2257	__raid56_parity_recover(rbio);
				2258	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2259
				2260	/*
				2261	* The following code is used to scrub/replace the parity stripe
				2262	*
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2263	* Caller must have already increased bio_counter for getting @bbio.
				2264	*
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2265	* Note: We need make sure all the pages that add into the scrub/replace
				2266	* raid bio are correct and not be changed during the scrub/replace. That
				2267	* is those pages just hold metadata or file data with checksum.
				2268	*/
				2269
				2270	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2271	raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2272	struct btrfs_bio *bbio, u64 stripe_len,
				2273	struct btrfs_device *scrub_dev,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2274	unsigned long *dbitmap, int stripe_nsectors)
				2275	{
				2276	struct btrfs_raid_bio *rbio;
				2277	int i;
				2278
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2279	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2280	if (IS_ERR(rbio))
				2281	return NULL;
				2282	bio_list_add(&rbio->bio_list, bio);
				2283	/*
				2284	* This is a special bio which is used to hold the completion handler
				2285	* and make the scrub rbio is similar to the other types
				2286	*/
				2287	ASSERT(!bio->bi_iter.bi_size);
				2288	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2289
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2290	/*
				2291	* After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
				2292	* to the end position, so this search can start from the first parity
				2293	* stripe.
				2294	*/
				2295	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2296	if (bbio->stripes[i].dev == scrub_dev) {
				2297	rbio->scrubp = i;
				2298	break;
				2299	}
				2300	}
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2301	ASSERT(i < rbio->real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2302
				2303	/* Now we just support the sectorsize equals to page size */
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2304	ASSERT(fs_info->sectorsize == PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2305	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2306	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2307
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2308	/*
				2309	* We have already increased bio_counter when getting bbio, record it
				2310	* so we can free it at rbio_orig_end_io().
				2311	*/
				2312	rbio->generic_bio_cnt = 1;
				2313
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2314	return rbio;
				2315	}
				2316
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2317	/* Used for both parity scrub and missing. */
				2318	void raid56_add_scrub_pages(struct btrfs_raid_bio rbio, struct page page,
				2319	u64 logical)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2320	{
				2321	int stripe_offset;
				2322	int index;
				2323
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2324	ASSERT(logical >= rbio->bbio->raid_map[0]);
				2325	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2326	rbio->stripe_len * rbio->nr_data);
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2327	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2328	index = stripe_offset >> PAGE_SHIFT;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2329	rbio->bio_pages[index] = page;
				2330	}
				2331
				2332	/*
				2333	* We just scrub the parity that we have correct data on the same horizontal,
				2334	* so we needn't allocate all pages for all the stripes.
				2335	*/
				2336	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2337	{
				2338	int i;
				2339	int bit;
				2340	int index;
				2341	struct page *page;
				2342
				2343	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2344	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2345	index = i * rbio->stripe_npages + bit;
				2346	if (rbio->stripe_pages[index])
				2347	continue;
				2348
				2349	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2350	if (!page)
				2351	return -ENOMEM;
				2352	rbio->stripe_pages[index] = page;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2353	}
				2354	}
				2355	return 0;
				2356	}
				2357
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2358	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2359	int need_check)
				2360	{
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2361	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	2362	void **pointers = rbio->finish_pointers;
				2363	unsigned long *pbitmap = rbio->finish_pbitmap;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2364	int nr_data = rbio->nr_data;
				2365	int stripe;
				2366	int pagenr;
				2367	int p_stripe = -1;
				2368	int q_stripe = -1;
				2369	struct page *p_page = NULL;
				2370	struct page *q_page = NULL;
				2371	struct bio_list bio_list;
				2372	struct bio *bio;
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2373	int is_replace = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2374	int ret;
				2375
				2376	bio_list_init(&bio_list);
				2377
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2378	if (rbio->real_stripes - rbio->nr_data == 1) {
				2379	p_stripe = rbio->real_stripes - 1;
				2380	} else if (rbio->real_stripes - rbio->nr_data == 2) {
				2381	p_stripe = rbio->real_stripes - 2;
				2382	q_stripe = rbio->real_stripes - 1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2383	} else {
				2384	BUG();
				2385	}
				2386
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2387	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
				2388	is_replace = 1;
				2389	bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
				2390	}
				2391
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2392	/*
				2393	* Because the higher layers(scrubber) are unlikely to
				2394	* use this area of the disk again soon, so don't cache
				2395	* it.
				2396	*/
				2397	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2398
				2399	if (!need_check)
				2400	goto writeback;
				2401
				2402	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2403	if (!p_page)
				2404	goto cleanup;
				2405	SetPageUptodate(p_page);
				2406
				2407	if (q_stripe != -1) {
				2408	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2409	if (!q_page) {
				2410	__free_page(p_page);
				2411	goto cleanup;
				2412	}
				2413	SetPageUptodate(q_page);
				2414	}
				2415
				2416	atomic_set(&rbio->error, 0);
				2417
				2418	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2419	struct page *p;
				2420	void *parity;
				2421	/* first collect one page from each data stripe */
				2422	for (stripe = 0; stripe < nr_data; stripe++) {
				2423	p = page_in_rbio(rbio, stripe, pagenr, 0);
				2424	pointers[stripe] = kmap(p);
				2425	}
				2426
				2427	/* then add the parity stripe */
				2428	pointers[stripe++] = kmap(p_page);
				2429
				2430	if (q_stripe != -1) {
				2431
				2432	/*
				2433	* raid6, add the qstripe and call the
				2434	* library function to fill in our p/q
				2435	*/
				2436	pointers[stripe++] = kmap(q_page);
				2437
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2438	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2439	pointers);
				2440	} else {
				2441	/* raid5 */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	2442	copy_page(pointers[nr_data], pointers[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2443	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2444	}
				2445
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2446	/* Check scrubbing parity and repair it */
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2447	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2448	parity = kmap(p);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2449	if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	2450	copy_page(parity, pointers[rbio->scrubp]);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2451	else
				2452	/* Parity is right, needn't writeback */
				2453	bitmap_clear(rbio->dbitmap, pagenr, 1);
				2454	kunmap(p);
				2455
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2456	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2457	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				2458	}
				2459
				2460	__free_page(p_page);
				2461	if (q_page)
				2462	__free_page(q_page);
				2463
				2464	writeback:
				2465	/*
				2466	* time to start writing. Make bios for everything from the
				2467	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2468	* everything else.
				2469	*/
				2470	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2471	struct page *page;
				2472
				2473	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2474	ret = rbio_add_io_page(rbio, &bio_list,
				2475	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2476	if (ret)
				2477	goto cleanup;
				2478	}
				2479
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2480	if (!is_replace)
				2481	goto submit_write;
				2482
				2483	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
				2484	struct page *page;
				2485
				2486	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2487	ret = rbio_add_io_page(rbio, &bio_list, page,
				2488	bbio->tgtdev_map[rbio->scrubp],
				2489	pagenr, rbio->stripe_len);
				2490	if (ret)
				2491	goto cleanup;
				2492	}
				2493
				2494	submit_write:
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2495	nr_data = bio_list_size(&bio_list);
				2496	if (!nr_data) {
				2497	/* Every parity is right */
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2498	rbio_orig_end_io(rbio, BLK_STS_OK);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2499	return;
				2500	}
				2501
				2502	atomic_set(&rbio->stripes_pending, nr_data);
				2503
				2504	while (1) {
				2505	bio = bio_list_pop(&bio_list);
				2506	if (!bio)
				2507	break;
				2508
				2509	bio->bi_private = rbio;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	2510	bio->bi_end_io = raid_write_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2511	bio->bi_opf = REQ_OP_WRITE;
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2512
				2513	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2514	}
				2515	return;
				2516
				2517	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2518	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2519
				2520	while ((bio = bio_list_pop(&bio_list)))
				2521	bio_put(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2522	}
				2523
				2524	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2525	{
				2526	if (stripe >= 0 && stripe < rbio->nr_data)
				2527	return 1;
				2528	return 0;
				2529	}
				2530
				2531	/*
				2532	* While we're doing the parity check and repair, we could have errors
				2533	* in reading pages off the disk. This checks for errors and if we're
				2534	* not able to read the page it'll trigger parity reconstruction. The
				2535	* parity scrub will be finished after we've reconstructed the failed
				2536	* stripes
				2537	*/
				2538	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2539	{
				2540	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2541	goto cleanup;
				2542
				2543	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2544	int dfail = 0, failp = -1;
				2545
				2546	if (is_data_stripe(rbio, rbio->faila))
				2547	dfail++;
				2548	else if (is_parity_stripe(rbio->faila))
				2549	failp = rbio->faila;
				2550
				2551	if (is_data_stripe(rbio, rbio->failb))
				2552	dfail++;
				2553	else if (is_parity_stripe(rbio->failb))
				2554	failp = rbio->failb;
				2555
				2556	/*
				2557	* Because we can not use a scrubbing parity to repair
				2558	* the data, so the capability of the repair is declined.
				2559	* (In the case of RAID5, we can not repair anything)
				2560	*/
				2561	if (dfail > rbio->bbio->max_errors - 1)
				2562	goto cleanup;
				2563
				2564	/*
				2565	* If all data is good, only parity is correctly, just
				2566	* repair the parity.
				2567	*/
				2568	if (dfail == 0) {
				2569	finish_parity_scrub(rbio, 0);
				2570	return;
				2571	}
				2572
				2573	/*
				2574	* Here means we got one corrupted data stripe and one
				2575	* corrupted parity on RAID6, if the corrupted parity
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2576	* is scrubbing parity, luckily, use the other one to repair
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2577	* the data, or we can not repair the data stripe.
				2578	*/
				2579	if (failp != rbio->scrubp)
				2580	goto cleanup;
				2581
				2582	__raid_recover_end_io(rbio);
				2583	} else {
				2584	finish_parity_scrub(rbio, 1);
				2585	}
				2586	return;
				2587
				2588	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2589	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2590	}
				2591
				2592	/*
				2593	* end io for the read phase of the rmw cycle. All the bios here are physical
				2594	* stripe bios we've read from the disk so we can recalculate the parity of the
				2595	* stripe.
				2596	*
				2597	* This will usually kick off finish_rmw once all the bios are read in, but it
				2598	* may trigger parity reconstruction if we had any errors along the way
				2599	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2600	static void raid56_parity_scrub_end_io(struct bio *bio)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2601	{
				2602	struct btrfs_raid_bio *rbio = bio->bi_private;
				2603
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2604	if (bio->bi_status)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2605	fail_bio_stripe(rbio, bio);
				2606	else
				2607	set_bio_pages_uptodate(bio);
				2608
				2609	bio_put(bio);
				2610
				2611	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2612	return;
				2613
				2614	/*
				2615	* this will normally call finish_rmw to start our write
				2616	* but if there are any failed stripes we'll reconstruct
				2617	* from parity first
				2618	*/
				2619	validate_rbio_for_parity_scrub(rbio);
				2620	}
				2621
				2622	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2623	{
				2624	int bios_to_read = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2625	struct bio_list bio_list;
				2626	int ret;
				2627	int pagenr;
				2628	int stripe;
				2629	struct bio *bio;
				2630
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2631	bio_list_init(&bio_list);
				2632
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2633	ret = alloc_rbio_essential_pages(rbio);
				2634	if (ret)
				2635	goto cleanup;
				2636
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2637	atomic_set(&rbio->error, 0);
				2638	/*
				2639	* build a list of bios to read all the missing parts of this
				2640	* stripe
				2641	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2642	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2643	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2644	struct page *page;
				2645	/*
				2646	* we want to find all the pages missing from
				2647	* the rbio and read them from the disk. If
				2648	* page_in_rbio finds a page in the bio list
				2649	* we don't need to read it off the stripe.
				2650	*/
				2651	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2652	if (page)
				2653	continue;
				2654
				2655	page = rbio_stripe_page(rbio, stripe, pagenr);
				2656	/*
				2657	* the bio cache may have handed us an uptodate
				2658	* page. If so, be happy and use it
				2659	*/
				2660	if (PageUptodate(page))
				2661	continue;
				2662
				2663	ret = rbio_add_io_page(rbio, &bio_list, page,
				2664	stripe, pagenr, rbio->stripe_len);
				2665	if (ret)
				2666	goto cleanup;
				2667	}
				2668	}
				2669
				2670	bios_to_read = bio_list_size(&bio_list);
				2671	if (!bios_to_read) {
				2672	/*
				2673	* this can happen if others have merged with
				2674	* us, it means there is nothing left to read.
				2675	* But if there are missing devices it may not be
				2676	* safe to do the full stripe write yet.
				2677	*/
				2678	goto finish;
				2679	}
				2680
				2681	/*
				2682	* the bbio may be freed once we submit the last bio. Make sure
				2683	* not to touch it after that
				2684	*/
				2685	atomic_set(&rbio->stripes_pending, bios_to_read);
				2686	while (1) {
				2687	bio = bio_list_pop(&bio_list);
				2688	if (!bio)
				2689	break;
				2690
				2691	bio->bi_private = rbio;
				2692	bio->bi_end_io = raid56_parity_scrub_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2693	bio->bi_opf = REQ_OP_READ;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2694
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2695	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2696
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2697	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2698	}
				2699	/* the actual write will happen once the reads are done */
				2700	return;
				2701
				2702	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2703	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2704
				2705	while ((bio = bio_list_pop(&bio_list)))
				2706	bio_put(bio);
				2707
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2708	return;
				2709
				2710	finish:
				2711	validate_rbio_for_parity_scrub(rbio);
				2712	}
				2713
				2714	static void scrub_parity_work(struct btrfs_work *work)
				2715	{
				2716	struct btrfs_raid_bio *rbio;
				2717
				2718	rbio = container_of(work, struct btrfs_raid_bio, work);
				2719	raid56_parity_scrub_stripe(rbio);
				2720	}
				2721
				2722	static void async_scrub_parity(struct btrfs_raid_bio *rbio)
				2723	{
				2724	btrfs_init_work(&rbio->work, btrfs_rmw_helper,
				2725	scrub_parity_work, NULL, NULL);
				2726
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2727	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2728	}
				2729
				2730	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2731	{
				2732	if (!lock_stripe_add(rbio))
				2733	async_scrub_parity(rbio);
				2734	}
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2735
				2736	/* The following code is used for dev replace of a missing RAID 5/6 device. */
				2737
				2738	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2739	raid56_alloc_missing_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2740	struct btrfs_bio *bbio, u64 length)
				2741	{
				2742	struct btrfs_raid_bio *rbio;
				2743
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2744	rbio = alloc_rbio(fs_info, bbio, length);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2745	if (IS_ERR(rbio))
				2746	return NULL;
				2747
				2748	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
				2749	bio_list_add(&rbio->bio_list, bio);
				2750	/*
				2751	* This is a special bio which is used to hold the completion handler
				2752	* and make the scrub rbio is similar to the other types
				2753	*/
				2754	ASSERT(!bio->bi_iter.bi_size);
				2755
				2756	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2757	if (rbio->faila == -1) {
				2758	BUG();
				2759	kfree(rbio);
				2760	return NULL;
				2761	}
				2762
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2763	/*
				2764	* When we get bbio, we have already increased bio_counter, record it
				2765	* so we can free it at rbio_orig_end_io()
				2766	*/
				2767	rbio->generic_bio_cnt = 1;
				2768
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2769	return rbio;
				2770	}
				2771
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2772	void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
				2773	{
				2774	if (!lock_stripe_add(rbio))
Liu Bo	d6a6913	2018-03-02 16:10:39 -0700	[diff] [blame]	2775	async_read_rebuild(rbio);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2776	}