Blame - fs/btrfs/scrub.c - SHIFTPHONES/kernel/shift/mainline

blob: 567e148caca2689a162a17bc59f1135cd329ba5a [file] [log] [blame]

Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1	/*
				2	* Copyright (C) 2011 STRATO. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	19	#include <linux/blkdev.h>
Jan Schmidt	558540c	2011-06-13 19:59:12 +0200	[diff] [blame]	20	#include <linux/ratelimit.h>
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	21	#include "ctree.h"
				22	#include "volumes.h"
				23	#include "disk-io.h"
				24	#include "ordered-data.h"
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	25	#include "transaction.h"
Jan Schmidt	558540c	2011-06-13 19:59:12 +0200	[diff] [blame]	26	#include "backref.h"
Jan Schmidt	5da6fcb	2011-08-04 18:11:04 +0200	[diff] [blame]	27	#include "extent_io.h"
Stefan Behrens	21adbd5	2011-11-09 13:44:05 +0100	[diff] [blame^]	28	#include "check-integrity.h"
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	29
				30	/*
				31	* This is only the first step towards a full-features scrub. It reads all
				32	* extent and super block and verifies the checksums. In case a bad checksum
				33	* is found or the extent cannot be read, good data will be written back if
				34	* any can be found.
				35	*
				36	* Future enhancements:
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	37	* - In case an unrepairable extent is encountered, track which files are
				38	* affected and report them
				39	* - In case of a read error on files with nodatasum, map the file and read
				40	* the extent to trigger a writeback of the good copy
				41	* - track and record media errors, throw out bad devices
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	42	* - add a mode to also read unallocated space
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	43	*/
				44
				45	struct scrub_bio;
				46	struct scrub_page;
				47	struct scrub_dev;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	48	static void scrub_bio_end_io(struct bio *bio, int err);
				49	static void scrub_checksum(struct btrfs_work *work);
				50	static int scrub_checksum_data(struct scrub_dev *sdev,
				51	struct scrub_page spag, void buffer);
				52	static int scrub_checksum_tree_block(struct scrub_dev *sdev,
				53	struct scrub_page *spag, u64 logical,
				54	void *buffer);
				55	static int scrub_checksum_super(struct scrub_bio sbio, void buffer);
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	56	static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
				57	static void scrub_fixup_end_io(struct bio *bio, int err);
				58	static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
				59	struct page *page);
				60	static void scrub_fixup(struct scrub_bio *sbio, int ix);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	61
				62	#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
				63	#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
				64
				65	struct scrub_page {
				66	u64 flags; /* extent flags */
				67	u64 generation;
Jan Schmidt	e12fa9c	2011-06-17 15:55:21 +0200	[diff] [blame]	68	int mirror_num;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	69	int have_csum;
				70	u8 csum[BTRFS_CSUM_SIZE];
				71	};
				72
				73	struct scrub_bio {
				74	int index;
				75	struct scrub_dev *sdev;
				76	struct bio *bio;
				77	int err;
				78	u64 logical;
				79	u64 physical;
				80	struct scrub_page spag[SCRUB_PAGES_PER_BIO];
				81	u64 count;
				82	int next_free;
				83	struct btrfs_work work;
				84	};
				85
				86	struct scrub_dev {
				87	struct scrub_bio *bios[SCRUB_BIOS_PER_DEV];
				88	struct btrfs_device *dev;
				89	int first_free;
				90	int curr;
				91	atomic_t in_flight;
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	92	atomic_t fixup_cnt;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	93	spinlock_t list_lock;
				94	wait_queue_head_t list_wait;
				95	u16 csum_size;
				96	struct list_head csum_list;
				97	atomic_t cancel_req;
Arne Jansen	8628764	2011-03-23 16:34:19 +0100	[diff] [blame]	98	int readonly;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	99	/*
				100	* statistics
				101	*/
				102	struct btrfs_scrub_progress stat;
				103	spinlock_t stat_lock;
				104	};
				105
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	106	struct scrub_fixup_nodatasum {
				107	struct scrub_dev *sdev;
				108	u64 logical;
				109	struct btrfs_root *root;
				110	struct btrfs_work work;
				111	int mirror_num;
				112	};
				113
Jan Schmidt	558540c	2011-06-13 19:59:12 +0200	[diff] [blame]	114	struct scrub_warning {
				115	struct btrfs_path *path;
				116	u64 extent_item_size;
				117	char *scratch_buf;
				118	char *msg_buf;
				119	const char *errstr;
				120	sector_t sector;
				121	u64 logical;
				122	struct btrfs_device *dev;
				123	int msg_bufsize;
				124	int scratch_bufsize;
				125	};
				126
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	127	static void scrub_free_csums(struct scrub_dev *sdev)
				128	{
				129	while (!list_empty(&sdev->csum_list)) {
				130	struct btrfs_ordered_sum *sum;
				131	sum = list_first_entry(&sdev->csum_list,
				132	struct btrfs_ordered_sum, list);
				133	list_del(&sum->list);
				134	kfree(sum);
				135	}
				136	}
				137
Arne Jansen	1bc8779	2011-05-28 21:57:55 +0200	[diff] [blame]	138	static void scrub_free_bio(struct bio *bio)
				139	{
				140	int i;
				141	struct page *last_page = NULL;
				142
				143	if (!bio)
				144	return;
				145
				146	for (i = 0; i < bio->bi_vcnt; ++i) {
				147	if (bio->bi_io_vec[i].bv_page == last_page)
				148	continue;
				149	last_page = bio->bi_io_vec[i].bv_page;
				150	__free_page(last_page);
				151	}
				152	bio_put(bio);
				153	}
				154
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	155	static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
				156	{
				157	int i;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	158
				159	if (!sdev)
				160	return;
				161
				162	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
				163	struct scrub_bio *sbio = sdev->bios[i];
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	164
				165	if (!sbio)
				166	break;
				167
Arne Jansen	1bc8779	2011-05-28 21:57:55 +0200	[diff] [blame]	168	scrub_free_bio(sbio->bio);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	169	kfree(sbio);
				170	}
				171
				172	scrub_free_csums(sdev);
				173	kfree(sdev);
				174	}
				175
				176	static noinline_for_stack
				177	struct scrub_dev scrub_setup_dev(struct btrfs_device dev)
				178	{
				179	struct scrub_dev *sdev;
				180	int i;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	181	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
				182
				183	sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
				184	if (!sdev)
				185	goto nomem;
				186	sdev->dev = dev;
				187	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	188	struct scrub_bio *sbio;
				189
				190	sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
				191	if (!sbio)
				192	goto nomem;
				193	sdev->bios[i] = sbio;
				194
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	195	sbio->index = i;
				196	sbio->sdev = sdev;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	197	sbio->count = 0;
				198	sbio->work.func = scrub_checksum;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	199
				200	if (i != SCRUB_BIOS_PER_DEV-1)
				201	sdev->bios[i]->next_free = i + 1;
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	202	else
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	203	sdev->bios[i]->next_free = -1;
				204	}
				205	sdev->first_free = 0;
				206	sdev->curr = -1;
				207	atomic_set(&sdev->in_flight, 0);
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	208	atomic_set(&sdev->fixup_cnt, 0);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	209	atomic_set(&sdev->cancel_req, 0);
David Sterba	6c41761	2011-04-13 15:41:04 +0200	[diff] [blame]	210	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	211	INIT_LIST_HEAD(&sdev->csum_list);
				212
				213	spin_lock_init(&sdev->list_lock);
				214	spin_lock_init(&sdev->stat_lock);
				215	init_waitqueue_head(&sdev->list_wait);
				216	return sdev;
				217
				218	nomem:
				219	scrub_free_dev(sdev);
				220	return ERR_PTR(-ENOMEM);
				221	}
				222
Jan Schmidt	558540c	2011-06-13 19:59:12 +0200	[diff] [blame]	223	static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
				224	{
				225	u64 isize;
				226	u32 nlink;
				227	int ret;
				228	int i;
				229	struct extent_buffer *eb;
				230	struct btrfs_inode_item *inode_item;
				231	struct scrub_warning *swarn = ctx;
				232	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
				233	struct inode_fs_paths *ipath = NULL;
				234	struct btrfs_root *local_root;
				235	struct btrfs_key root_key;
				236
				237	root_key.objectid = root;
				238	root_key.type = BTRFS_ROOT_ITEM_KEY;
				239	root_key.offset = (u64)-1;
				240	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
				241	if (IS_ERR(local_root)) {
				242	ret = PTR_ERR(local_root);
				243	goto err;
				244	}
				245
				246	ret = inode_item_info(inum, 0, local_root, swarn->path);
				247	if (ret) {
				248	btrfs_release_path(swarn->path);
				249	goto err;
				250	}
				251
				252	eb = swarn->path->nodes[0];
				253	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
				254	struct btrfs_inode_item);
				255	isize = btrfs_inode_size(eb, inode_item);
				256	nlink = btrfs_inode_nlink(eb, inode_item);
				257	btrfs_release_path(swarn->path);
				258
				259	ipath = init_ipath(4096, local_root, swarn->path);
Dan Carpenter	26bdef5	2011-11-16 11:28:01 +0300	[diff] [blame]	260	if (IS_ERR(ipath)) {
				261	ret = PTR_ERR(ipath);
				262	ipath = NULL;
				263	goto err;
				264	}
Jan Schmidt	558540c	2011-06-13 19:59:12 +0200	[diff] [blame]	265	ret = paths_from_inode(inum, ipath);
				266
				267	if (ret < 0)
				268	goto err;
				269
				270	/*
				271	* we deliberately ignore the bit ipath might have been too small to
				272	* hold all of the paths here
				273	*/
				274	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
				275	printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
				276	"%s, sector %llu, root %llu, inode %llu, offset %llu, "
				277	"length %llu, links %u (path: %s)\n", swarn->errstr,
				278	swarn->logical, swarn->dev->name,
				279	(unsigned long long)swarn->sector, root, inum, offset,
				280	min(isize - offset, (u64)PAGE_SIZE), nlink,
Jeff Mahoney	745c4d8	2011-11-20 07:31:57 -0500	[diff] [blame]	281	(char *)(unsigned long)ipath->fspath->val[i]);
Jan Schmidt	558540c	2011-06-13 19:59:12 +0200	[diff] [blame]	282
				283	free_ipath(ipath);
				284	return 0;
				285
				286	err:
				287	printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
				288	"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
				289	"resolving failed with ret=%d\n", swarn->errstr,
				290	swarn->logical, swarn->dev->name,
				291	(unsigned long long)swarn->sector, root, inum, offset, ret);
				292
				293	free_ipath(ipath);
				294	return 0;
				295	}
				296
				297	static void scrub_print_warning(const char errstr, struct scrub_bio sbio,
				298	int ix)
				299	{
				300	struct btrfs_device *dev = sbio->sdev->dev;
				301	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
				302	struct btrfs_path *path;
				303	struct btrfs_key found_key;
				304	struct extent_buffer *eb;
				305	struct btrfs_extent_item *ei;
				306	struct scrub_warning swarn;
				307	u32 item_size;
				308	int ret;
				309	u64 ref_root;
				310	u8 ref_level;
				311	unsigned long ptr = 0;
				312	const int bufsize = 4096;
				313	u64 extent_offset;
				314
				315	path = btrfs_alloc_path();
				316
				317	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
				318	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
				319	swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
				320	swarn.logical = sbio->logical + ix * PAGE_SIZE;
				321	swarn.errstr = errstr;
				322	swarn.dev = dev;
				323	swarn.msg_bufsize = bufsize;
				324	swarn.scratch_bufsize = bufsize;
				325
				326	if (!path \|\| !swarn.scratch_buf \|\| !swarn.msg_buf)
				327	goto out;
				328
				329	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
				330	if (ret < 0)
				331	goto out;
				332
				333	extent_offset = swarn.logical - found_key.objectid;
				334	swarn.extent_item_size = found_key.offset;
				335
				336	eb = path->nodes[0];
				337	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
				338	item_size = btrfs_item_size_nr(eb, path->slots[0]);
				339
				340	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
				341	do {
				342	ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
				343	&ref_root, &ref_level);
				344	printk(KERN_WARNING "%s at logical %llu on dev %s, "
				345	"sector %llu: metadata %s (level %d) in tree "
				346	"%llu\n", errstr, swarn.logical, dev->name,
				347	(unsigned long long)swarn.sector,
				348	ref_level ? "node" : "leaf",
				349	ret < 0 ? -1 : ref_level,
				350	ret < 0 ? -1 : ref_root);
				351	} while (ret != 1);
				352	} else {
				353	swarn.path = path;
				354	iterate_extent_inodes(fs_info, path, found_key.objectid,
				355	extent_offset,
				356	scrub_print_warning_inode, &swarn);
				357	}
				358
				359	out:
				360	btrfs_free_path(path);
				361	kfree(swarn.scratch_buf);
				362	kfree(swarn.msg_buf);
				363	}
				364
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	365	static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
				366	{
Jan Schmidt	5da6fcb	2011-08-04 18:11:04 +0200	[diff] [blame]	367	struct page *page = NULL;
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	368	unsigned long index;
				369	struct scrub_fixup_nodatasum *fixup = ctx;
				370	int ret;
Jan Schmidt	5da6fcb	2011-08-04 18:11:04 +0200	[diff] [blame]	371	int corrected = 0;
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	372	struct btrfs_key key;
Jan Schmidt	5da6fcb	2011-08-04 18:11:04 +0200	[diff] [blame]	373	struct inode *inode = NULL;
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	374	u64 end = offset + PAGE_SIZE - 1;
				375	struct btrfs_root *local_root;
				376
				377	key.objectid = root;
				378	key.type = BTRFS_ROOT_ITEM_KEY;
				379	key.offset = (u64)-1;
				380	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
				381	if (IS_ERR(local_root))
				382	return PTR_ERR(local_root);
				383
				384	key.type = BTRFS_INODE_ITEM_KEY;
				385	key.objectid = inum;
				386	key.offset = 0;
				387	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
				388	if (IS_ERR(inode))
				389	return PTR_ERR(inode);
				390
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	391	index = offset >> PAGE_CACHE_SHIFT;
				392
				393	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
Jan Schmidt	5da6fcb	2011-08-04 18:11:04 +0200	[diff] [blame]	394	if (!page) {
				395	ret = -ENOMEM;
				396	goto out;
				397	}
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	398
Jan Schmidt	5da6fcb	2011-08-04 18:11:04 +0200	[diff] [blame]	399	if (PageUptodate(page)) {
				400	struct btrfs_mapping_tree *map_tree;
				401	if (PageDirty(page)) {
				402	/*
				403	* we need to write the data to the defect sector. the
				404	* data that was in that sector is not in memory,
				405	* because the page was modified. we must not write the
				406	* modified page to that sector.
				407	*
				408	* TODO: what could be done here: wait for the delalloc
				409	* runner to write out that page (might involve
				410	* COW) and see whether the sector is still
				411	* referenced afterwards.
				412	*
				413	* For the meantime, we'll treat this error
				414	* incorrectable, although there is a chance that a
				415	* later scrub will find the bad sector again and that
				416	* there's no dirty page in memory, then.
				417	*/
				418	ret = -EIO;
				419	goto out;
				420	}
				421	map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
				422	ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
				423	fixup->logical, page,
				424	fixup->mirror_num);
				425	unlock_page(page);
				426	corrected = !ret;
				427	} else {
				428	/*
				429	* we need to get good data first. the general readpage path
				430	* will call repair_io_failure for us, we just have to make
				431	* sure we read the bad mirror.
				432	*/
				433	ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
				434	EXTENT_DAMAGED, GFP_NOFS);
				435	if (ret) {
				436	/* set_extent_bits should give proper error */
				437	WARN_ON(ret > 0);
				438	if (ret > 0)
				439	ret = -EFAULT;
				440	goto out;
				441	}
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	442
Jan Schmidt	5da6fcb	2011-08-04 18:11:04 +0200	[diff] [blame]	443	ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
				444	btrfs_get_extent,
				445	fixup->mirror_num);
				446	wait_on_page_locked(page);
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	447
Jan Schmidt	5da6fcb	2011-08-04 18:11:04 +0200	[diff] [blame]	448	corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
				449	end, EXTENT_DAMAGED, 0, NULL);
				450	if (!corrected)
				451	clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
				452	EXTENT_DAMAGED, GFP_NOFS);
				453	}
				454
				455	out:
				456	if (page)
				457	put_page(page);
				458	if (inode)
				459	iput(inode);
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	460
				461	if (ret < 0)
				462	return ret;
				463
				464	if (ret == 0 && corrected) {
				465	/*
				466	* we only need to call readpage for one of the inodes belonging
				467	* to this extent. so make iterate_extent_inodes stop
				468	*/
				469	return 1;
				470	}
				471
				472	return -EIO;
				473	}
				474
				475	static void scrub_fixup_nodatasum(struct btrfs_work *work)
				476	{
				477	int ret;
				478	struct scrub_fixup_nodatasum *fixup;
				479	struct scrub_dev *sdev;
				480	struct btrfs_trans_handle *trans = NULL;
				481	struct btrfs_fs_info *fs_info;
				482	struct btrfs_path *path;
				483	int uncorrectable = 0;
				484
				485	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
				486	sdev = fixup->sdev;
				487	fs_info = fixup->root->fs_info;
				488
				489	path = btrfs_alloc_path();
				490	if (!path) {
				491	spin_lock(&sdev->stat_lock);
				492	++sdev->stat.malloc_errors;
				493	spin_unlock(&sdev->stat_lock);
				494	uncorrectable = 1;
				495	goto out;
				496	}
				497
				498	trans = btrfs_join_transaction(fixup->root);
				499	if (IS_ERR(trans)) {
				500	uncorrectable = 1;
				501	goto out;
				502	}
				503
				504	/*
				505	* the idea is to trigger a regular read through the standard path. we
				506	* read a page from the (failed) logical address by specifying the
				507	* corresponding copynum of the failed sector. thus, that readpage is
				508	* expected to fail.
				509	* that is the point where on-the-fly error correction will kick in
				510	* (once it's finished) and rewrite the failed sector if a good copy
				511	* can be found.
				512	*/
				513	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
				514	path, scrub_fixup_readpage,
				515	fixup);
				516	if (ret < 0) {
				517	uncorrectable = 1;
				518	goto out;
				519	}
				520	WARN_ON(ret != 1);
				521
				522	spin_lock(&sdev->stat_lock);
				523	++sdev->stat.corrected_errors;
				524	spin_unlock(&sdev->stat_lock);
				525
				526	out:
				527	if (trans && !IS_ERR(trans))
				528	btrfs_end_transaction(trans, fixup->root);
				529	if (uncorrectable) {
				530	spin_lock(&sdev->stat_lock);
				531	++sdev->stat.uncorrectable_errors;
				532	spin_unlock(&sdev->stat_lock);
				533	printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
				534	"(nodatasum) error at logical %llu\n",
				535	fixup->logical);
				536	}
				537
				538	btrfs_free_path(path);
				539	kfree(fixup);
				540
				541	/* see caller why we're pretending to be paused in the scrub counters */
				542	mutex_lock(&fs_info->scrub_lock);
				543	atomic_dec(&fs_info->scrubs_running);
				544	atomic_dec(&fs_info->scrubs_paused);
				545	mutex_unlock(&fs_info->scrub_lock);
				546	atomic_dec(&sdev->fixup_cnt);
				547	wake_up(&fs_info->scrub_pause_wait);
				548	wake_up(&sdev->list_wait);
				549	}
				550
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	551	/*
				552	* scrub_recheck_error gets called when either verification of the page
				553	* failed or the bio failed to read, e.g. with EIO. In the latter case,
				554	* recheck_error gets called for every page in the bio, even though only
				555	* one may be bad
				556	*/
Jan Schmidt	13db62b	2011-06-13 19:56:13 +0200	[diff] [blame]	557	static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	558	{
Jan Schmidt	13db62b	2011-06-13 19:56:13 +0200	[diff] [blame]	559	struct scrub_dev *sdev = sbio->sdev;
				560	u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
Jan Schmidt	558540c	2011-06-13 19:59:12 +0200	[diff] [blame]	561	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
				562	DEFAULT_RATELIMIT_BURST);
Jan Schmidt	13db62b	2011-06-13 19:56:13 +0200	[diff] [blame]	563
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	564	if (sbio->err) {
Jan Schmidt	13db62b	2011-06-13 19:56:13 +0200	[diff] [blame]	565	if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	566	sbio->bio->bi_io_vec[ix].bv_page) == 0) {
				567	if (scrub_fixup_check(sbio, ix) == 0)
Jan Schmidt	13db62b	2011-06-13 19:56:13 +0200	[diff] [blame]	568	return 0;
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	569	}
Jan Schmidt	558540c	2011-06-13 19:59:12 +0200	[diff] [blame]	570	if (__ratelimit(&_rs))
				571	scrub_print_warning("i/o error", sbio, ix);
				572	} else {
				573	if (__ratelimit(&_rs))
				574	scrub_print_warning("checksum error", sbio, ix);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	575	}
				576
Jan Schmidt	13db62b	2011-06-13 19:56:13 +0200	[diff] [blame]	577	spin_lock(&sdev->stat_lock);
				578	++sdev->stat.read_errors;
				579	spin_unlock(&sdev->stat_lock);
				580
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	581	scrub_fixup(sbio, ix);
Jan Schmidt	13db62b	2011-06-13 19:56:13 +0200	[diff] [blame]	582	return 1;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	583	}
				584
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	585	static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	586	{
				587	int ret = 1;
				588	struct page *page;
				589	void *buffer;
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	590	u64 flags = sbio->spag[ix].flags;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	591
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	592	page = sbio->bio->bi_io_vec[ix].bv_page;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	593	buffer = kmap_atomic(page, KM_USER0);
				594	if (flags & BTRFS_EXTENT_FLAG_DATA) {
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	595	ret = scrub_checksum_data(sbio->sdev,
				596	sbio->spag + ix, buffer);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	597	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	598	ret = scrub_checksum_tree_block(sbio->sdev,
				599	sbio->spag + ix,
				600	sbio->logical + ix * PAGE_SIZE,
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	601	buffer);
				602	} else {
				603	WARN_ON(1);
				604	}
				605	kunmap_atomic(buffer, KM_USER0);
				606
				607	return ret;
				608	}
				609
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	610	static void scrub_fixup_end_io(struct bio *bio, int err)
				611	{
				612	complete((struct completion *)bio->bi_private);
				613	}
				614
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	615	static void scrub_fixup(struct scrub_bio *sbio, int ix)
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	616	{
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	617	struct scrub_dev *sdev = sbio->sdev;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	618	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
				619	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
Jan Schmidt	a1d3c47	2011-08-04 17:15:33 +0200	[diff] [blame]	620	struct btrfs_bio *bbio = NULL;
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	621	struct scrub_fixup_nodatasum *fixup;
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	622	u64 logical = sbio->logical + ix * PAGE_SIZE;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	623	u64 length;
				624	int i;
				625	int ret;
				626	DECLARE_COMPLETION_ONSTACK(complete);
				627
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	628	if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
				629	(sbio->spag[ix].have_csum == 0)) {
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	630	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
				631	if (!fixup)
				632	goto uncorrectable;
				633	fixup->sdev = sdev;
				634	fixup->logical = logical;
				635	fixup->root = fs_info->extent_root;
				636	fixup->mirror_num = sbio->spag[ix].mirror_num;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	637	/*
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	638	* increment scrubs_running to prevent cancel requests from
				639	* completing as long as a fixup worker is running. we must also
				640	* increment scrubs_paused to prevent deadlocking on pause
				641	* requests used for transactions commits (as the worker uses a
				642	* transaction context). it is safe to regard the fixup worker
				643	* as paused for all matters practical. effectively, we only
				644	* avoid cancellation requests from completing.
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	645	*/
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	646	mutex_lock(&fs_info->scrub_lock);
				647	atomic_inc(&fs_info->scrubs_running);
				648	atomic_inc(&fs_info->scrubs_paused);
				649	mutex_unlock(&fs_info->scrub_lock);
				650	atomic_inc(&sdev->fixup_cnt);
				651	fixup->work.func = scrub_fixup_nodatasum;
				652	btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
				653	return;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	654	}
				655
				656	length = PAGE_SIZE;
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	657	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
Jan Schmidt	a1d3c47	2011-08-04 17:15:33 +0200	[diff] [blame]	658	&bbio, 0);
				659	if (ret \|\| !bbio \|\| length < PAGE_SIZE) {
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	660	printk(KERN_ERR
				661	"scrub_fixup: btrfs_map_block failed us for %llu\n",
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	662	(unsigned long long)logical);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	663	WARN_ON(1);
Ilya Dryomov	56d2a48	2011-11-04 09:41:02 -0400	[diff] [blame]	664	kfree(bbio);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	665	return;
				666	}
				667
Jan Schmidt	a1d3c47	2011-08-04 17:15:33 +0200	[diff] [blame]	668	if (bbio->num_stripes == 1)
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	669	/* there aren't any replicas */
				670	goto uncorrectable;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	671
				672	/*
				673	* first find a good copy
				674	*/
Jan Schmidt	a1d3c47	2011-08-04 17:15:33 +0200	[diff] [blame]	675	for (i = 0; i < bbio->num_stripes; ++i) {
Jan Schmidt	193ea74	2011-06-13 19:56:54 +0200	[diff] [blame]	676	if (i + 1 == sbio->spag[ix].mirror_num)
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	677	continue;
				678
Jan Schmidt	a1d3c47	2011-08-04 17:15:33 +0200	[diff] [blame]	679	if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
				680	bbio->stripes[i].physical >> 9,
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	681	sbio->bio->bi_io_vec[ix].bv_page)) {
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	682	/* I/O-error, this is not a good copy */
				683	continue;
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	684	}
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	685
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	686	if (scrub_fixup_check(sbio, ix) == 0)
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	687	break;
				688	}
Jan Schmidt	a1d3c47	2011-08-04 17:15:33 +0200	[diff] [blame]	689	if (i == bbio->num_stripes)
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	690	goto uncorrectable;
				691
Arne Jansen	8628764	2011-03-23 16:34:19 +0100	[diff] [blame]	692	if (!sdev->readonly) {
				693	/*
				694	* bi_io_vec[ix].bv_page now contains good data, write it back
				695	*/
				696	if (scrub_fixup_io(WRITE, sdev->dev->bdev,
				697	(sbio->physical + ix * PAGE_SIZE) >> 9,
				698	sbio->bio->bi_io_vec[ix].bv_page)) {
				699	/* I/O-error, writeback failed, give up */
				700	goto uncorrectable;
				701	}
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	702	}
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	703
Jan Schmidt	a1d3c47	2011-08-04 17:15:33 +0200	[diff] [blame]	704	kfree(bbio);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	705	spin_lock(&sdev->stat_lock);
				706	++sdev->stat.corrected_errors;
				707	spin_unlock(&sdev->stat_lock);
				708
Jan Schmidt	558540c	2011-06-13 19:59:12 +0200	[diff] [blame]	709	printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
				710	(unsigned long long)logical);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	711	return;
				712
				713	uncorrectable:
Jan Schmidt	a1d3c47	2011-08-04 17:15:33 +0200	[diff] [blame]	714	kfree(bbio);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	715	spin_lock(&sdev->stat_lock);
				716	++sdev->stat.uncorrectable_errors;
				717	spin_unlock(&sdev->stat_lock);
				718
Jan Schmidt	558540c	2011-06-13 19:59:12 +0200	[diff] [blame]	719	printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
				720	"logical %llu\n", (unsigned long long)logical);
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	721	}
				722
				723	static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
				724	struct page *page)
				725	{
				726	struct bio *bio = NULL;
				727	int ret;
				728	DECLARE_COMPLETION_ONSTACK(complete);
				729
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	730	bio = bio_alloc(GFP_NOFS, 1);
				731	bio->bi_bdev = bdev;
				732	bio->bi_sector = sector;
				733	bio_add_page(bio, page, PAGE_SIZE, 0);
				734	bio->bi_end_io = scrub_fixup_end_io;
				735	bio->bi_private = &complete;
Stefan Behrens	21adbd5	2011-11-09 13:44:05 +0100	[diff] [blame^]	736	btrfsic_submit_bio(rw, bio);
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	737
Arne Jansen	e7786c3	2011-05-28 20:58:38 +0000	[diff] [blame]	738	/* this will also unplug the queue */
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	739	wait_for_completion(&complete);
				740
				741	ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
				742	bio_put(bio);
				743	return ret;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	744	}
				745
				746	static void scrub_bio_end_io(struct bio *bio, int err)
				747	{
				748	struct scrub_bio *sbio = bio->bi_private;
				749	struct scrub_dev *sdev = sbio->sdev;
				750	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
				751
				752	sbio->err = err;
Arne Jansen	1bc8779	2011-05-28 21:57:55 +0200	[diff] [blame]	753	sbio->bio = bio;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	754
				755	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
				756	}
				757
				758	static void scrub_checksum(struct btrfs_work *work)
				759	{
				760	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
				761	struct scrub_dev *sdev = sbio->sdev;
				762	struct page *page;
				763	void *buffer;
				764	int i;
				765	u64 flags;
				766	u64 logical;
				767	int ret;
				768
				769	if (sbio->err) {
Jan Schmidt	13db62b	2011-06-13 19:56:13 +0200	[diff] [blame]	770	ret = 0;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	771	for (i = 0; i < sbio->count; ++i)
Jan Schmidt	13db62b	2011-06-13 19:56:13 +0200	[diff] [blame]	772	ret \|= scrub_recheck_error(sbio, i);
				773	if (!ret) {
				774	spin_lock(&sdev->stat_lock);
				775	++sdev->stat.unverified_errors;
				776	spin_unlock(&sdev->stat_lock);
				777	}
Ilya Dryomov	96e3692	2011-04-09 14:27:01 +0300	[diff] [blame]	778
				779	sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
				780	sbio->bio->bi_flags \|= 1 << BIO_UPTODATE;
				781	sbio->bio->bi_phys_segments = 0;
				782	sbio->bio->bi_idx = 0;
				783
				784	for (i = 0; i < sbio->count; i++) {
				785	struct bio_vec *bi;
				786	bi = &sbio->bio->bi_io_vec[i];
				787	bi->bv_offset = 0;
				788	bi->bv_len = PAGE_SIZE;
				789	}
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	790	goto out;
				791	}
				792	for (i = 0; i < sbio->count; ++i) {
				793	page = sbio->bio->bi_io_vec[i].bv_page;
				794	buffer = kmap_atomic(page, KM_USER0);
				795	flags = sbio->spag[i].flags;
				796	logical = sbio->logical + i * PAGE_SIZE;
				797	ret = 0;
				798	if (flags & BTRFS_EXTENT_FLAG_DATA) {
				799	ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
				800	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
				801	ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
				802	logical, buffer);
				803	} else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
				804	BUG_ON(i);
				805	(void)scrub_checksum_super(sbio, buffer);
				806	} else {
				807	WARN_ON(1);
				808	}
				809	kunmap_atomic(buffer, KM_USER0);
Jan Schmidt	13db62b	2011-06-13 19:56:13 +0200	[diff] [blame]	810	if (ret) {
				811	ret = scrub_recheck_error(sbio, i);
				812	if (!ret) {
				813	spin_lock(&sdev->stat_lock);
				814	++sdev->stat.unverified_errors;
				815	spin_unlock(&sdev->stat_lock);
				816	}
				817	}
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	818	}
				819
				820	out:
Arne Jansen	1bc8779	2011-05-28 21:57:55 +0200	[diff] [blame]	821	scrub_free_bio(sbio->bio);
				822	sbio->bio = NULL;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	823	spin_lock(&sdev->list_lock);
				824	sbio->next_free = sdev->first_free;
				825	sdev->first_free = sbio->index;
				826	spin_unlock(&sdev->list_lock);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	827	atomic_dec(&sdev->in_flight);
				828	wake_up(&sdev->list_wait);
				829	}
				830
				831	static int scrub_checksum_data(struct scrub_dev *sdev,
				832	struct scrub_page spag, void buffer)
				833	{
				834	u8 csum[BTRFS_CSUM_SIZE];
				835	u32 crc = ~(u32)0;
				836	int fail = 0;
				837	struct btrfs_root *root = sdev->dev->dev_root;
				838
				839	if (!spag->have_csum)
				840	return 0;
				841
				842	crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
				843	btrfs_csum_final(crc, csum);
				844	if (memcmp(csum, spag->csum, sdev->csum_size))
				845	fail = 1;
				846
				847	spin_lock(&sdev->stat_lock);
				848	++sdev->stat.data_extents_scrubbed;
				849	sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
				850	if (fail)
				851	++sdev->stat.csum_errors;
				852	spin_unlock(&sdev->stat_lock);
				853
				854	return fail;
				855	}
				856
				857	static int scrub_checksum_tree_block(struct scrub_dev *sdev,
				858	struct scrub_page *spag, u64 logical,
				859	void *buffer)
				860	{
				861	struct btrfs_header *h;
				862	struct btrfs_root *root = sdev->dev->dev_root;
				863	struct btrfs_fs_info *fs_info = root->fs_info;
				864	u8 csum[BTRFS_CSUM_SIZE];
				865	u32 crc = ~(u32)0;
				866	int fail = 0;
				867	int crc_fail = 0;
				868
				869	/*
				870	* we don't use the getter functions here, as we
				871	* a) don't have an extent buffer and
				872	* b) the page is already kmapped
				873	*/
				874	h = (struct btrfs_header *)buffer;
				875
				876	if (logical != le64_to_cpu(h->bytenr))
				877	++fail;
				878
				879	if (spag->generation != le64_to_cpu(h->generation))
				880	++fail;
				881
				882	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
				883	++fail;
				884
				885	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
				886	BTRFS_UUID_SIZE))
				887	++fail;
				888
				889	crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
				890	PAGE_SIZE - BTRFS_CSUM_SIZE);
				891	btrfs_csum_final(crc, csum);
				892	if (memcmp(csum, h->csum, sdev->csum_size))
				893	++crc_fail;
				894
				895	spin_lock(&sdev->stat_lock);
				896	++sdev->stat.tree_extents_scrubbed;
				897	sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
				898	if (crc_fail)
				899	++sdev->stat.csum_errors;
				900	if (fail)
				901	++sdev->stat.verify_errors;
				902	spin_unlock(&sdev->stat_lock);
				903
				904	return fail \|\| crc_fail;
				905	}
				906
				907	static int scrub_checksum_super(struct scrub_bio sbio, void buffer)
				908	{
				909	struct btrfs_super_block *s;
				910	u64 logical;
				911	struct scrub_dev *sdev = sbio->sdev;
				912	struct btrfs_root *root = sdev->dev->dev_root;
				913	struct btrfs_fs_info *fs_info = root->fs_info;
				914	u8 csum[BTRFS_CSUM_SIZE];
				915	u32 crc = ~(u32)0;
				916	int fail = 0;
				917
				918	s = (struct btrfs_super_block *)buffer;
				919	logical = sbio->logical;
				920
				921	if (logical != le64_to_cpu(s->bytenr))
				922	++fail;
				923
				924	if (sbio->spag[0].generation != le64_to_cpu(s->generation))
				925	++fail;
				926
				927	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
				928	++fail;
				929
				930	crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
				931	PAGE_SIZE - BTRFS_CSUM_SIZE);
				932	btrfs_csum_final(crc, csum);
				933	if (memcmp(csum, s->csum, sbio->sdev->csum_size))
				934	++fail;
				935
				936	if (fail) {
				937	/*
				938	* if we find an error in a super block, we just report it.
				939	* They will get written with the next transaction commit
				940	* anyway
				941	*/
				942	spin_lock(&sdev->stat_lock);
				943	++sdev->stat.super_errors;
				944	spin_unlock(&sdev->stat_lock);
				945	}
				946
				947	return fail;
				948	}
				949
				950	static int scrub_submit(struct scrub_dev *sdev)
				951	{
				952	struct scrub_bio *sbio;
				953
				954	if (sdev->curr == -1)
				955	return 0;
				956
				957	sbio = sdev->bios[sdev->curr];
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	958	sbio->err = 0;
				959	sdev->curr = -1;
				960	atomic_inc(&sdev->in_flight);
				961
Stefan Behrens	21adbd5	2011-11-09 13:44:05 +0100	[diff] [blame^]	962	btrfsic_submit_bio(READ, sbio->bio);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	963
				964	return 0;
				965	}
				966
				967	static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
Jan Schmidt	e12fa9c	2011-06-17 15:55:21 +0200	[diff] [blame]	968	u64 physical, u64 flags, u64 gen, int mirror_num,
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	969	u8 *csum, int force)
				970	{
				971	struct scrub_bio *sbio;
Arne Jansen	69f4cb5	2011-11-11 08:17:10 -0500	[diff] [blame]	972	struct page *page;
				973	int ret;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	974
				975	again:
				976	/*
				977	* grab a fresh bio or wait for one to become available
				978	*/
				979	while (sdev->curr == -1) {
				980	spin_lock(&sdev->list_lock);
				981	sdev->curr = sdev->first_free;
				982	if (sdev->curr != -1) {
				983	sdev->first_free = sdev->bios[sdev->curr]->next_free;
				984	sdev->bios[sdev->curr]->next_free = -1;
				985	sdev->bios[sdev->curr]->count = 0;
				986	spin_unlock(&sdev->list_lock);
				987	} else {
				988	spin_unlock(&sdev->list_lock);
				989	wait_event(sdev->list_wait, sdev->first_free != -1);
				990	}
				991	}
				992	sbio = sdev->bios[sdev->curr];
				993	if (sbio->count == 0) {
Arne Jansen	69f4cb5	2011-11-11 08:17:10 -0500	[diff] [blame]	994	struct bio *bio;
				995
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	996	sbio->physical = physical;
				997	sbio->logical = logical;
Arne Jansen	69f4cb5	2011-11-11 08:17:10 -0500	[diff] [blame]	998	bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
				999	if (!bio)
				1000	return -ENOMEM;
				1001
				1002	bio->bi_private = sbio;
				1003	bio->bi_end_io = scrub_bio_end_io;
				1004	bio->bi_bdev = sdev->dev->bdev;
				1005	bio->bi_sector = sbio->physical >> 9;
				1006	sbio->err = 0;
				1007	sbio->bio = bio;
Arne Jansen	00d01bc	2011-05-25 12:22:50 +0000	[diff] [blame]	1008	} else if (sbio->physical + sbio->count * PAGE_SIZE != physical \|\|
				1009	sbio->logical + sbio->count * PAGE_SIZE != logical) {
Arne Jansen	1bc8779	2011-05-28 21:57:55 +0200	[diff] [blame]	1010	ret = scrub_submit(sdev);
				1011	if (ret)
				1012	return ret;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1013	goto again;
				1014	}
				1015	sbio->spag[sbio->count].flags = flags;
				1016	sbio->spag[sbio->count].generation = gen;
				1017	sbio->spag[sbio->count].have_csum = 0;
				1018	sbio->spag[sbio->count].mirror_num = mirror_num;
Arne Jansen	69f4cb5	2011-11-11 08:17:10 -0500	[diff] [blame]	1019
				1020	page = alloc_page(GFP_NOFS);
				1021	if (!page)
				1022	return -ENOMEM;
				1023
				1024	ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
				1025	if (!ret) {
				1026	__free_page(page);
				1027	ret = scrub_submit(sdev);
				1028	if (ret)
				1029	return ret;
				1030	goto again;
				1031	}
				1032
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1033	if (csum) {
				1034	sbio->spag[sbio->count].have_csum = 1;
				1035	memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
				1036	}
				1037	++sbio->count;
Arne Jansen	1bc8779	2011-05-28 21:57:55 +0200	[diff] [blame]	1038	if (sbio->count == SCRUB_PAGES_PER_BIO \|\| force) {
				1039	int ret;
				1040
				1041	ret = scrub_submit(sdev);
				1042	if (ret)
				1043	return ret;
				1044	}
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1045
				1046	return 0;
				1047	}
				1048
				1049	static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
				1050	u8 *csum)
				1051	{
				1052	struct btrfs_ordered_sum *sum = NULL;
				1053	int ret = 0;
				1054	unsigned long i;
				1055	unsigned long num_sectors;
				1056	u32 sectorsize = sdev->dev->dev_root->sectorsize;
				1057
				1058	while (!list_empty(&sdev->csum_list)) {
				1059	sum = list_first_entry(&sdev->csum_list,
				1060	struct btrfs_ordered_sum, list);
				1061	if (sum->bytenr > logical)
				1062	return 0;
				1063	if (sum->bytenr + sum->len > logical)
				1064	break;
				1065
				1066	++sdev->stat.csum_discards;
				1067	list_del(&sum->list);
				1068	kfree(sum);
				1069	sum = NULL;
				1070	}
				1071	if (!sum)
				1072	return 0;
				1073
				1074	num_sectors = sum->len / sectorsize;
				1075	for (i = 0; i < num_sectors; ++i) {
				1076	if (sum->sums[i].bytenr == logical) {
				1077	memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
				1078	ret = 1;
				1079	break;
				1080	}
				1081	}
				1082	if (ret && i == num_sectors - 1) {
				1083	list_del(&sum->list);
				1084	kfree(sum);
				1085	}
				1086	return ret;
				1087	}
				1088
				1089	/* scrub extent tries to collect up to 64 kB for each bio */
				1090	static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
Jan Schmidt	e12fa9c	2011-06-17 15:55:21 +0200	[diff] [blame]	1091	u64 physical, u64 flags, u64 gen, int mirror_num)
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1092	{
				1093	int ret;
				1094	u8 csum[BTRFS_CSUM_SIZE];
				1095
				1096	while (len) {
				1097	u64 l = min_t(u64, len, PAGE_SIZE);
				1098	int have_csum = 0;
				1099
				1100	if (flags & BTRFS_EXTENT_FLAG_DATA) {
				1101	/* push csums to sbio */
				1102	have_csum = scrub_find_csum(sdev, logical, l, csum);
				1103	if (have_csum == 0)
				1104	++sdev->stat.no_csum;
				1105	}
				1106	ret = scrub_page(sdev, logical, l, physical, flags, gen,
				1107	mirror_num, have_csum ? csum : NULL, 0);
				1108	if (ret)
				1109	return ret;
				1110	len -= l;
				1111	logical += l;
				1112	physical += l;
				1113	}
				1114	return 0;
				1115	}
				1116
				1117	static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
				1118	struct map_lookup *map, int num, u64 base, u64 length)
				1119	{
				1120	struct btrfs_path *path;
				1121	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
				1122	struct btrfs_root *root = fs_info->extent_root;
				1123	struct btrfs_root *csum_root = fs_info->csum_root;
				1124	struct btrfs_extent_item *extent;
Arne Jansen	e7786c3	2011-05-28 20:58:38 +0000	[diff] [blame]	1125	struct blk_plug plug;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1126	u64 flags;
				1127	int ret;
				1128	int slot;
				1129	int i;
				1130	u64 nstripes;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1131	struct extent_buffer *l;
				1132	struct btrfs_key key;
				1133	u64 physical;
				1134	u64 logical;
				1135	u64 generation;
Jan Schmidt	e12fa9c	2011-06-17 15:55:21 +0200	[diff] [blame]	1136	int mirror_num;
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1137	struct reada_control *reada1;
				1138	struct reada_control *reada2;
				1139	struct btrfs_key key_start;
				1140	struct btrfs_key key_end;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1141
				1142	u64 increment = map->stripe_len;
				1143	u64 offset;
				1144
				1145	nstripes = length;
				1146	offset = 0;
				1147	do_div(nstripes, map->stripe_len);
				1148	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
				1149	offset = map->stripe_len * num;
				1150	increment = map->stripe_len * map->num_stripes;
Jan Schmidt	193ea74	2011-06-13 19:56:54 +0200	[diff] [blame]	1151	mirror_num = 1;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1152	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
				1153	int factor = map->num_stripes / map->sub_stripes;
				1154	offset = map->stripe_len * (num / map->sub_stripes);
				1155	increment = map->stripe_len * factor;
Jan Schmidt	193ea74	2011-06-13 19:56:54 +0200	[diff] [blame]	1156	mirror_num = num % map->sub_stripes + 1;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1157	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
				1158	increment = map->stripe_len;
Jan Schmidt	193ea74	2011-06-13 19:56:54 +0200	[diff] [blame]	1159	mirror_num = num % map->num_stripes + 1;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1160	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
				1161	increment = map->stripe_len;
Jan Schmidt	193ea74	2011-06-13 19:56:54 +0200	[diff] [blame]	1162	mirror_num = num % map->num_stripes + 1;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1163	} else {
				1164	increment = map->stripe_len;
Jan Schmidt	193ea74	2011-06-13 19:56:54 +0200	[diff] [blame]	1165	mirror_num = 1;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1166	}
				1167
				1168	path = btrfs_alloc_path();
				1169	if (!path)
				1170	return -ENOMEM;
				1171
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1172	path->search_commit_root = 1;
				1173	path->skip_locking = 1;
				1174
				1175	/*
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1176	* trigger the readahead for extent tree csum tree and wait for
				1177	* completion. During readahead, the scrub is officially paused
				1178	* to not hold off transaction commits
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1179	*/
				1180	logical = base + offset;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1181
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1182	wait_event(sdev->list_wait,
				1183	atomic_read(&sdev->in_flight) == 0);
				1184	atomic_inc(&fs_info->scrubs_paused);
				1185	wake_up(&fs_info->scrub_pause_wait);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1186
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1187	/* FIXME it might be better to start readahead at commit root */
				1188	key_start.objectid = logical;
				1189	key_start.type = BTRFS_EXTENT_ITEM_KEY;
				1190	key_start.offset = (u64)0;
				1191	key_end.objectid = base + offset + nstripes * increment;
				1192	key_end.type = BTRFS_EXTENT_ITEM_KEY;
				1193	key_end.offset = (u64)0;
				1194	reada1 = btrfs_reada_add(root, &key_start, &key_end);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1195
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1196	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
				1197	key_start.type = BTRFS_EXTENT_CSUM_KEY;
				1198	key_start.offset = logical;
				1199	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
				1200	key_end.type = BTRFS_EXTENT_CSUM_KEY;
				1201	key_end.offset = base + offset + nstripes * increment;
				1202	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1203
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1204	if (!IS_ERR(reada1))
				1205	btrfs_reada_wait(reada1);
				1206	if (!IS_ERR(reada2))
				1207	btrfs_reada_wait(reada2);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1208
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1209	mutex_lock(&fs_info->scrub_lock);
				1210	while (atomic_read(&fs_info->scrub_pause_req)) {
				1211	mutex_unlock(&fs_info->scrub_lock);
				1212	wait_event(fs_info->scrub_pause_wait,
				1213	atomic_read(&fs_info->scrub_pause_req) == 0);
				1214	mutex_lock(&fs_info->scrub_lock);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1215	}
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1216	atomic_dec(&fs_info->scrubs_paused);
				1217	mutex_unlock(&fs_info->scrub_lock);
				1218	wake_up(&fs_info->scrub_pause_wait);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1219
				1220	/*
				1221	* collect all data csums for the stripe to avoid seeking during
				1222	* the scrub. This might currently (crc32) end up to be about 1MB
				1223	*/
Arne Jansen	e7786c3	2011-05-28 20:58:38 +0000	[diff] [blame]	1224	blk_start_plug(&plug);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1225
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1226	/*
				1227	* now find all extents for each stripe and scrub them
				1228	*/
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1229	logical = base + offset;
				1230	physical = map->stripes[num].physical;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1231	ret = 0;
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1232	for (i = 0; i < nstripes; ++i) {
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1233	/*
				1234	* canceled?
				1235	*/
				1236	if (atomic_read(&fs_info->scrub_cancel_req) \|\|
				1237	atomic_read(&sdev->cancel_req)) {
				1238	ret = -ECANCELED;
				1239	goto out;
				1240	}
				1241	/*
				1242	* check to see if we have to pause
				1243	*/
				1244	if (atomic_read(&fs_info->scrub_pause_req)) {
				1245	/* push queued extents */
				1246	scrub_submit(sdev);
				1247	wait_event(sdev->list_wait,
				1248	atomic_read(&sdev->in_flight) == 0);
				1249	atomic_inc(&fs_info->scrubs_paused);
				1250	wake_up(&fs_info->scrub_pause_wait);
				1251	mutex_lock(&fs_info->scrub_lock);
				1252	while (atomic_read(&fs_info->scrub_pause_req)) {
				1253	mutex_unlock(&fs_info->scrub_lock);
				1254	wait_event(fs_info->scrub_pause_wait,
				1255	atomic_read(&fs_info->scrub_pause_req) == 0);
				1256	mutex_lock(&fs_info->scrub_lock);
				1257	}
				1258	atomic_dec(&fs_info->scrubs_paused);
				1259	mutex_unlock(&fs_info->scrub_lock);
				1260	wake_up(&fs_info->scrub_pause_wait);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1261	}
				1262
Arne Jansen	7a26285	2011-06-10 12:39:23 +0200	[diff] [blame]	1263	ret = btrfs_lookup_csums_range(csum_root, logical,
				1264	logical + map->stripe_len - 1,
				1265	&sdev->csum_list, 1);
				1266	if (ret)
				1267	goto out;
				1268
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1269	key.objectid = logical;
				1270	key.type = BTRFS_EXTENT_ITEM_KEY;
				1271	key.offset = (u64)0;
				1272
				1273	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1274	if (ret < 0)
				1275	goto out;
Arne Jansen	8c51032	2011-06-03 10:09:26 +0200	[diff] [blame]	1276	if (ret > 0) {
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1277	ret = btrfs_previous_item(root, path, 0,
				1278	BTRFS_EXTENT_ITEM_KEY);
				1279	if (ret < 0)
				1280	goto out;
Arne Jansen	8c51032	2011-06-03 10:09:26 +0200	[diff] [blame]	1281	if (ret > 0) {
				1282	/* there's no smaller item, so stick with the
				1283	* larger one */
				1284	btrfs_release_path(path);
				1285	ret = btrfs_search_slot(NULL, root, &key,
				1286	path, 0, 0);
				1287	if (ret < 0)
				1288	goto out;
				1289	}
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1290	}
				1291
				1292	while (1) {
				1293	l = path->nodes[0];
				1294	slot = path->slots[0];
				1295	if (slot >= btrfs_header_nritems(l)) {
				1296	ret = btrfs_next_leaf(root, path);
				1297	if (ret == 0)
				1298	continue;
				1299	if (ret < 0)
				1300	goto out;
				1301
				1302	break;
				1303	}
				1304	btrfs_item_key_to_cpu(l, &key, slot);
				1305
				1306	if (key.objectid + key.offset <= logical)
				1307	goto next;
				1308
				1309	if (key.objectid >= logical + map->stripe_len)
				1310	break;
				1311
				1312	if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
				1313	goto next;
				1314
				1315	extent = btrfs_item_ptr(l, slot,
				1316	struct btrfs_extent_item);
				1317	flags = btrfs_extent_flags(l, extent);
				1318	generation = btrfs_extent_generation(l, extent);
				1319
				1320	if (key.objectid < logical &&
				1321	(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
				1322	printk(KERN_ERR
				1323	"btrfs scrub: tree block %llu spanning "
				1324	"stripes, ignored. logical=%llu\n",
				1325	(unsigned long long)key.objectid,
				1326	(unsigned long long)logical);
				1327	goto next;
				1328	}
				1329
				1330	/*
				1331	* trim extent to this stripe
				1332	*/
				1333	if (key.objectid < logical) {
				1334	key.offset -= logical - key.objectid;
				1335	key.objectid = logical;
				1336	}
				1337	if (key.objectid + key.offset >
				1338	logical + map->stripe_len) {
				1339	key.offset = logical + map->stripe_len -
				1340	key.objectid;
				1341	}
				1342
				1343	ret = scrub_extent(sdev, key.objectid, key.offset,
				1344	key.objectid - logical + physical,
				1345	flags, generation, mirror_num);
				1346	if (ret)
				1347	goto out;
				1348
				1349	next:
				1350	path->slots[0]++;
				1351	}
Chris Mason	7126733	2011-05-23 06:30:52 -0400	[diff] [blame]	1352	btrfs_release_path(path);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1353	logical += increment;
				1354	physical += map->stripe_len;
				1355	spin_lock(&sdev->stat_lock);
				1356	sdev->stat.last_physical = physical;
				1357	spin_unlock(&sdev->stat_lock);
				1358	}
				1359	/* push queued extents */
				1360	scrub_submit(sdev);
				1361
				1362	out:
Arne Jansen	e7786c3	2011-05-28 20:58:38 +0000	[diff] [blame]	1363	blk_finish_plug(&plug);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1364	btrfs_free_path(path);
				1365	return ret < 0 ? ret : 0;
				1366	}
				1367
				1368	static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
				1369	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
				1370	{
				1371	struct btrfs_mapping_tree *map_tree =
				1372	&sdev->dev->dev_root->fs_info->mapping_tree;
				1373	struct map_lookup *map;
				1374	struct extent_map *em;
				1375	int i;
				1376	int ret = -EINVAL;
				1377
				1378	read_lock(&map_tree->map_tree.lock);
				1379	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
				1380	read_unlock(&map_tree->map_tree.lock);
				1381
				1382	if (!em)
				1383	return -EINVAL;
				1384
				1385	map = (struct map_lookup *)em->bdev;
				1386	if (em->start != chunk_offset)
				1387	goto out;
				1388
				1389	if (em->len < length)
				1390	goto out;
				1391
				1392	for (i = 0; i < map->num_stripes; ++i) {
				1393	if (map->stripes[i].dev == sdev->dev) {
				1394	ret = scrub_stripe(sdev, map, i, chunk_offset, length);
				1395	if (ret)
				1396	goto out;
				1397	}
				1398	}
				1399	out:
				1400	free_extent_map(em);
				1401
				1402	return ret;
				1403	}
				1404
				1405	static noinline_for_stack
				1406	int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
				1407	{
				1408	struct btrfs_dev_extent *dev_extent = NULL;
				1409	struct btrfs_path *path;
				1410	struct btrfs_root *root = sdev->dev->dev_root;
				1411	struct btrfs_fs_info *fs_info = root->fs_info;
				1412	u64 length;
				1413	u64 chunk_tree;
				1414	u64 chunk_objectid;
				1415	u64 chunk_offset;
				1416	int ret;
				1417	int slot;
				1418	struct extent_buffer *l;
				1419	struct btrfs_key key;
				1420	struct btrfs_key found_key;
				1421	struct btrfs_block_group_cache *cache;
				1422
				1423	path = btrfs_alloc_path();
				1424	if (!path)
				1425	return -ENOMEM;
				1426
				1427	path->reada = 2;
				1428	path->search_commit_root = 1;
				1429	path->skip_locking = 1;
				1430
				1431	key.objectid = sdev->dev->devid;
				1432	key.offset = 0ull;
				1433	key.type = BTRFS_DEV_EXTENT_KEY;
				1434
				1435
				1436	while (1) {
				1437	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1438	if (ret < 0)
Arne Jansen	8c51032	2011-06-03 10:09:26 +0200	[diff] [blame]	1439	break;
				1440	if (ret > 0) {
				1441	if (path->slots[0] >=
				1442	btrfs_header_nritems(path->nodes[0])) {
				1443	ret = btrfs_next_leaf(root, path);
				1444	if (ret)
				1445	break;
				1446	}
				1447	}
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1448
				1449	l = path->nodes[0];
				1450	slot = path->slots[0];
				1451
				1452	btrfs_item_key_to_cpu(l, &found_key, slot);
				1453
				1454	if (found_key.objectid != sdev->dev->devid)
				1455	break;
				1456
Arne Jansen	8c51032	2011-06-03 10:09:26 +0200	[diff] [blame]	1457	if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1458	break;
				1459
				1460	if (found_key.offset >= end)
				1461	break;
				1462
				1463	if (found_key.offset < key.offset)
				1464	break;
				1465
				1466	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
				1467	length = btrfs_dev_extent_length(l, dev_extent);
				1468
				1469	if (found_key.offset + length <= start) {
				1470	key.offset = found_key.offset + length;
Chris Mason	7126733	2011-05-23 06:30:52 -0400	[diff] [blame]	1471	btrfs_release_path(path);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1472	continue;
				1473	}
				1474
				1475	chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
				1476	chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
				1477	chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
				1478
				1479	/*
				1480	* get a reference on the corresponding block group to prevent
				1481	* the chunk from going away while we scrub it
				1482	*/
				1483	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
				1484	if (!cache) {
				1485	ret = -ENOENT;
Arne Jansen	8c51032	2011-06-03 10:09:26 +0200	[diff] [blame]	1486	break;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1487	}
				1488	ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
				1489	chunk_offset, length);
				1490	btrfs_put_block_group(cache);
				1491	if (ret)
				1492	break;
				1493
				1494	key.offset = found_key.offset + length;
Chris Mason	7126733	2011-05-23 06:30:52 -0400	[diff] [blame]	1495	btrfs_release_path(path);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1496	}
				1497
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1498	btrfs_free_path(path);
Arne Jansen	8c51032	2011-06-03 10:09:26 +0200	[diff] [blame]	1499
				1500	/*
				1501	* ret can still be 1 from search_slot or next_leaf,
				1502	* that's not an error
				1503	*/
				1504	return ret < 0 ? ret : 0;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1505	}
				1506
				1507	static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
				1508	{
				1509	int i;
				1510	u64 bytenr;
				1511	u64 gen;
				1512	int ret;
				1513	struct btrfs_device *device = sdev->dev;
				1514	struct btrfs_root *root = device->dev_root;
				1515
				1516	gen = root->fs_info->last_trans_committed;
				1517
				1518	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
				1519	bytenr = btrfs_sb_offset(i);
				1520	if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
				1521	break;
				1522
				1523	ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
				1524	BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
				1525	if (ret)
				1526	return ret;
				1527	}
				1528	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
				1529
				1530	return 0;
				1531	}
				1532
				1533	/*
				1534	* get a reference count on fs_info->scrub_workers. start worker if necessary
				1535	*/
				1536	static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
				1537	{
				1538	struct btrfs_fs_info *fs_info = root->fs_info;
Josef Bacik	0dc3b84	2011-11-18 14:37:27 -0500	[diff] [blame]	1539	int ret = 0;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1540
				1541	mutex_lock(&fs_info->scrub_lock);
Arne Jansen	632dd77	2011-06-10 12:07:07 +0200	[diff] [blame]	1542	if (fs_info->scrub_workers_refcnt == 0) {
				1543	btrfs_init_workers(&fs_info->scrub_workers, "scrub",
				1544	fs_info->thread_pool_size, &fs_info->generic_worker);
				1545	fs_info->scrub_workers.idle_thresh = 4;
Josef Bacik	0dc3b84	2011-11-18 14:37:27 -0500	[diff] [blame]	1546	ret = btrfs_start_workers(&fs_info->scrub_workers);
				1547	if (ret)
				1548	goto out;
Arne Jansen	632dd77	2011-06-10 12:07:07 +0200	[diff] [blame]	1549	}
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1550	++fs_info->scrub_workers_refcnt;
Josef Bacik	0dc3b84	2011-11-18 14:37:27 -0500	[diff] [blame]	1551	out:
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1552	mutex_unlock(&fs_info->scrub_lock);
				1553
Josef Bacik	0dc3b84	2011-11-18 14:37:27 -0500	[diff] [blame]	1554	return ret;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1555	}
				1556
				1557	static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
				1558	{
				1559	struct btrfs_fs_info *fs_info = root->fs_info;
				1560
				1561	mutex_lock(&fs_info->scrub_lock);
				1562	if (--fs_info->scrub_workers_refcnt == 0)
				1563	btrfs_stop_workers(&fs_info->scrub_workers);
				1564	WARN_ON(fs_info->scrub_workers_refcnt < 0);
				1565	mutex_unlock(&fs_info->scrub_lock);
				1566	}
				1567
				1568
				1569	int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
Arne Jansen	8628764	2011-03-23 16:34:19 +0100	[diff] [blame]	1570	struct btrfs_scrub_progress *progress, int readonly)
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1571	{
				1572	struct scrub_dev *sdev;
				1573	struct btrfs_fs_info *fs_info = root->fs_info;
				1574	int ret;
				1575	struct btrfs_device *dev;
				1576
David Sterba	7841cb2	2011-05-31 18:07:27 +0200	[diff] [blame]	1577	if (btrfs_fs_closing(root->fs_info))
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1578	return -EINVAL;
				1579
				1580	/*
				1581	* check some assumptions
				1582	*/
				1583	if (root->sectorsize != PAGE_SIZE \|\|
				1584	root->sectorsize != root->leafsize \|\|
				1585	root->sectorsize != root->nodesize) {
				1586	printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
				1587	return -EINVAL;
				1588	}
				1589
				1590	ret = scrub_workers_get(root);
				1591	if (ret)
				1592	return ret;
				1593
				1594	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
				1595	dev = btrfs_find_device(root, devid, NULL, NULL);
				1596	if (!dev \|\| dev->missing) {
				1597	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
				1598	scrub_workers_put(root);
				1599	return -ENODEV;
				1600	}
				1601	mutex_lock(&fs_info->scrub_lock);
				1602
				1603	if (!dev->in_fs_metadata) {
				1604	mutex_unlock(&fs_info->scrub_lock);
				1605	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
				1606	scrub_workers_put(root);
				1607	return -ENODEV;
				1608	}
				1609
				1610	if (dev->scrub_device) {
				1611	mutex_unlock(&fs_info->scrub_lock);
				1612	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
				1613	scrub_workers_put(root);
				1614	return -EINPROGRESS;
				1615	}
				1616	sdev = scrub_setup_dev(dev);
				1617	if (IS_ERR(sdev)) {
				1618	mutex_unlock(&fs_info->scrub_lock);
				1619	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
				1620	scrub_workers_put(root);
				1621	return PTR_ERR(sdev);
				1622	}
Arne Jansen	8628764	2011-03-23 16:34:19 +0100	[diff] [blame]	1623	sdev->readonly = readonly;
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1624	dev->scrub_device = sdev;
				1625
				1626	atomic_inc(&fs_info->scrubs_running);
				1627	mutex_unlock(&fs_info->scrub_lock);
				1628	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
				1629
				1630	down_read(&fs_info->scrub_super_lock);
				1631	ret = scrub_supers(sdev);
				1632	up_read(&fs_info->scrub_super_lock);
				1633
				1634	if (!ret)
				1635	ret = scrub_enumerate_chunks(sdev, start, end);
				1636
				1637	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1638	atomic_dec(&fs_info->scrubs_running);
				1639	wake_up(&fs_info->scrub_pause_wait);
				1640
Jan Schmidt	0ef8e45	2011-06-13 20:04:15 +0200	[diff] [blame]	1641	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
				1642
Arne Jansen	a2de733	2011-03-08 14:14:00 +0100	[diff] [blame]	1643	if (progress)
				1644	memcpy(progress, &sdev->stat, sizeof(*progress));
				1645
				1646	mutex_lock(&fs_info->scrub_lock);
				1647	dev->scrub_device = NULL;
				1648	mutex_unlock(&fs_info->scrub_lock);
				1649
				1650	scrub_free_dev(sdev);
				1651	scrub_workers_put(root);
				1652
				1653	return ret;
				1654	}
				1655
				1656	int btrfs_scrub_pause(struct btrfs_root *root)
				1657	{
				1658	struct btrfs_fs_info *fs_info = root->fs_info;
				1659
				1660	mutex_lock(&fs_info->scrub_lock);
				1661	atomic_inc(&fs_info->scrub_pause_req);
				1662	while (atomic_read(&fs_info->scrubs_paused) !=
				1663	atomic_read(&fs_info->scrubs_running)) {
				1664	mutex_unlock(&fs_info->scrub_lock);
				1665	wait_event(fs_info->scrub_pause_wait,
				1666	atomic_read(&fs_info->scrubs_paused) ==
				1667	atomic_read(&fs_info->scrubs_running));
				1668	mutex_lock(&fs_info->scrub_lock);
				1669	}
				1670	mutex_unlock(&fs_info->scrub_lock);
				1671
				1672	return 0;
				1673	}
				1674
				1675	int btrfs_scrub_continue(struct btrfs_root *root)
				1676	{
				1677	struct btrfs_fs_info *fs_info = root->fs_info;
				1678
				1679	atomic_dec(&fs_info->scrub_pause_req);
				1680	wake_up(&fs_info->scrub_pause_wait);
				1681	return 0;
				1682	}
				1683
				1684	int btrfs_scrub_pause_super(struct btrfs_root *root)
				1685	{
				1686	down_write(&root->fs_info->scrub_super_lock);
				1687	return 0;
				1688	}
				1689
				1690	int btrfs_scrub_continue_super(struct btrfs_root *root)
				1691	{
				1692	up_write(&root->fs_info->scrub_super_lock);
				1693	return 0;
				1694	}
				1695
				1696	int btrfs_scrub_cancel(struct btrfs_root *root)
				1697	{
				1698	struct btrfs_fs_info *fs_info = root->fs_info;
				1699
				1700	mutex_lock(&fs_info->scrub_lock);
				1701	if (!atomic_read(&fs_info->scrubs_running)) {
				1702	mutex_unlock(&fs_info->scrub_lock);
				1703	return -ENOTCONN;
				1704	}
				1705
				1706	atomic_inc(&fs_info->scrub_cancel_req);
				1707	while (atomic_read(&fs_info->scrubs_running)) {
				1708	mutex_unlock(&fs_info->scrub_lock);
				1709	wait_event(fs_info->scrub_pause_wait,
				1710	atomic_read(&fs_info->scrubs_running) == 0);
				1711	mutex_lock(&fs_info->scrub_lock);
				1712	}
				1713	atomic_dec(&fs_info->scrub_cancel_req);
				1714	mutex_unlock(&fs_info->scrub_lock);
				1715
				1716	return 0;
				1717	}
				1718
				1719	int btrfs_scrub_cancel_dev(struct btrfs_root root, struct btrfs_device dev)
				1720	{
				1721	struct btrfs_fs_info *fs_info = root->fs_info;
				1722	struct scrub_dev *sdev;
				1723
				1724	mutex_lock(&fs_info->scrub_lock);
				1725	sdev = dev->scrub_device;
				1726	if (!sdev) {
				1727	mutex_unlock(&fs_info->scrub_lock);
				1728	return -ENOTCONN;
				1729	}
				1730	atomic_inc(&sdev->cancel_req);
				1731	while (dev->scrub_device) {
				1732	mutex_unlock(&fs_info->scrub_lock);
				1733	wait_event(fs_info->scrub_pause_wait,
				1734	dev->scrub_device == NULL);
				1735	mutex_lock(&fs_info->scrub_lock);
				1736	}
				1737	mutex_unlock(&fs_info->scrub_lock);
				1738
				1739	return 0;
				1740	}
				1741	int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
				1742	{
				1743	struct btrfs_fs_info *fs_info = root->fs_info;
				1744	struct btrfs_device *dev;
				1745	int ret;
				1746
				1747	/*
				1748	* we have to hold the device_list_mutex here so the device
				1749	* does not go away in cancel_dev. FIXME: find a better solution
				1750	*/
				1751	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				1752	dev = btrfs_find_device(root, devid, NULL, NULL);
				1753	if (!dev) {
				1754	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				1755	return -ENODEV;
				1756	}
				1757	ret = btrfs_scrub_cancel_dev(root, dev);
				1758	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				1759
				1760	return ret;
				1761	}
				1762
				1763	int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
				1764	struct btrfs_scrub_progress *progress)
				1765	{
				1766	struct btrfs_device *dev;
				1767	struct scrub_dev *sdev = NULL;
				1768
				1769	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
				1770	dev = btrfs_find_device(root, devid, NULL, NULL);
				1771	if (dev)
				1772	sdev = dev->scrub_device;
				1773	if (sdev)
				1774	memcpy(progress, &sdev->stat, sizeof(*progress));
				1775	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
				1776
				1777	return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
				1778	}