blob: e99be644b19fe388dea2b7084fbd12693d6eec85 [file] [log] [blame]
Arne Jansena2de7332011-03-08 14:14:00 +01001/*
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01002 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
Arne Jansena2de7332011-03-08 14:14:00 +01003 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
Arne Jansena2de7332011-03-08 14:14:00 +010019#include <linux/blkdev.h>
Jan Schmidt558540c2011-06-13 19:59:12 +020020#include <linux/ratelimit.h>
Arne Jansena2de7332011-03-08 14:14:00 +010021#include "ctree.h"
22#include "volumes.h"
23#include "disk-io.h"
24#include "ordered-data.h"
Jan Schmidt0ef8e452011-06-13 20:04:15 +020025#include "transaction.h"
Jan Schmidt558540c2011-06-13 19:59:12 +020026#include "backref.h"
Jan Schmidt5da6fcb2011-08-04 18:11:04 +020027#include "extent_io.h"
Stefan Behrensff023aa2012-11-06 11:43:11 +010028#include "dev-replace.h"
Stefan Behrens21adbd52011-11-09 13:44:05 +010029#include "check-integrity.h"
Josef Bacik606686e2012-06-04 14:03:51 -040030#include "rcu-string.h"
David Woodhouse53b381b2013-01-29 18:40:14 -050031#include "raid56.h"
Arne Jansena2de7332011-03-08 14:14:00 +010032
33/*
34 * This is only the first step towards a full-features scrub. It reads all
35 * extent and super block and verifies the checksums. In case a bad checksum
36 * is found or the extent cannot be read, good data will be written back if
37 * any can be found.
38 *
39 * Future enhancements:
Arne Jansena2de7332011-03-08 14:14:00 +010040 * - In case an unrepairable extent is encountered, track which files are
41 * affected and report them
Arne Jansena2de7332011-03-08 14:14:00 +010042 * - track and record media errors, throw out bad devices
Arne Jansena2de7332011-03-08 14:14:00 +010043 * - add a mode to also read unallocated space
Arne Jansena2de7332011-03-08 14:14:00 +010044 */
45
Stefan Behrensb5d67f62012-03-27 14:21:27 -040046struct scrub_block;
Stefan Behrensd9d181c2012-11-02 09:58:09 +010047struct scrub_ctx;
Arne Jansena2de7332011-03-08 14:14:00 +010048
Stefan Behrensff023aa2012-11-06 11:43:11 +010049/*
50 * the following three values only influence the performance.
51 * The last one configures the number of parallel and outstanding I/O
52 * operations. The first two values configure an upper limit for the number
53 * of (dynamically allocated) pages that are added to a bio.
54 */
55#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */
56#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */
57#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */
Stefan Behrens7a9e9982012-11-02 14:58:04 +010058
59/*
60 * the following value times PAGE_SIZE needs to be large enough to match the
61 * largest node/leaf/sector size that shall be supported.
62 * Values larger than BTRFS_STRIPE_LEN are not supported.
63 */
Stefan Behrensb5d67f62012-03-27 14:21:27 -040064#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */
Arne Jansena2de7332011-03-08 14:14:00 +010065
Miao Xieaf8e2d12014-10-23 14:42:50 +080066struct scrub_recover {
Elena Reshetova6f615012017-03-03 10:55:21 +020067 refcount_t refs;
Miao Xieaf8e2d12014-10-23 14:42:50 +080068 struct btrfs_bio *bbio;
Miao Xieaf8e2d12014-10-23 14:42:50 +080069 u64 map_length;
70};
71
Arne Jansena2de7332011-03-08 14:14:00 +010072struct scrub_page {
Stefan Behrensb5d67f62012-03-27 14:21:27 -040073 struct scrub_block *sblock;
74 struct page *page;
Stefan Behrens442a4f62012-05-25 16:06:08 +020075 struct btrfs_device *dev;
Miao Xie5a6ac9e2014-11-06 17:20:58 +080076 struct list_head list;
Arne Jansena2de7332011-03-08 14:14:00 +010077 u64 flags; /* extent flags */
78 u64 generation;
Stefan Behrensb5d67f62012-03-27 14:21:27 -040079 u64 logical;
80 u64 physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +010081 u64 physical_for_dev_replace;
Zhao Lei57019342015-01-20 15:11:45 +080082 atomic_t refs;
Stefan Behrensb5d67f62012-03-27 14:21:27 -040083 struct {
84 unsigned int mirror_num:8;
85 unsigned int have_csum:1;
86 unsigned int io_error:1;
87 };
Arne Jansena2de7332011-03-08 14:14:00 +010088 u8 csum[BTRFS_CSUM_SIZE];
Miao Xieaf8e2d12014-10-23 14:42:50 +080089
90 struct scrub_recover *recover;
Arne Jansena2de7332011-03-08 14:14:00 +010091};
92
93struct scrub_bio {
94 int index;
Stefan Behrensd9d181c2012-11-02 09:58:09 +010095 struct scrub_ctx *sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +010096 struct btrfs_device *dev;
Arne Jansena2de7332011-03-08 14:14:00 +010097 struct bio *bio;
98 int err;
99 u64 logical;
100 u64 physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100101#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
102 struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];
103#else
104 struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];
105#endif
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400106 int page_count;
Arne Jansena2de7332011-03-08 14:14:00 +0100107 int next_free;
108 struct btrfs_work work;
109};
110
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400111struct scrub_block {
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100112 struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400113 int page_count;
114 atomic_t outstanding_pages;
Elena Reshetova186debd2017-03-03 10:55:23 +0200115 refcount_t refs; /* free mem on transition to zero */
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100116 struct scrub_ctx *sctx;
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800117 struct scrub_parity *sparity;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400118 struct {
119 unsigned int header_error:1;
120 unsigned int checksum_error:1;
121 unsigned int no_io_error_seen:1;
Stefan Behrens442a4f62012-05-25 16:06:08 +0200122 unsigned int generation_error:1; /* also sets header_error */
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800123
124 /* The following is for the data used to check parity */
125 /* It is for the data with checksum */
126 unsigned int data_corrected:1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400127 };
Omar Sandoval73ff61d2015-06-19 11:52:51 -0700128 struct btrfs_work work;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400129};
130
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800131/* Used for the chunks with parity stripe such RAID5/6 */
132struct scrub_parity {
133 struct scrub_ctx *sctx;
134
135 struct btrfs_device *scrub_dev;
136
137 u64 logic_start;
138
139 u64 logic_end;
140
141 int nsectors;
142
Liu Bo972d7212017-04-03 13:45:33 -0700143 u64 stripe_len;
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800144
Elena Reshetova78a76452017-03-03 10:55:24 +0200145 refcount_t refs;
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800146
147 struct list_head spages;
148
149 /* Work of parity check and repair */
150 struct btrfs_work work;
151
152 /* Mark the parity blocks which have data */
153 unsigned long *dbitmap;
154
155 /*
156 * Mark the parity blocks which have data, but errors happen when
157 * read data or check data
158 */
159 unsigned long *ebitmap;
160
161 unsigned long bitmap[0];
162};
163
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100164struct scrub_ctx {
Stefan Behrensff023aa2012-11-06 11:43:11 +0100165 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400166 struct btrfs_fs_info *fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +0100167 int first_free;
168 int curr;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100169 atomic_t bios_in_flight;
170 atomic_t workers_pending;
Arne Jansena2de7332011-03-08 14:14:00 +0100171 spinlock_t list_lock;
172 wait_queue_head_t list_wait;
173 u16 csum_size;
174 struct list_head csum_list;
175 atomic_t cancel_req;
Arne Jansen86287642011-03-23 16:34:19 +0100176 int readonly;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100177 int pages_per_rd_bio;
Stefan Behrens63a212a2012-11-05 18:29:28 +0100178
179 int is_dev_replace;
David Sterba3fb99302017-05-16 19:10:32 +0200180
181 struct scrub_bio *wr_curr_bio;
182 struct mutex wr_lock;
183 int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
184 atomic_t flush_all_writes;
185 struct btrfs_device *wr_tgtdev;
Stefan Behrens63a212a2012-11-05 18:29:28 +0100186
Arne Jansena2de7332011-03-08 14:14:00 +0100187 /*
188 * statistics
189 */
190 struct btrfs_scrub_progress stat;
191 spinlock_t stat_lock;
Filipe Mananaf55985f2015-02-09 21:14:24 +0000192
193 /*
194 * Use a ref counter to avoid use-after-free issues. Scrub workers
195 * decrement bios_in_flight and workers_pending and then do a wakeup
196 * on the list_wait wait queue. We must ensure the main scrub task
197 * doesn't free the scrub context before or while the workers are
198 * doing the wakeup() call.
199 */
Elena Reshetova99f4cdb2017-03-03 10:55:25 +0200200 refcount_t refs;
Arne Jansena2de7332011-03-08 14:14:00 +0100201};
202
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200203struct scrub_fixup_nodatasum {
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100204 struct scrub_ctx *sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100205 struct btrfs_device *dev;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200206 u64 logical;
207 struct btrfs_root *root;
208 struct btrfs_work work;
209 int mirror_num;
210};
211
Josef Bacik652f25a2013-09-12 16:58:28 -0400212struct scrub_nocow_inode {
213 u64 inum;
214 u64 offset;
215 u64 root;
216 struct list_head list;
217};
218
Stefan Behrensff023aa2012-11-06 11:43:11 +0100219struct scrub_copy_nocow_ctx {
220 struct scrub_ctx *sctx;
221 u64 logical;
222 u64 len;
223 int mirror_num;
224 u64 physical_for_dev_replace;
Josef Bacik652f25a2013-09-12 16:58:28 -0400225 struct list_head inodes;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100226 struct btrfs_work work;
227};
228
Jan Schmidt558540c2011-06-13 19:59:12 +0200229struct scrub_warning {
230 struct btrfs_path *path;
231 u64 extent_item_size;
Jan Schmidt558540c2011-06-13 19:59:12 +0200232 const char *errstr;
233 sector_t sector;
234 u64 logical;
235 struct btrfs_device *dev;
Jan Schmidt558540c2011-06-13 19:59:12 +0200236};
237
Qu Wenruo0966a7b2017-04-14 08:35:54 +0800238struct full_stripe_lock {
239 struct rb_node node;
240 u64 logical;
241 u64 refs;
242 struct mutex mutex;
243};
244
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100245static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
246static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
247static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
248static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400249static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
Zhao Leibe50a8d2015-01-20 15:11:42 +0800250static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
Stefan Behrensff023aa2012-11-06 11:43:11 +0100251 struct scrub_block *sblocks_for_recheck);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +0100252static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
Zhao Leiaffe4a52015-08-24 21:32:06 +0800253 struct scrub_block *sblock,
254 int retry_failed_mirror);
Zhao Leiba7cf982015-08-24 21:18:02 +0800255static void scrub_recheck_block_checksum(struct scrub_block *sblock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400256static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
Zhao Lei114ab502015-01-20 15:11:36 +0800257 struct scrub_block *sblock_good);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400258static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
259 struct scrub_block *sblock_good,
260 int page_num, int force_write);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100261static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
262static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
263 int page_num);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400264static int scrub_checksum_data(struct scrub_block *sblock);
265static int scrub_checksum_tree_block(struct scrub_block *sblock);
266static int scrub_checksum_super(struct scrub_block *sblock);
267static void scrub_block_get(struct scrub_block *sblock);
268static void scrub_block_put(struct scrub_block *sblock);
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100269static void scrub_page_get(struct scrub_page *spage);
270static void scrub_page_put(struct scrub_page *spage);
Miao Xie5a6ac9e2014-11-06 17:20:58 +0800271static void scrub_parity_get(struct scrub_parity *sparity);
272static void scrub_parity_put(struct scrub_parity *sparity);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100273static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
274 struct scrub_page *spage);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100275static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100276 u64 physical, struct btrfs_device *dev, u64 flags,
Stefan Behrensff023aa2012-11-06 11:43:11 +0100277 u64 gen, int mirror_num, u8 *csum, int force,
278 u64 physical_for_dev_replace);
Christoph Hellwig4246a0b2015-07-20 15:29:37 +0200279static void scrub_bio_end_io(struct bio *bio);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400280static void scrub_bio_end_io_worker(struct btrfs_work *work);
281static void scrub_block_complete(struct scrub_block *sblock);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100282static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
283 u64 extent_logical, u64 extent_len,
284 u64 *extent_physical,
285 struct btrfs_device **extent_dev,
286 int *extent_mirror_num);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100287static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
288 struct scrub_page *spage);
289static void scrub_wr_submit(struct scrub_ctx *sctx);
Christoph Hellwig4246a0b2015-07-20 15:29:37 +0200290static void scrub_wr_bio_end_io(struct bio *bio);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100291static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
292static int write_page_nocow(struct scrub_ctx *sctx,
293 u64 physical_for_dev_replace, struct page *page);
294static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
Josef Bacik652f25a2013-09-12 16:58:28 -0400295 struct scrub_copy_nocow_ctx *ctx);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100296static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
297 int mirror_num, u64 physical_for_dev_replace);
298static void copy_nocow_pages_worker(struct btrfs_work *work);
Wang Shilongcb7ab022013-12-04 21:16:53 +0800299static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
Wang Shilong3cb09292013-12-04 21:15:19 +0800300static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
Filipe Mananaf55985f2015-02-09 21:14:24 +0000301static void scrub_put_ctx(struct scrub_ctx *sctx);
Stefan Behrens1623ede2012-03-27 14:21:26 -0400302
303
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100304static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
305{
Elena Reshetova99f4cdb2017-03-03 10:55:25 +0200306 refcount_inc(&sctx->refs);
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100307 atomic_inc(&sctx->bios_in_flight);
308}
309
310static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
311{
312 atomic_dec(&sctx->bios_in_flight);
313 wake_up(&sctx->list_wait);
Filipe Mananaf55985f2015-02-09 21:14:24 +0000314 scrub_put_ctx(sctx);
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100315}
316
Wang Shilongcb7ab022013-12-04 21:16:53 +0800317static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
Wang Shilong3cb09292013-12-04 21:15:19 +0800318{
319 while (atomic_read(&fs_info->scrub_pause_req)) {
320 mutex_unlock(&fs_info->scrub_lock);
321 wait_event(fs_info->scrub_pause_wait,
322 atomic_read(&fs_info->scrub_pause_req) == 0);
323 mutex_lock(&fs_info->scrub_lock);
324 }
325}
326
Zhaolei0e22be82015-08-05 16:43:28 +0800327static void scrub_pause_on(struct btrfs_fs_info *fs_info)
Wang Shilongcb7ab022013-12-04 21:16:53 +0800328{
329 atomic_inc(&fs_info->scrubs_paused);
330 wake_up(&fs_info->scrub_pause_wait);
Zhaolei0e22be82015-08-05 16:43:28 +0800331}
Wang Shilongcb7ab022013-12-04 21:16:53 +0800332
Zhaolei0e22be82015-08-05 16:43:28 +0800333static void scrub_pause_off(struct btrfs_fs_info *fs_info)
334{
Wang Shilongcb7ab022013-12-04 21:16:53 +0800335 mutex_lock(&fs_info->scrub_lock);
336 __scrub_blocked_if_needed(fs_info);
337 atomic_dec(&fs_info->scrubs_paused);
338 mutex_unlock(&fs_info->scrub_lock);
339
340 wake_up(&fs_info->scrub_pause_wait);
341}
342
Zhaolei0e22be82015-08-05 16:43:28 +0800343static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
344{
345 scrub_pause_on(fs_info);
346 scrub_pause_off(fs_info);
347}
348
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100349/*
Qu Wenruo0966a7b2017-04-14 08:35:54 +0800350 * Insert new full stripe lock into full stripe locks tree
351 *
352 * Return pointer to existing or newly inserted full_stripe_lock structure if
353 * everything works well.
354 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
355 *
356 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
357 * function
358 */
359static struct full_stripe_lock *insert_full_stripe_lock(
360 struct btrfs_full_stripe_locks_tree *locks_root,
361 u64 fstripe_logical)
362{
363 struct rb_node **p;
364 struct rb_node *parent = NULL;
365 struct full_stripe_lock *entry;
366 struct full_stripe_lock *ret;
367
368 WARN_ON(!mutex_is_locked(&locks_root->lock));
369
370 p = &locks_root->root.rb_node;
371 while (*p) {
372 parent = *p;
373 entry = rb_entry(parent, struct full_stripe_lock, node);
374 if (fstripe_logical < entry->logical) {
375 p = &(*p)->rb_left;
376 } else if (fstripe_logical > entry->logical) {
377 p = &(*p)->rb_right;
378 } else {
379 entry->refs++;
380 return entry;
381 }
382 }
383
384 /* Insert new lock */
385 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
386 if (!ret)
387 return ERR_PTR(-ENOMEM);
388 ret->logical = fstripe_logical;
389 ret->refs = 1;
390 mutex_init(&ret->mutex);
391
392 rb_link_node(&ret->node, parent, p);
393 rb_insert_color(&ret->node, &locks_root->root);
394 return ret;
395}
396
397/*
398 * Search for a full stripe lock of a block group
399 *
400 * Return pointer to existing full stripe lock if found
401 * Return NULL if not found
402 */
403static struct full_stripe_lock *search_full_stripe_lock(
404 struct btrfs_full_stripe_locks_tree *locks_root,
405 u64 fstripe_logical)
406{
407 struct rb_node *node;
408 struct full_stripe_lock *entry;
409
410 WARN_ON(!mutex_is_locked(&locks_root->lock));
411
412 node = locks_root->root.rb_node;
413 while (node) {
414 entry = rb_entry(node, struct full_stripe_lock, node);
415 if (fstripe_logical < entry->logical)
416 node = node->rb_left;
417 else if (fstripe_logical > entry->logical)
418 node = node->rb_right;
419 else
420 return entry;
421 }
422 return NULL;
423}
424
425/*
426 * Helper to get full stripe logical from a normal bytenr.
427 *
428 * Caller must ensure @cache is a RAID56 block group.
429 */
430static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
431 u64 bytenr)
432{
433 u64 ret;
434
435 /*
436 * Due to chunk item size limit, full stripe length should not be
437 * larger than U32_MAX. Just a sanity check here.
438 */
439 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
440
441 /*
442 * round_down() can only handle power of 2, while RAID56 full
443 * stripe length can be 64KiB * n, so we need to manually round down.
444 */
445 ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
446 cache->full_stripe_len + cache->key.objectid;
447 return ret;
448}
449
450/*
451 * Lock a full stripe to avoid concurrency of recovery and read
452 *
453 * It's only used for profiles with parities (RAID5/6), for other profiles it
454 * does nothing.
455 *
456 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
457 * So caller must call unlock_full_stripe() at the same context.
458 *
459 * Return <0 if encounters error.
460 */
461static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
462 bool *locked_ret)
463{
464 struct btrfs_block_group_cache *bg_cache;
465 struct btrfs_full_stripe_locks_tree *locks_root;
466 struct full_stripe_lock *existing;
467 u64 fstripe_start;
468 int ret = 0;
469
470 *locked_ret = false;
471 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
472 if (!bg_cache) {
473 ASSERT(0);
474 return -ENOENT;
475 }
476
477 /* Profiles not based on parity don't need full stripe lock */
478 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
479 goto out;
480 locks_root = &bg_cache->full_stripe_locks_root;
481
482 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
483
484 /* Now insert the full stripe lock */
485 mutex_lock(&locks_root->lock);
486 existing = insert_full_stripe_lock(locks_root, fstripe_start);
487 mutex_unlock(&locks_root->lock);
488 if (IS_ERR(existing)) {
489 ret = PTR_ERR(existing);
490 goto out;
491 }
492 mutex_lock(&existing->mutex);
493 *locked_ret = true;
494out:
495 btrfs_put_block_group(bg_cache);
496 return ret;
497}
498
499/*
500 * Unlock a full stripe.
501 *
502 * NOTE: Caller must ensure it's the same context calling corresponding
503 * lock_full_stripe().
504 *
505 * Return 0 if we unlock full stripe without problem.
506 * Return <0 for error
507 */
508static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
509 bool locked)
510{
511 struct btrfs_block_group_cache *bg_cache;
512 struct btrfs_full_stripe_locks_tree *locks_root;
513 struct full_stripe_lock *fstripe_lock;
514 u64 fstripe_start;
515 bool freeit = false;
516 int ret = 0;
517
518 /* If we didn't acquire full stripe lock, no need to continue */
519 if (!locked)
520 return 0;
521
522 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
523 if (!bg_cache) {
524 ASSERT(0);
525 return -ENOENT;
526 }
527 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
528 goto out;
529
530 locks_root = &bg_cache->full_stripe_locks_root;
531 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
532
533 mutex_lock(&locks_root->lock);
534 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
535 /* Unpaired unlock_full_stripe() detected */
536 if (!fstripe_lock) {
537 WARN_ON(1);
538 ret = -ENOENT;
539 mutex_unlock(&locks_root->lock);
540 goto out;
541 }
542
543 if (fstripe_lock->refs == 0) {
544 WARN_ON(1);
545 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
546 fstripe_lock->logical);
547 } else {
548 fstripe_lock->refs--;
549 }
550
551 if (fstripe_lock->refs == 0) {
552 rb_erase(&fstripe_lock->node, &locks_root->root);
553 freeit = true;
554 }
555 mutex_unlock(&locks_root->lock);
556
557 mutex_unlock(&fstripe_lock->mutex);
558 if (freeit)
559 kfree(fstripe_lock);
560out:
561 btrfs_put_block_group(bg_cache);
562 return ret;
563}
564
565/*
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100566 * used for workers that require transaction commits (i.e., for the
567 * NOCOW case)
568 */
569static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
570{
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400571 struct btrfs_fs_info *fs_info = sctx->fs_info;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100572
Elena Reshetova99f4cdb2017-03-03 10:55:25 +0200573 refcount_inc(&sctx->refs);
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100574 /*
575 * increment scrubs_running to prevent cancel requests from
576 * completing as long as a worker is running. we must also
577 * increment scrubs_paused to prevent deadlocking on pause
578 * requests used for transactions commits (as the worker uses a
579 * transaction context). it is safe to regard the worker
580 * as paused for all matters practical. effectively, we only
581 * avoid cancellation requests from completing.
582 */
583 mutex_lock(&fs_info->scrub_lock);
584 atomic_inc(&fs_info->scrubs_running);
585 atomic_inc(&fs_info->scrubs_paused);
586 mutex_unlock(&fs_info->scrub_lock);
Wang Shilong32a44782014-02-19 19:24:19 +0800587
588 /*
589 * check if @scrubs_running=@scrubs_paused condition
590 * inside wait_event() is not an atomic operation.
591 * which means we may inc/dec @scrub_running/paused
592 * at any time. Let's wake up @scrub_pause_wait as
593 * much as we can to let commit transaction blocked less.
594 */
595 wake_up(&fs_info->scrub_pause_wait);
596
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100597 atomic_inc(&sctx->workers_pending);
598}
599
600/* used for workers that require transaction commits */
601static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
602{
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400603 struct btrfs_fs_info *fs_info = sctx->fs_info;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100604
605 /*
606 * see scrub_pending_trans_workers_inc() why we're pretending
607 * to be paused in the scrub counters
608 */
609 mutex_lock(&fs_info->scrub_lock);
610 atomic_dec(&fs_info->scrubs_running);
611 atomic_dec(&fs_info->scrubs_paused);
612 mutex_unlock(&fs_info->scrub_lock);
613 atomic_dec(&sctx->workers_pending);
614 wake_up(&fs_info->scrub_pause_wait);
615 wake_up(&sctx->list_wait);
Filipe Mananaf55985f2015-02-09 21:14:24 +0000616 scrub_put_ctx(sctx);
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100617}
618
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100619static void scrub_free_csums(struct scrub_ctx *sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100620{
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100621 while (!list_empty(&sctx->csum_list)) {
Arne Jansena2de7332011-03-08 14:14:00 +0100622 struct btrfs_ordered_sum *sum;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100623 sum = list_first_entry(&sctx->csum_list,
Arne Jansena2de7332011-03-08 14:14:00 +0100624 struct btrfs_ordered_sum, list);
625 list_del(&sum->list);
626 kfree(sum);
627 }
628}
629
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100630static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100631{
632 int i;
Arne Jansena2de7332011-03-08 14:14:00 +0100633
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100634 if (!sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100635 return;
636
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400637 /* this can happen when scrub is cancelled */
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100638 if (sctx->curr != -1) {
639 struct scrub_bio *sbio = sctx->bios[sctx->curr];
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400640
641 for (i = 0; i < sbio->page_count; i++) {
Stefan Behrensff023aa2012-11-06 11:43:11 +0100642 WARN_ON(!sbio->pagev[i]->page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400643 scrub_block_put(sbio->pagev[i]->sblock);
644 }
645 bio_put(sbio->bio);
646 }
647
Stefan Behrensff023aa2012-11-06 11:43:11 +0100648 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100649 struct scrub_bio *sbio = sctx->bios[i];
Arne Jansena2de7332011-03-08 14:14:00 +0100650
651 if (!sbio)
652 break;
Arne Jansena2de7332011-03-08 14:14:00 +0100653 kfree(sbio);
654 }
655
David Sterba3fb99302017-05-16 19:10:32 +0200656 kfree(sctx->wr_curr_bio);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100657 scrub_free_csums(sctx);
658 kfree(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +0100659}
660
Filipe Mananaf55985f2015-02-09 21:14:24 +0000661static void scrub_put_ctx(struct scrub_ctx *sctx)
662{
Elena Reshetova99f4cdb2017-03-03 10:55:25 +0200663 if (refcount_dec_and_test(&sctx->refs))
Filipe Mananaf55985f2015-02-09 21:14:24 +0000664 scrub_free_ctx(sctx);
665}
666
Arne Jansena2de7332011-03-08 14:14:00 +0100667static noinline_for_stack
Stefan Behrens63a212a2012-11-05 18:29:28 +0100668struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +0100669{
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100670 struct scrub_ctx *sctx;
Arne Jansena2de7332011-03-08 14:14:00 +0100671 int i;
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400672 struct btrfs_fs_info *fs_info = dev->fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +0100673
David Sterba58c4e172016-02-11 10:49:42 +0100674 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100675 if (!sctx)
Arne Jansena2de7332011-03-08 14:14:00 +0100676 goto nomem;
Elena Reshetova99f4cdb2017-03-03 10:55:25 +0200677 refcount_set(&sctx->refs, 1);
Stefan Behrens63a212a2012-11-05 18:29:28 +0100678 sctx->is_dev_replace = is_dev_replace;
Kent Overstreetb54ffb72015-05-19 14:31:01 +0200679 sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100680 sctx->curr = -1;
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400681 sctx->fs_info = dev->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100682 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
Arne Jansena2de7332011-03-08 14:14:00 +0100683 struct scrub_bio *sbio;
684
David Sterba58c4e172016-02-11 10:49:42 +0100685 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
Arne Jansena2de7332011-03-08 14:14:00 +0100686 if (!sbio)
687 goto nomem;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100688 sctx->bios[i] = sbio;
Arne Jansena2de7332011-03-08 14:14:00 +0100689
Arne Jansena2de7332011-03-08 14:14:00 +0100690 sbio->index = i;
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100691 sbio->sctx = sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400692 sbio->page_count = 0;
Liu Bo9e0af232014-08-15 23:36:53 +0800693 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
694 scrub_bio_end_io_worker, NULL, NULL);
Arne Jansena2de7332011-03-08 14:14:00 +0100695
Stefan Behrensff023aa2012-11-06 11:43:11 +0100696 if (i != SCRUB_BIOS_PER_SCTX - 1)
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100697 sctx->bios[i]->next_free = i + 1;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200698 else
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100699 sctx->bios[i]->next_free = -1;
Arne Jansena2de7332011-03-08 14:14:00 +0100700 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100701 sctx->first_free = 0;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +0100702 atomic_set(&sctx->bios_in_flight, 0);
703 atomic_set(&sctx->workers_pending, 0);
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100704 atomic_set(&sctx->cancel_req, 0);
705 sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
706 INIT_LIST_HEAD(&sctx->csum_list);
Arne Jansena2de7332011-03-08 14:14:00 +0100707
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100708 spin_lock_init(&sctx->list_lock);
709 spin_lock_init(&sctx->stat_lock);
710 init_waitqueue_head(&sctx->list_wait);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100711
David Sterba3fb99302017-05-16 19:10:32 +0200712 WARN_ON(sctx->wr_curr_bio != NULL);
713 mutex_init(&sctx->wr_lock);
714 sctx->wr_curr_bio = NULL;
David Sterba8fcdac32017-05-16 19:10:23 +0200715 if (is_dev_replace) {
716 WARN_ON(!dev->bdev);
David Sterba3fb99302017-05-16 19:10:32 +0200717 sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
718 sctx->wr_tgtdev = dev;
719 atomic_set(&sctx->flush_all_writes, 0);
Stefan Behrensff023aa2012-11-06 11:43:11 +0100720 }
David Sterba8fcdac32017-05-16 19:10:23 +0200721
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100722 return sctx;
Arne Jansena2de7332011-03-08 14:14:00 +0100723
724nomem:
Stefan Behrensd9d181c2012-11-02 09:58:09 +0100725 scrub_free_ctx(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +0100726 return ERR_PTR(-ENOMEM);
727}
728
Stefan Behrensff023aa2012-11-06 11:43:11 +0100729static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
730 void *warn_ctx)
Jan Schmidt558540c2011-06-13 19:59:12 +0200731{
732 u64 isize;
733 u32 nlink;
734 int ret;
735 int i;
736 struct extent_buffer *eb;
737 struct btrfs_inode_item *inode_item;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100738 struct scrub_warning *swarn = warn_ctx;
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400739 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
Jan Schmidt558540c2011-06-13 19:59:12 +0200740 struct inode_fs_paths *ipath = NULL;
741 struct btrfs_root *local_root;
742 struct btrfs_key root_key;
David Sterba1d4c08e2015-01-02 19:36:14 +0100743 struct btrfs_key key;
Jan Schmidt558540c2011-06-13 19:59:12 +0200744
745 root_key.objectid = root;
746 root_key.type = BTRFS_ROOT_ITEM_KEY;
747 root_key.offset = (u64)-1;
748 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
749 if (IS_ERR(local_root)) {
750 ret = PTR_ERR(local_root);
751 goto err;
752 }
753
David Sterba14692cc2015-01-02 18:55:46 +0100754 /*
755 * this makes the path point to (inum INODE_ITEM ioff)
756 */
David Sterba1d4c08e2015-01-02 19:36:14 +0100757 key.objectid = inum;
758 key.type = BTRFS_INODE_ITEM_KEY;
759 key.offset = 0;
760
761 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
Jan Schmidt558540c2011-06-13 19:59:12 +0200762 if (ret) {
763 btrfs_release_path(swarn->path);
764 goto err;
765 }
766
767 eb = swarn->path->nodes[0];
768 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
769 struct btrfs_inode_item);
770 isize = btrfs_inode_size(eb, inode_item);
771 nlink = btrfs_inode_nlink(eb, inode_item);
772 btrfs_release_path(swarn->path);
773
774 ipath = init_ipath(4096, local_root, swarn->path);
Dan Carpenter26bdef52011-11-16 11:28:01 +0300775 if (IS_ERR(ipath)) {
776 ret = PTR_ERR(ipath);
777 ipath = NULL;
778 goto err;
779 }
Jan Schmidt558540c2011-06-13 19:59:12 +0200780 ret = paths_from_inode(inum, ipath);
781
782 if (ret < 0)
783 goto err;
784
785 /*
786 * we deliberately ignore the bit ipath might have been too small to
787 * hold all of the paths here
788 */
789 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
Jeff Mahoney5d163e02016-09-20 10:05:00 -0400790 btrfs_warn_in_rcu(fs_info,
791 "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
792 swarn->errstr, swarn->logical,
793 rcu_str_deref(swarn->dev->name),
794 (unsigned long long)swarn->sector,
795 root, inum, offset,
796 min(isize - offset, (u64)PAGE_SIZE), nlink,
797 (char *)(unsigned long)ipath->fspath->val[i]);
Jan Schmidt558540c2011-06-13 19:59:12 +0200798
799 free_ipath(ipath);
800 return 0;
801
802err:
Jeff Mahoney5d163e02016-09-20 10:05:00 -0400803 btrfs_warn_in_rcu(fs_info,
804 "%s at logical %llu on dev %s, sector %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
805 swarn->errstr, swarn->logical,
806 rcu_str_deref(swarn->dev->name),
807 (unsigned long long)swarn->sector,
808 root, inum, offset, ret);
Jan Schmidt558540c2011-06-13 19:59:12 +0200809
810 free_ipath(ipath);
811 return 0;
812}
813
Stefan Behrensb5d67f62012-03-27 14:21:27 -0400814static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
Jan Schmidt558540c2011-06-13 19:59:12 +0200815{
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100816 struct btrfs_device *dev;
817 struct btrfs_fs_info *fs_info;
Jan Schmidt558540c2011-06-13 19:59:12 +0200818 struct btrfs_path *path;
819 struct btrfs_key found_key;
820 struct extent_buffer *eb;
821 struct btrfs_extent_item *ei;
822 struct scrub_warning swarn;
Jan Schmidt558540c2011-06-13 19:59:12 +0200823 unsigned long ptr = 0;
Jan Schmidt4692cf52011-12-02 14:56:41 +0100824 u64 extent_item_pos;
Liu Bo69917e42012-09-07 20:01:28 -0600825 u64 flags = 0;
826 u64 ref_root;
827 u32 item_size;
Dan Carpenter07c9a8e2016-03-11 11:08:56 +0300828 u8 ref_level = 0;
Liu Bo69917e42012-09-07 20:01:28 -0600829 int ret;
Jan Schmidt558540c2011-06-13 19:59:12 +0200830
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100831 WARN_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100832 dev = sblock->pagev[0]->dev;
Jeff Mahoneyfb456252016-06-22 18:54:56 -0400833 fs_info = sblock->sctx->fs_info;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100834
Jan Schmidt558540c2011-06-13 19:59:12 +0200835 path = btrfs_alloc_path();
David Sterba8b9456d2014-07-30 01:25:30 +0200836 if (!path)
837 return;
Jan Schmidt558540c2011-06-13 19:59:12 +0200838
Stefan Behrens7a9e9982012-11-02 14:58:04 +0100839 swarn.sector = (sblock->pagev[0]->physical) >> 9;
840 swarn.logical = sblock->pagev[0]->logical;
Jan Schmidt558540c2011-06-13 19:59:12 +0200841 swarn.errstr = errstr;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100842 swarn.dev = NULL;
Jan Schmidt558540c2011-06-13 19:59:12 +0200843
Liu Bo69917e42012-09-07 20:01:28 -0600844 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
845 &flags);
Jan Schmidt558540c2011-06-13 19:59:12 +0200846 if (ret < 0)
847 goto out;
848
Jan Schmidt4692cf52011-12-02 14:56:41 +0100849 extent_item_pos = swarn.logical - found_key.objectid;
Jan Schmidt558540c2011-06-13 19:59:12 +0200850 swarn.extent_item_size = found_key.offset;
851
852 eb = path->nodes[0];
853 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
854 item_size = btrfs_item_size_nr(eb, path->slots[0]);
855
Liu Bo69917e42012-09-07 20:01:28 -0600856 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
Jan Schmidt558540c2011-06-13 19:59:12 +0200857 do {
Liu Bo6eda71d2014-06-09 10:54:07 +0800858 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
859 item_size, &ref_root,
860 &ref_level);
David Sterbaecaeb142015-10-08 09:01:03 +0200861 btrfs_warn_in_rcu(fs_info,
Jeff Mahoney5d163e02016-09-20 10:05:00 -0400862 "%s at logical %llu on dev %s, sector %llu: metadata %s (level %d) in tree %llu",
863 errstr, swarn.logical,
Josef Bacik606686e2012-06-04 14:03:51 -0400864 rcu_str_deref(dev->name),
Jan Schmidt558540c2011-06-13 19:59:12 +0200865 (unsigned long long)swarn.sector,
866 ref_level ? "node" : "leaf",
867 ret < 0 ? -1 : ref_level,
868 ret < 0 ? -1 : ref_root);
869 } while (ret != 1);
Josef Bacikd8fe29e2013-03-29 08:09:34 -0600870 btrfs_release_path(path);
Jan Schmidt558540c2011-06-13 19:59:12 +0200871 } else {
Josef Bacikd8fe29e2013-03-29 08:09:34 -0600872 btrfs_release_path(path);
Jan Schmidt558540c2011-06-13 19:59:12 +0200873 swarn.path = path;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +0100874 swarn.dev = dev;
Jan Schmidt7a3ae2f2012-03-23 17:32:28 +0100875 iterate_extent_inodes(fs_info, found_key.objectid,
876 extent_item_pos, 1,
Jan Schmidt558540c2011-06-13 19:59:12 +0200877 scrub_print_warning_inode, &swarn);
878 }
879
880out:
881 btrfs_free_path(path);
Jan Schmidt558540c2011-06-13 19:59:12 +0200882}
883
Stefan Behrensff023aa2012-11-06 11:43:11 +0100884static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200885{
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200886 struct page *page = NULL;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200887 unsigned long index;
Stefan Behrensff023aa2012-11-06 11:43:11 +0100888 struct scrub_fixup_nodatasum *fixup = fixup_ctx;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200889 int ret;
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200890 int corrected = 0;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200891 struct btrfs_key key;
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200892 struct inode *inode = NULL;
Liu Bo6f1c3602013-01-29 03:22:10 +0000893 struct btrfs_fs_info *fs_info;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200894 u64 end = offset + PAGE_SIZE - 1;
895 struct btrfs_root *local_root;
Liu Bo6f1c3602013-01-29 03:22:10 +0000896 int srcu_index;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200897
898 key.objectid = root;
899 key.type = BTRFS_ROOT_ITEM_KEY;
900 key.offset = (u64)-1;
Liu Bo6f1c3602013-01-29 03:22:10 +0000901
902 fs_info = fixup->root->fs_info;
903 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
904
905 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
906 if (IS_ERR(local_root)) {
907 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200908 return PTR_ERR(local_root);
Liu Bo6f1c3602013-01-29 03:22:10 +0000909 }
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200910
911 key.type = BTRFS_INODE_ITEM_KEY;
912 key.objectid = inum;
913 key.offset = 0;
Liu Bo6f1c3602013-01-29 03:22:10 +0000914 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
915 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200916 if (IS_ERR(inode))
917 return PTR_ERR(inode);
918
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +0300919 index = offset >> PAGE_SHIFT;
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200920
921 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200922 if (!page) {
923 ret = -ENOMEM;
924 goto out;
925 }
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200926
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200927 if (PageUptodate(page)) {
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200928 if (PageDirty(page)) {
929 /*
930 * we need to write the data to the defect sector. the
931 * data that was in that sector is not in memory,
932 * because the page was modified. we must not write the
933 * modified page to that sector.
934 *
935 * TODO: what could be done here: wait for the delalloc
936 * runner to write out that page (might involve
937 * COW) and see whether the sector is still
938 * referenced afterwards.
939 *
940 * For the meantime, we'll treat this error
941 * incorrectable, although there is a chance that a
942 * later scrub will find the bad sector again and that
943 * there's no dirty page in memory, then.
944 */
945 ret = -EIO;
946 goto out;
947 }
Josef Bacik6ec656b2017-05-05 11:57:14 -0400948 ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200949 fixup->logical, page,
Miao Xieffdd2012014-09-12 18:44:00 +0800950 offset - page_offset(page),
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200951 fixup->mirror_num);
952 unlock_page(page);
953 corrected = !ret;
954 } else {
955 /*
956 * we need to get good data first. the general readpage path
957 * will call repair_io_failure for us, we just have to make
958 * sure we read the bad mirror.
959 */
960 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
David Sterbaceeb0ae2016-04-26 23:54:39 +0200961 EXTENT_DAMAGED);
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200962 if (ret) {
963 /* set_extent_bits should give proper error */
964 WARN_ON(ret > 0);
965 if (ret > 0)
966 ret = -EFAULT;
967 goto out;
968 }
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200969
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200970 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
971 btrfs_get_extent,
972 fixup->mirror_num);
973 wait_on_page_locked(page);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200974
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200975 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
976 end, EXTENT_DAMAGED, 0, NULL);
977 if (!corrected)
978 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
David Sterba91166212016-04-26 23:54:39 +0200979 EXTENT_DAMAGED);
Jan Schmidt5da6fcb2011-08-04 18:11:04 +0200980 }
981
982out:
983 if (page)
984 put_page(page);
Tobias Klauser7fb18a02014-04-25 14:58:05 +0200985
986 iput(inode);
Jan Schmidt0ef8e452011-06-13 20:04:15 +0200987
988 if (ret < 0)
989 return ret;
990
991 if (ret == 0 && corrected) {
992 /*
993 * we only need to call readpage for one of the inodes belonging
994 * to this extent. so make iterate_extent_inodes stop
995 */
996 return 1;
997 }
998
999 return -EIO;
1000}
1001
1002static void scrub_fixup_nodatasum(struct btrfs_work *work)
1003{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001004 struct btrfs_fs_info *fs_info;
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001005 int ret;
1006 struct scrub_fixup_nodatasum *fixup;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001007 struct scrub_ctx *sctx;
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001008 struct btrfs_trans_handle *trans = NULL;
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001009 struct btrfs_path *path;
1010 int uncorrectable = 0;
1011
1012 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001013 sctx = fixup->sctx;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001014 fs_info = fixup->root->fs_info;
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001015
1016 path = btrfs_alloc_path();
1017 if (!path) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001018 spin_lock(&sctx->stat_lock);
1019 ++sctx->stat.malloc_errors;
1020 spin_unlock(&sctx->stat_lock);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001021 uncorrectable = 1;
1022 goto out;
1023 }
1024
1025 trans = btrfs_join_transaction(fixup->root);
1026 if (IS_ERR(trans)) {
1027 uncorrectable = 1;
1028 goto out;
1029 }
1030
1031 /*
1032 * the idea is to trigger a regular read through the standard path. we
1033 * read a page from the (failed) logical address by specifying the
1034 * corresponding copynum of the failed sector. thus, that readpage is
1035 * expected to fail.
1036 * that is the point where on-the-fly error correction will kick in
1037 * (once it's finished) and rewrite the failed sector if a good copy
1038 * can be found.
1039 */
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001040 ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
1041 scrub_fixup_readpage, fixup);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001042 if (ret < 0) {
1043 uncorrectable = 1;
1044 goto out;
1045 }
1046 WARN_ON(ret != 1);
1047
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001048 spin_lock(&sctx->stat_lock);
1049 ++sctx->stat.corrected_errors;
1050 spin_unlock(&sctx->stat_lock);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001051
1052out:
1053 if (trans && !IS_ERR(trans))
Jeff Mahoney3a45bb22016-09-09 21:39:03 -04001054 btrfs_end_transaction(trans);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001055 if (uncorrectable) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001056 spin_lock(&sctx->stat_lock);
1057 ++sctx->stat.uncorrectable_errors;
1058 spin_unlock(&sctx->stat_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001059 btrfs_dev_replace_stats_inc(
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001060 &fs_info->dev_replace.num_uncorrectable_read_errors);
1061 btrfs_err_rl_in_rcu(fs_info,
David Sterbab14af3b2015-10-08 10:43:10 +02001062 "unable to fixup (nodatasum) error at logical %llu on dev %s",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02001063 fixup->logical, rcu_str_deref(fixup->dev->name));
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001064 }
1065
1066 btrfs_free_path(path);
1067 kfree(fixup);
1068
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01001069 scrub_pending_trans_workers_dec(sctx);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02001070}
1071
Miao Xieaf8e2d12014-10-23 14:42:50 +08001072static inline void scrub_get_recover(struct scrub_recover *recover)
1073{
Elena Reshetova6f615012017-03-03 10:55:21 +02001074 refcount_inc(&recover->refs);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001075}
1076
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001077static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
1078 struct scrub_recover *recover)
Miao Xieaf8e2d12014-10-23 14:42:50 +08001079{
Elena Reshetova6f615012017-03-03 10:55:21 +02001080 if (refcount_dec_and_test(&recover->refs)) {
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001081 btrfs_bio_counter_dec(fs_info);
Zhao Lei6e9606d2015-01-20 15:11:34 +08001082 btrfs_put_bbio(recover->bbio);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001083 kfree(recover);
1084 }
1085}
1086
Arne Jansena2de7332011-03-08 14:14:00 +01001087/*
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001088 * scrub_handle_errored_block gets called when either verification of the
1089 * pages failed or the bio failed to read, e.g. with EIO. In the latter
1090 * case, this function handles all pages in the bio, even though only one
1091 * may be bad.
1092 * The goal of this function is to repair the errored block by using the
1093 * contents of one of the mirrors.
Arne Jansena2de7332011-03-08 14:14:00 +01001094 */
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001095static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
Arne Jansena2de7332011-03-08 14:14:00 +01001096{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001097 struct scrub_ctx *sctx = sblock_to_check->sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001098 struct btrfs_device *dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001099 struct btrfs_fs_info *fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +01001100 u64 length;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001101 u64 logical;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001102 unsigned int failed_mirror_index;
1103 unsigned int is_metadata;
1104 unsigned int have_csum;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001105 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
1106 struct scrub_block *sblock_bad;
Arne Jansena2de7332011-03-08 14:14:00 +01001107 int ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001108 int mirror_index;
1109 int page_num;
1110 int success;
Qu Wenruo28d70e22017-04-14 08:35:55 +08001111 bool full_stripe_locked;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001112 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1113 DEFAULT_RATELIMIT_BURST);
Arne Jansena2de7332011-03-08 14:14:00 +01001114
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001115 BUG_ON(sblock_to_check->page_count < 1);
Jeff Mahoneyfb456252016-06-22 18:54:56 -04001116 fs_info = sctx->fs_info;
Stefan Behrens4ded4f62012-11-14 18:57:29 +00001117 if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1118 /*
1119 * if we find an error in a super block, we just report it.
1120 * They will get written with the next transaction commit
1121 * anyway
1122 */
1123 spin_lock(&sctx->stat_lock);
1124 ++sctx->stat.super_errors;
1125 spin_unlock(&sctx->stat_lock);
1126 return 0;
1127 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001128 length = sblock_to_check->page_count * PAGE_SIZE;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001129 logical = sblock_to_check->pagev[0]->logical;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001130 BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
1131 failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
1132 is_metadata = !(sblock_to_check->pagev[0]->flags &
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001133 BTRFS_EXTENT_FLAG_DATA);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001134 have_csum = sblock_to_check->pagev[0]->have_csum;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001135 dev = sblock_to_check->pagev[0]->dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001136
Qu Wenruo28d70e22017-04-14 08:35:55 +08001137 /*
1138 * For RAID5/6, race can happen for a different device scrub thread.
1139 * For data corruption, Parity and Data threads will both try
1140 * to recovery the data.
1141 * Race can lead to doubly added csum error, or even unrecoverable
1142 * error.
1143 */
1144 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1145 if (ret < 0) {
1146 spin_lock(&sctx->stat_lock);
1147 if (ret == -ENOMEM)
1148 sctx->stat.malloc_errors++;
1149 sctx->stat.read_errors++;
1150 sctx->stat.uncorrectable_errors++;
1151 spin_unlock(&sctx->stat_lock);
1152 return ret;
1153 }
1154
Stefan Behrensff023aa2012-11-06 11:43:11 +01001155 if (sctx->is_dev_replace && !is_metadata && !have_csum) {
1156 sblocks_for_recheck = NULL;
1157 goto nodatasum_case;
1158 }
1159
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001160 /*
1161 * read all mirrors one after the other. This includes to
1162 * re-read the extent or metadata block that failed (that was
1163 * the cause that this fixup code is called) another time,
1164 * page by page this time in order to know which pages
1165 * caused I/O errors and which ones are good (for all mirrors).
1166 * It is the goal to handle the situation when more than one
1167 * mirror contains I/O errors, but the errors do not
1168 * overlap, i.e. the data can be repaired by selecting the
1169 * pages from those mirrors without I/O error on the
1170 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
1171 * would be that mirror #1 has an I/O error on the first page,
1172 * the second page is good, and mirror #2 has an I/O error on
1173 * the second page, but the first page is good.
1174 * Then the first page of the first mirror can be repaired by
1175 * taking the first page of the second mirror, and the
1176 * second page of the second mirror can be repaired by
1177 * copying the contents of the 2nd page of the 1st mirror.
1178 * One more note: if the pages of one mirror contain I/O
1179 * errors, the checksum cannot be verified. In order to get
1180 * the best data for repairing, the first attempt is to find
1181 * a mirror without I/O errors and with a validated checksum.
1182 * Only if this is not possible, the pages are picked from
1183 * mirrors with I/O errors without considering the checksum.
1184 * If the latter is the case, at the end, the checksum of the
1185 * repaired area is verified in order to correctly maintain
1186 * the statistics.
1187 */
1188
David Sterba31e818f2015-02-20 18:00:26 +01001189 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
1190 sizeof(*sblocks_for_recheck), GFP_NOFS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001191 if (!sblocks_for_recheck) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001192 spin_lock(&sctx->stat_lock);
1193 sctx->stat.malloc_errors++;
1194 sctx->stat.read_errors++;
1195 sctx->stat.uncorrectable_errors++;
1196 spin_unlock(&sctx->stat_lock);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001197 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001198 goto out;
1199 }
1200
1201 /* setup the context, map the logical blocks and alloc the pages */
Zhao Leibe50a8d2015-01-20 15:11:42 +08001202 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001203 if (ret) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001204 spin_lock(&sctx->stat_lock);
1205 sctx->stat.read_errors++;
1206 sctx->stat.uncorrectable_errors++;
1207 spin_unlock(&sctx->stat_lock);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001208 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001209 goto out;
1210 }
1211 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1212 sblock_bad = sblocks_for_recheck + failed_mirror_index;
1213
1214 /* build and submit the bios for the failed mirror, check checksums */
Zhao Leiaffe4a52015-08-24 21:32:06 +08001215 scrub_recheck_block(fs_info, sblock_bad, 1);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001216
1217 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1218 sblock_bad->no_io_error_seen) {
1219 /*
1220 * the error disappeared after reading page by page, or
1221 * the area was part of a huge bio and other parts of the
1222 * bio caused I/O errors, or the block layer merged several
1223 * read requests into one and the error is caused by a
1224 * different bio (usually one of the two latter cases is
1225 * the cause)
1226 */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001227 spin_lock(&sctx->stat_lock);
1228 sctx->stat.unverified_errors++;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08001229 sblock_to_check->data_corrected = 1;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001230 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001231
Stefan Behrensff023aa2012-11-06 11:43:11 +01001232 if (sctx->is_dev_replace)
1233 scrub_write_block_to_dev_replace(sblock_bad);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001234 goto out;
1235 }
1236
1237 if (!sblock_bad->no_io_error_seen) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001238 spin_lock(&sctx->stat_lock);
1239 sctx->stat.read_errors++;
1240 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001241 if (__ratelimit(&_rs))
1242 scrub_print_warning("i/o error", sblock_to_check);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001243 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001244 } else if (sblock_bad->checksum_error) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001245 spin_lock(&sctx->stat_lock);
1246 sctx->stat.csum_errors++;
1247 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001248 if (__ratelimit(&_rs))
1249 scrub_print_warning("checksum error", sblock_to_check);
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001250 btrfs_dev_stat_inc_and_print(dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02001251 BTRFS_DEV_STAT_CORRUPTION_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001252 } else if (sblock_bad->header_error) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001253 spin_lock(&sctx->stat_lock);
1254 sctx->stat.verify_errors++;
1255 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001256 if (__ratelimit(&_rs))
1257 scrub_print_warning("checksum/header error",
1258 sblock_to_check);
Stefan Behrens442a4f62012-05-25 16:06:08 +02001259 if (sblock_bad->generation_error)
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001260 btrfs_dev_stat_inc_and_print(dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02001261 BTRFS_DEV_STAT_GENERATION_ERRS);
1262 else
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001263 btrfs_dev_stat_inc_and_print(dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02001264 BTRFS_DEV_STAT_CORRUPTION_ERRS);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001265 }
1266
Ilya Dryomov33ef30a2013-11-03 19:06:38 +02001267 if (sctx->readonly) {
1268 ASSERT(!sctx->is_dev_replace);
1269 goto out;
1270 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001271
1272 if (!is_metadata && !have_csum) {
1273 struct scrub_fixup_nodatasum *fixup_nodatasum;
1274
Stefan Behrensff023aa2012-11-06 11:43:11 +01001275 WARN_ON(sctx->is_dev_replace);
1276
Zhao Leib25c94c2015-01-20 15:11:35 +08001277nodatasum_case:
1278
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001279 /*
1280 * !is_metadata and !have_csum, this means that the data
Nicholas D Steeves01327612016-05-19 21:18:45 -04001281 * might not be COWed, that it might be modified
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001282 * concurrently. The general strategy to work on the
1283 * commit root does not help in the case when COW is not
1284 * used.
1285 */
1286 fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1287 if (!fixup_nodatasum)
1288 goto did_not_correct_error;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001289 fixup_nodatasum->sctx = sctx;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01001290 fixup_nodatasum->dev = dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001291 fixup_nodatasum->logical = logical;
1292 fixup_nodatasum->root = fs_info->extent_root;
1293 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01001294 scrub_pending_trans_workers_inc(sctx);
Liu Bo9e0af232014-08-15 23:36:53 +08001295 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1296 scrub_fixup_nodatasum, NULL, NULL);
Qu Wenruo0339ef22014-02-28 10:46:17 +08001297 btrfs_queue_work(fs_info->scrub_workers,
1298 &fixup_nodatasum->work);
Arne Jansena2de7332011-03-08 14:14:00 +01001299 goto out;
1300 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001301
1302 /*
1303 * now build and submit the bios for the other mirrors, check
Stefan Behrenscb2ced72012-11-02 16:14:21 +01001304 * checksums.
1305 * First try to pick the mirror which is completely without I/O
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001306 * errors and also does not have a checksum error.
1307 * If one is found, and if a checksum is present, the full block
1308 * that is known to contain an error is rewritten. Afterwards
1309 * the block is known to be corrected.
1310 * If a mirror is found which is completely correct, and no
1311 * checksum is present, only those pages are rewritten that had
1312 * an I/O error in the block to be repaired, since it cannot be
1313 * determined, which copy of the other pages is better (and it
1314 * could happen otherwise that a correct page would be
1315 * overwritten by a bad one).
1316 */
1317 for (mirror_index = 0;
1318 mirror_index < BTRFS_MAX_MIRRORS &&
1319 sblocks_for_recheck[mirror_index].page_count > 0;
1320 mirror_index++) {
Stefan Behrenscb2ced72012-11-02 16:14:21 +01001321 struct scrub_block *sblock_other;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001322
Stefan Behrenscb2ced72012-11-02 16:14:21 +01001323 if (mirror_index == failed_mirror_index)
1324 continue;
1325 sblock_other = sblocks_for_recheck + mirror_index;
1326
1327 /* build and submit the bios, check checksums */
Zhao Leiaffe4a52015-08-24 21:32:06 +08001328 scrub_recheck_block(fs_info, sblock_other, 0);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001329
1330 if (!sblock_other->header_error &&
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001331 !sblock_other->checksum_error &&
1332 sblock_other->no_io_error_seen) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01001333 if (sctx->is_dev_replace) {
1334 scrub_write_block_to_dev_replace(sblock_other);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001335 goto corrected_error;
Zhao Lei114ab502015-01-20 15:11:36 +08001336 } else {
1337 ret = scrub_repair_block_from_good_copy(
1338 sblock_bad, sblock_other);
1339 if (!ret)
1340 goto corrected_error;
1341 }
Arne Jansena2de7332011-03-08 14:14:00 +01001342 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001343 }
1344
Zhao Leib968fed2015-01-20 15:11:41 +08001345 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1346 goto did_not_correct_error;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001347
1348 /*
Stefan Behrensff023aa2012-11-06 11:43:11 +01001349 * In case of I/O errors in the area that is supposed to be
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001350 * repaired, continue by picking good copies of those pages.
1351 * Select the good pages from mirrors to rewrite bad pages from
1352 * the area to fix. Afterwards verify the checksum of the block
1353 * that is supposed to be repaired. This verification step is
1354 * only done for the purpose of statistic counting and for the
1355 * final scrub report, whether errors remain.
1356 * A perfect algorithm could make use of the checksum and try
1357 * all possible combinations of pages from the different mirrors
1358 * until the checksum verification succeeds. For example, when
1359 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1360 * of mirror #2 is readable but the final checksum test fails,
1361 * then the 2nd page of mirror #3 could be tried, whether now
Nicholas D Steeves01327612016-05-19 21:18:45 -04001362 * the final checksum succeeds. But this would be a rare
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001363 * exception and is therefore not implemented. At least it is
1364 * avoided that the good copy is overwritten.
1365 * A more useful improvement would be to pick the sectors
1366 * without I/O error based on sector sizes (512 bytes on legacy
1367 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1368 * mirror could be repaired by taking 512 byte of a different
1369 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1370 * area are unreadable.
1371 */
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001372 success = 1;
Zhao Leib968fed2015-01-20 15:11:41 +08001373 for (page_num = 0; page_num < sblock_bad->page_count;
1374 page_num++) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001375 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
Zhao Leib968fed2015-01-20 15:11:41 +08001376 struct scrub_block *sblock_other = NULL;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001377
Zhao Leib968fed2015-01-20 15:11:41 +08001378 /* skip no-io-error page in scrub */
1379 if (!page_bad->io_error && !sctx->is_dev_replace)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001380 continue;
1381
Zhao Leib968fed2015-01-20 15:11:41 +08001382 /* try to find no-io-error page in mirrors */
1383 if (page_bad->io_error) {
1384 for (mirror_index = 0;
1385 mirror_index < BTRFS_MAX_MIRRORS &&
1386 sblocks_for_recheck[mirror_index].page_count > 0;
1387 mirror_index++) {
1388 if (!sblocks_for_recheck[mirror_index].
1389 pagev[page_num]->io_error) {
1390 sblock_other = sblocks_for_recheck +
1391 mirror_index;
1392 break;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001393 }
Jan Schmidt13db62b2011-06-13 19:56:13 +02001394 }
Zhao Leib968fed2015-01-20 15:11:41 +08001395 if (!sblock_other)
1396 success = 0;
Jan Schmidt13db62b2011-06-13 19:56:13 +02001397 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001398
Zhao Leib968fed2015-01-20 15:11:41 +08001399 if (sctx->is_dev_replace) {
1400 /*
1401 * did not find a mirror to fetch the page
1402 * from. scrub_write_page_to_dev_replace()
1403 * handles this case (page->io_error), by
1404 * filling the block with zeros before
1405 * submitting the write request
1406 */
1407 if (!sblock_other)
1408 sblock_other = sblock_bad;
1409
1410 if (scrub_write_page_to_dev_replace(sblock_other,
1411 page_num) != 0) {
1412 btrfs_dev_replace_stats_inc(
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001413 &fs_info->dev_replace.num_write_errors);
Zhao Leib968fed2015-01-20 15:11:41 +08001414 success = 0;
1415 }
1416 } else if (sblock_other) {
1417 ret = scrub_repair_page_from_good_copy(sblock_bad,
1418 sblock_other,
1419 page_num, 0);
1420 if (0 == ret)
1421 page_bad->io_error = 0;
1422 else
1423 success = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001424 }
1425 }
1426
Zhao Leib968fed2015-01-20 15:11:41 +08001427 if (success && !sctx->is_dev_replace) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001428 if (is_metadata || have_csum) {
1429 /*
1430 * need to verify the checksum now that all
1431 * sectors on disk are repaired (the write
1432 * request for data to be repaired is on its way).
1433 * Just be lazy and use scrub_recheck_block()
1434 * which re-reads the data before the checksum
1435 * is verified, but most likely the data comes out
1436 * of the page cache.
1437 */
Zhao Leiaffe4a52015-08-24 21:32:06 +08001438 scrub_recheck_block(fs_info, sblock_bad, 1);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001439 if (!sblock_bad->header_error &&
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001440 !sblock_bad->checksum_error &&
1441 sblock_bad->no_io_error_seen)
1442 goto corrected_error;
1443 else
1444 goto did_not_correct_error;
1445 } else {
1446corrected_error:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001447 spin_lock(&sctx->stat_lock);
1448 sctx->stat.corrected_errors++;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08001449 sblock_to_check->data_corrected = 1;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001450 spin_unlock(&sctx->stat_lock);
David Sterbab14af3b2015-10-08 10:43:10 +02001451 btrfs_err_rl_in_rcu(fs_info,
1452 "fixed up error at logical %llu on dev %s",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02001453 logical, rcu_str_deref(dev->name));
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001454 }
1455 } else {
1456did_not_correct_error:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001457 spin_lock(&sctx->stat_lock);
1458 sctx->stat.uncorrectable_errors++;
1459 spin_unlock(&sctx->stat_lock);
David Sterbab14af3b2015-10-08 10:43:10 +02001460 btrfs_err_rl_in_rcu(fs_info,
1461 "unable to fixup (regular) error at logical %llu on dev %s",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02001462 logical, rcu_str_deref(dev->name));
Arne Jansena2de7332011-03-08 14:14:00 +01001463 }
1464
1465out:
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001466 if (sblocks_for_recheck) {
1467 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1468 mirror_index++) {
1469 struct scrub_block *sblock = sblocks_for_recheck +
1470 mirror_index;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001471 struct scrub_recover *recover;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001472 int page_index;
1473
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001474 for (page_index = 0; page_index < sblock->page_count;
1475 page_index++) {
1476 sblock->pagev[page_index]->sblock = NULL;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001477 recover = sblock->pagev[page_index]->recover;
1478 if (recover) {
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001479 scrub_put_recover(fs_info, recover);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001480 sblock->pagev[page_index]->recover =
1481 NULL;
1482 }
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001483 scrub_page_put(sblock->pagev[page_index]);
1484 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001485 }
1486 kfree(sblocks_for_recheck);
1487 }
1488
Qu Wenruo28d70e22017-04-14 08:35:55 +08001489 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1490 if (ret < 0)
1491 return ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001492 return 0;
Arne Jansena2de7332011-03-08 14:14:00 +01001493}
1494
Zhao Lei8e5cfb52015-01-20 15:11:33 +08001495static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
Miao Xieaf8e2d12014-10-23 14:42:50 +08001496{
Zhao Lei10f11902015-01-20 15:11:43 +08001497 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1498 return 2;
1499 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1500 return 3;
1501 else
Miao Xieaf8e2d12014-10-23 14:42:50 +08001502 return (int)bbio->num_stripes;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001503}
1504
Zhao Lei10f11902015-01-20 15:11:43 +08001505static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1506 u64 *raid_map,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001507 u64 mapped_length,
1508 int nstripes, int mirror,
1509 int *stripe_index,
1510 u64 *stripe_offset)
1511{
1512 int i;
1513
Zhao Leiffe2d202015-01-20 15:11:44 +08001514 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
Miao Xieaf8e2d12014-10-23 14:42:50 +08001515 /* RAID5/6 */
1516 for (i = 0; i < nstripes; i++) {
1517 if (raid_map[i] == RAID6_Q_STRIPE ||
1518 raid_map[i] == RAID5_P_STRIPE)
1519 continue;
1520
1521 if (logical >= raid_map[i] &&
1522 logical < raid_map[i] + mapped_length)
1523 break;
1524 }
1525
1526 *stripe_index = i;
1527 *stripe_offset = logical - raid_map[i];
1528 } else {
1529 /* The other RAID type */
1530 *stripe_index = mirror;
1531 *stripe_offset = 0;
1532 }
1533}
1534
Zhao Leibe50a8d2015-01-20 15:11:42 +08001535static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001536 struct scrub_block *sblocks_for_recheck)
Arne Jansena2de7332011-03-08 14:14:00 +01001537{
Zhao Leibe50a8d2015-01-20 15:11:42 +08001538 struct scrub_ctx *sctx = original_sblock->sctx;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04001539 struct btrfs_fs_info *fs_info = sctx->fs_info;
Zhao Leibe50a8d2015-01-20 15:11:42 +08001540 u64 length = original_sblock->page_count * PAGE_SIZE;
1541 u64 logical = original_sblock->pagev[0]->logical;
Zhao Lei4734b7e2015-08-19 22:39:18 +08001542 u64 generation = original_sblock->pagev[0]->generation;
1543 u64 flags = original_sblock->pagev[0]->flags;
1544 u64 have_csum = original_sblock->pagev[0]->have_csum;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001545 struct scrub_recover *recover;
1546 struct btrfs_bio *bbio;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001547 u64 sublen;
1548 u64 mapped_length;
1549 u64 stripe_offset;
1550 int stripe_index;
Zhao Leibe50a8d2015-01-20 15:11:42 +08001551 int page_index = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001552 int mirror_index;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001553 int nmirrors;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001554 int ret;
1555
1556 /*
Zhao Lei57019342015-01-20 15:11:45 +08001557 * note: the two members refs and outstanding_pages
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001558 * are not used (and not set) in the blocks that are used for
1559 * the recheck procedure
1560 */
1561
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001562 while (length > 0) {
Miao Xieaf8e2d12014-10-23 14:42:50 +08001563 sublen = min_t(u64, length, PAGE_SIZE);
1564 mapped_length = sublen;
1565 bbio = NULL;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001566
1567 /*
1568 * with a length of PAGE_SIZE, each returned stripe
1569 * represents one mirror
1570 */
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001571 btrfs_bio_counter_inc_blocked(fs_info);
Christoph Hellwigcf8cddd2016-10-27 09:27:36 +02001572 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
David Sterba825ad4c2017-03-28 14:45:22 +02001573 logical, &mapped_length, &bbio);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001574 if (ret || !bbio || mapped_length < sublen) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08001575 btrfs_put_bbio(bbio);
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001576 btrfs_bio_counter_dec(fs_info);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001577 return -EIO;
1578 }
1579
Miao Xieaf8e2d12014-10-23 14:42:50 +08001580 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1581 if (!recover) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08001582 btrfs_put_bbio(bbio);
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001583 btrfs_bio_counter_dec(fs_info);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001584 return -ENOMEM;
1585 }
1586
Elena Reshetova6f615012017-03-03 10:55:21 +02001587 refcount_set(&recover->refs, 1);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001588 recover->bbio = bbio;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001589 recover->map_length = mapped_length;
1590
Ashish Samant24731142016-04-29 18:33:59 -07001591 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001592
Zhao Leibe50a8d2015-01-20 15:11:42 +08001593 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
Zhao Lei10f11902015-01-20 15:11:43 +08001594
Miao Xieaf8e2d12014-10-23 14:42:50 +08001595 for (mirror_index = 0; mirror_index < nmirrors;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001596 mirror_index++) {
1597 struct scrub_block *sblock;
1598 struct scrub_page *page;
1599
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001600 sblock = sblocks_for_recheck + mirror_index;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001601 sblock->sctx = sctx;
Zhao Lei4734b7e2015-08-19 22:39:18 +08001602
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001603 page = kzalloc(sizeof(*page), GFP_NOFS);
1604 if (!page) {
1605leave_nomem:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01001606 spin_lock(&sctx->stat_lock);
1607 sctx->stat.malloc_errors++;
1608 spin_unlock(&sctx->stat_lock);
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001609 scrub_put_recover(fs_info, recover);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001610 return -ENOMEM;
1611 }
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001612 scrub_page_get(page);
1613 sblock->pagev[page_index] = page;
Zhao Lei4734b7e2015-08-19 22:39:18 +08001614 page->sblock = sblock;
1615 page->flags = flags;
1616 page->generation = generation;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001617 page->logical = logical;
Zhao Lei4734b7e2015-08-19 22:39:18 +08001618 page->have_csum = have_csum;
1619 if (have_csum)
1620 memcpy(page->csum,
1621 original_sblock->pagev[0]->csum,
1622 sctx->csum_size);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001623
Zhao Lei10f11902015-01-20 15:11:43 +08001624 scrub_stripe_index_and_offset(logical,
1625 bbio->map_type,
1626 bbio->raid_map,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001627 mapped_length,
Zhao Leie34c3302015-01-20 15:11:31 +08001628 bbio->num_stripes -
1629 bbio->num_tgtdevs,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001630 mirror_index,
1631 &stripe_index,
1632 &stripe_offset);
1633 page->physical = bbio->stripes[stripe_index].physical +
1634 stripe_offset;
1635 page->dev = bbio->stripes[stripe_index].dev;
1636
Stefan Behrensff023aa2012-11-06 11:43:11 +01001637 BUG_ON(page_index >= original_sblock->page_count);
1638 page->physical_for_dev_replace =
1639 original_sblock->pagev[page_index]->
1640 physical_for_dev_replace;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001641 /* for missing devices, dev->bdev is NULL */
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001642 page->mirror_num = mirror_index + 1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001643 sblock->page_count++;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001644 page->page = alloc_page(GFP_NOFS);
1645 if (!page->page)
1646 goto leave_nomem;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001647
1648 scrub_get_recover(recover);
1649 page->recover = recover;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001650 }
Qu Wenruoe501bfe2017-03-29 09:33:22 +08001651 scrub_put_recover(fs_info, recover);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001652 length -= sublen;
1653 logical += sublen;
1654 page_index++;
1655 }
1656
1657 return 0;
1658}
1659
Miao Xieaf8e2d12014-10-23 14:42:50 +08001660struct scrub_bio_ret {
1661 struct completion event;
1662 int error;
1663};
1664
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02001665static void scrub_bio_wait_endio(struct bio *bio)
Miao Xieaf8e2d12014-10-23 14:42:50 +08001666{
1667 struct scrub_bio_ret *ret = bio->bi_private;
1668
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02001669 ret->error = bio->bi_error;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001670 complete(&ret->event);
1671}
1672
1673static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1674{
Zhao Lei10f11902015-01-20 15:11:43 +08001675 return page->recover &&
Zhao Leiffe2d202015-01-20 15:11:44 +08001676 (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001677}
1678
1679static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1680 struct bio *bio,
1681 struct scrub_page *page)
1682{
1683 struct scrub_bio_ret done;
1684 int ret;
1685
1686 init_completion(&done.event);
1687 done.error = 0;
1688 bio->bi_iter.bi_sector = page->logical >> 9;
1689 bio->bi_private = &done;
1690 bio->bi_end_io = scrub_bio_wait_endio;
1691
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04001692 ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
Miao Xieaf8e2d12014-10-23 14:42:50 +08001693 page->recover->map_length,
Miao Xie42452152014-11-25 16:39:28 +08001694 page->mirror_num, 0);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001695 if (ret)
1696 return ret;
1697
1698 wait_for_completion(&done.event);
1699 if (done.error)
1700 return -EIO;
1701
1702 return 0;
1703}
1704
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001705/*
1706 * this function will check the on disk data for checksum errors, header
1707 * errors and read I/O errors. If any I/O errors happen, the exact pages
1708 * which are errored are marked as being bad. The goal is to enable scrub
1709 * to take those pages that are not errored from all the mirrors so that
1710 * the pages that are errored in the just handled mirror can be repaired.
1711 */
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001712static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
Zhao Leiaffe4a52015-08-24 21:32:06 +08001713 struct scrub_block *sblock,
1714 int retry_failed_mirror)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001715{
1716 int page_num;
1717
1718 sblock->no_io_error_seen = 1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001719
1720 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1721 struct bio *bio;
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001722 struct scrub_page *page = sblock->pagev[page_num];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001723
Stefan Behrens442a4f62012-05-25 16:06:08 +02001724 if (page->dev->bdev == NULL) {
Stefan Behrensea9947b2012-05-04 15:16:07 -04001725 page->io_error = 1;
1726 sblock->no_io_error_seen = 0;
1727 continue;
1728 }
1729
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001730 WARN_ON(!page->page);
Chris Mason9be33952013-05-17 18:30:14 -04001731 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001732 if (!bio) {
1733 page->io_error = 1;
1734 sblock->no_io_error_seen = 0;
1735 continue;
1736 }
Stefan Behrens442a4f62012-05-25 16:06:08 +02001737 bio->bi_bdev = page->dev->bdev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001738
Stefan Behrens34f5c8e2012-11-02 16:16:26 +01001739 bio_add_page(bio, page->page, PAGE_SIZE, 0);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001740 if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
Liu Bo1bcd7aa2017-03-29 10:55:16 -07001741 if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) {
1742 page->io_error = 1;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001743 sblock->no_io_error_seen = 0;
Liu Bo1bcd7aa2017-03-29 10:55:16 -07001744 }
Miao Xieaf8e2d12014-10-23 14:42:50 +08001745 } else {
1746 bio->bi_iter.bi_sector = page->physical >> 9;
Mike Christie37226b22016-06-05 14:31:52 -05001747 bio_set_op_attrs(bio, REQ_OP_READ, 0);
Miao Xieaf8e2d12014-10-23 14:42:50 +08001748
Liu Bo1bcd7aa2017-03-29 10:55:16 -07001749 if (btrfsic_submit_bio_wait(bio)) {
1750 page->io_error = 1;
Miao Xieaf8e2d12014-10-23 14:42:50 +08001751 sblock->no_io_error_seen = 0;
Liu Bo1bcd7aa2017-03-29 10:55:16 -07001752 }
Miao Xieaf8e2d12014-10-23 14:42:50 +08001753 }
Kent Overstreet33879d42013-11-23 22:33:32 -08001754
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001755 bio_put(bio);
1756 }
1757
1758 if (sblock->no_io_error_seen)
Zhao Leiba7cf982015-08-24 21:18:02 +08001759 scrub_recheck_block_checksum(sblock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001760}
1761
Miao Xie17a9be22014-07-24 11:37:08 +08001762static inline int scrub_check_fsid(u8 fsid[],
1763 struct scrub_page *spage)
1764{
1765 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1766 int ret;
1767
1768 ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1769 return !ret;
1770}
1771
Zhao Leiba7cf982015-08-24 21:18:02 +08001772static void scrub_recheck_block_checksum(struct scrub_block *sblock)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001773{
Zhao Leiba7cf982015-08-24 21:18:02 +08001774 sblock->header_error = 0;
1775 sblock->checksum_error = 0;
1776 sblock->generation_error = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001777
Zhao Leiba7cf982015-08-24 21:18:02 +08001778 if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1779 scrub_checksum_data(sblock);
1780 else
1781 scrub_checksum_tree_block(sblock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001782}
1783
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001784static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
Zhao Lei114ab502015-01-20 15:11:36 +08001785 struct scrub_block *sblock_good)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001786{
1787 int page_num;
1788 int ret = 0;
1789
1790 for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1791 int ret_sub;
1792
1793 ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1794 sblock_good,
Zhao Lei114ab502015-01-20 15:11:36 +08001795 page_num, 1);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001796 if (ret_sub)
1797 ret = ret_sub;
1798 }
1799
1800 return ret;
1801}
1802
1803static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1804 struct scrub_block *sblock_good,
1805 int page_num, int force_write)
1806{
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001807 struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1808 struct scrub_page *page_good = sblock_good->pagev[page_num];
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001809 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001810
Stefan Behrens7a9e9982012-11-02 14:58:04 +01001811 BUG_ON(page_bad->page == NULL);
1812 BUG_ON(page_good->page == NULL);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001813 if (force_write || sblock_bad->header_error ||
1814 sblock_bad->checksum_error || page_bad->io_error) {
1815 struct bio *bio;
1816 int ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001817
Stefan Behrensff023aa2012-11-06 11:43:11 +01001818 if (!page_bad->dev->bdev) {
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001819 btrfs_warn_rl(fs_info,
Jeff Mahoney5d163e02016-09-20 10:05:00 -04001820 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
Stefan Behrensff023aa2012-11-06 11:43:11 +01001821 return -EIO;
1822 }
1823
Chris Mason9be33952013-05-17 18:30:14 -04001824 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
Tsutomu Itohe627ee72012-04-12 16:03:56 -04001825 if (!bio)
1826 return -EIO;
Stefan Behrens442a4f62012-05-25 16:06:08 +02001827 bio->bi_bdev = page_bad->dev->bdev;
Kent Overstreet4f024f32013-10-11 15:44:27 -07001828 bio->bi_iter.bi_sector = page_bad->physical >> 9;
Mike Christie37226b22016-06-05 14:31:52 -05001829 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001830
1831 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1832 if (PAGE_SIZE != ret) {
1833 bio_put(bio);
1834 return -EIO;
1835 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001836
Mike Christie4e49ea42016-06-05 14:31:41 -05001837 if (btrfsic_submit_bio_wait(bio)) {
Stefan Behrens442a4f62012-05-25 16:06:08 +02001838 btrfs_dev_stat_inc_and_print(page_bad->dev,
1839 BTRFS_DEV_STAT_WRITE_ERRS);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001840 btrfs_dev_replace_stats_inc(
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001841 &fs_info->dev_replace.num_write_errors);
Stefan Behrens442a4f62012-05-25 16:06:08 +02001842 bio_put(bio);
1843 return -EIO;
1844 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04001845 bio_put(bio);
1846 }
1847
1848 return 0;
1849}
1850
Stefan Behrensff023aa2012-11-06 11:43:11 +01001851static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1852{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001853 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001854 int page_num;
1855
Miao Xie5a6ac9e2014-11-06 17:20:58 +08001856 /*
1857 * This block is used for the check of the parity on the source device,
1858 * so the data needn't be written into the destination device.
1859 */
1860 if (sblock->sparity)
1861 return;
1862
Stefan Behrensff023aa2012-11-06 11:43:11 +01001863 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1864 int ret;
1865
1866 ret = scrub_write_page_to_dev_replace(sblock, page_num);
1867 if (ret)
1868 btrfs_dev_replace_stats_inc(
Jeff Mahoney0b246af2016-06-22 18:54:23 -04001869 &fs_info->dev_replace.num_write_errors);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001870 }
1871}
1872
1873static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1874 int page_num)
1875{
1876 struct scrub_page *spage = sblock->pagev[page_num];
1877
1878 BUG_ON(spage->page == NULL);
1879 if (spage->io_error) {
1880 void *mapped_buffer = kmap_atomic(spage->page);
1881
David Sterba619a9742017-03-29 20:48:44 +02001882 clear_page(mapped_buffer);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001883 flush_dcache_page(spage->page);
1884 kunmap_atomic(mapped_buffer);
1885 }
1886 return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1887}
1888
1889static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1890 struct scrub_page *spage)
1891{
Stefan Behrensff023aa2012-11-06 11:43:11 +01001892 struct scrub_bio *sbio;
1893 int ret;
1894
David Sterba3fb99302017-05-16 19:10:32 +02001895 mutex_lock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001896again:
David Sterba3fb99302017-05-16 19:10:32 +02001897 if (!sctx->wr_curr_bio) {
1898 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
David Sterba58c4e172016-02-11 10:49:42 +01001899 GFP_KERNEL);
David Sterba3fb99302017-05-16 19:10:32 +02001900 if (!sctx->wr_curr_bio) {
1901 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001902 return -ENOMEM;
1903 }
David Sterba3fb99302017-05-16 19:10:32 +02001904 sctx->wr_curr_bio->sctx = sctx;
1905 sctx->wr_curr_bio->page_count = 0;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001906 }
David Sterba3fb99302017-05-16 19:10:32 +02001907 sbio = sctx->wr_curr_bio;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001908 if (sbio->page_count == 0) {
1909 struct bio *bio;
1910
1911 sbio->physical = spage->physical_for_dev_replace;
1912 sbio->logical = spage->logical;
David Sterba3fb99302017-05-16 19:10:32 +02001913 sbio->dev = sctx->wr_tgtdev;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001914 bio = sbio->bio;
1915 if (!bio) {
David Sterba58c4e172016-02-11 10:49:42 +01001916 bio = btrfs_io_bio_alloc(GFP_KERNEL,
David Sterba3fb99302017-05-16 19:10:32 +02001917 sctx->pages_per_wr_bio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001918 if (!bio) {
David Sterba3fb99302017-05-16 19:10:32 +02001919 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001920 return -ENOMEM;
1921 }
1922 sbio->bio = bio;
1923 }
1924
1925 bio->bi_private = sbio;
1926 bio->bi_end_io = scrub_wr_bio_end_io;
1927 bio->bi_bdev = sbio->dev->bdev;
Kent Overstreet4f024f32013-10-11 15:44:27 -07001928 bio->bi_iter.bi_sector = sbio->physical >> 9;
Mike Christie37226b22016-06-05 14:31:52 -05001929 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001930 sbio->err = 0;
1931 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1932 spage->physical_for_dev_replace ||
1933 sbio->logical + sbio->page_count * PAGE_SIZE !=
1934 spage->logical) {
1935 scrub_wr_submit(sctx);
1936 goto again;
1937 }
1938
1939 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1940 if (ret != PAGE_SIZE) {
1941 if (sbio->page_count < 1) {
1942 bio_put(sbio->bio);
1943 sbio->bio = NULL;
David Sterba3fb99302017-05-16 19:10:32 +02001944 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001945 return -EIO;
1946 }
1947 scrub_wr_submit(sctx);
1948 goto again;
1949 }
1950
1951 sbio->pagev[sbio->page_count] = spage;
1952 scrub_page_get(spage);
1953 sbio->page_count++;
David Sterba3fb99302017-05-16 19:10:32 +02001954 if (sbio->page_count == sctx->pages_per_wr_bio)
Stefan Behrensff023aa2012-11-06 11:43:11 +01001955 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02001956 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001957
1958 return 0;
1959}
1960
1961static void scrub_wr_submit(struct scrub_ctx *sctx)
1962{
Stefan Behrensff023aa2012-11-06 11:43:11 +01001963 struct scrub_bio *sbio;
1964
David Sterba3fb99302017-05-16 19:10:32 +02001965 if (!sctx->wr_curr_bio)
Stefan Behrensff023aa2012-11-06 11:43:11 +01001966 return;
1967
David Sterba3fb99302017-05-16 19:10:32 +02001968 sbio = sctx->wr_curr_bio;
1969 sctx->wr_curr_bio = NULL;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001970 WARN_ON(!sbio->bio->bi_bdev);
1971 scrub_pending_bio_inc(sctx);
1972 /* process all writes in a single worker thread. Then the block layer
1973 * orders the requests before sending them to the driver which
1974 * doubled the write performance on spinning disks when measured
1975 * with Linux 3.5 */
Mike Christie4e49ea42016-06-05 14:31:41 -05001976 btrfsic_submit_bio(sbio->bio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001977}
1978
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02001979static void scrub_wr_bio_end_io(struct bio *bio)
Stefan Behrensff023aa2012-11-06 11:43:11 +01001980{
1981 struct scrub_bio *sbio = bio->bi_private;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04001982 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001983
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02001984 sbio->err = bio->bi_error;
Stefan Behrensff023aa2012-11-06 11:43:11 +01001985 sbio->bio = bio;
1986
Liu Bo9e0af232014-08-15 23:36:53 +08001987 btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1988 scrub_wr_bio_end_io_worker, NULL, NULL);
Qu Wenruo0339ef22014-02-28 10:46:17 +08001989 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
Stefan Behrensff023aa2012-11-06 11:43:11 +01001990}
1991
1992static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1993{
1994 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1995 struct scrub_ctx *sctx = sbio->sctx;
1996 int i;
1997
1998 WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1999 if (sbio->err) {
2000 struct btrfs_dev_replace *dev_replace =
Jeff Mahoneyfb456252016-06-22 18:54:56 -04002001 &sbio->sctx->fs_info->dev_replace;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002002
2003 for (i = 0; i < sbio->page_count; i++) {
2004 struct scrub_page *spage = sbio->pagev[i];
2005
2006 spage->io_error = 1;
2007 btrfs_dev_replace_stats_inc(&dev_replace->
2008 num_write_errors);
2009 }
2010 }
2011
2012 for (i = 0; i < sbio->page_count; i++)
2013 scrub_page_put(sbio->pagev[i]);
2014
2015 bio_put(sbio->bio);
2016 kfree(sbio);
2017 scrub_pending_bio_dec(sctx);
2018}
2019
2020static int scrub_checksum(struct scrub_block *sblock)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002021{
2022 u64 flags;
2023 int ret;
2024
Zhao Leiba7cf982015-08-24 21:18:02 +08002025 /*
2026 * No need to initialize these stats currently,
2027 * because this function only use return value
2028 * instead of these stats value.
2029 *
2030 * Todo:
2031 * always use stats
2032 */
2033 sblock->header_error = 0;
2034 sblock->generation_error = 0;
2035 sblock->checksum_error = 0;
2036
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002037 WARN_ON(sblock->page_count < 1);
2038 flags = sblock->pagev[0]->flags;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002039 ret = 0;
2040 if (flags & BTRFS_EXTENT_FLAG_DATA)
2041 ret = scrub_checksum_data(sblock);
2042 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
2043 ret = scrub_checksum_tree_block(sblock);
2044 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
2045 (void)scrub_checksum_super(sblock);
2046 else
2047 WARN_ON(1);
2048 if (ret)
2049 scrub_handle_errored_block(sblock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002050
2051 return ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002052}
2053
2054static int scrub_checksum_data(struct scrub_block *sblock)
2055{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002056 struct scrub_ctx *sctx = sblock->sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01002057 u8 csum[BTRFS_CSUM_SIZE];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002058 u8 *on_disk_csum;
2059 struct page *page;
2060 void *buffer;
Arne Jansena2de7332011-03-08 14:14:00 +01002061 u32 crc = ~(u32)0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002062 u64 len;
2063 int index;
Arne Jansena2de7332011-03-08 14:14:00 +01002064
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002065 BUG_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002066 if (!sblock->pagev[0]->have_csum)
Arne Jansena2de7332011-03-08 14:14:00 +01002067 return 0;
2068
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002069 on_disk_csum = sblock->pagev[0]->csum;
2070 page = sblock->pagev[0]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002071 buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002072
David Sterba25cc1222017-05-16 19:10:41 +02002073 len = sctx->fs_info->sectorsize;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002074 index = 0;
2075 for (;;) {
2076 u64 l = min_t(u64, len, PAGE_SIZE);
2077
Liu Bob0496682013-03-14 14:57:45 +00002078 crc = btrfs_csum_data(buffer, crc, l);
Linus Torvalds9613beb2012-03-30 12:44:29 -07002079 kunmap_atomic(buffer);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002080 len -= l;
2081 if (len == 0)
2082 break;
2083 index++;
2084 BUG_ON(index >= sblock->page_count);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002085 BUG_ON(!sblock->pagev[index]->page);
2086 page = sblock->pagev[index]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002087 buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002088 }
2089
Arne Jansena2de7332011-03-08 14:14:00 +01002090 btrfs_csum_final(crc, csum);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002091 if (memcmp(csum, on_disk_csum, sctx->csum_size))
Zhao Leiba7cf982015-08-24 21:18:02 +08002092 sblock->checksum_error = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002093
Zhao Leiba7cf982015-08-24 21:18:02 +08002094 return sblock->checksum_error;
Arne Jansena2de7332011-03-08 14:14:00 +01002095}
2096
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002097static int scrub_checksum_tree_block(struct scrub_block *sblock)
Arne Jansena2de7332011-03-08 14:14:00 +01002098{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002099 struct scrub_ctx *sctx = sblock->sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01002100 struct btrfs_header *h;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002101 struct btrfs_fs_info *fs_info = sctx->fs_info;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002102 u8 calculated_csum[BTRFS_CSUM_SIZE];
2103 u8 on_disk_csum[BTRFS_CSUM_SIZE];
2104 struct page *page;
2105 void *mapped_buffer;
2106 u64 mapped_size;
2107 void *p;
Arne Jansena2de7332011-03-08 14:14:00 +01002108 u32 crc = ~(u32)0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002109 u64 len;
2110 int index;
2111
2112 BUG_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002113 page = sblock->pagev[0]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002114 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002115 h = (struct btrfs_header *)mapped_buffer;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002116 memcpy(on_disk_csum, h->csum, sctx->csum_size);
Arne Jansena2de7332011-03-08 14:14:00 +01002117
2118 /*
2119 * we don't use the getter functions here, as we
2120 * a) don't have an extent buffer and
2121 * b) the page is already kmapped
2122 */
Qu Wenruo3cae2102013-07-16 11:19:18 +08002123 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
Zhao Leiba7cf982015-08-24 21:18:02 +08002124 sblock->header_error = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002125
Zhao Leiba7cf982015-08-24 21:18:02 +08002126 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
2127 sblock->header_error = 1;
2128 sblock->generation_error = 1;
2129 }
Arne Jansena2de7332011-03-08 14:14:00 +01002130
Miao Xie17a9be22014-07-24 11:37:08 +08002131 if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
Zhao Leiba7cf982015-08-24 21:18:02 +08002132 sblock->header_error = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002133
2134 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
2135 BTRFS_UUID_SIZE))
Zhao Leiba7cf982015-08-24 21:18:02 +08002136 sblock->header_error = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002137
David Sterba25cc1222017-05-16 19:10:41 +02002138 len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002139 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2140 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2141 index = 0;
2142 for (;;) {
2143 u64 l = min_t(u64, len, mapped_size);
2144
Liu Bob0496682013-03-14 14:57:45 +00002145 crc = btrfs_csum_data(p, crc, l);
Linus Torvalds9613beb2012-03-30 12:44:29 -07002146 kunmap_atomic(mapped_buffer);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002147 len -= l;
2148 if (len == 0)
2149 break;
2150 index++;
2151 BUG_ON(index >= sblock->page_count);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002152 BUG_ON(!sblock->pagev[index]->page);
2153 page = sblock->pagev[index]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002154 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002155 mapped_size = PAGE_SIZE;
2156 p = mapped_buffer;
2157 }
2158
2159 btrfs_csum_final(crc, calculated_csum);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002160 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
Zhao Leiba7cf982015-08-24 21:18:02 +08002161 sblock->checksum_error = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002162
Zhao Leiba7cf982015-08-24 21:18:02 +08002163 return sblock->header_error || sblock->checksum_error;
Arne Jansena2de7332011-03-08 14:14:00 +01002164}
2165
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002166static int scrub_checksum_super(struct scrub_block *sblock)
Arne Jansena2de7332011-03-08 14:14:00 +01002167{
2168 struct btrfs_super_block *s;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002169 struct scrub_ctx *sctx = sblock->sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002170 u8 calculated_csum[BTRFS_CSUM_SIZE];
2171 u8 on_disk_csum[BTRFS_CSUM_SIZE];
2172 struct page *page;
2173 void *mapped_buffer;
2174 u64 mapped_size;
2175 void *p;
Arne Jansena2de7332011-03-08 14:14:00 +01002176 u32 crc = ~(u32)0;
Stefan Behrens442a4f62012-05-25 16:06:08 +02002177 int fail_gen = 0;
2178 int fail_cor = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002179 u64 len;
2180 int index;
Arne Jansena2de7332011-03-08 14:14:00 +01002181
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002182 BUG_ON(sblock->page_count < 1);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002183 page = sblock->pagev[0]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002184 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002185 s = (struct btrfs_super_block *)mapped_buffer;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002186 memcpy(on_disk_csum, s->csum, sctx->csum_size);
Arne Jansena2de7332011-03-08 14:14:00 +01002187
Qu Wenruo3cae2102013-07-16 11:19:18 +08002188 if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
Stefan Behrens442a4f62012-05-25 16:06:08 +02002189 ++fail_cor;
Arne Jansena2de7332011-03-08 14:14:00 +01002190
Qu Wenruo3cae2102013-07-16 11:19:18 +08002191 if (sblock->pagev[0]->generation != btrfs_super_generation(s))
Stefan Behrens442a4f62012-05-25 16:06:08 +02002192 ++fail_gen;
Arne Jansena2de7332011-03-08 14:14:00 +01002193
Miao Xie17a9be22014-07-24 11:37:08 +08002194 if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
Stefan Behrens442a4f62012-05-25 16:06:08 +02002195 ++fail_cor;
Arne Jansena2de7332011-03-08 14:14:00 +01002196
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002197 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2198 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2199 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2200 index = 0;
2201 for (;;) {
2202 u64 l = min_t(u64, len, mapped_size);
2203
Liu Bob0496682013-03-14 14:57:45 +00002204 crc = btrfs_csum_data(p, crc, l);
Linus Torvalds9613beb2012-03-30 12:44:29 -07002205 kunmap_atomic(mapped_buffer);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002206 len -= l;
2207 if (len == 0)
2208 break;
2209 index++;
2210 BUG_ON(index >= sblock->page_count);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002211 BUG_ON(!sblock->pagev[index]->page);
2212 page = sblock->pagev[index]->page;
Linus Torvalds9613beb2012-03-30 12:44:29 -07002213 mapped_buffer = kmap_atomic(page);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002214 mapped_size = PAGE_SIZE;
2215 p = mapped_buffer;
2216 }
2217
2218 btrfs_csum_final(crc, calculated_csum);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002219 if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
Stefan Behrens442a4f62012-05-25 16:06:08 +02002220 ++fail_cor;
Arne Jansena2de7332011-03-08 14:14:00 +01002221
Stefan Behrens442a4f62012-05-25 16:06:08 +02002222 if (fail_cor + fail_gen) {
Arne Jansena2de7332011-03-08 14:14:00 +01002223 /*
2224 * if we find an error in a super block, we just report it.
2225 * They will get written with the next transaction commit
2226 * anyway
2227 */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002228 spin_lock(&sctx->stat_lock);
2229 ++sctx->stat.super_errors;
2230 spin_unlock(&sctx->stat_lock);
Stefan Behrens442a4f62012-05-25 16:06:08 +02002231 if (fail_cor)
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002232 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02002233 BTRFS_DEV_STAT_CORRUPTION_ERRS);
2234 else
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002235 btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
Stefan Behrens442a4f62012-05-25 16:06:08 +02002236 BTRFS_DEV_STAT_GENERATION_ERRS);
Arne Jansena2de7332011-03-08 14:14:00 +01002237 }
2238
Stefan Behrens442a4f62012-05-25 16:06:08 +02002239 return fail_cor + fail_gen;
Arne Jansena2de7332011-03-08 14:14:00 +01002240}
2241
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002242static void scrub_block_get(struct scrub_block *sblock)
2243{
Elena Reshetova186debd2017-03-03 10:55:23 +02002244 refcount_inc(&sblock->refs);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002245}
2246
2247static void scrub_block_put(struct scrub_block *sblock)
2248{
Elena Reshetova186debd2017-03-03 10:55:23 +02002249 if (refcount_dec_and_test(&sblock->refs)) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002250 int i;
2251
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002252 if (sblock->sparity)
2253 scrub_parity_put(sblock->sparity);
2254
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002255 for (i = 0; i < sblock->page_count; i++)
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002256 scrub_page_put(sblock->pagev[i]);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002257 kfree(sblock);
2258 }
2259}
2260
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002261static void scrub_page_get(struct scrub_page *spage)
2262{
Zhao Lei57019342015-01-20 15:11:45 +08002263 atomic_inc(&spage->refs);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002264}
2265
2266static void scrub_page_put(struct scrub_page *spage)
2267{
Zhao Lei57019342015-01-20 15:11:45 +08002268 if (atomic_dec_and_test(&spage->refs)) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002269 if (spage->page)
2270 __free_page(spage->page);
2271 kfree(spage);
2272 }
2273}
2274
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002275static void scrub_submit(struct scrub_ctx *sctx)
Arne Jansena2de7332011-03-08 14:14:00 +01002276{
2277 struct scrub_bio *sbio;
2278
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002279 if (sctx->curr == -1)
Stefan Behrens1623ede2012-03-27 14:21:26 -04002280 return;
Arne Jansena2de7332011-03-08 14:14:00 +01002281
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002282 sbio = sctx->bios[sctx->curr];
2283 sctx->curr = -1;
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01002284 scrub_pending_bio_inc(sctx);
Mike Christie4e49ea42016-06-05 14:31:41 -05002285 btrfsic_submit_bio(sbio->bio);
Arne Jansena2de7332011-03-08 14:14:00 +01002286}
2287
Stefan Behrensff023aa2012-11-06 11:43:11 +01002288static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2289 struct scrub_page *spage)
Arne Jansena2de7332011-03-08 14:14:00 +01002290{
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002291 struct scrub_block *sblock = spage->sblock;
Arne Jansena2de7332011-03-08 14:14:00 +01002292 struct scrub_bio *sbio;
Arne Jansen69f4cb52011-11-11 08:17:10 -05002293 int ret;
Arne Jansena2de7332011-03-08 14:14:00 +01002294
2295again:
2296 /*
2297 * grab a fresh bio or wait for one to become available
2298 */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002299 while (sctx->curr == -1) {
2300 spin_lock(&sctx->list_lock);
2301 sctx->curr = sctx->first_free;
2302 if (sctx->curr != -1) {
2303 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2304 sctx->bios[sctx->curr]->next_free = -1;
2305 sctx->bios[sctx->curr]->page_count = 0;
2306 spin_unlock(&sctx->list_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01002307 } else {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002308 spin_unlock(&sctx->list_lock);
2309 wait_event(sctx->list_wait, sctx->first_free != -1);
Arne Jansena2de7332011-03-08 14:14:00 +01002310 }
2311 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002312 sbio = sctx->bios[sctx->curr];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002313 if (sbio->page_count == 0) {
Arne Jansen69f4cb52011-11-11 08:17:10 -05002314 struct bio *bio;
2315
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002316 sbio->physical = spage->physical;
2317 sbio->logical = spage->logical;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002318 sbio->dev = spage->dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002319 bio = sbio->bio;
2320 if (!bio) {
David Sterba58c4e172016-02-11 10:49:42 +01002321 bio = btrfs_io_bio_alloc(GFP_KERNEL,
2322 sctx->pages_per_rd_bio);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002323 if (!bio)
2324 return -ENOMEM;
2325 sbio->bio = bio;
2326 }
Arne Jansen69f4cb52011-11-11 08:17:10 -05002327
2328 bio->bi_private = sbio;
2329 bio->bi_end_io = scrub_bio_end_io;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002330 bio->bi_bdev = sbio->dev->bdev;
Kent Overstreet4f024f32013-10-11 15:44:27 -07002331 bio->bi_iter.bi_sector = sbio->physical >> 9;
Mike Christie37226b22016-06-05 14:31:52 -05002332 bio_set_op_attrs(bio, REQ_OP_READ, 0);
Arne Jansen69f4cb52011-11-11 08:17:10 -05002333 sbio->err = 0;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002334 } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2335 spage->physical ||
2336 sbio->logical + sbio->page_count * PAGE_SIZE !=
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002337 spage->logical ||
2338 sbio->dev != spage->dev) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002339 scrub_submit(sctx);
Arne Jansen69f4cb52011-11-11 08:17:10 -05002340 goto again;
2341 }
2342
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002343 sbio->pagev[sbio->page_count] = spage;
2344 ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2345 if (ret != PAGE_SIZE) {
2346 if (sbio->page_count < 1) {
2347 bio_put(sbio->bio);
2348 sbio->bio = NULL;
2349 return -EIO;
2350 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002351 scrub_submit(sctx);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002352 goto again;
Arne Jansena2de7332011-03-08 14:14:00 +01002353 }
Arne Jansen1bc87792011-05-28 21:57:55 +02002354
Stefan Behrensff023aa2012-11-06 11:43:11 +01002355 scrub_block_get(sblock); /* one for the page added to the bio */
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002356 atomic_inc(&sblock->outstanding_pages);
2357 sbio->page_count++;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002358 if (sbio->page_count == sctx->pages_per_rd_bio)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002359 scrub_submit(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +01002360
2361 return 0;
2362}
2363
Linus Torvalds22365972015-09-05 15:14:43 -07002364static void scrub_missing_raid56_end_io(struct bio *bio)
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002365{
2366 struct scrub_block *sblock = bio->bi_private;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04002367 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002368
Linus Torvalds22365972015-09-05 15:14:43 -07002369 if (bio->bi_error)
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002370 sblock->no_io_error_seen = 0;
2371
Scott Talbert46732722016-05-09 09:14:28 -04002372 bio_put(bio);
2373
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002374 btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2375}
2376
2377static void scrub_missing_raid56_worker(struct btrfs_work *work)
2378{
2379 struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2380 struct scrub_ctx *sctx = sblock->sctx;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002381 struct btrfs_fs_info *fs_info = sctx->fs_info;
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002382 u64 logical;
2383 struct btrfs_device *dev;
2384
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002385 logical = sblock->pagev[0]->logical;
2386 dev = sblock->pagev[0]->dev;
2387
Zhao Leiaffe4a52015-08-24 21:32:06 +08002388 if (sblock->no_io_error_seen)
Zhao Leiba7cf982015-08-24 21:18:02 +08002389 scrub_recheck_block_checksum(sblock);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002390
2391 if (!sblock->no_io_error_seen) {
2392 spin_lock(&sctx->stat_lock);
2393 sctx->stat.read_errors++;
2394 spin_unlock(&sctx->stat_lock);
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002395 btrfs_err_rl_in_rcu(fs_info,
David Sterbab14af3b2015-10-08 10:43:10 +02002396 "IO error rebuilding logical %llu for dev %s",
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002397 logical, rcu_str_deref(dev->name));
2398 } else if (sblock->header_error || sblock->checksum_error) {
2399 spin_lock(&sctx->stat_lock);
2400 sctx->stat.uncorrectable_errors++;
2401 spin_unlock(&sctx->stat_lock);
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002402 btrfs_err_rl_in_rcu(fs_info,
David Sterbab14af3b2015-10-08 10:43:10 +02002403 "failed to rebuild valid logical %llu for dev %s",
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002404 logical, rcu_str_deref(dev->name));
2405 } else {
2406 scrub_write_block_to_dev_replace(sblock);
2407 }
2408
2409 scrub_block_put(sblock);
2410
2411 if (sctx->is_dev_replace &&
David Sterba3fb99302017-05-16 19:10:32 +02002412 atomic_read(&sctx->flush_all_writes)) {
2413 mutex_lock(&sctx->wr_lock);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002414 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02002415 mutex_unlock(&sctx->wr_lock);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002416 }
2417
2418 scrub_pending_bio_dec(sctx);
2419}
2420
2421static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2422{
2423 struct scrub_ctx *sctx = sblock->sctx;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04002424 struct btrfs_fs_info *fs_info = sctx->fs_info;
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002425 u64 length = sblock->page_count * PAGE_SIZE;
2426 u64 logical = sblock->pagev[0]->logical;
Zhao Leif1fee652016-05-17 17:37:38 +08002427 struct btrfs_bio *bbio = NULL;
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002428 struct bio *bio;
2429 struct btrfs_raid_bio *rbio;
2430 int ret;
2431 int i;
2432
Qu Wenruoae6529c2017-03-29 09:33:21 +08002433 btrfs_bio_counter_inc_blocked(fs_info);
Christoph Hellwigcf8cddd2016-10-27 09:27:36 +02002434 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
David Sterba825ad4c2017-03-28 14:45:22 +02002435 &length, &bbio);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002436 if (ret || !bbio || !bbio->raid_map)
2437 goto bbio_out;
2438
2439 if (WARN_ON(!sctx->is_dev_replace ||
2440 !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2441 /*
2442 * We shouldn't be scrubbing a missing device. Even for dev
2443 * replace, we should only get here for RAID 5/6. We either
2444 * managed to mount something with no mirrors remaining or
2445 * there's a bug in scrub_remap_extent()/btrfs_map_block().
2446 */
2447 goto bbio_out;
2448 }
2449
2450 bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2451 if (!bio)
2452 goto bbio_out;
2453
2454 bio->bi_iter.bi_sector = logical >> 9;
2455 bio->bi_private = sblock;
2456 bio->bi_end_io = scrub_missing_raid56_end_io;
2457
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002458 rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002459 if (!rbio)
2460 goto rbio_out;
2461
2462 for (i = 0; i < sblock->page_count; i++) {
2463 struct scrub_page *spage = sblock->pagev[i];
2464
2465 raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2466 }
2467
2468 btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2469 scrub_missing_raid56_worker, NULL, NULL);
2470 scrub_block_get(sblock);
2471 scrub_pending_bio_inc(sctx);
2472 raid56_submit_missing_rbio(rbio);
2473 return;
2474
2475rbio_out:
2476 bio_put(bio);
2477bbio_out:
Qu Wenruoae6529c2017-03-29 09:33:21 +08002478 btrfs_bio_counter_dec(fs_info);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002479 btrfs_put_bbio(bbio);
2480 spin_lock(&sctx->stat_lock);
2481 sctx->stat.malloc_errors++;
2482 spin_unlock(&sctx->stat_lock);
2483}
2484
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002485static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002486 u64 physical, struct btrfs_device *dev, u64 flags,
Stefan Behrensff023aa2012-11-06 11:43:11 +01002487 u64 gen, int mirror_num, u8 *csum, int force,
2488 u64 physical_for_dev_replace)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002489{
2490 struct scrub_block *sblock;
2491 int index;
2492
David Sterba58c4e172016-02-11 10:49:42 +01002493 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002494 if (!sblock) {
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002495 spin_lock(&sctx->stat_lock);
2496 sctx->stat.malloc_errors++;
2497 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002498 return -ENOMEM;
2499 }
2500
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002501 /* one ref inside this function, plus one for each page added to
2502 * a bio later on */
Elena Reshetova186debd2017-03-03 10:55:23 +02002503 refcount_set(&sblock->refs, 1);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002504 sblock->sctx = sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002505 sblock->no_io_error_seen = 1;
2506
2507 for (index = 0; len > 0; index++) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002508 struct scrub_page *spage;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002509 u64 l = min_t(u64, len, PAGE_SIZE);
2510
David Sterba58c4e172016-02-11 10:49:42 +01002511 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002512 if (!spage) {
2513leave_nomem:
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002514 spin_lock(&sctx->stat_lock);
2515 sctx->stat.malloc_errors++;
2516 spin_unlock(&sctx->stat_lock);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002517 scrub_block_put(sblock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002518 return -ENOMEM;
2519 }
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002520 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2521 scrub_page_get(spage);
2522 sblock->pagev[index] = spage;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002523 spage->sblock = sblock;
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002524 spage->dev = dev;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002525 spage->flags = flags;
2526 spage->generation = gen;
2527 spage->logical = logical;
2528 spage->physical = physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002529 spage->physical_for_dev_replace = physical_for_dev_replace;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002530 spage->mirror_num = mirror_num;
2531 if (csum) {
2532 spage->have_csum = 1;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002533 memcpy(spage->csum, csum, sctx->csum_size);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002534 } else {
2535 spage->have_csum = 0;
2536 }
2537 sblock->page_count++;
David Sterba58c4e172016-02-11 10:49:42 +01002538 spage->page = alloc_page(GFP_KERNEL);
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002539 if (!spage->page)
2540 goto leave_nomem;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002541 len -= l;
2542 logical += l;
2543 physical += l;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002544 physical_for_dev_replace += l;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002545 }
2546
Stefan Behrens7a9e9982012-11-02 14:58:04 +01002547 WARN_ON(sblock->page_count == 0);
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002548 if (dev->missing) {
2549 /*
2550 * This case should only be hit for RAID 5/6 device replace. See
2551 * the comment in scrub_missing_raid56_pages() for details.
2552 */
2553 scrub_missing_raid56_pages(sblock);
2554 } else {
2555 for (index = 0; index < sblock->page_count; index++) {
2556 struct scrub_page *spage = sblock->pagev[index];
2557 int ret;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002558
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002559 ret = scrub_add_page_to_rd_bio(sctx, spage);
2560 if (ret) {
2561 scrub_block_put(sblock);
2562 return ret;
2563 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002564 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002565
Omar Sandoval73ff61d2015-06-19 11:52:51 -07002566 if (force)
2567 scrub_submit(sctx);
2568 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002569
2570 /* last one frees, either here or in bio completion for last page */
2571 scrub_block_put(sblock);
2572 return 0;
2573}
2574
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002575static void scrub_bio_end_io(struct bio *bio)
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002576{
2577 struct scrub_bio *sbio = bio->bi_private;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04002578 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002579
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002580 sbio->err = bio->bi_error;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002581 sbio->bio = bio;
2582
Qu Wenruo0339ef22014-02-28 10:46:17 +08002583 btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002584}
2585
2586static void scrub_bio_end_io_worker(struct btrfs_work *work)
2587{
2588 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002589 struct scrub_ctx *sctx = sbio->sctx;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002590 int i;
2591
Stefan Behrensff023aa2012-11-06 11:43:11 +01002592 BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002593 if (sbio->err) {
2594 for (i = 0; i < sbio->page_count; i++) {
2595 struct scrub_page *spage = sbio->pagev[i];
2596
2597 spage->io_error = 1;
2598 spage->sblock->no_io_error_seen = 0;
2599 }
2600 }
2601
2602 /* now complete the scrub_block items that have all pages completed */
2603 for (i = 0; i < sbio->page_count; i++) {
2604 struct scrub_page *spage = sbio->pagev[i];
2605 struct scrub_block *sblock = spage->sblock;
2606
2607 if (atomic_dec_and_test(&sblock->outstanding_pages))
2608 scrub_block_complete(sblock);
2609 scrub_block_put(sblock);
2610 }
2611
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002612 bio_put(sbio->bio);
2613 sbio->bio = NULL;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002614 spin_lock(&sctx->list_lock);
2615 sbio->next_free = sctx->first_free;
2616 sctx->first_free = sbio->index;
2617 spin_unlock(&sctx->list_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002618
2619 if (sctx->is_dev_replace &&
David Sterba3fb99302017-05-16 19:10:32 +02002620 atomic_read(&sctx->flush_all_writes)) {
2621 mutex_lock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002622 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02002623 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002624 }
2625
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01002626 scrub_pending_bio_dec(sctx);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002627}
2628
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002629static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2630 unsigned long *bitmap,
2631 u64 start, u64 len)
2632{
Liu Bo972d7212017-04-03 13:45:33 -07002633 u64 offset;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002634 int nsectors;
Jeff Mahoneyda170662016-06-15 09:22:56 -04002635 int sectorsize = sparity->sctx->fs_info->sectorsize;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002636
2637 if (len >= sparity->stripe_len) {
2638 bitmap_set(bitmap, 0, sparity->nsectors);
2639 return;
2640 }
2641
2642 start -= sparity->logic_start;
Liu Bo972d7212017-04-03 13:45:33 -07002643 start = div64_u64_rem(start, sparity->stripe_len, &offset);
2644 offset = div_u64(offset, sectorsize);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002645 nsectors = (int)len / sectorsize;
2646
2647 if (offset + nsectors <= sparity->nsectors) {
2648 bitmap_set(bitmap, offset, nsectors);
2649 return;
2650 }
2651
2652 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2653 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2654}
2655
2656static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2657 u64 start, u64 len)
2658{
2659 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2660}
2661
2662static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2663 u64 start, u64 len)
2664{
2665 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2666}
2667
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002668static void scrub_block_complete(struct scrub_block *sblock)
2669{
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002670 int corrupted = 0;
2671
Stefan Behrensff023aa2012-11-06 11:43:11 +01002672 if (!sblock->no_io_error_seen) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002673 corrupted = 1;
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002674 scrub_handle_errored_block(sblock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01002675 } else {
2676 /*
2677 * if has checksum error, write via repair mechanism in
2678 * dev replace case, otherwise write here in dev replace
2679 * case.
2680 */
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002681 corrupted = scrub_checksum(sblock);
2682 if (!corrupted && sblock->sctx->is_dev_replace)
Stefan Behrensff023aa2012-11-06 11:43:11 +01002683 scrub_write_block_to_dev_replace(sblock);
2684 }
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002685
2686 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2687 u64 start = sblock->pagev[0]->logical;
2688 u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2689 PAGE_SIZE;
2690
2691 scrub_parity_mark_sectors_error(sblock->sparity,
2692 start, end - start);
2693 }
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002694}
2695
Zhao Lei3b5753e2015-08-24 22:03:02 +08002696static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
Arne Jansena2de7332011-03-08 14:14:00 +01002697{
2698 struct btrfs_ordered_sum *sum = NULL;
Miao Xief51a4a12013-06-19 10:36:09 +08002699 unsigned long index;
Arne Jansena2de7332011-03-08 14:14:00 +01002700 unsigned long num_sectors;
Arne Jansena2de7332011-03-08 14:14:00 +01002701
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002702 while (!list_empty(&sctx->csum_list)) {
2703 sum = list_first_entry(&sctx->csum_list,
Arne Jansena2de7332011-03-08 14:14:00 +01002704 struct btrfs_ordered_sum, list);
2705 if (sum->bytenr > logical)
2706 return 0;
2707 if (sum->bytenr + sum->len > logical)
2708 break;
2709
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002710 ++sctx->stat.csum_discards;
Arne Jansena2de7332011-03-08 14:14:00 +01002711 list_del(&sum->list);
2712 kfree(sum);
2713 sum = NULL;
2714 }
2715 if (!sum)
2716 return 0;
2717
David Sterba25cc1222017-05-16 19:10:41 +02002718 index = ((u32)(logical - sum->bytenr)) / sctx->fs_info->sectorsize;
2719 num_sectors = sum->len / sctx->fs_info->sectorsize;
Miao Xief51a4a12013-06-19 10:36:09 +08002720 memcpy(csum, sum->sums + index, sctx->csum_size);
2721 if (index == num_sectors - 1) {
Arne Jansena2de7332011-03-08 14:14:00 +01002722 list_del(&sum->list);
2723 kfree(sum);
2724 }
Miao Xief51a4a12013-06-19 10:36:09 +08002725 return 1;
Arne Jansena2de7332011-03-08 14:14:00 +01002726}
2727
2728/* scrub extent tries to collect up to 64 kB for each bio */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002729static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002730 u64 physical, struct btrfs_device *dev, u64 flags,
Stefan Behrensff023aa2012-11-06 11:43:11 +01002731 u64 gen, int mirror_num, u64 physical_for_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01002732{
2733 int ret;
2734 u8 csum[BTRFS_CSUM_SIZE];
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002735 u32 blocksize;
2736
2737 if (flags & BTRFS_EXTENT_FLAG_DATA) {
David Sterba25cc1222017-05-16 19:10:41 +02002738 blocksize = sctx->fs_info->sectorsize;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002739 spin_lock(&sctx->stat_lock);
2740 sctx->stat.data_extents_scrubbed++;
2741 sctx->stat.data_bytes_scrubbed += len;
2742 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002743 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
David Sterba25cc1222017-05-16 19:10:41 +02002744 blocksize = sctx->fs_info->nodesize;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002745 spin_lock(&sctx->stat_lock);
2746 sctx->stat.tree_extents_scrubbed++;
2747 sctx->stat.tree_bytes_scrubbed += len;
2748 spin_unlock(&sctx->stat_lock);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002749 } else {
David Sterba25cc1222017-05-16 19:10:41 +02002750 blocksize = sctx->fs_info->sectorsize;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002751 WARN_ON(1);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002752 }
Arne Jansena2de7332011-03-08 14:14:00 +01002753
2754 while (len) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04002755 u64 l = min_t(u64, len, blocksize);
Arne Jansena2de7332011-03-08 14:14:00 +01002756 int have_csum = 0;
2757
2758 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2759 /* push csums to sbio */
Zhao Lei3b5753e2015-08-24 22:03:02 +08002760 have_csum = scrub_find_csum(sctx, logical, csum);
Arne Jansena2de7332011-03-08 14:14:00 +01002761 if (have_csum == 0)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01002762 ++sctx->stat.no_csum;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002763 if (sctx->is_dev_replace && !have_csum) {
2764 ret = copy_nocow_pages(sctx, logical, l,
2765 mirror_num,
2766 physical_for_dev_replace);
2767 goto behind_scrub_pages;
2768 }
Arne Jansena2de7332011-03-08 14:14:00 +01002769 }
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01002770 ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
Stefan Behrensff023aa2012-11-06 11:43:11 +01002771 mirror_num, have_csum ? csum : NULL, 0,
2772 physical_for_dev_replace);
2773behind_scrub_pages:
Arne Jansena2de7332011-03-08 14:14:00 +01002774 if (ret)
2775 return ret;
2776 len -= l;
2777 logical += l;
2778 physical += l;
Stefan Behrensff023aa2012-11-06 11:43:11 +01002779 physical_for_dev_replace += l;
Arne Jansena2de7332011-03-08 14:14:00 +01002780 }
2781 return 0;
2782}
2783
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002784static int scrub_pages_for_parity(struct scrub_parity *sparity,
2785 u64 logical, u64 len,
2786 u64 physical, struct btrfs_device *dev,
2787 u64 flags, u64 gen, int mirror_num, u8 *csum)
2788{
2789 struct scrub_ctx *sctx = sparity->sctx;
2790 struct scrub_block *sblock;
2791 int index;
2792
David Sterba58c4e172016-02-11 10:49:42 +01002793 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002794 if (!sblock) {
2795 spin_lock(&sctx->stat_lock);
2796 sctx->stat.malloc_errors++;
2797 spin_unlock(&sctx->stat_lock);
2798 return -ENOMEM;
2799 }
2800
2801 /* one ref inside this function, plus one for each page added to
2802 * a bio later on */
Elena Reshetova186debd2017-03-03 10:55:23 +02002803 refcount_set(&sblock->refs, 1);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002804 sblock->sctx = sctx;
2805 sblock->no_io_error_seen = 1;
2806 sblock->sparity = sparity;
2807 scrub_parity_get(sparity);
2808
2809 for (index = 0; len > 0; index++) {
2810 struct scrub_page *spage;
2811 u64 l = min_t(u64, len, PAGE_SIZE);
2812
David Sterba58c4e172016-02-11 10:49:42 +01002813 spage = kzalloc(sizeof(*spage), GFP_KERNEL);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002814 if (!spage) {
2815leave_nomem:
2816 spin_lock(&sctx->stat_lock);
2817 sctx->stat.malloc_errors++;
2818 spin_unlock(&sctx->stat_lock);
2819 scrub_block_put(sblock);
2820 return -ENOMEM;
2821 }
2822 BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2823 /* For scrub block */
2824 scrub_page_get(spage);
2825 sblock->pagev[index] = spage;
2826 /* For scrub parity */
2827 scrub_page_get(spage);
2828 list_add_tail(&spage->list, &sparity->spages);
2829 spage->sblock = sblock;
2830 spage->dev = dev;
2831 spage->flags = flags;
2832 spage->generation = gen;
2833 spage->logical = logical;
2834 spage->physical = physical;
2835 spage->mirror_num = mirror_num;
2836 if (csum) {
2837 spage->have_csum = 1;
2838 memcpy(spage->csum, csum, sctx->csum_size);
2839 } else {
2840 spage->have_csum = 0;
2841 }
2842 sblock->page_count++;
David Sterba58c4e172016-02-11 10:49:42 +01002843 spage->page = alloc_page(GFP_KERNEL);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002844 if (!spage->page)
2845 goto leave_nomem;
2846 len -= l;
2847 logical += l;
2848 physical += l;
2849 }
2850
2851 WARN_ON(sblock->page_count == 0);
2852 for (index = 0; index < sblock->page_count; index++) {
2853 struct scrub_page *spage = sblock->pagev[index];
2854 int ret;
2855
2856 ret = scrub_add_page_to_rd_bio(sctx, spage);
2857 if (ret) {
2858 scrub_block_put(sblock);
2859 return ret;
2860 }
2861 }
2862
2863 /* last one frees, either here or in bio completion for last page */
2864 scrub_block_put(sblock);
2865 return 0;
2866}
2867
2868static int scrub_extent_for_parity(struct scrub_parity *sparity,
2869 u64 logical, u64 len,
2870 u64 physical, struct btrfs_device *dev,
2871 u64 flags, u64 gen, int mirror_num)
2872{
2873 struct scrub_ctx *sctx = sparity->sctx;
2874 int ret;
2875 u8 csum[BTRFS_CSUM_SIZE];
2876 u32 blocksize;
2877
Omar Sandoval4a770892015-06-19 11:52:52 -07002878 if (dev->missing) {
2879 scrub_parity_mark_sectors_error(sparity, logical, len);
2880 return 0;
2881 }
2882
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002883 if (flags & BTRFS_EXTENT_FLAG_DATA) {
David Sterba25cc1222017-05-16 19:10:41 +02002884 blocksize = sctx->fs_info->sectorsize;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002885 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
David Sterba25cc1222017-05-16 19:10:41 +02002886 blocksize = sctx->fs_info->nodesize;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002887 } else {
David Sterba25cc1222017-05-16 19:10:41 +02002888 blocksize = sctx->fs_info->sectorsize;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002889 WARN_ON(1);
2890 }
2891
2892 while (len) {
2893 u64 l = min_t(u64, len, blocksize);
2894 int have_csum = 0;
2895
2896 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2897 /* push csums to sbio */
Zhao Lei3b5753e2015-08-24 22:03:02 +08002898 have_csum = scrub_find_csum(sctx, logical, csum);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002899 if (have_csum == 0)
2900 goto skip;
2901 }
2902 ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2903 flags, gen, mirror_num,
2904 have_csum ? csum : NULL);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002905 if (ret)
2906 return ret;
Dan Carpenter6b6d24b2014-12-12 22:30:00 +03002907skip:
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002908 len -= l;
2909 logical += l;
2910 physical += l;
2911 }
2912 return 0;
2913}
2914
Wang Shilong3b080b22014-04-01 18:01:43 +08002915/*
2916 * Given a physical address, this will calculate it's
2917 * logical offset. if this is a parity stripe, it will return
2918 * the most left data stripe's logical offset.
2919 *
2920 * return 0 if it is a data stripe, 1 means parity stripe.
2921 */
2922static int get_raid56_logic_offset(u64 physical, int num,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002923 struct map_lookup *map, u64 *offset,
2924 u64 *stripe_start)
Wang Shilong3b080b22014-04-01 18:01:43 +08002925{
2926 int i;
2927 int j = 0;
2928 u64 stripe_nr;
2929 u64 last_offset;
David Sterba9d644a62015-02-20 18:42:11 +01002930 u32 stripe_index;
2931 u32 rot;
Wang Shilong3b080b22014-04-01 18:01:43 +08002932
2933 last_offset = (physical - map->stripes[num].physical) *
2934 nr_data_stripes(map);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002935 if (stripe_start)
2936 *stripe_start = last_offset;
2937
Wang Shilong3b080b22014-04-01 18:01:43 +08002938 *offset = last_offset;
2939 for (i = 0; i < nr_data_stripes(map); i++) {
2940 *offset = last_offset + i * map->stripe_len;
2941
Liu Bo42c61ab2017-04-03 13:45:24 -07002942 stripe_nr = div64_u64(*offset, map->stripe_len);
David Sterbab8b93ad2015-01-16 17:26:13 +01002943 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
Wang Shilong3b080b22014-04-01 18:01:43 +08002944
2945 /* Work out the disk rotation on this stripe-set */
David Sterba47c57132015-02-20 18:43:47 +01002946 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
Wang Shilong3b080b22014-04-01 18:01:43 +08002947 /* calculate which stripe this data locates */
2948 rot += i;
Wang Shilonge4fbaee2014-04-11 18:32:25 +08002949 stripe_index = rot % map->num_stripes;
Wang Shilong3b080b22014-04-01 18:01:43 +08002950 if (stripe_index == num)
2951 return 0;
2952 if (stripe_index < num)
2953 j++;
2954 }
2955 *offset = last_offset + j * map->stripe_len;
2956 return 1;
2957}
2958
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002959static void scrub_free_parity(struct scrub_parity *sparity)
2960{
2961 struct scrub_ctx *sctx = sparity->sctx;
2962 struct scrub_page *curr, *next;
2963 int nbits;
2964
2965 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2966 if (nbits) {
2967 spin_lock(&sctx->stat_lock);
2968 sctx->stat.read_errors += nbits;
2969 sctx->stat.uncorrectable_errors += nbits;
2970 spin_unlock(&sctx->stat_lock);
2971 }
2972
2973 list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2974 list_del_init(&curr->list);
2975 scrub_page_put(curr);
2976 }
2977
2978 kfree(sparity);
2979}
2980
Zhao Lei20b2e302015-06-04 20:09:15 +08002981static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
2982{
2983 struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2984 work);
2985 struct scrub_ctx *sctx = sparity->sctx;
2986
2987 scrub_free_parity(sparity);
2988 scrub_pending_bio_dec(sctx);
2989}
2990
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002991static void scrub_parity_bio_endio(struct bio *bio)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002992{
2993 struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002994 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002995
Christoph Hellwig4246a0b2015-07-20 15:29:37 +02002996 if (bio->bi_error)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08002997 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2998 sparity->nsectors);
2999
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003000 bio_put(bio);
Zhao Lei20b2e302015-06-04 20:09:15 +08003001
3002 btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
3003 scrub_parity_bio_endio_worker, NULL, NULL);
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003004 btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003005}
3006
3007static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
3008{
3009 struct scrub_ctx *sctx = sparity->sctx;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003010 struct btrfs_fs_info *fs_info = sctx->fs_info;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003011 struct bio *bio;
3012 struct btrfs_raid_bio *rbio;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003013 struct btrfs_bio *bbio = NULL;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003014 u64 length;
3015 int ret;
3016
3017 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
3018 sparity->nsectors))
3019 goto out;
3020
Zhao Leia0dd59d2015-07-21 15:42:26 +08003021 length = sparity->logic_end - sparity->logic_start;
Qu Wenruoae6529c2017-03-29 09:33:21 +08003022
3023 btrfs_bio_counter_inc_blocked(fs_info);
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003024 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
David Sterba825ad4c2017-03-28 14:45:22 +02003025 &length, &bbio);
Zhao Lei8e5cfb52015-01-20 15:11:33 +08003026 if (ret || !bbio || !bbio->raid_map)
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003027 goto bbio_out;
3028
3029 bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
3030 if (!bio)
3031 goto bbio_out;
3032
3033 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3034 bio->bi_private = sparity;
3035 bio->bi_end_io = scrub_parity_bio_endio;
3036
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04003037 rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
Zhao Lei8e5cfb52015-01-20 15:11:33 +08003038 length, sparity->scrub_dev,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003039 sparity->dbitmap,
3040 sparity->nsectors);
3041 if (!rbio)
3042 goto rbio_out;
3043
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003044 scrub_pending_bio_inc(sctx);
3045 raid56_parity_submit_scrub_rbio(rbio);
3046 return;
3047
3048rbio_out:
3049 bio_put(bio);
3050bbio_out:
Qu Wenruoae6529c2017-03-29 09:33:21 +08003051 btrfs_bio_counter_dec(fs_info);
Zhao Lei6e9606d2015-01-20 15:11:34 +08003052 btrfs_put_bbio(bbio);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003053 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3054 sparity->nsectors);
3055 spin_lock(&sctx->stat_lock);
3056 sctx->stat.malloc_errors++;
3057 spin_unlock(&sctx->stat_lock);
3058out:
3059 scrub_free_parity(sparity);
3060}
3061
3062static inline int scrub_calc_parity_bitmap_len(int nsectors)
3063{
Zhao Leibfca9a62014-12-08 19:55:57 +08003064 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003065}
3066
3067static void scrub_parity_get(struct scrub_parity *sparity)
3068{
Elena Reshetova78a76452017-03-03 10:55:24 +02003069 refcount_inc(&sparity->refs);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003070}
3071
3072static void scrub_parity_put(struct scrub_parity *sparity)
3073{
Elena Reshetova78a76452017-03-03 10:55:24 +02003074 if (!refcount_dec_and_test(&sparity->refs))
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003075 return;
3076
3077 scrub_parity_check_and_repair(sparity);
3078}
3079
3080static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3081 struct map_lookup *map,
3082 struct btrfs_device *sdev,
3083 struct btrfs_path *path,
3084 u64 logic_start,
3085 u64 logic_end)
3086{
Jeff Mahoneyfb456252016-06-22 18:54:56 -04003087 struct btrfs_fs_info *fs_info = sctx->fs_info;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003088 struct btrfs_root *root = fs_info->extent_root;
3089 struct btrfs_root *csum_root = fs_info->csum_root;
3090 struct btrfs_extent_item *extent;
Omar Sandoval4a770892015-06-19 11:52:52 -07003091 struct btrfs_bio *bbio = NULL;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003092 u64 flags;
3093 int ret;
3094 int slot;
3095 struct extent_buffer *l;
3096 struct btrfs_key key;
3097 u64 generation;
3098 u64 extent_logical;
3099 u64 extent_physical;
3100 u64 extent_len;
Omar Sandoval4a770892015-06-19 11:52:52 -07003101 u64 mapped_length;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003102 struct btrfs_device *extent_dev;
3103 struct scrub_parity *sparity;
3104 int nsectors;
3105 int bitmap_len;
3106 int extent_mirror_num;
3107 int stop_loop = 0;
3108
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003109 nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003110 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3111 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3112 GFP_NOFS);
3113 if (!sparity) {
3114 spin_lock(&sctx->stat_lock);
3115 sctx->stat.malloc_errors++;
3116 spin_unlock(&sctx->stat_lock);
3117 return -ENOMEM;
3118 }
3119
3120 sparity->stripe_len = map->stripe_len;
3121 sparity->nsectors = nsectors;
3122 sparity->sctx = sctx;
3123 sparity->scrub_dev = sdev;
3124 sparity->logic_start = logic_start;
3125 sparity->logic_end = logic_end;
Elena Reshetova78a76452017-03-03 10:55:24 +02003126 refcount_set(&sparity->refs, 1);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003127 INIT_LIST_HEAD(&sparity->spages);
3128 sparity->dbitmap = sparity->bitmap;
3129 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3130
3131 ret = 0;
3132 while (logic_start < logic_end) {
3133 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3134 key.type = BTRFS_METADATA_ITEM_KEY;
3135 else
3136 key.type = BTRFS_EXTENT_ITEM_KEY;
3137 key.objectid = logic_start;
3138 key.offset = (u64)-1;
3139
3140 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3141 if (ret < 0)
3142 goto out;
3143
3144 if (ret > 0) {
3145 ret = btrfs_previous_extent_item(root, path, 0);
3146 if (ret < 0)
3147 goto out;
3148 if (ret > 0) {
3149 btrfs_release_path(path);
3150 ret = btrfs_search_slot(NULL, root, &key,
3151 path, 0, 0);
3152 if (ret < 0)
3153 goto out;
3154 }
3155 }
3156
3157 stop_loop = 0;
3158 while (1) {
3159 u64 bytes;
3160
3161 l = path->nodes[0];
3162 slot = path->slots[0];
3163 if (slot >= btrfs_header_nritems(l)) {
3164 ret = btrfs_next_leaf(root, path);
3165 if (ret == 0)
3166 continue;
3167 if (ret < 0)
3168 goto out;
3169
3170 stop_loop = 1;
3171 break;
3172 }
3173 btrfs_item_key_to_cpu(l, &key, slot);
3174
Zhao Leid7cad232015-07-22 13:14:48 +08003175 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3176 key.type != BTRFS_METADATA_ITEM_KEY)
3177 goto next;
3178
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003179 if (key.type == BTRFS_METADATA_ITEM_KEY)
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003180 bytes = fs_info->nodesize;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003181 else
3182 bytes = key.offset;
3183
3184 if (key.objectid + bytes <= logic_start)
3185 goto next;
3186
Zhao Leia0dd59d2015-07-21 15:42:26 +08003187 if (key.objectid >= logic_end) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003188 stop_loop = 1;
3189 break;
3190 }
3191
3192 while (key.objectid >= logic_start + map->stripe_len)
3193 logic_start += map->stripe_len;
3194
3195 extent = btrfs_item_ptr(l, slot,
3196 struct btrfs_extent_item);
3197 flags = btrfs_extent_flags(l, extent);
3198 generation = btrfs_extent_generation(l, extent);
3199
Zhao Leia323e812015-07-23 12:29:49 +08003200 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3201 (key.objectid < logic_start ||
3202 key.objectid + bytes >
3203 logic_start + map->stripe_len)) {
Jeff Mahoney5d163e02016-09-20 10:05:00 -04003204 btrfs_err(fs_info,
3205 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
Zhao Leia323e812015-07-23 12:29:49 +08003206 key.objectid, logic_start);
Zhao Lei9799d2c32015-08-25 21:31:40 +08003207 spin_lock(&sctx->stat_lock);
3208 sctx->stat.uncorrectable_errors++;
3209 spin_unlock(&sctx->stat_lock);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003210 goto next;
3211 }
3212again:
3213 extent_logical = key.objectid;
3214 extent_len = bytes;
3215
3216 if (extent_logical < logic_start) {
3217 extent_len -= logic_start - extent_logical;
3218 extent_logical = logic_start;
3219 }
3220
3221 if (extent_logical + extent_len >
3222 logic_start + map->stripe_len)
3223 extent_len = logic_start + map->stripe_len -
3224 extent_logical;
3225
3226 scrub_parity_mark_sectors_data(sparity, extent_logical,
3227 extent_len);
3228
Omar Sandoval4a770892015-06-19 11:52:52 -07003229 mapped_length = extent_len;
Zhao Leif1fee652016-05-17 17:37:38 +08003230 bbio = NULL;
Christoph Hellwigcf8cddd2016-10-27 09:27:36 +02003231 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3232 extent_logical, &mapped_length, &bbio,
3233 0);
Omar Sandoval4a770892015-06-19 11:52:52 -07003234 if (!ret) {
3235 if (!bbio || mapped_length < extent_len)
3236 ret = -EIO;
3237 }
3238 if (ret) {
3239 btrfs_put_bbio(bbio);
3240 goto out;
3241 }
3242 extent_physical = bbio->stripes[0].physical;
3243 extent_mirror_num = bbio->mirror_num;
3244 extent_dev = bbio->stripes[0].dev;
3245 btrfs_put_bbio(bbio);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003246
3247 ret = btrfs_lookup_csums_range(csum_root,
3248 extent_logical,
3249 extent_logical + extent_len - 1,
3250 &sctx->csum_list, 1);
3251 if (ret)
3252 goto out;
3253
3254 ret = scrub_extent_for_parity(sparity, extent_logical,
3255 extent_len,
3256 extent_physical,
3257 extent_dev, flags,
3258 generation,
3259 extent_mirror_num);
Zhao Lei6fa96d72015-07-21 12:22:30 +08003260
3261 scrub_free_csums(sctx);
3262
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003263 if (ret)
3264 goto out;
3265
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003266 if (extent_logical + extent_len <
3267 key.objectid + bytes) {
3268 logic_start += map->stripe_len;
3269
3270 if (logic_start >= logic_end) {
3271 stop_loop = 1;
3272 break;
3273 }
3274
3275 if (logic_start < key.objectid + bytes) {
3276 cond_resched();
3277 goto again;
3278 }
3279 }
3280next:
3281 path->slots[0]++;
3282 }
3283
3284 btrfs_release_path(path);
3285
3286 if (stop_loop)
3287 break;
3288
3289 logic_start += map->stripe_len;
3290 }
3291out:
3292 if (ret < 0)
3293 scrub_parity_mark_sectors_error(sparity, logic_start,
Zhao Leia0dd59d2015-07-21 15:42:26 +08003294 logic_end - logic_start);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003295 scrub_parity_put(sparity);
3296 scrub_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003297 mutex_lock(&sctx->wr_lock);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003298 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003299 mutex_unlock(&sctx->wr_lock);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003300
3301 btrfs_release_path(path);
3302 return ret < 0 ? ret : 0;
3303}
3304
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003305static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003306 struct map_lookup *map,
3307 struct btrfs_device *scrub_dev,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003308 int num, u64 base, u64 length,
3309 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01003310{
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003311 struct btrfs_path *path, *ppath;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04003312 struct btrfs_fs_info *fs_info = sctx->fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +01003313 struct btrfs_root *root = fs_info->extent_root;
3314 struct btrfs_root *csum_root = fs_info->csum_root;
3315 struct btrfs_extent_item *extent;
Arne Jansene7786c32011-05-28 20:58:38 +00003316 struct blk_plug plug;
Arne Jansena2de7332011-03-08 14:14:00 +01003317 u64 flags;
3318 int ret;
3319 int slot;
Arne Jansena2de7332011-03-08 14:14:00 +01003320 u64 nstripes;
Arne Jansena2de7332011-03-08 14:14:00 +01003321 struct extent_buffer *l;
Arne Jansena2de7332011-03-08 14:14:00 +01003322 u64 physical;
3323 u64 logical;
Liu Bo625f1c8d2013-04-27 02:56:57 +00003324 u64 logic_end;
Wang Shilong3b080b22014-04-01 18:01:43 +08003325 u64 physical_end;
Arne Jansena2de7332011-03-08 14:14:00 +01003326 u64 generation;
Jan Schmidte12fa9c2011-06-17 15:55:21 +02003327 int mirror_num;
Arne Jansen7a262852011-06-10 12:39:23 +02003328 struct reada_control *reada1;
3329 struct reada_control *reada2;
David Sterbae6c11f92016-03-24 18:00:53 +01003330 struct btrfs_key key;
Arne Jansen7a262852011-06-10 12:39:23 +02003331 struct btrfs_key key_end;
Arne Jansena2de7332011-03-08 14:14:00 +01003332 u64 increment = map->stripe_len;
3333 u64 offset;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003334 u64 extent_logical;
3335 u64 extent_physical;
3336 u64 extent_len;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003337 u64 stripe_logical;
3338 u64 stripe_end;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003339 struct btrfs_device *extent_dev;
3340 int extent_mirror_num;
Wang Shilong3b080b22014-04-01 18:01:43 +08003341 int stop_loop = 0;
David Woodhouse53b381b2013-01-29 18:40:14 -05003342
Wang Shilong3b080b22014-04-01 18:01:43 +08003343 physical = map->stripes[num].physical;
Arne Jansena2de7332011-03-08 14:14:00 +01003344 offset = 0;
Liu Bo42c61ab2017-04-03 13:45:24 -07003345 nstripes = div64_u64(length, map->stripe_len);
Arne Jansena2de7332011-03-08 14:14:00 +01003346 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3347 offset = map->stripe_len * num;
3348 increment = map->stripe_len * map->num_stripes;
Jan Schmidt193ea742011-06-13 19:56:54 +02003349 mirror_num = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003350 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3351 int factor = map->num_stripes / map->sub_stripes;
3352 offset = map->stripe_len * (num / map->sub_stripes);
3353 increment = map->stripe_len * factor;
Jan Schmidt193ea742011-06-13 19:56:54 +02003354 mirror_num = num % map->sub_stripes + 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003355 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3356 increment = map->stripe_len;
Jan Schmidt193ea742011-06-13 19:56:54 +02003357 mirror_num = num % map->num_stripes + 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003358 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3359 increment = map->stripe_len;
Jan Schmidt193ea742011-06-13 19:56:54 +02003360 mirror_num = num % map->num_stripes + 1;
Zhao Leiffe2d202015-01-20 15:11:44 +08003361 } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003362 get_raid56_logic_offset(physical, num, map, &offset, NULL);
Wang Shilong3b080b22014-04-01 18:01:43 +08003363 increment = map->stripe_len * nr_data_stripes(map);
3364 mirror_num = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003365 } else {
3366 increment = map->stripe_len;
Jan Schmidt193ea742011-06-13 19:56:54 +02003367 mirror_num = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003368 }
3369
3370 path = btrfs_alloc_path();
3371 if (!path)
3372 return -ENOMEM;
3373
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003374 ppath = btrfs_alloc_path();
3375 if (!ppath) {
Tsutomu Itoh379d6852015-01-09 17:37:52 +09003376 btrfs_free_path(path);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003377 return -ENOMEM;
3378 }
3379
Stefan Behrensb5d67f62012-03-27 14:21:27 -04003380 /*
3381 * work on commit root. The related disk blocks are static as
3382 * long as COW is applied. This means, it is save to rewrite
3383 * them to repair disk errors without any race conditions
3384 */
Arne Jansena2de7332011-03-08 14:14:00 +01003385 path->search_commit_root = 1;
3386 path->skip_locking = 1;
3387
Gui Hecheng063c54d2015-01-09 09:39:40 +08003388 ppath->search_commit_root = 1;
3389 ppath->skip_locking = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003390 /*
Arne Jansen7a262852011-06-10 12:39:23 +02003391 * trigger the readahead for extent tree csum tree and wait for
3392 * completion. During readahead, the scrub is officially paused
3393 * to not hold off transaction commits
Arne Jansena2de7332011-03-08 14:14:00 +01003394 */
3395 logical = base + offset;
Wang Shilong3b080b22014-04-01 18:01:43 +08003396 physical_end = physical + nstripes * map->stripe_len;
Zhao Leiffe2d202015-01-20 15:11:44 +08003397 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
Wang Shilong3b080b22014-04-01 18:01:43 +08003398 get_raid56_logic_offset(physical_end, num,
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003399 map, &logic_end, NULL);
Wang Shilong3b080b22014-04-01 18:01:43 +08003400 logic_end += base;
3401 } else {
3402 logic_end = logical + increment * nstripes;
3403 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003404 wait_event(sctx->list_wait,
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01003405 atomic_read(&sctx->bios_in_flight) == 0);
Wang Shilongcb7ab022013-12-04 21:16:53 +08003406 scrub_blocked_if_needed(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01003407
Arne Jansen7a262852011-06-10 12:39:23 +02003408 /* FIXME it might be better to start readahead at commit root */
David Sterbae6c11f92016-03-24 18:00:53 +01003409 key.objectid = logical;
3410 key.type = BTRFS_EXTENT_ITEM_KEY;
3411 key.offset = (u64)0;
Wang Shilong3b080b22014-04-01 18:01:43 +08003412 key_end.objectid = logic_end;
Josef Bacik3173a182013-03-07 14:22:04 -05003413 key_end.type = BTRFS_METADATA_ITEM_KEY;
3414 key_end.offset = (u64)-1;
David Sterbae6c11f92016-03-24 18:00:53 +01003415 reada1 = btrfs_reada_add(root, &key, &key_end);
Arne Jansena2de7332011-03-08 14:14:00 +01003416
David Sterbae6c11f92016-03-24 18:00:53 +01003417 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3418 key.type = BTRFS_EXTENT_CSUM_KEY;
3419 key.offset = logical;
Arne Jansen7a262852011-06-10 12:39:23 +02003420 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3421 key_end.type = BTRFS_EXTENT_CSUM_KEY;
Wang Shilong3b080b22014-04-01 18:01:43 +08003422 key_end.offset = logic_end;
David Sterbae6c11f92016-03-24 18:00:53 +01003423 reada2 = btrfs_reada_add(csum_root, &key, &key_end);
Arne Jansena2de7332011-03-08 14:14:00 +01003424
Arne Jansen7a262852011-06-10 12:39:23 +02003425 if (!IS_ERR(reada1))
3426 btrfs_reada_wait(reada1);
3427 if (!IS_ERR(reada2))
3428 btrfs_reada_wait(reada2);
Arne Jansena2de7332011-03-08 14:14:00 +01003429
Arne Jansena2de7332011-03-08 14:14:00 +01003430
3431 /*
3432 * collect all data csums for the stripe to avoid seeking during
3433 * the scrub. This might currently (crc32) end up to be about 1MB
3434 */
Arne Jansene7786c32011-05-28 20:58:38 +00003435 blk_start_plug(&plug);
Arne Jansena2de7332011-03-08 14:14:00 +01003436
Arne Jansena2de7332011-03-08 14:14:00 +01003437 /*
3438 * now find all extents for each stripe and scrub them
3439 */
Arne Jansena2de7332011-03-08 14:14:00 +01003440 ret = 0;
Wang Shilong3b080b22014-04-01 18:01:43 +08003441 while (physical < physical_end) {
Arne Jansena2de7332011-03-08 14:14:00 +01003442 /*
3443 * canceled?
3444 */
3445 if (atomic_read(&fs_info->scrub_cancel_req) ||
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003446 atomic_read(&sctx->cancel_req)) {
Arne Jansena2de7332011-03-08 14:14:00 +01003447 ret = -ECANCELED;
3448 goto out;
3449 }
3450 /*
3451 * check to see if we have to pause
3452 */
3453 if (atomic_read(&fs_info->scrub_pause_req)) {
3454 /* push queued extents */
David Sterba3fb99302017-05-16 19:10:32 +02003455 atomic_set(&sctx->flush_all_writes, 1);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003456 scrub_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003457 mutex_lock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003458 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003459 mutex_unlock(&sctx->wr_lock);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003460 wait_event(sctx->list_wait,
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01003461 atomic_read(&sctx->bios_in_flight) == 0);
David Sterba3fb99302017-05-16 19:10:32 +02003462 atomic_set(&sctx->flush_all_writes, 0);
Wang Shilong3cb09292013-12-04 21:15:19 +08003463 scrub_blocked_if_needed(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01003464 }
3465
Zhao Leif2f66a22015-07-21 12:22:29 +08003466 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3467 ret = get_raid56_logic_offset(physical, num, map,
3468 &logical,
3469 &stripe_logical);
3470 logical += base;
3471 if (ret) {
Zhao Lei79553232015-08-18 17:54:30 +08003472 /* it is parity strip */
Zhao Leif2f66a22015-07-21 12:22:29 +08003473 stripe_logical += base;
Zhao Leia0dd59d2015-07-21 15:42:26 +08003474 stripe_end = stripe_logical + increment;
Zhao Leif2f66a22015-07-21 12:22:29 +08003475 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3476 ppath, stripe_logical,
3477 stripe_end);
3478 if (ret)
3479 goto out;
3480 goto skip;
3481 }
3482 }
3483
Wang Shilong7c76edb2014-01-12 21:38:32 +08003484 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3485 key.type = BTRFS_METADATA_ITEM_KEY;
3486 else
3487 key.type = BTRFS_EXTENT_ITEM_KEY;
Arne Jansena2de7332011-03-08 14:14:00 +01003488 key.objectid = logical;
Liu Bo625f1c8d2013-04-27 02:56:57 +00003489 key.offset = (u64)-1;
Arne Jansena2de7332011-03-08 14:14:00 +01003490
3491 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3492 if (ret < 0)
3493 goto out;
Josef Bacik3173a182013-03-07 14:22:04 -05003494
Arne Jansen8c510322011-06-03 10:09:26 +02003495 if (ret > 0) {
Wang Shilongade2e0b2014-01-12 21:38:33 +08003496 ret = btrfs_previous_extent_item(root, path, 0);
Arne Jansena2de7332011-03-08 14:14:00 +01003497 if (ret < 0)
3498 goto out;
Arne Jansen8c510322011-06-03 10:09:26 +02003499 if (ret > 0) {
3500 /* there's no smaller item, so stick with the
3501 * larger one */
3502 btrfs_release_path(path);
3503 ret = btrfs_search_slot(NULL, root, &key,
3504 path, 0, 0);
3505 if (ret < 0)
3506 goto out;
3507 }
Arne Jansena2de7332011-03-08 14:14:00 +01003508 }
3509
Liu Bo625f1c8d2013-04-27 02:56:57 +00003510 stop_loop = 0;
Arne Jansena2de7332011-03-08 14:14:00 +01003511 while (1) {
Josef Bacik3173a182013-03-07 14:22:04 -05003512 u64 bytes;
3513
Arne Jansena2de7332011-03-08 14:14:00 +01003514 l = path->nodes[0];
3515 slot = path->slots[0];
3516 if (slot >= btrfs_header_nritems(l)) {
3517 ret = btrfs_next_leaf(root, path);
3518 if (ret == 0)
3519 continue;
3520 if (ret < 0)
3521 goto out;
3522
Liu Bo625f1c8d2013-04-27 02:56:57 +00003523 stop_loop = 1;
Arne Jansena2de7332011-03-08 14:14:00 +01003524 break;
3525 }
3526 btrfs_item_key_to_cpu(l, &key, slot);
3527
Zhao Leid7cad232015-07-22 13:14:48 +08003528 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3529 key.type != BTRFS_METADATA_ITEM_KEY)
3530 goto next;
3531
Josef Bacik3173a182013-03-07 14:22:04 -05003532 if (key.type == BTRFS_METADATA_ITEM_KEY)
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003533 bytes = fs_info->nodesize;
Josef Bacik3173a182013-03-07 14:22:04 -05003534 else
3535 bytes = key.offset;
3536
3537 if (key.objectid + bytes <= logical)
Arne Jansena2de7332011-03-08 14:14:00 +01003538 goto next;
3539
Liu Bo625f1c8d2013-04-27 02:56:57 +00003540 if (key.objectid >= logical + map->stripe_len) {
3541 /* out of this device extent */
3542 if (key.objectid >= logic_end)
3543 stop_loop = 1;
3544 break;
3545 }
Arne Jansena2de7332011-03-08 14:14:00 +01003546
3547 extent = btrfs_item_ptr(l, slot,
3548 struct btrfs_extent_item);
3549 flags = btrfs_extent_flags(l, extent);
3550 generation = btrfs_extent_generation(l, extent);
3551
Zhao Leia323e812015-07-23 12:29:49 +08003552 if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3553 (key.objectid < logical ||
3554 key.objectid + bytes >
3555 logical + map->stripe_len)) {
Frank Holtonefe120a2013-12-20 11:37:06 -05003556 btrfs_err(fs_info,
Jeff Mahoney5d163e02016-09-20 10:05:00 -04003557 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
Geert Uytterhoevenc1c9ff72013-08-20 13:20:07 +02003558 key.objectid, logical);
Zhao Lei9799d2c32015-08-25 21:31:40 +08003559 spin_lock(&sctx->stat_lock);
3560 sctx->stat.uncorrectable_errors++;
3561 spin_unlock(&sctx->stat_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01003562 goto next;
3563 }
3564
Liu Bo625f1c8d2013-04-27 02:56:57 +00003565again:
3566 extent_logical = key.objectid;
3567 extent_len = bytes;
3568
Arne Jansena2de7332011-03-08 14:14:00 +01003569 /*
3570 * trim extent to this stripe
3571 */
Liu Bo625f1c8d2013-04-27 02:56:57 +00003572 if (extent_logical < logical) {
3573 extent_len -= logical - extent_logical;
3574 extent_logical = logical;
Arne Jansena2de7332011-03-08 14:14:00 +01003575 }
Liu Bo625f1c8d2013-04-27 02:56:57 +00003576 if (extent_logical + extent_len >
Arne Jansena2de7332011-03-08 14:14:00 +01003577 logical + map->stripe_len) {
Liu Bo625f1c8d2013-04-27 02:56:57 +00003578 extent_len = logical + map->stripe_len -
3579 extent_logical;
Arne Jansena2de7332011-03-08 14:14:00 +01003580 }
3581
Liu Bo625f1c8d2013-04-27 02:56:57 +00003582 extent_physical = extent_logical - logical + physical;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003583 extent_dev = scrub_dev;
3584 extent_mirror_num = mirror_num;
3585 if (is_dev_replace)
3586 scrub_remap_extent(fs_info, extent_logical,
3587 extent_len, &extent_physical,
3588 &extent_dev,
3589 &extent_mirror_num);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003590
Zhao Leife8cf652015-07-22 13:14:47 +08003591 ret = btrfs_lookup_csums_range(csum_root,
3592 extent_logical,
3593 extent_logical +
3594 extent_len - 1,
3595 &sctx->csum_list, 1);
Arne Jansena2de7332011-03-08 14:14:00 +01003596 if (ret)
3597 goto out;
3598
Liu Bo625f1c8d2013-04-27 02:56:57 +00003599 ret = scrub_extent(sctx, extent_logical, extent_len,
3600 extent_physical, extent_dev, flags,
3601 generation, extent_mirror_num,
Stefan Behrens115930c2013-07-04 16:14:23 +02003602 extent_logical - logical + physical);
Zhao Lei6fa96d72015-07-21 12:22:30 +08003603
3604 scrub_free_csums(sctx);
3605
Liu Bo625f1c8d2013-04-27 02:56:57 +00003606 if (ret)
3607 goto out;
3608
3609 if (extent_logical + extent_len <
3610 key.objectid + bytes) {
Zhao Leiffe2d202015-01-20 15:11:44 +08003611 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
Wang Shilong3b080b22014-04-01 18:01:43 +08003612 /*
3613 * loop until we find next data stripe
3614 * or we have finished all stripes.
3615 */
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003616loop:
3617 physical += map->stripe_len;
3618 ret = get_raid56_logic_offset(physical,
3619 num, map, &logical,
3620 &stripe_logical);
3621 logical += base;
3622
3623 if (ret && physical < physical_end) {
3624 stripe_logical += base;
3625 stripe_end = stripe_logical +
Zhao Leia0dd59d2015-07-21 15:42:26 +08003626 increment;
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003627 ret = scrub_raid56_parity(sctx,
3628 map, scrub_dev, ppath,
3629 stripe_logical,
3630 stripe_end);
3631 if (ret)
3632 goto out;
3633 goto loop;
3634 }
Wang Shilong3b080b22014-04-01 18:01:43 +08003635 } else {
3636 physical += map->stripe_len;
3637 logical += increment;
3638 }
Liu Bo625f1c8d2013-04-27 02:56:57 +00003639 if (logical < key.objectid + bytes) {
3640 cond_resched();
3641 goto again;
3642 }
3643
Wang Shilong3b080b22014-04-01 18:01:43 +08003644 if (physical >= physical_end) {
Liu Bo625f1c8d2013-04-27 02:56:57 +00003645 stop_loop = 1;
3646 break;
3647 }
3648 }
Arne Jansena2de7332011-03-08 14:14:00 +01003649next:
3650 path->slots[0]++;
3651 }
Chris Mason71267332011-05-23 06:30:52 -04003652 btrfs_release_path(path);
Wang Shilong3b080b22014-04-01 18:01:43 +08003653skip:
Arne Jansena2de7332011-03-08 14:14:00 +01003654 logical += increment;
3655 physical += map->stripe_len;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003656 spin_lock(&sctx->stat_lock);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003657 if (stop_loop)
3658 sctx->stat.last_physical = map->stripes[num].physical +
3659 length;
3660 else
3661 sctx->stat.last_physical = physical;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003662 spin_unlock(&sctx->stat_lock);
Liu Bo625f1c8d2013-04-27 02:56:57 +00003663 if (stop_loop)
3664 break;
Arne Jansena2de7332011-03-08 14:14:00 +01003665 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01003666out:
Arne Jansena2de7332011-03-08 14:14:00 +01003667 /* push queued extents */
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003668 scrub_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003669 mutex_lock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003670 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003671 mutex_unlock(&sctx->wr_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01003672
Arne Jansene7786c32011-05-28 20:58:38 +00003673 blk_finish_plug(&plug);
Arne Jansena2de7332011-03-08 14:14:00 +01003674 btrfs_free_path(path);
Miao Xie5a6ac9e2014-11-06 17:20:58 +08003675 btrfs_free_path(ppath);
Arne Jansena2de7332011-03-08 14:14:00 +01003676 return ret < 0 ? ret : 0;
3677}
3678
Stefan Behrensd9d181c2012-11-02 09:58:09 +01003679static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003680 struct btrfs_device *scrub_dev,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003681 u64 chunk_offset, u64 length,
Filipe Manana020d5b72015-11-19 10:57:20 +00003682 u64 dev_offset,
3683 struct btrfs_block_group_cache *cache,
3684 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01003685{
Jeff Mahoneyfb456252016-06-22 18:54:56 -04003686 struct btrfs_fs_info *fs_info = sctx->fs_info;
3687 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
Arne Jansena2de7332011-03-08 14:14:00 +01003688 struct map_lookup *map;
3689 struct extent_map *em;
3690 int i;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003691 int ret = 0;
Arne Jansena2de7332011-03-08 14:14:00 +01003692
3693 read_lock(&map_tree->map_tree.lock);
3694 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3695 read_unlock(&map_tree->map_tree.lock);
3696
Filipe Manana020d5b72015-11-19 10:57:20 +00003697 if (!em) {
3698 /*
3699 * Might have been an unused block group deleted by the cleaner
3700 * kthread or relocation.
3701 */
3702 spin_lock(&cache->lock);
3703 if (!cache->removed)
3704 ret = -EINVAL;
3705 spin_unlock(&cache->lock);
3706
3707 return ret;
3708 }
Arne Jansena2de7332011-03-08 14:14:00 +01003709
Jeff Mahoney95617d62015-06-03 10:55:48 -04003710 map = em->map_lookup;
Arne Jansena2de7332011-03-08 14:14:00 +01003711 if (em->start != chunk_offset)
3712 goto out;
3713
3714 if (em->len < length)
3715 goto out;
3716
3717 for (i = 0; i < map->num_stripes; ++i) {
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003718 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
Arne Jansen859acaf2012-02-09 15:09:02 +01003719 map->stripes[i].physical == dev_offset) {
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003720 ret = scrub_stripe(sctx, map, scrub_dev, i,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003721 chunk_offset, length,
3722 is_dev_replace);
Arne Jansena2de7332011-03-08 14:14:00 +01003723 if (ret)
3724 goto out;
3725 }
3726 }
3727out:
3728 free_extent_map(em);
3729
3730 return ret;
3731}
3732
3733static noinline_for_stack
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003734int scrub_enumerate_chunks(struct scrub_ctx *sctx,
Stefan Behrensff023aa2012-11-06 11:43:11 +01003735 struct btrfs_device *scrub_dev, u64 start, u64 end,
3736 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01003737{
3738 struct btrfs_dev_extent *dev_extent = NULL;
3739 struct btrfs_path *path;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003740 struct btrfs_fs_info *fs_info = sctx->fs_info;
3741 struct btrfs_root *root = fs_info->dev_root;
Arne Jansena2de7332011-03-08 14:14:00 +01003742 u64 length;
Arne Jansena2de7332011-03-08 14:14:00 +01003743 u64 chunk_offset;
Zhaolei55e3a602015-08-05 16:43:30 +08003744 int ret = 0;
Zhaolei76a8efa2015-11-17 18:46:17 +08003745 int ro_set;
Arne Jansena2de7332011-03-08 14:14:00 +01003746 int slot;
3747 struct extent_buffer *l;
3748 struct btrfs_key key;
3749 struct btrfs_key found_key;
3750 struct btrfs_block_group_cache *cache;
Stefan Behrensff023aa2012-11-06 11:43:11 +01003751 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
Arne Jansena2de7332011-03-08 14:14:00 +01003752
3753 path = btrfs_alloc_path();
3754 if (!path)
3755 return -ENOMEM;
3756
David Sterbae4058b52015-11-27 16:31:35 +01003757 path->reada = READA_FORWARD;
Arne Jansena2de7332011-03-08 14:14:00 +01003758 path->search_commit_root = 1;
3759 path->skip_locking = 1;
3760
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003761 key.objectid = scrub_dev->devid;
Arne Jansena2de7332011-03-08 14:14:00 +01003762 key.offset = 0ull;
3763 key.type = BTRFS_DEV_EXTENT_KEY;
3764
Arne Jansena2de7332011-03-08 14:14:00 +01003765 while (1) {
3766 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3767 if (ret < 0)
Arne Jansen8c510322011-06-03 10:09:26 +02003768 break;
3769 if (ret > 0) {
3770 if (path->slots[0] >=
3771 btrfs_header_nritems(path->nodes[0])) {
3772 ret = btrfs_next_leaf(root, path);
Zhaolei55e3a602015-08-05 16:43:30 +08003773 if (ret < 0)
Arne Jansen8c510322011-06-03 10:09:26 +02003774 break;
Zhaolei55e3a602015-08-05 16:43:30 +08003775 if (ret > 0) {
3776 ret = 0;
3777 break;
3778 }
3779 } else {
3780 ret = 0;
Arne Jansen8c510322011-06-03 10:09:26 +02003781 }
3782 }
Arne Jansena2de7332011-03-08 14:14:00 +01003783
3784 l = path->nodes[0];
3785 slot = path->slots[0];
3786
3787 btrfs_item_key_to_cpu(l, &found_key, slot);
3788
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003789 if (found_key.objectid != scrub_dev->devid)
Arne Jansena2de7332011-03-08 14:14:00 +01003790 break;
3791
David Sterba962a2982014-06-04 18:41:45 +02003792 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
Arne Jansena2de7332011-03-08 14:14:00 +01003793 break;
3794
3795 if (found_key.offset >= end)
3796 break;
3797
3798 if (found_key.offset < key.offset)
3799 break;
3800
3801 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3802 length = btrfs_dev_extent_length(l, dev_extent);
3803
Qu Wenruoced96ed2014-06-19 10:42:51 +08003804 if (found_key.offset + length <= start)
3805 goto skip;
Arne Jansena2de7332011-03-08 14:14:00 +01003806
Arne Jansena2de7332011-03-08 14:14:00 +01003807 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3808
3809 /*
3810 * get a reference on the corresponding block group to prevent
3811 * the chunk from going away while we scrub it
3812 */
3813 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
Qu Wenruoced96ed2014-06-19 10:42:51 +08003814
3815 /* some chunks are removed but not committed to disk yet,
3816 * continue scrubbing */
3817 if (!cache)
3818 goto skip;
3819
Zhaolei55e3a602015-08-05 16:43:30 +08003820 /*
3821 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3822 * to avoid deadlock caused by:
3823 * btrfs_inc_block_group_ro()
3824 * -> btrfs_wait_for_commit()
3825 * -> btrfs_commit_transaction()
3826 * -> btrfs_scrub_pause()
3827 */
3828 scrub_pause_on(fs_info);
Jeff Mahoney5e00f192017-02-15 16:28:29 -05003829 ret = btrfs_inc_block_group_ro(fs_info, cache);
Filipe Mananaf0e9b7d2016-05-14 09:12:53 +01003830 if (!ret && is_dev_replace) {
3831 /*
3832 * If we are doing a device replace wait for any tasks
3833 * that started dellaloc right before we set the block
3834 * group to RO mode, as they might have just allocated
3835 * an extent from it or decided they could do a nocow
3836 * write. And if any such tasks did that, wait for their
3837 * ordered extents to complete and then commit the
3838 * current transaction, so that we can later see the new
3839 * extent items in the extent tree - the ordered extents
3840 * create delayed data references (for cow writes) when
3841 * they complete, which will be run and insert the
3842 * corresponding extent items into the extent tree when
3843 * we commit the transaction they used when running
3844 * inode.c:btrfs_finish_ordered_io(). We later use
3845 * the commit root of the extent tree to find extents
3846 * to copy from the srcdev into the tgtdev, and we don't
3847 * want to miss any new extents.
3848 */
3849 btrfs_wait_block_group_reservations(cache);
3850 btrfs_wait_nocow_writers(cache);
3851 ret = btrfs_wait_ordered_roots(fs_info, -1,
3852 cache->key.objectid,
3853 cache->key.offset);
3854 if (ret > 0) {
3855 struct btrfs_trans_handle *trans;
3856
3857 trans = btrfs_join_transaction(root);
3858 if (IS_ERR(trans))
3859 ret = PTR_ERR(trans);
3860 else
Jeff Mahoney3a45bb22016-09-09 21:39:03 -04003861 ret = btrfs_commit_transaction(trans);
Filipe Mananaf0e9b7d2016-05-14 09:12:53 +01003862 if (ret) {
3863 scrub_pause_off(fs_info);
3864 btrfs_put_block_group(cache);
3865 break;
3866 }
3867 }
3868 }
Zhaolei55e3a602015-08-05 16:43:30 +08003869 scrub_pause_off(fs_info);
Zhaolei76a8efa2015-11-17 18:46:17 +08003870
3871 if (ret == 0) {
3872 ro_set = 1;
3873 } else if (ret == -ENOSPC) {
3874 /*
3875 * btrfs_inc_block_group_ro return -ENOSPC when it
3876 * failed in creating new chunk for metadata.
3877 * It is not a problem for scrub/replace, because
3878 * metadata are always cowed, and our scrub paused
3879 * commit_transactions.
3880 */
3881 ro_set = 0;
3882 } else {
Jeff Mahoney5d163e02016-09-20 10:05:00 -04003883 btrfs_warn(fs_info,
3884 "failed setting block group ro, ret=%d\n",
Zhaolei76a8efa2015-11-17 18:46:17 +08003885 ret);
Zhaolei55e3a602015-08-05 16:43:30 +08003886 btrfs_put_block_group(cache);
3887 break;
3888 }
3889
Filipe Manana81e87a72016-05-14 16:32:35 +01003890 btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003891 dev_replace->cursor_right = found_key.offset + length;
3892 dev_replace->cursor_left = found_key.offset;
3893 dev_replace->item_needs_writeback = 1;
Filipe Manana81e87a72016-05-14 16:32:35 +01003894 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
Zhao Lei8c204c92015-08-19 15:02:40 +08003895 ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
Filipe Manana020d5b72015-11-19 10:57:20 +00003896 found_key.offset, cache, is_dev_replace);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003897
3898 /*
3899 * flush, submit all pending read and write bios, afterwards
3900 * wait for them.
3901 * Note that in the dev replace case, a read request causes
3902 * write requests that are submitted in the read completion
3903 * worker. Therefore in the current situation, it is required
3904 * that all write requests are flushed, so that all read and
3905 * write requests are really completed when bios_in_flight
3906 * changes to 0.
3907 */
David Sterba3fb99302017-05-16 19:10:32 +02003908 atomic_set(&sctx->flush_all_writes, 1);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003909 scrub_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003910 mutex_lock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003911 scrub_wr_submit(sctx);
David Sterba3fb99302017-05-16 19:10:32 +02003912 mutex_unlock(&sctx->wr_lock);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003913
3914 wait_event(sctx->list_wait,
3915 atomic_read(&sctx->bios_in_flight) == 0);
Zhaoleib708ce92015-08-05 16:43:29 +08003916
3917 scrub_pause_on(fs_info);
Wang Shilong12cf9372014-02-19 19:24:17 +08003918
3919 /*
3920 * must be called before we decrease @scrub_paused.
3921 * make sure we don't block transaction commit while
3922 * we are waiting pending workers finished.
3923 */
Stefan Behrensff023aa2012-11-06 11:43:11 +01003924 wait_event(sctx->list_wait,
3925 atomic_read(&sctx->workers_pending) == 0);
David Sterba3fb99302017-05-16 19:10:32 +02003926 atomic_set(&sctx->flush_all_writes, 0);
Wang Shilong12cf9372014-02-19 19:24:17 +08003927
Zhaoleib708ce92015-08-05 16:43:29 +08003928 scrub_pause_off(fs_info);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003929
Filipe Manana1a1a8b72016-05-14 19:44:40 +01003930 btrfs_dev_replace_lock(&fs_info->dev_replace, 1);
3931 dev_replace->cursor_left = dev_replace->cursor_right;
3932 dev_replace->item_needs_writeback = 1;
3933 btrfs_dev_replace_unlock(&fs_info->dev_replace, 1);
3934
Zhaolei76a8efa2015-11-17 18:46:17 +08003935 if (ro_set)
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04003936 btrfs_dec_block_group_ro(cache);
Stefan Behrensff023aa2012-11-06 11:43:11 +01003937
Filipe Manana758f2df2015-11-19 11:45:48 +00003938 /*
3939 * We might have prevented the cleaner kthread from deleting
3940 * this block group if it was already unused because we raced
3941 * and set it to RO mode first. So add it back to the unused
3942 * list, otherwise it might not ever be deleted unless a manual
3943 * balance is triggered or it becomes used and unused again.
3944 */
3945 spin_lock(&cache->lock);
3946 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3947 btrfs_block_group_used(&cache->item) == 0) {
3948 spin_unlock(&cache->lock);
3949 spin_lock(&fs_info->unused_bgs_lock);
3950 if (list_empty(&cache->bg_list)) {
3951 btrfs_get_block_group(cache);
3952 list_add_tail(&cache->bg_list,
3953 &fs_info->unused_bgs);
3954 }
3955 spin_unlock(&fs_info->unused_bgs_lock);
3956 } else {
3957 spin_unlock(&cache->lock);
3958 }
3959
Arne Jansena2de7332011-03-08 14:14:00 +01003960 btrfs_put_block_group(cache);
3961 if (ret)
3962 break;
Stefan Behrensaf1be4f2012-11-27 17:39:51 +00003963 if (is_dev_replace &&
3964 atomic64_read(&dev_replace->num_write_errors) > 0) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01003965 ret = -EIO;
3966 break;
3967 }
3968 if (sctx->stat.malloc_errors > 0) {
3969 ret = -ENOMEM;
3970 break;
3971 }
Qu Wenruoced96ed2014-06-19 10:42:51 +08003972skip:
Arne Jansena2de7332011-03-08 14:14:00 +01003973 key.offset = found_key.offset + length;
Chris Mason71267332011-05-23 06:30:52 -04003974 btrfs_release_path(path);
Arne Jansena2de7332011-03-08 14:14:00 +01003975 }
3976
Arne Jansena2de7332011-03-08 14:14:00 +01003977 btrfs_free_path(path);
Arne Jansen8c510322011-06-03 10:09:26 +02003978
Zhaolei55e3a602015-08-05 16:43:30 +08003979 return ret;
Arne Jansena2de7332011-03-08 14:14:00 +01003980}
3981
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01003982static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3983 struct btrfs_device *scrub_dev)
Arne Jansena2de7332011-03-08 14:14:00 +01003984{
3985 int i;
3986 u64 bytenr;
3987 u64 gen;
3988 int ret;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003989 struct btrfs_fs_info *fs_info = sctx->fs_info;
Arne Jansena2de7332011-03-08 14:14:00 +01003990
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003991 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003992 return -EIO;
3993
Miao Xie5f546062014-07-24 11:37:09 +08003994 /* Seed devices of a new filesystem has their own generation. */
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003995 if (scrub_dev->fs_devices != fs_info->fs_devices)
Miao Xie5f546062014-07-24 11:37:09 +08003996 gen = scrub_dev->generation;
3997 else
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003998 gen = fs_info->last_trans_committed;
Arne Jansena2de7332011-03-08 14:14:00 +01003999
4000 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4001 bytenr = btrfs_sb_offset(i);
Miao Xie935e5cc2014-09-03 21:35:33 +08004002 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4003 scrub_dev->commit_total_bytes)
Arne Jansena2de7332011-03-08 14:14:00 +01004004 break;
4005
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004006 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
Stefan Behrensa36cf8b2012-11-02 13:26:57 +01004007 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
Stefan Behrensff023aa2012-11-06 11:43:11 +01004008 NULL, 1, bytenr);
Arne Jansena2de7332011-03-08 14:14:00 +01004009 if (ret)
4010 return ret;
4011 }
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01004012 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
Arne Jansena2de7332011-03-08 14:14:00 +01004013
4014 return 0;
4015}
4016
4017/*
4018 * get a reference count on fs_info->scrub_workers. start worker if necessary
4019 */
Stefan Behrensff023aa2012-11-06 11:43:11 +01004020static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4021 int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01004022{
David Sterba6f011052015-02-16 18:34:01 +01004023 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
Qu Wenruo0339ef22014-02-28 10:46:17 +08004024 int max_active = fs_info->thread_pool_size;
Arne Jansena2de7332011-03-08 14:14:00 +01004025
Arne Jansen632dd772011-06-10 12:07:07 +02004026 if (fs_info->scrub_workers_refcnt == 0) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01004027 if (is_dev_replace)
Qu Wenruo0339ef22014-02-28 10:46:17 +08004028 fs_info->scrub_workers =
Jeff Mahoneycb001092016-06-09 16:22:11 -04004029 btrfs_alloc_workqueue(fs_info, "scrub", flags,
Qu Wenruo0339ef22014-02-28 10:46:17 +08004030 1, 4);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004031 else
Qu Wenruo0339ef22014-02-28 10:46:17 +08004032 fs_info->scrub_workers =
Jeff Mahoneycb001092016-06-09 16:22:11 -04004033 btrfs_alloc_workqueue(fs_info, "scrub", flags,
Qu Wenruo0339ef22014-02-28 10:46:17 +08004034 max_active, 4);
Zhao Leie82afc52015-06-12 20:36:58 +08004035 if (!fs_info->scrub_workers)
4036 goto fail_scrub_workers;
4037
Qu Wenruo0339ef22014-02-28 10:46:17 +08004038 fs_info->scrub_wr_completion_workers =
Jeff Mahoneycb001092016-06-09 16:22:11 -04004039 btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
Qu Wenruo0339ef22014-02-28 10:46:17 +08004040 max_active, 2);
Zhao Leie82afc52015-06-12 20:36:58 +08004041 if (!fs_info->scrub_wr_completion_workers)
4042 goto fail_scrub_wr_completion_workers;
4043
Qu Wenruo0339ef22014-02-28 10:46:17 +08004044 fs_info->scrub_nocow_workers =
Jeff Mahoneycb001092016-06-09 16:22:11 -04004045 btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
Zhao Leie82afc52015-06-12 20:36:58 +08004046 if (!fs_info->scrub_nocow_workers)
4047 goto fail_scrub_nocow_workers;
Zhao Lei20b2e302015-06-04 20:09:15 +08004048 fs_info->scrub_parity_workers =
Jeff Mahoneycb001092016-06-09 16:22:11 -04004049 btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
Zhao Lei20b2e302015-06-04 20:09:15 +08004050 max_active, 2);
Zhao Leie82afc52015-06-12 20:36:58 +08004051 if (!fs_info->scrub_parity_workers)
4052 goto fail_scrub_parity_workers;
Arne Jansen632dd772011-06-10 12:07:07 +02004053 }
Arne Jansena2de7332011-03-08 14:14:00 +01004054 ++fs_info->scrub_workers_refcnt;
Zhao Leie82afc52015-06-12 20:36:58 +08004055 return 0;
4056
4057fail_scrub_parity_workers:
4058 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4059fail_scrub_nocow_workers:
4060 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4061fail_scrub_wr_completion_workers:
4062 btrfs_destroy_workqueue(fs_info->scrub_workers);
4063fail_scrub_workers:
4064 return -ENOMEM;
Arne Jansena2de7332011-03-08 14:14:00 +01004065}
4066
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004067static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
Arne Jansena2de7332011-03-08 14:14:00 +01004068{
Stefan Behrensff023aa2012-11-06 11:43:11 +01004069 if (--fs_info->scrub_workers_refcnt == 0) {
Qu Wenruo0339ef22014-02-28 10:46:17 +08004070 btrfs_destroy_workqueue(fs_info->scrub_workers);
4071 btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4072 btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
Zhao Lei20b2e302015-06-04 20:09:15 +08004073 btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004074 }
Arne Jansena2de7332011-03-08 14:14:00 +01004075 WARN_ON(fs_info->scrub_workers_refcnt < 0);
Arne Jansena2de7332011-03-08 14:14:00 +01004076}
4077
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004078int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4079 u64 end, struct btrfs_scrub_progress *progress,
Stefan Behrens63a212a2012-11-05 18:29:28 +01004080 int readonly, int is_dev_replace)
Arne Jansena2de7332011-03-08 14:14:00 +01004081{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004082 struct scrub_ctx *sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01004083 int ret;
4084 struct btrfs_device *dev;
Miao Xie5d68da32014-07-24 11:37:07 +08004085 struct rcu_string *name;
Arne Jansena2de7332011-03-08 14:14:00 +01004086
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004087 if (btrfs_fs_closing(fs_info))
Arne Jansena2de7332011-03-08 14:14:00 +01004088 return -EINVAL;
4089
Jeff Mahoneyda170662016-06-15 09:22:56 -04004090 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04004091 /*
4092 * in this case scrub is unable to calculate the checksum
4093 * the way scrub is implemented. Do not handle this
4094 * situation at all because it won't ever happen.
4095 */
Frank Holtonefe120a2013-12-20 11:37:06 -05004096 btrfs_err(fs_info,
4097 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
Jeff Mahoneyda170662016-06-15 09:22:56 -04004098 fs_info->nodesize,
4099 BTRFS_STRIPE_LEN);
Stefan Behrensb5d67f62012-03-27 14:21:27 -04004100 return -EINVAL;
4101 }
4102
Jeff Mahoneyda170662016-06-15 09:22:56 -04004103 if (fs_info->sectorsize != PAGE_SIZE) {
Stefan Behrensb5d67f62012-03-27 14:21:27 -04004104 /* not supported for data w/o checksums */
Chandan Rajendra751bebbe2016-07-04 10:04:39 +05304105 btrfs_err_rl(fs_info,
Jeff Mahoney5d163e02016-09-20 10:05:00 -04004106 "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
Jeff Mahoneyda170662016-06-15 09:22:56 -04004107 fs_info->sectorsize, PAGE_SIZE);
Arne Jansena2de7332011-03-08 14:14:00 +01004108 return -EINVAL;
4109 }
4110
Jeff Mahoneyda170662016-06-15 09:22:56 -04004111 if (fs_info->nodesize >
Stefan Behrens7a9e9982012-11-02 14:58:04 +01004112 PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
Jeff Mahoneyda170662016-06-15 09:22:56 -04004113 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
Stefan Behrens7a9e9982012-11-02 14:58:04 +01004114 /*
4115 * would exhaust the array bounds of pagev member in
4116 * struct scrub_block
4117 */
Jeff Mahoney5d163e02016-09-20 10:05:00 -04004118 btrfs_err(fs_info,
4119 "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
Jeff Mahoneyda170662016-06-15 09:22:56 -04004120 fs_info->nodesize,
Stefan Behrens7a9e9982012-11-02 14:58:04 +01004121 SCRUB_MAX_PAGES_PER_BLOCK,
Jeff Mahoneyda170662016-06-15 09:22:56 -04004122 fs_info->sectorsize,
Stefan Behrens7a9e9982012-11-02 14:58:04 +01004123 SCRUB_MAX_PAGES_PER_BLOCK);
4124 return -EINVAL;
4125 }
4126
Arne Jansena2de7332011-03-08 14:14:00 +01004127
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004128 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4129 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
Stefan Behrens63a212a2012-11-05 18:29:28 +01004130 if (!dev || (dev->missing && !is_dev_replace)) {
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004131 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01004132 return -ENODEV;
4133 }
Arne Jansena2de7332011-03-08 14:14:00 +01004134
Miao Xie5d68da32014-07-24 11:37:07 +08004135 if (!is_dev_replace && !readonly && !dev->writeable) {
4136 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4137 rcu_read_lock();
4138 name = rcu_dereference(dev->name);
4139 btrfs_err(fs_info, "scrub: device %s is not writable",
4140 name->str);
4141 rcu_read_unlock();
4142 return -EROFS;
4143 }
4144
Wang Shilong3b7a0162013-10-12 02:11:12 +08004145 mutex_lock(&fs_info->scrub_lock);
Stefan Behrens63a212a2012-11-05 18:29:28 +01004146 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
Arne Jansena2de7332011-03-08 14:14:00 +01004147 mutex_unlock(&fs_info->scrub_lock);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004148 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004149 return -EIO;
Arne Jansena2de7332011-03-08 14:14:00 +01004150 }
4151
Liu Bo73beece2015-07-17 16:49:19 +08004152 btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
Stefan Behrens8dabb742012-11-06 13:15:27 +01004153 if (dev->scrub_device ||
4154 (!is_dev_replace &&
4155 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
Liu Bo73beece2015-07-17 16:49:19 +08004156 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
Arne Jansena2de7332011-03-08 14:14:00 +01004157 mutex_unlock(&fs_info->scrub_lock);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004158 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01004159 return -EINPROGRESS;
4160 }
Liu Bo73beece2015-07-17 16:49:19 +08004161 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
Wang Shilong3b7a0162013-10-12 02:11:12 +08004162
4163 ret = scrub_workers_get(fs_info, is_dev_replace);
4164 if (ret) {
4165 mutex_unlock(&fs_info->scrub_lock);
4166 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4167 return ret;
4168 }
4169
Stefan Behrens63a212a2012-11-05 18:29:28 +01004170 sctx = scrub_setup_ctx(dev, is_dev_replace);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004171 if (IS_ERR(sctx)) {
Arne Jansena2de7332011-03-08 14:14:00 +01004172 mutex_unlock(&fs_info->scrub_lock);
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004173 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4174 scrub_workers_put(fs_info);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004175 return PTR_ERR(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +01004176 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004177 sctx->readonly = readonly;
4178 dev->scrub_device = sctx;
Wang Shilong3cb09292013-12-04 21:15:19 +08004179 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01004180
Wang Shilong3cb09292013-12-04 21:15:19 +08004181 /*
4182 * checking @scrub_pause_req here, we can avoid
4183 * race between committing transaction and scrubbing.
4184 */
Wang Shilongcb7ab022013-12-04 21:16:53 +08004185 __scrub_blocked_if_needed(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01004186 atomic_inc(&fs_info->scrubs_running);
4187 mutex_unlock(&fs_info->scrub_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01004188
Stefan Behrensff023aa2012-11-06 11:43:11 +01004189 if (!is_dev_replace) {
Wang Shilong9b011ad2013-10-25 19:12:02 +08004190 /*
4191 * by holding device list mutex, we can
4192 * kick off writing super in log tree sync.
4193 */
Wang Shilong3cb09292013-12-04 21:15:19 +08004194 mutex_lock(&fs_info->fs_devices->device_list_mutex);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004195 ret = scrub_supers(sctx, dev);
Wang Shilong3cb09292013-12-04 21:15:19 +08004196 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004197 }
Arne Jansena2de7332011-03-08 14:14:00 +01004198
4199 if (!ret)
Stefan Behrensff023aa2012-11-06 11:43:11 +01004200 ret = scrub_enumerate_chunks(sctx, dev, start, end,
4201 is_dev_replace);
Arne Jansena2de7332011-03-08 14:14:00 +01004202
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01004203 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
Arne Jansena2de7332011-03-08 14:14:00 +01004204 atomic_dec(&fs_info->scrubs_running);
4205 wake_up(&fs_info->scrub_pause_wait);
4206
Stefan Behrensb6bfebc2012-11-02 16:44:58 +01004207 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
Jan Schmidt0ef8e452011-06-13 20:04:15 +02004208
Arne Jansena2de7332011-03-08 14:14:00 +01004209 if (progress)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004210 memcpy(progress, &sctx->stat, sizeof(*progress));
Arne Jansena2de7332011-03-08 14:14:00 +01004211
4212 mutex_lock(&fs_info->scrub_lock);
4213 dev->scrub_device = NULL;
Wang Shilong3b7a0162013-10-12 02:11:12 +08004214 scrub_workers_put(fs_info);
Arne Jansena2de7332011-03-08 14:14:00 +01004215 mutex_unlock(&fs_info->scrub_lock);
4216
Filipe Mananaf55985f2015-02-09 21:14:24 +00004217 scrub_put_ctx(sctx);
Arne Jansena2de7332011-03-08 14:14:00 +01004218
4219 return ret;
4220}
4221
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04004222void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
Arne Jansena2de7332011-03-08 14:14:00 +01004223{
Arne Jansena2de7332011-03-08 14:14:00 +01004224 mutex_lock(&fs_info->scrub_lock);
4225 atomic_inc(&fs_info->scrub_pause_req);
4226 while (atomic_read(&fs_info->scrubs_paused) !=
4227 atomic_read(&fs_info->scrubs_running)) {
4228 mutex_unlock(&fs_info->scrub_lock);
4229 wait_event(fs_info->scrub_pause_wait,
4230 atomic_read(&fs_info->scrubs_paused) ==
4231 atomic_read(&fs_info->scrubs_running));
4232 mutex_lock(&fs_info->scrub_lock);
4233 }
4234 mutex_unlock(&fs_info->scrub_lock);
Arne Jansena2de7332011-03-08 14:14:00 +01004235}
4236
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04004237void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
Arne Jansena2de7332011-03-08 14:14:00 +01004238{
Arne Jansena2de7332011-03-08 14:14:00 +01004239 atomic_dec(&fs_info->scrub_pause_req);
4240 wake_up(&fs_info->scrub_pause_wait);
Arne Jansena2de7332011-03-08 14:14:00 +01004241}
4242
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004243int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
Arne Jansena2de7332011-03-08 14:14:00 +01004244{
Arne Jansena2de7332011-03-08 14:14:00 +01004245 mutex_lock(&fs_info->scrub_lock);
4246 if (!atomic_read(&fs_info->scrubs_running)) {
4247 mutex_unlock(&fs_info->scrub_lock);
4248 return -ENOTCONN;
4249 }
4250
4251 atomic_inc(&fs_info->scrub_cancel_req);
4252 while (atomic_read(&fs_info->scrubs_running)) {
4253 mutex_unlock(&fs_info->scrub_lock);
4254 wait_event(fs_info->scrub_pause_wait,
4255 atomic_read(&fs_info->scrubs_running) == 0);
4256 mutex_lock(&fs_info->scrub_lock);
4257 }
4258 atomic_dec(&fs_info->scrub_cancel_req);
4259 mutex_unlock(&fs_info->scrub_lock);
4260
4261 return 0;
4262}
4263
Stefan Behrensaa1b8cd2012-11-05 17:03:39 +01004264int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
4265 struct btrfs_device *dev)
Jeff Mahoney49b25e02012-03-01 17:24:58 +01004266{
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004267 struct scrub_ctx *sctx;
Arne Jansena2de7332011-03-08 14:14:00 +01004268
4269 mutex_lock(&fs_info->scrub_lock);
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004270 sctx = dev->scrub_device;
4271 if (!sctx) {
Arne Jansena2de7332011-03-08 14:14:00 +01004272 mutex_unlock(&fs_info->scrub_lock);
4273 return -ENOTCONN;
4274 }
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004275 atomic_inc(&sctx->cancel_req);
Arne Jansena2de7332011-03-08 14:14:00 +01004276 while (dev->scrub_device) {
4277 mutex_unlock(&fs_info->scrub_lock);
4278 wait_event(fs_info->scrub_pause_wait,
4279 dev->scrub_device == NULL);
4280 mutex_lock(&fs_info->scrub_lock);
4281 }
4282 mutex_unlock(&fs_info->scrub_lock);
4283
4284 return 0;
4285}
Stefan Behrens1623ede2012-03-27 14:21:26 -04004286
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04004287int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
Arne Jansena2de7332011-03-08 14:14:00 +01004288 struct btrfs_scrub_progress *progress)
4289{
4290 struct btrfs_device *dev;
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004291 struct scrub_ctx *sctx = NULL;
Arne Jansena2de7332011-03-08 14:14:00 +01004292
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004293 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4294 dev = btrfs_find_device(fs_info, devid, NULL, NULL);
Arne Jansena2de7332011-03-08 14:14:00 +01004295 if (dev)
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004296 sctx = dev->scrub_device;
4297 if (sctx)
4298 memcpy(progress, &sctx->stat, sizeof(*progress));
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004299 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
Arne Jansena2de7332011-03-08 14:14:00 +01004300
Stefan Behrensd9d181c2012-11-02 09:58:09 +01004301 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
Arne Jansena2de7332011-03-08 14:14:00 +01004302}
Stefan Behrensff023aa2012-11-06 11:43:11 +01004303
4304static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4305 u64 extent_logical, u64 extent_len,
4306 u64 *extent_physical,
4307 struct btrfs_device **extent_dev,
4308 int *extent_mirror_num)
4309{
4310 u64 mapped_length;
4311 struct btrfs_bio *bbio = NULL;
4312 int ret;
4313
4314 mapped_length = extent_len;
Christoph Hellwigcf8cddd2016-10-27 09:27:36 +02004315 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
Stefan Behrensff023aa2012-11-06 11:43:11 +01004316 &mapped_length, &bbio, 0);
4317 if (ret || !bbio || mapped_length < extent_len ||
4318 !bbio->stripes[0].dev->bdev) {
Zhao Lei6e9606d2015-01-20 15:11:34 +08004319 btrfs_put_bbio(bbio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004320 return;
4321 }
4322
4323 *extent_physical = bbio->stripes[0].physical;
4324 *extent_mirror_num = bbio->mirror_num;
4325 *extent_dev = bbio->stripes[0].dev;
Zhao Lei6e9606d2015-01-20 15:11:34 +08004326 btrfs_put_bbio(bbio);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004327}
4328
Stefan Behrensff023aa2012-11-06 11:43:11 +01004329static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
4330 int mirror_num, u64 physical_for_dev_replace)
4331{
4332 struct scrub_copy_nocow_ctx *nocow_ctx;
Jeff Mahoneyfb456252016-06-22 18:54:56 -04004333 struct btrfs_fs_info *fs_info = sctx->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004334
4335 nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
4336 if (!nocow_ctx) {
4337 spin_lock(&sctx->stat_lock);
4338 sctx->stat.malloc_errors++;
4339 spin_unlock(&sctx->stat_lock);
4340 return -ENOMEM;
4341 }
4342
4343 scrub_pending_trans_workers_inc(sctx);
4344
4345 nocow_ctx->sctx = sctx;
4346 nocow_ctx->logical = logical;
4347 nocow_ctx->len = len;
4348 nocow_ctx->mirror_num = mirror_num;
4349 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
Liu Bo9e0af232014-08-15 23:36:53 +08004350 btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
4351 copy_nocow_pages_worker, NULL, NULL);
Josef Bacik652f25a2013-09-12 16:58:28 -04004352 INIT_LIST_HEAD(&nocow_ctx->inodes);
Qu Wenruo0339ef22014-02-28 10:46:17 +08004353 btrfs_queue_work(fs_info->scrub_nocow_workers,
4354 &nocow_ctx->work);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004355
4356 return 0;
4357}
4358
Josef Bacik652f25a2013-09-12 16:58:28 -04004359static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
4360{
4361 struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
4362 struct scrub_nocow_inode *nocow_inode;
4363
4364 nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
4365 if (!nocow_inode)
4366 return -ENOMEM;
4367 nocow_inode->inum = inum;
4368 nocow_inode->offset = offset;
4369 nocow_inode->root = root;
4370 list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
4371 return 0;
4372}
4373
4374#define COPY_COMPLETE 1
4375
Stefan Behrensff023aa2012-11-06 11:43:11 +01004376static void copy_nocow_pages_worker(struct btrfs_work *work)
4377{
4378 struct scrub_copy_nocow_ctx *nocow_ctx =
4379 container_of(work, struct scrub_copy_nocow_ctx, work);
4380 struct scrub_ctx *sctx = nocow_ctx->sctx;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004381 struct btrfs_fs_info *fs_info = sctx->fs_info;
4382 struct btrfs_root *root = fs_info->extent_root;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004383 u64 logical = nocow_ctx->logical;
4384 u64 len = nocow_ctx->len;
4385 int mirror_num = nocow_ctx->mirror_num;
4386 u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4387 int ret;
4388 struct btrfs_trans_handle *trans = NULL;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004389 struct btrfs_path *path;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004390 int not_written = 0;
4391
Stefan Behrensff023aa2012-11-06 11:43:11 +01004392 path = btrfs_alloc_path();
4393 if (!path) {
4394 spin_lock(&sctx->stat_lock);
4395 sctx->stat.malloc_errors++;
4396 spin_unlock(&sctx->stat_lock);
4397 not_written = 1;
4398 goto out;
4399 }
4400
4401 trans = btrfs_join_transaction(root);
4402 if (IS_ERR(trans)) {
4403 not_written = 1;
4404 goto out;
4405 }
4406
4407 ret = iterate_inodes_from_logical(logical, fs_info, path,
Josef Bacik652f25a2013-09-12 16:58:28 -04004408 record_inode_for_nocow, nocow_ctx);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004409 if (ret != 0 && ret != -ENOENT) {
Jeff Mahoney5d163e02016-09-20 10:05:00 -04004410 btrfs_warn(fs_info,
4411 "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
4412 logical, physical_for_dev_replace, len, mirror_num,
4413 ret);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004414 not_written = 1;
4415 goto out;
4416 }
4417
Jeff Mahoney3a45bb22016-09-09 21:39:03 -04004418 btrfs_end_transaction(trans);
Josef Bacik652f25a2013-09-12 16:58:28 -04004419 trans = NULL;
4420 while (!list_empty(&nocow_ctx->inodes)) {
4421 struct scrub_nocow_inode *entry;
4422 entry = list_first_entry(&nocow_ctx->inodes,
4423 struct scrub_nocow_inode,
4424 list);
4425 list_del_init(&entry->list);
4426 ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4427 entry->root, nocow_ctx);
4428 kfree(entry);
4429 if (ret == COPY_COMPLETE) {
4430 ret = 0;
4431 break;
4432 } else if (ret) {
4433 break;
4434 }
4435 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004436out:
Josef Bacik652f25a2013-09-12 16:58:28 -04004437 while (!list_empty(&nocow_ctx->inodes)) {
4438 struct scrub_nocow_inode *entry;
4439 entry = list_first_entry(&nocow_ctx->inodes,
4440 struct scrub_nocow_inode,
4441 list);
4442 list_del_init(&entry->list);
4443 kfree(entry);
4444 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004445 if (trans && !IS_ERR(trans))
Jeff Mahoney3a45bb22016-09-09 21:39:03 -04004446 btrfs_end_transaction(trans);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004447 if (not_written)
4448 btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4449 num_uncorrectable_read_errors);
4450
4451 btrfs_free_path(path);
4452 kfree(nocow_ctx);
4453
4454 scrub_pending_trans_workers_dec(sctx);
4455}
4456
Nikolay Borisov1c8c9c52017-02-20 13:51:05 +02004457static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
Gui Hecheng32159242014-11-10 15:36:08 +08004458 u64 logical)
4459{
4460 struct extent_state *cached_state = NULL;
4461 struct btrfs_ordered_extent *ordered;
4462 struct extent_io_tree *io_tree;
4463 struct extent_map *em;
4464 u64 lockstart = start, lockend = start + len - 1;
4465 int ret = 0;
4466
Nikolay Borisov1c8c9c52017-02-20 13:51:05 +02004467 io_tree = &inode->io_tree;
Gui Hecheng32159242014-11-10 15:36:08 +08004468
David Sterbaff13db42015-12-03 14:30:40 +01004469 lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
Nikolay Borisov1c8c9c52017-02-20 13:51:05 +02004470 ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
Gui Hecheng32159242014-11-10 15:36:08 +08004471 if (ordered) {
4472 btrfs_put_ordered_extent(ordered);
4473 ret = 1;
4474 goto out_unlock;
4475 }
4476
4477 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4478 if (IS_ERR(em)) {
4479 ret = PTR_ERR(em);
4480 goto out_unlock;
4481 }
4482
4483 /*
4484 * This extent does not actually cover the logical extent anymore,
4485 * move on to the next inode.
4486 */
4487 if (em->block_start > logical ||
4488 em->block_start + em->block_len < logical + len) {
4489 free_extent_map(em);
4490 ret = 1;
4491 goto out_unlock;
4492 }
4493 free_extent_map(em);
4494
4495out_unlock:
4496 unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4497 GFP_NOFS);
4498 return ret;
4499}
4500
Josef Bacik652f25a2013-09-12 16:58:28 -04004501static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4502 struct scrub_copy_nocow_ctx *nocow_ctx)
Stefan Behrensff023aa2012-11-06 11:43:11 +01004503{
Jeff Mahoneyfb456252016-06-22 18:54:56 -04004504 struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004505 struct btrfs_key key;
Miao Xie826aa0a2013-06-27 18:50:59 +08004506 struct inode *inode;
4507 struct page *page;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004508 struct btrfs_root *local_root;
Josef Bacik652f25a2013-09-12 16:58:28 -04004509 struct extent_io_tree *io_tree;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004510 u64 physical_for_dev_replace;
Gui Hecheng32159242014-11-10 15:36:08 +08004511 u64 nocow_ctx_logical;
Josef Bacik652f25a2013-09-12 16:58:28 -04004512 u64 len = nocow_ctx->len;
Miao Xie826aa0a2013-06-27 18:50:59 +08004513 unsigned long index;
Liu Bo6f1c3602013-01-29 03:22:10 +00004514 int srcu_index;
Josef Bacik652f25a2013-09-12 16:58:28 -04004515 int ret = 0;
4516 int err = 0;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004517
4518 key.objectid = root;
4519 key.type = BTRFS_ROOT_ITEM_KEY;
4520 key.offset = (u64)-1;
Liu Bo6f1c3602013-01-29 03:22:10 +00004521
4522 srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4523
Stefan Behrensff023aa2012-11-06 11:43:11 +01004524 local_root = btrfs_read_fs_root_no_name(fs_info, &key);
Liu Bo6f1c3602013-01-29 03:22:10 +00004525 if (IS_ERR(local_root)) {
4526 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004527 return PTR_ERR(local_root);
Liu Bo6f1c3602013-01-29 03:22:10 +00004528 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004529
4530 key.type = BTRFS_INODE_ITEM_KEY;
4531 key.objectid = inum;
4532 key.offset = 0;
4533 inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
Liu Bo6f1c3602013-01-29 03:22:10 +00004534 srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004535 if (IS_ERR(inode))
4536 return PTR_ERR(inode);
4537
Miao Xieedd14002013-06-27 18:51:00 +08004538 /* Avoid truncate/dio/punch hole.. */
Al Viro59551022016-01-22 15:40:57 -05004539 inode_lock(inode);
Miao Xieedd14002013-06-27 18:51:00 +08004540 inode_dio_wait(inode);
4541
Stefan Behrensff023aa2012-11-06 11:43:11 +01004542 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
Josef Bacik652f25a2013-09-12 16:58:28 -04004543 io_tree = &BTRFS_I(inode)->io_tree;
Gui Hecheng32159242014-11-10 15:36:08 +08004544 nocow_ctx_logical = nocow_ctx->logical;
Josef Bacik652f25a2013-09-12 16:58:28 -04004545
Nikolay Borisov1c8c9c52017-02-20 13:51:05 +02004546 ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4547 nocow_ctx_logical);
Gui Hecheng32159242014-11-10 15:36:08 +08004548 if (ret) {
4549 ret = ret > 0 ? 0 : ret;
4550 goto out;
Josef Bacik652f25a2013-09-12 16:58:28 -04004551 }
4552
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004553 while (len >= PAGE_SIZE) {
4554 index = offset >> PAGE_SHIFT;
Miao Xieedd14002013-06-27 18:51:00 +08004555again:
Stefan Behrensff023aa2012-11-06 11:43:11 +01004556 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4557 if (!page) {
Frank Holtonefe120a2013-12-20 11:37:06 -05004558 btrfs_err(fs_info, "find_or_create_page() failed");
Stefan Behrensff023aa2012-11-06 11:43:11 +01004559 ret = -ENOMEM;
Miao Xie826aa0a2013-06-27 18:50:59 +08004560 goto out;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004561 }
4562
4563 if (PageUptodate(page)) {
4564 if (PageDirty(page))
4565 goto next_page;
4566 } else {
4567 ClearPageError(page);
Gui Hecheng32159242014-11-10 15:36:08 +08004568 err = extent_read_full_page(io_tree, page,
Josef Bacik652f25a2013-09-12 16:58:28 -04004569 btrfs_get_extent,
4570 nocow_ctx->mirror_num);
Miao Xie826aa0a2013-06-27 18:50:59 +08004571 if (err) {
4572 ret = err;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004573 goto next_page;
4574 }
Miao Xieedd14002013-06-27 18:51:00 +08004575
Miao Xie26b258912013-06-27 18:50:58 +08004576 lock_page(page);
Miao Xieedd14002013-06-27 18:51:00 +08004577 /*
4578 * If the page has been remove from the page cache,
4579 * the data on it is meaningless, because it may be
4580 * old one, the new data may be written into the new
4581 * page in the page cache.
4582 */
4583 if (page->mapping != inode->i_mapping) {
Josef Bacik652f25a2013-09-12 16:58:28 -04004584 unlock_page(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004585 put_page(page);
Miao Xieedd14002013-06-27 18:51:00 +08004586 goto again;
4587 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004588 if (!PageUptodate(page)) {
4589 ret = -EIO;
4590 goto next_page;
4591 }
4592 }
Gui Hecheng32159242014-11-10 15:36:08 +08004593
Nikolay Borisov1c8c9c52017-02-20 13:51:05 +02004594 ret = check_extent_to_block(BTRFS_I(inode), offset, len,
Gui Hecheng32159242014-11-10 15:36:08 +08004595 nocow_ctx_logical);
4596 if (ret) {
4597 ret = ret > 0 ? 0 : ret;
4598 goto next_page;
4599 }
4600
Miao Xie826aa0a2013-06-27 18:50:59 +08004601 err = write_page_nocow(nocow_ctx->sctx,
4602 physical_for_dev_replace, page);
4603 if (err)
4604 ret = err;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004605next_page:
Miao Xie826aa0a2013-06-27 18:50:59 +08004606 unlock_page(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004607 put_page(page);
Miao Xie826aa0a2013-06-27 18:50:59 +08004608
4609 if (ret)
4610 break;
4611
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004612 offset += PAGE_SIZE;
4613 physical_for_dev_replace += PAGE_SIZE;
4614 nocow_ctx_logical += PAGE_SIZE;
4615 len -= PAGE_SIZE;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004616 }
Josef Bacik652f25a2013-09-12 16:58:28 -04004617 ret = COPY_COMPLETE;
Miao Xie826aa0a2013-06-27 18:50:59 +08004618out:
Al Viro59551022016-01-22 15:40:57 -05004619 inode_unlock(inode);
Miao Xie826aa0a2013-06-27 18:50:59 +08004620 iput(inode);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004621 return ret;
4622}
4623
4624static int write_page_nocow(struct scrub_ctx *sctx,
4625 u64 physical_for_dev_replace, struct page *page)
4626{
4627 struct bio *bio;
4628 struct btrfs_device *dev;
4629 int ret;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004630
David Sterba3fb99302017-05-16 19:10:32 +02004631 dev = sctx->wr_tgtdev;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004632 if (!dev)
4633 return -EIO;
4634 if (!dev->bdev) {
Jeff Mahoneyfb456252016-06-22 18:54:56 -04004635 btrfs_warn_rl(dev->fs_info,
David Sterba94647322015-10-08 11:01:36 +02004636 "scrub write_page_nocow(bdev == NULL) is unexpected");
Stefan Behrensff023aa2012-11-06 11:43:11 +01004637 return -EIO;
4638 }
Chris Mason9be33952013-05-17 18:30:14 -04004639 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
Stefan Behrensff023aa2012-11-06 11:43:11 +01004640 if (!bio) {
4641 spin_lock(&sctx->stat_lock);
4642 sctx->stat.malloc_errors++;
4643 spin_unlock(&sctx->stat_lock);
4644 return -ENOMEM;
4645 }
Kent Overstreet4f024f32013-10-11 15:44:27 -07004646 bio->bi_iter.bi_size = 0;
4647 bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
Stefan Behrensff023aa2012-11-06 11:43:11 +01004648 bio->bi_bdev = dev->bdev;
Christoph Hellwig70fd7612016-11-01 07:40:10 -06004649 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03004650 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
4651 if (ret != PAGE_SIZE) {
Stefan Behrensff023aa2012-11-06 11:43:11 +01004652leave_with_eio:
4653 bio_put(bio);
4654 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4655 return -EIO;
4656 }
Stefan Behrensff023aa2012-11-06 11:43:11 +01004657
Mike Christie4e49ea42016-06-05 14:31:41 -05004658 if (btrfsic_submit_bio_wait(bio))
Stefan Behrensff023aa2012-11-06 11:43:11 +01004659 goto leave_with_eio;
4660
4661 bio_put(bio);
4662 return 0;
4663}