blob: fca8e3c7887dad8884adf508cad549e59a2e36e0 [file] [log] [blame]
Darrick J. Wong84d42ea2018-05-14 06:34:36 -07001/*
2 * Copyright (C) 2018 Oracle. All Rights Reserved.
3 *
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
19 */
20#include "xfs.h"
21#include "xfs_fs.h"
22#include "xfs_shared.h"
23#include "xfs_format.h"
24#include "xfs_trans_resv.h"
25#include "xfs_mount.h"
26#include "xfs_defer.h"
27#include "xfs_btree.h"
28#include "xfs_bit.h"
29#include "xfs_log_format.h"
30#include "xfs_trans.h"
31#include "xfs_sb.h"
32#include "xfs_inode.h"
33#include "xfs_icache.h"
34#include "xfs_alloc.h"
35#include "xfs_alloc_btree.h"
36#include "xfs_ialloc.h"
37#include "xfs_ialloc_btree.h"
38#include "xfs_rmap.h"
39#include "xfs_rmap_btree.h"
40#include "xfs_refcount.h"
41#include "xfs_refcount_btree.h"
42#include "xfs_extent_busy.h"
43#include "xfs_ag_resv.h"
44#include "xfs_trans_space.h"
45#include "scrub/xfs_scrub.h"
46#include "scrub/scrub.h"
47#include "scrub/common.h"
48#include "scrub/trace.h"
49#include "scrub/repair.h"
50
51/*
52 * Attempt to repair some metadata, if the metadata is corrupt and userspace
53 * told us to fix it. This function returns -EAGAIN to mean "re-run scrub",
54 * and will set *fixed to true if it thinks it repaired anything.
55 */
56int
57xfs_repair_attempt(
58 struct xfs_inode *ip,
59 struct xfs_scrub_context *sc,
60 bool *fixed)
61{
62 int error = 0;
63
64 trace_xfs_repair_attempt(ip, sc->sm, error);
65
66 xfs_scrub_ag_btcur_free(&sc->sa);
67
68 /* Repair whatever's broken. */
69 ASSERT(sc->ops->repair);
70 error = sc->ops->repair(sc);
71 trace_xfs_repair_done(ip, sc->sm, error);
72 switch (error) {
73 case 0:
74 /*
75 * Repair succeeded. Commit the fixes and perform a second
76 * scrub so that we can tell userspace if we fixed the problem.
77 */
78 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
79 *fixed = true;
80 return -EAGAIN;
81 case -EDEADLOCK:
82 case -EAGAIN:
83 /* Tell the caller to try again having grabbed all the locks. */
84 if (!sc->try_harder) {
85 sc->try_harder = true;
86 return -EAGAIN;
87 }
88 /*
89 * We tried harder but still couldn't grab all the resources
90 * we needed to fix it. The corruption has not been fixed,
91 * so report back to userspace.
92 */
93 return -EFSCORRUPTED;
94 default:
95 return error;
96 }
97}
98
99/*
100 * Complain about unfixable problems in the filesystem. We don't log
101 * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
102 * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
103 * administrator isn't running xfs_scrub in no-repairs mode.
104 *
105 * Use this helper function because _ratelimited silently declares a static
106 * structure to track rate limiting information.
107 */
108void
109xfs_repair_failure(
110 struct xfs_mount *mp)
111{
112 xfs_alert_ratelimited(mp,
113"Corruption not fixed during online repair. Unmount and run xfs_repair.");
114}
115
116/*
117 * Repair probe -- userspace uses this to probe if we're willing to repair a
118 * given mountpoint.
119 */
120int
121xfs_repair_probe(
122 struct xfs_scrub_context *sc)
123{
124 int error = 0;
125
126 if (xfs_scrub_should_terminate(sc, &error))
127 return error;
128
129 return 0;
130}
Darrick J. Wong0a9633f2018-05-29 22:18:08 -0700131
132/*
133 * Roll a transaction, keeping the AG headers locked and reinitializing
134 * the btree cursors.
135 */
136int
137xfs_repair_roll_ag_trans(
138 struct xfs_scrub_context *sc)
139{
140 int error;
141
142 /* Keep the AG header buffers locked so we can keep going. */
143 xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
144 xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
145 xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
146
147 /* Roll the transaction. */
148 error = xfs_trans_roll(&sc->tp);
149 if (error)
150 goto out_release;
151
152 /* Join AG headers to the new transaction. */
153 xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
154 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
155 xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
156
157 return 0;
158
159out_release:
160 /*
161 * Rolling failed, so release the hold on the buffers. The
162 * buffers will be released during teardown on our way out
163 * of the kernel.
164 */
165 xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
166 xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
167 xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
168
169 return error;
170}
171
172/*
173 * Does the given AG have enough space to rebuild a btree? Neither AG
174 * reservation can be critical, and we must have enough space (factoring
175 * in AG reservations) to construct a whole btree.
176 */
177bool
178xfs_repair_ag_has_space(
179 struct xfs_perag *pag,
180 xfs_extlen_t nr_blocks,
181 enum xfs_ag_resv_type type)
182{
183 return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
184 !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
185 pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
186}
187
188/*
189 * Figure out how many blocks to reserve for an AG repair. We calculate the
190 * worst case estimate for the number of blocks we'd need to rebuild one of
191 * any type of per-AG btree.
192 */
193xfs_extlen_t
194xfs_repair_calc_ag_resblks(
195 struct xfs_scrub_context *sc)
196{
197 struct xfs_mount *mp = sc->mp;
198 struct xfs_scrub_metadata *sm = sc->sm;
199 struct xfs_perag *pag;
200 struct xfs_buf *bp;
201 xfs_agino_t icount = 0;
202 xfs_extlen_t aglen = 0;
203 xfs_extlen_t usedlen;
204 xfs_extlen_t freelen;
205 xfs_extlen_t bnobt_sz;
206 xfs_extlen_t inobt_sz;
207 xfs_extlen_t rmapbt_sz;
208 xfs_extlen_t refcbt_sz;
209 int error;
210
211 if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
212 return 0;
213
214 /* Use in-core counters if possible. */
215 pag = xfs_perag_get(mp, sm->sm_agno);
216 if (pag->pagi_init)
217 icount = pag->pagi_count;
218
219 /*
220 * Otherwise try to get the actual counters from disk; if not, make
221 * some worst case assumptions.
222 */
223 if (icount == 0) {
224 error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
225 if (error) {
226 icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
227 } else {
228 icount = pag->pagi_count;
229 xfs_buf_relse(bp);
230 }
231 }
232
233 /* Now grab the block counters from the AGF. */
234 error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
235 if (error) {
236 aglen = mp->m_sb.sb_agblocks;
237 freelen = aglen;
238 usedlen = aglen;
239 } else {
240 aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
241 freelen = pag->pagf_freeblks;
242 usedlen = aglen - freelen;
243 xfs_buf_relse(bp);
244 }
245 xfs_perag_put(pag);
246
247 trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
248 freelen, usedlen);
249
250 /*
251 * Figure out how many blocks we'd need worst case to rebuild
252 * each type of btree. Note that we can only rebuild the
253 * bnobt/cntbt or inobt/finobt as pairs.
254 */
255 bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
256 if (xfs_sb_version_hassparseinodes(&mp->m_sb))
257 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
258 XFS_INODES_PER_HOLEMASK_BIT);
259 else
260 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
261 XFS_INODES_PER_CHUNK);
262 if (xfs_sb_version_hasfinobt(&mp->m_sb))
263 inobt_sz *= 2;
264 if (xfs_sb_version_hasreflink(&mp->m_sb))
265 refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
266 else
267 refcbt_sz = 0;
268 if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
269 /*
270 * Guess how many blocks we need to rebuild the rmapbt.
271 * For non-reflink filesystems we can't have more records than
272 * used blocks. However, with reflink it's possible to have
273 * more than one rmap record per AG block. We don't know how
274 * many rmaps there could be in the AG, so we start off with
275 * what we hope is an generous over-estimation.
276 */
277 if (xfs_sb_version_hasreflink(&mp->m_sb))
278 rmapbt_sz = xfs_rmapbt_calc_size(mp,
279 (unsigned long long)aglen * 2);
280 else
281 rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
282 } else {
283 rmapbt_sz = 0;
284 }
285
286 trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
287 inobt_sz, rmapbt_sz, refcbt_sz);
288
289 return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
290}
Darrick J. Wong73d6b422018-05-29 22:18:09 -0700291
292/* Allocate a block in an AG. */
293int
294xfs_repair_alloc_ag_block(
295 struct xfs_scrub_context *sc,
296 struct xfs_owner_info *oinfo,
297 xfs_fsblock_t *fsbno,
298 enum xfs_ag_resv_type resv)
299{
300 struct xfs_alloc_arg args = {0};
301 xfs_agblock_t bno;
302 int error;
303
304 switch (resv) {
305 case XFS_AG_RESV_AGFL:
306 case XFS_AG_RESV_RMAPBT:
307 error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
308 if (error)
309 return error;
310 if (bno == NULLAGBLOCK)
311 return -ENOSPC;
312 xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
313 1, false);
314 *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
315 if (resv == XFS_AG_RESV_RMAPBT)
316 xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
317 return 0;
318 default:
319 break;
320 }
321
322 args.tp = sc->tp;
323 args.mp = sc->mp;
324 args.oinfo = *oinfo;
325 args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
326 args.minlen = 1;
327 args.maxlen = 1;
328 args.prod = 1;
329 args.type = XFS_ALLOCTYPE_THIS_AG;
330 args.resv = resv;
331
332 error = xfs_alloc_vextent(&args);
333 if (error)
334 return error;
335 if (args.fsbno == NULLFSBLOCK)
336 return -ENOSPC;
337 ASSERT(args.len == 1);
338 *fsbno = args.fsbno;
339
340 return 0;
341}
342
343/* Initialize a new AG btree root block with zero entries. */
344int
345xfs_repair_init_btblock(
346 struct xfs_scrub_context *sc,
347 xfs_fsblock_t fsb,
348 struct xfs_buf **bpp,
349 xfs_btnum_t btnum,
350 const struct xfs_buf_ops *ops)
351{
352 struct xfs_trans *tp = sc->tp;
353 struct xfs_mount *mp = sc->mp;
354 struct xfs_buf *bp;
355
356 trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
357 XFS_FSB_TO_AGBNO(mp, fsb), btnum);
358
359 ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
360 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
361 XFS_FSB_TO_BB(mp, 1), 0);
362 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
363 xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0);
364 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
365 xfs_trans_log_buf(tp, bp, 0, bp->b_length);
366 bp->b_ops = ops;
367 *bpp = bp;
368
369 return 0;
370}
Darrick J. Wong64a39d82018-05-29 22:18:09 -0700371
372/*
373 * Reconstructing per-AG Btrees
374 *
375 * When a space btree is corrupt, we don't bother trying to fix it. Instead,
376 * we scan secondary space metadata to derive the records that should be in
377 * the damaged btree, initialize a fresh btree root, and insert the records.
378 * Note that for rebuilding the rmapbt we scan all the primary data to
379 * generate the new records.
380 *
381 * However, that leaves the matter of removing all the metadata describing the
382 * old broken structure. For primary metadata we use the rmap data to collect
383 * every extent with a matching rmap owner (exlist); we then iterate all other
384 * metadata structures with the same rmap owner to collect the extents that
385 * cannot be removed (sublist). We then subtract sublist from exlist to
386 * derive the blocks that were used by the old btree. These blocks can be
387 * reaped.
388 *
389 * For rmapbt reconstructions we must use different tactics for extent
390 * collection. First we iterate all primary metadata (this excludes the old
391 * rmapbt, obviously) to generate new rmap records. The gaps in the rmap
392 * records are collected as exlist. The bnobt records are collected as
393 * sublist. As with the other btrees we subtract sublist from exlist, and the
394 * result (since the rmapbt lives in the free space) are the blocks from the
395 * old rmapbt.
396 */
397
398/* Collect a dead btree extent for later disposal. */
399int
400xfs_repair_collect_btree_extent(
401 struct xfs_scrub_context *sc,
402 struct xfs_repair_extent_list *exlist,
403 xfs_fsblock_t fsbno,
404 xfs_extlen_t len)
405{
406 struct xfs_repair_extent *rex;
407
408 trace_xfs_repair_collect_btree_extent(sc->mp,
409 XFS_FSB_TO_AGNO(sc->mp, fsbno),
410 XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
411
412 rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
413 if (!rex)
414 return -ENOMEM;
415
416 INIT_LIST_HEAD(&rex->list);
417 rex->fsbno = fsbno;
418 rex->len = len;
419 list_add_tail(&rex->list, &exlist->list);
420
421 return 0;
422}
423
424/*
425 * An error happened during the rebuild so the transaction will be cancelled.
426 * The fs will shut down, and the administrator has to unmount and run repair.
427 * Therefore, free all the memory associated with the list so we can die.
428 */
429void
430xfs_repair_cancel_btree_extents(
431 struct xfs_scrub_context *sc,
432 struct xfs_repair_extent_list *exlist)
433{
434 struct xfs_repair_extent *rex;
435 struct xfs_repair_extent *n;
436
437 for_each_xfs_repair_extent_safe(rex, n, exlist) {
438 list_del(&rex->list);
439 kmem_free(rex);
440 }
441}
442
443/* Compare two btree extents. */
444static int
445xfs_repair_btree_extent_cmp(
446 void *priv,
447 struct list_head *a,
448 struct list_head *b)
449{
450 struct xfs_repair_extent *ap;
451 struct xfs_repair_extent *bp;
452
453 ap = container_of(a, struct xfs_repair_extent, list);
454 bp = container_of(b, struct xfs_repair_extent, list);
455
456 if (ap->fsbno > bp->fsbno)
457 return 1;
458 if (ap->fsbno < bp->fsbno)
459 return -1;
460 return 0;
461}
462
463/*
464 * Remove all the blocks mentioned in @sublist from the extents in @exlist.
465 *
466 * The intent is that callers will iterate the rmapbt for all of its records
467 * for a given owner to generate @exlist; and iterate all the blocks of the
468 * metadata structures that are not being rebuilt and have the same rmapbt
469 * owner to generate @sublist. This routine subtracts all the extents
470 * mentioned in sublist from all the extents linked in @exlist, which leaves
471 * @exlist as the list of blocks that are not accounted for, which we assume
472 * are the dead blocks of the old metadata structure. The blocks mentioned in
473 * @exlist can be reaped.
474 */
475#define LEFT_ALIGNED (1 << 0)
476#define RIGHT_ALIGNED (1 << 1)
477int
478xfs_repair_subtract_extents(
479 struct xfs_scrub_context *sc,
480 struct xfs_repair_extent_list *exlist,
481 struct xfs_repair_extent_list *sublist)
482{
483 struct list_head *lp;
484 struct xfs_repair_extent *ex;
485 struct xfs_repair_extent *newex;
486 struct xfs_repair_extent *subex;
487 xfs_fsblock_t sub_fsb;
488 xfs_extlen_t sub_len;
489 int state;
490 int error = 0;
491
492 if (list_empty(&exlist->list) || list_empty(&sublist->list))
493 return 0;
494 ASSERT(!list_empty(&sublist->list));
495
496 list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
497 list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
498
499 /*
500 * Now that we've sorted both lists, we iterate exlist once, rolling
501 * forward through sublist and/or exlist as necessary until we find an
502 * overlap or reach the end of either list. We do not reset lp to the
503 * head of exlist nor do we reset subex to the head of sublist. The
504 * list traversal is similar to merge sort, but we're deleting
505 * instead. In this manner we avoid O(n^2) operations.
506 */
507 subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
508 list);
509 lp = exlist->list.next;
510 while (lp != &exlist->list) {
511 ex = list_entry(lp, struct xfs_repair_extent, list);
512
513 /*
514 * Advance subex and/or ex until we find a pair that
515 * intersect or we run out of extents.
516 */
517 while (subex->fsbno + subex->len <= ex->fsbno) {
518 if (list_is_last(&subex->list, &sublist->list))
519 goto out;
520 subex = list_next_entry(subex, list);
521 }
522 if (subex->fsbno >= ex->fsbno + ex->len) {
523 lp = lp->next;
524 continue;
525 }
526
527 /* trim subex to fit the extent we have */
528 sub_fsb = subex->fsbno;
529 sub_len = subex->len;
530 if (subex->fsbno < ex->fsbno) {
531 sub_len -= ex->fsbno - subex->fsbno;
532 sub_fsb = ex->fsbno;
533 }
534 if (sub_len > ex->len)
535 sub_len = ex->len;
536
537 state = 0;
538 if (sub_fsb == ex->fsbno)
539 state |= LEFT_ALIGNED;
540 if (sub_fsb + sub_len == ex->fsbno + ex->len)
541 state |= RIGHT_ALIGNED;
542 switch (state) {
543 case LEFT_ALIGNED:
544 /* Coincides with only the left. */
545 ex->fsbno += sub_len;
546 ex->len -= sub_len;
547 break;
548 case RIGHT_ALIGNED:
549 /* Coincides with only the right. */
550 ex->len -= sub_len;
551 lp = lp->next;
552 break;
553 case LEFT_ALIGNED | RIGHT_ALIGNED:
554 /* Total overlap, just delete ex. */
555 lp = lp->next;
556 list_del(&ex->list);
557 kmem_free(ex);
558 break;
559 case 0:
560 /*
561 * Deleting from the middle: add the new right extent
562 * and then shrink the left extent.
563 */
564 newex = kmem_alloc(sizeof(struct xfs_repair_extent),
565 KM_MAYFAIL);
566 if (!newex) {
567 error = -ENOMEM;
568 goto out;
569 }
570 INIT_LIST_HEAD(&newex->list);
571 newex->fsbno = sub_fsb + sub_len;
572 newex->len = ex->fsbno + ex->len - newex->fsbno;
573 list_add(&newex->list, &ex->list);
574 ex->len = sub_fsb - ex->fsbno;
575 lp = lp->next;
576 break;
577 default:
578 ASSERT(0);
579 break;
580 }
581 }
582
583out:
584 return error;
585}
586#undef LEFT_ALIGNED
587#undef RIGHT_ALIGNED