Blame - fs/xfs/scrub/repair.c - SHIFTPHONES/mainline/linux

blob: e3e8fba1c99cc41d64a95122aa04b847360ae71e [file] [log] [blame]

Darrick J. Wong	84d42ea	2018-05-14 06:34:36 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2018 Oracle. All Rights Reserved.
				3	*
				4	* Author: Darrick J. Wong <darrick.wong@oracle.com>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version 2
				9	* of the License, or (at your option) any later version.
				10	*
				11	* This program is distributed in the hope that it would be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				14	* GNU General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU General Public License
				17	* along with this program; if not, write the Free Software Foundation,
				18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
				19	*/
				20	#include "xfs.h"
				21	#include "xfs_fs.h"
				22	#include "xfs_shared.h"
				23	#include "xfs_format.h"
				24	#include "xfs_trans_resv.h"
				25	#include "xfs_mount.h"
				26	#include "xfs_defer.h"
				27	#include "xfs_btree.h"
				28	#include "xfs_bit.h"
				29	#include "xfs_log_format.h"
				30	#include "xfs_trans.h"
				31	#include "xfs_sb.h"
				32	#include "xfs_inode.h"
				33	#include "xfs_icache.h"
				34	#include "xfs_alloc.h"
				35	#include "xfs_alloc_btree.h"
				36	#include "xfs_ialloc.h"
				37	#include "xfs_ialloc_btree.h"
				38	#include "xfs_rmap.h"
				39	#include "xfs_rmap_btree.h"
				40	#include "xfs_refcount.h"
				41	#include "xfs_refcount_btree.h"
				42	#include "xfs_extent_busy.h"
				43	#include "xfs_ag_resv.h"
				44	#include "xfs_trans_space.h"
Darrick J. Wong	7e85bc6	2018-05-29 22:18:11 -0700	[diff] [blame^]	45	#include "xfs_quota.h"
Darrick J. Wong	84d42ea	2018-05-14 06:34:36 -0700	[diff] [blame]	46	#include "scrub/xfs_scrub.h"
				47	#include "scrub/scrub.h"
				48	#include "scrub/common.h"
				49	#include "scrub/trace.h"
				50	#include "scrub/repair.h"
				51
				52	/*
				53	* Attempt to repair some metadata, if the metadata is corrupt and userspace
				54	* told us to fix it. This function returns -EAGAIN to mean "re-run scrub",
				55	* and will set *fixed to true if it thinks it repaired anything.
				56	*/
				57	int
				58	xfs_repair_attempt(
				59	struct xfs_inode *ip,
				60	struct xfs_scrub_context *sc,
				61	bool *fixed)
				62	{
				63	int error = 0;
				64
				65	trace_xfs_repair_attempt(ip, sc->sm, error);
				66
				67	xfs_scrub_ag_btcur_free(&sc->sa);
				68
				69	/* Repair whatever's broken. */
				70	ASSERT(sc->ops->repair);
				71	error = sc->ops->repair(sc);
				72	trace_xfs_repair_done(ip, sc->sm, error);
				73	switch (error) {
				74	case 0:
				75	/*
				76	* Repair succeeded. Commit the fixes and perform a second
				77	* scrub so that we can tell userspace if we fixed the problem.
				78	*/
				79	sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
				80	*fixed = true;
				81	return -EAGAIN;
				82	case -EDEADLOCK:
				83	case -EAGAIN:
				84	/* Tell the caller to try again having grabbed all the locks. */
				85	if (!sc->try_harder) {
				86	sc->try_harder = true;
				87	return -EAGAIN;
				88	}
				89	/*
				90	* We tried harder but still couldn't grab all the resources
				91	* we needed to fix it. The corruption has not been fixed,
				92	* so report back to userspace.
				93	*/
				94	return -EFSCORRUPTED;
				95	default:
				96	return error;
				97	}
				98	}
				99
				100	/*
				101	* Complain about unfixable problems in the filesystem. We don't log
				102	* corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
				103	* program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
				104	* administrator isn't running xfs_scrub in no-repairs mode.
				105	*
				106	* Use this helper function because _ratelimited silently declares a static
				107	* structure to track rate limiting information.
				108	*/
				109	void
				110	xfs_repair_failure(
				111	struct xfs_mount *mp)
				112	{
				113	xfs_alert_ratelimited(mp,
				114	"Corruption not fixed during online repair. Unmount and run xfs_repair.");
				115	}
				116
				117	/*
				118	* Repair probe -- userspace uses this to probe if we're willing to repair a
				119	* given mountpoint.
				120	*/
				121	int
				122	xfs_repair_probe(
				123	struct xfs_scrub_context *sc)
				124	{
				125	int error = 0;
				126
				127	if (xfs_scrub_should_terminate(sc, &error))
				128	return error;
				129
				130	return 0;
				131	}
Darrick J. Wong	0a9633f	2018-05-29 22:18:08 -0700	[diff] [blame]	132
				133	/*
				134	* Roll a transaction, keeping the AG headers locked and reinitializing
				135	* the btree cursors.
				136	*/
				137	int
				138	xfs_repair_roll_ag_trans(
				139	struct xfs_scrub_context *sc)
				140	{
				141	int error;
				142
				143	/* Keep the AG header buffers locked so we can keep going. */
				144	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
				145	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
				146	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
				147
				148	/* Roll the transaction. */
				149	error = xfs_trans_roll(&sc->tp);
				150	if (error)
				151	goto out_release;
				152
				153	/* Join AG headers to the new transaction. */
				154	xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
				155	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
				156	xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
				157
				158	return 0;
				159
				160	out_release:
				161	/*
				162	* Rolling failed, so release the hold on the buffers. The
				163	* buffers will be released during teardown on our way out
				164	* of the kernel.
				165	*/
				166	xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
				167	xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
				168	xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
				169
				170	return error;
				171	}
				172
				173	/*
				174	* Does the given AG have enough space to rebuild a btree? Neither AG
				175	* reservation can be critical, and we must have enough space (factoring
				176	* in AG reservations) to construct a whole btree.
				177	*/
				178	bool
				179	xfs_repair_ag_has_space(
				180	struct xfs_perag *pag,
				181	xfs_extlen_t nr_blocks,
				182	enum xfs_ag_resv_type type)
				183	{
				184	return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
				185	!xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
				186	pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
				187	}
				188
				189	/*
				190	* Figure out how many blocks to reserve for an AG repair. We calculate the
				191	* worst case estimate for the number of blocks we'd need to rebuild one of
				192	* any type of per-AG btree.
				193	*/
				194	xfs_extlen_t
				195	xfs_repair_calc_ag_resblks(
				196	struct xfs_scrub_context *sc)
				197	{
				198	struct xfs_mount *mp = sc->mp;
				199	struct xfs_scrub_metadata *sm = sc->sm;
				200	struct xfs_perag *pag;
				201	struct xfs_buf *bp;
				202	xfs_agino_t icount = 0;
				203	xfs_extlen_t aglen = 0;
				204	xfs_extlen_t usedlen;
				205	xfs_extlen_t freelen;
				206	xfs_extlen_t bnobt_sz;
				207	xfs_extlen_t inobt_sz;
				208	xfs_extlen_t rmapbt_sz;
				209	xfs_extlen_t refcbt_sz;
				210	int error;
				211
				212	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
				213	return 0;
				214
				215	/* Use in-core counters if possible. */
				216	pag = xfs_perag_get(mp, sm->sm_agno);
				217	if (pag->pagi_init)
				218	icount = pag->pagi_count;
				219
				220	/*
				221	* Otherwise try to get the actual counters from disk; if not, make
				222	* some worst case assumptions.
				223	*/
				224	if (icount == 0) {
				225	error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
				226	if (error) {
				227	icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
				228	} else {
				229	icount = pag->pagi_count;
				230	xfs_buf_relse(bp);
				231	}
				232	}
				233
				234	/* Now grab the block counters from the AGF. */
				235	error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
				236	if (error) {
				237	aglen = mp->m_sb.sb_agblocks;
				238	freelen = aglen;
				239	usedlen = aglen;
				240	} else {
				241	aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
				242	freelen = pag->pagf_freeblks;
				243	usedlen = aglen - freelen;
				244	xfs_buf_relse(bp);
				245	}
				246	xfs_perag_put(pag);
				247
				248	trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
				249	freelen, usedlen);
				250
				251	/*
				252	* Figure out how many blocks we'd need worst case to rebuild
				253	* each type of btree. Note that we can only rebuild the
				254	* bnobt/cntbt or inobt/finobt as pairs.
				255	*/
				256	bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
				257	if (xfs_sb_version_hassparseinodes(&mp->m_sb))
				258	inobt_sz = xfs_iallocbt_calc_size(mp, icount /
				259	XFS_INODES_PER_HOLEMASK_BIT);
				260	else
				261	inobt_sz = xfs_iallocbt_calc_size(mp, icount /
				262	XFS_INODES_PER_CHUNK);
				263	if (xfs_sb_version_hasfinobt(&mp->m_sb))
				264	inobt_sz *= 2;
				265	if (xfs_sb_version_hasreflink(&mp->m_sb))
				266	refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
				267	else
				268	refcbt_sz = 0;
				269	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
				270	/*
				271	* Guess how many blocks we need to rebuild the rmapbt.
				272	* For non-reflink filesystems we can't have more records than
				273	* used blocks. However, with reflink it's possible to have
				274	* more than one rmap record per AG block. We don't know how
				275	* many rmaps there could be in the AG, so we start off with
				276	* what we hope is an generous over-estimation.
				277	*/
				278	if (xfs_sb_version_hasreflink(&mp->m_sb))
				279	rmapbt_sz = xfs_rmapbt_calc_size(mp,
				280	(unsigned long long)aglen * 2);
				281	else
				282	rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
				283	} else {
				284	rmapbt_sz = 0;
				285	}
				286
				287	trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
				288	inobt_sz, rmapbt_sz, refcbt_sz);
				289
				290	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
				291	}
Darrick J. Wong	73d6b42	2018-05-29 22:18:09 -0700	[diff] [blame]	292
				293	/* Allocate a block in an AG. */
				294	int
				295	xfs_repair_alloc_ag_block(
				296	struct xfs_scrub_context *sc,
				297	struct xfs_owner_info *oinfo,
				298	xfs_fsblock_t *fsbno,
				299	enum xfs_ag_resv_type resv)
				300	{
				301	struct xfs_alloc_arg args = {0};
				302	xfs_agblock_t bno;
				303	int error;
				304
				305	switch (resv) {
				306	case XFS_AG_RESV_AGFL:
				307	case XFS_AG_RESV_RMAPBT:
				308	error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
				309	if (error)
				310	return error;
				311	if (bno == NULLAGBLOCK)
				312	return -ENOSPC;
				313	xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
				314	1, false);
				315	*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
				316	if (resv == XFS_AG_RESV_RMAPBT)
				317	xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
				318	return 0;
				319	default:
				320	break;
				321	}
				322
				323	args.tp = sc->tp;
				324	args.mp = sc->mp;
				325	args.oinfo = *oinfo;
				326	args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
				327	args.minlen = 1;
				328	args.maxlen = 1;
				329	args.prod = 1;
				330	args.type = XFS_ALLOCTYPE_THIS_AG;
				331	args.resv = resv;
				332
				333	error = xfs_alloc_vextent(&args);
				334	if (error)
				335	return error;
				336	if (args.fsbno == NULLFSBLOCK)
				337	return -ENOSPC;
				338	ASSERT(args.len == 1);
				339	*fsbno = args.fsbno;
				340
				341	return 0;
				342	}
				343
				344	/* Initialize a new AG btree root block with zero entries. */
				345	int
				346	xfs_repair_init_btblock(
				347	struct xfs_scrub_context *sc,
				348	xfs_fsblock_t fsb,
				349	struct xfs_buf **bpp,
				350	xfs_btnum_t btnum,
				351	const struct xfs_buf_ops *ops)
				352	{
				353	struct xfs_trans *tp = sc->tp;
				354	struct xfs_mount *mp = sc->mp;
				355	struct xfs_buf *bp;
				356
				357	trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
				358	XFS_FSB_TO_AGBNO(mp, fsb), btnum);
				359
				360	ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
				361	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
				362	XFS_FSB_TO_BB(mp, 1), 0);
				363	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
				364	xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0);
				365	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
				366	xfs_trans_log_buf(tp, bp, 0, bp->b_length);
				367	bp->b_ops = ops;
				368	*bpp = bp;
				369
				370	return 0;
				371	}
Darrick J. Wong	64a39d8	2018-05-29 22:18:09 -0700	[diff] [blame]	372
				373	/*
				374	* Reconstructing per-AG Btrees
				375	*
				376	* When a space btree is corrupt, we don't bother trying to fix it. Instead,
				377	* we scan secondary space metadata to derive the records that should be in
				378	* the damaged btree, initialize a fresh btree root, and insert the records.
				379	* Note that for rebuilding the rmapbt we scan all the primary data to
				380	* generate the new records.
				381	*
				382	* However, that leaves the matter of removing all the metadata describing the
				383	* old broken structure. For primary metadata we use the rmap data to collect
				384	* every extent with a matching rmap owner (exlist); we then iterate all other
				385	* metadata structures with the same rmap owner to collect the extents that
				386	* cannot be removed (sublist). We then subtract sublist from exlist to
				387	* derive the blocks that were used by the old btree. These blocks can be
				388	* reaped.
				389	*
				390	* For rmapbt reconstructions we must use different tactics for extent
				391	* collection. First we iterate all primary metadata (this excludes the old
				392	* rmapbt, obviously) to generate new rmap records. The gaps in the rmap
				393	* records are collected as exlist. The bnobt records are collected as
				394	* sublist. As with the other btrees we subtract sublist from exlist, and the
				395	* result (since the rmapbt lives in the free space) are the blocks from the
				396	* old rmapbt.
				397	*/
				398
				399	/* Collect a dead btree extent for later disposal. */
				400	int
				401	xfs_repair_collect_btree_extent(
				402	struct xfs_scrub_context *sc,
				403	struct xfs_repair_extent_list *exlist,
				404	xfs_fsblock_t fsbno,
				405	xfs_extlen_t len)
				406	{
				407	struct xfs_repair_extent *rex;
				408
				409	trace_xfs_repair_collect_btree_extent(sc->mp,
				410	XFS_FSB_TO_AGNO(sc->mp, fsbno),
				411	XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
				412
				413	rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
				414	if (!rex)
				415	return -ENOMEM;
				416
				417	INIT_LIST_HEAD(&rex->list);
				418	rex->fsbno = fsbno;
				419	rex->len = len;
				420	list_add_tail(&rex->list, &exlist->list);
				421
				422	return 0;
				423	}
				424
				425	/*
				426	* An error happened during the rebuild so the transaction will be cancelled.
				427	* The fs will shut down, and the administrator has to unmount and run repair.
				428	* Therefore, free all the memory associated with the list so we can die.
				429	*/
				430	void
				431	xfs_repair_cancel_btree_extents(
				432	struct xfs_scrub_context *sc,
				433	struct xfs_repair_extent_list *exlist)
				434	{
				435	struct xfs_repair_extent *rex;
				436	struct xfs_repair_extent *n;
				437
				438	for_each_xfs_repair_extent_safe(rex, n, exlist) {
				439	list_del(&rex->list);
				440	kmem_free(rex);
				441	}
				442	}
				443
				444	/* Compare two btree extents. */
				445	static int
				446	xfs_repair_btree_extent_cmp(
				447	void *priv,
				448	struct list_head *a,
				449	struct list_head *b)
				450	{
				451	struct xfs_repair_extent *ap;
				452	struct xfs_repair_extent *bp;
				453
				454	ap = container_of(a, struct xfs_repair_extent, list);
				455	bp = container_of(b, struct xfs_repair_extent, list);
				456
				457	if (ap->fsbno > bp->fsbno)
				458	return 1;
				459	if (ap->fsbno < bp->fsbno)
				460	return -1;
				461	return 0;
				462	}
				463
				464	/*
				465	* Remove all the blocks mentioned in @sublist from the extents in @exlist.
				466	*
				467	* The intent is that callers will iterate the rmapbt for all of its records
				468	* for a given owner to generate @exlist; and iterate all the blocks of the
				469	* metadata structures that are not being rebuilt and have the same rmapbt
				470	* owner to generate @sublist. This routine subtracts all the extents
				471	* mentioned in sublist from all the extents linked in @exlist, which leaves
				472	* @exlist as the list of blocks that are not accounted for, which we assume
				473	* are the dead blocks of the old metadata structure. The blocks mentioned in
				474	* @exlist can be reaped.
				475	*/
				476	#define LEFT_ALIGNED (1 << 0)
				477	#define RIGHT_ALIGNED (1 << 1)
				478	int
				479	xfs_repair_subtract_extents(
				480	struct xfs_scrub_context *sc,
				481	struct xfs_repair_extent_list *exlist,
				482	struct xfs_repair_extent_list *sublist)
				483	{
				484	struct list_head *lp;
				485	struct xfs_repair_extent *ex;
				486	struct xfs_repair_extent *newex;
				487	struct xfs_repair_extent *subex;
				488	xfs_fsblock_t sub_fsb;
				489	xfs_extlen_t sub_len;
				490	int state;
				491	int error = 0;
				492
				493	if (list_empty(&exlist->list) \|\| list_empty(&sublist->list))
				494	return 0;
				495	ASSERT(!list_empty(&sublist->list));
				496
				497	list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
				498	list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
				499
				500	/*
				501	* Now that we've sorted both lists, we iterate exlist once, rolling
				502	* forward through sublist and/or exlist as necessary until we find an
				503	* overlap or reach the end of either list. We do not reset lp to the
				504	* head of exlist nor do we reset subex to the head of sublist. The
				505	* list traversal is similar to merge sort, but we're deleting
				506	* instead. In this manner we avoid O(n^2) operations.
				507	*/
				508	subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
				509	list);
				510	lp = exlist->list.next;
				511	while (lp != &exlist->list) {
				512	ex = list_entry(lp, struct xfs_repair_extent, list);
				513
				514	/*
				515	* Advance subex and/or ex until we find a pair that
				516	* intersect or we run out of extents.
				517	*/
				518	while (subex->fsbno + subex->len <= ex->fsbno) {
				519	if (list_is_last(&subex->list, &sublist->list))
				520	goto out;
				521	subex = list_next_entry(subex, list);
				522	}
				523	if (subex->fsbno >= ex->fsbno + ex->len) {
				524	lp = lp->next;
				525	continue;
				526	}
				527
				528	/* trim subex to fit the extent we have */
				529	sub_fsb = subex->fsbno;
				530	sub_len = subex->len;
				531	if (subex->fsbno < ex->fsbno) {
				532	sub_len -= ex->fsbno - subex->fsbno;
				533	sub_fsb = ex->fsbno;
				534	}
				535	if (sub_len > ex->len)
				536	sub_len = ex->len;
				537
				538	state = 0;
				539	if (sub_fsb == ex->fsbno)
				540	state \|= LEFT_ALIGNED;
				541	if (sub_fsb + sub_len == ex->fsbno + ex->len)
				542	state \|= RIGHT_ALIGNED;
				543	switch (state) {
				544	case LEFT_ALIGNED:
				545	/* Coincides with only the left. */
				546	ex->fsbno += sub_len;
				547	ex->len -= sub_len;
				548	break;
				549	case RIGHT_ALIGNED:
				550	/* Coincides with only the right. */
				551	ex->len -= sub_len;
				552	lp = lp->next;
				553	break;
				554	case LEFT_ALIGNED \| RIGHT_ALIGNED:
				555	/* Total overlap, just delete ex. */
				556	lp = lp->next;
				557	list_del(&ex->list);
				558	kmem_free(ex);
				559	break;
				560	case 0:
				561	/*
				562	* Deleting from the middle: add the new right extent
				563	* and then shrink the left extent.
				564	*/
				565	newex = kmem_alloc(sizeof(struct xfs_repair_extent),
				566	KM_MAYFAIL);
				567	if (!newex) {
				568	error = -ENOMEM;
				569	goto out;
				570	}
				571	INIT_LIST_HEAD(&newex->list);
				572	newex->fsbno = sub_fsb + sub_len;
				573	newex->len = ex->fsbno + ex->len - newex->fsbno;
				574	list_add(&newex->list, &ex->list);
				575	ex->len = sub_fsb - ex->fsbno;
				576	lp = lp->next;
				577	break;
				578	default:
				579	ASSERT(0);
				580	break;
				581	}
				582	}
				583
				584	out:
				585	return error;
				586	}
				587	#undef LEFT_ALIGNED
				588	#undef RIGHT_ALIGNED
Darrick J. Wong	12c6510e	2018-05-29 22:18:10 -0700	[diff] [blame]	589
				590	/*
				591	* Disposal of Blocks from Old per-AG Btrees
				592	*
				593	* Now that we've constructed a new btree to replace the damaged one, we want
				594	* to dispose of the blocks that (we think) the old btree was using.
				595	* Previously, we used the rmapbt to collect the extents (exlist) with the
				596	* rmap owner corresponding to the tree we rebuilt, collected extents for any
				597	* blocks with the same rmap owner that are owned by another data structure
				598	* (sublist), and subtracted sublist from exlist. In theory the extents
				599	* remaining in exlist are the old btree's blocks.
				600	*
				601	* Unfortunately, it's possible that the btree was crosslinked with other
				602	* blocks on disk. The rmap data can tell us if there are multiple owners, so
				603	* if the rmapbt says there is an owner of this block other than @oinfo, then
				604	* the block is crosslinked. Remove the reverse mapping and continue.
				605	*
				606	* If there is one rmap record, we can free the block, which removes the
				607	* reverse mapping but doesn't add the block to the free space. Our repair
				608	* strategy is to hope the other metadata objects crosslinked on this block
				609	* will be rebuilt (atop different blocks), thereby removing all the cross
				610	* links.
				611	*
				612	* If there are no rmap records at all, we also free the block. If the btree
				613	* being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
				614	* supposed to be a rmap record and everything is ok. For other btrees there
				615	* had to have been an rmap entry for the block to have ended up on @exlist,
				616	* so if it's gone now there's something wrong and the fs will shut down.
				617	*
				618	* Note: If there are multiple rmap records with only the same rmap owner as
				619	* the btree we're trying to rebuild and the block is indeed owned by another
				620	* data structure with the same rmap owner, then the block will be in sublist
				621	* and therefore doesn't need disposal. If there are multiple rmap records
				622	* with only the same rmap owner but the block is not owned by something with
				623	* the same rmap owner, the block will be freed.
				624	*
				625	* The caller is responsible for locking the AG headers for the entire rebuild
				626	* operation so that nothing else can sneak in and change the AG state while
				627	* we're not looking. We also assume that the caller already invalidated any
				628	* buffers associated with @exlist.
				629	*/
				630
				631	/*
				632	* Invalidate buffers for per-AG btree blocks we're dumping. This function
				633	* is not intended for use with file data repairs; we have bunmapi for that.
				634	*/
				635	int
				636	xfs_repair_invalidate_blocks(
				637	struct xfs_scrub_context *sc,
				638	struct xfs_repair_extent_list *exlist)
				639	{
				640	struct xfs_repair_extent *rex;
				641	struct xfs_repair_extent *n;
				642	struct xfs_buf *bp;
				643	xfs_fsblock_t fsbno;
				644	xfs_agblock_t i;
				645
				646	/*
				647	* For each block in each extent, see if there's an incore buffer for
				648	* exactly that block; if so, invalidate it. The buffer cache only
				649	* lets us look for one buffer at a time, so we have to look one block
				650	* at a time. Avoid invalidating AG headers and post-EOFS blocks
				651	* because we never own those; and if we can't TRYLOCK the buffer we
				652	* assume it's owned by someone else.
				653	*/
				654	for_each_xfs_repair_extent_safe(rex, n, exlist) {
				655	for (fsbno = rex->fsbno, i = rex->len; i > 0; fsbno++, i--) {
				656	/* Skip AG headers and post-EOFS blocks */
				657	if (!xfs_verify_fsbno(sc->mp, fsbno))
				658	continue;
				659	bp = xfs_buf_incore(sc->mp->m_ddev_targp,
				660	XFS_FSB_TO_DADDR(sc->mp, fsbno),
				661	XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
				662	if (bp) {
				663	xfs_trans_bjoin(sc->tp, bp);
				664	xfs_trans_binval(sc->tp, bp);
				665	}
				666	}
				667	}
				668
				669	return 0;
				670	}
				671
				672	/* Ensure the freelist is the correct size. */
				673	int
				674	xfs_repair_fix_freelist(
				675	struct xfs_scrub_context *sc,
				676	bool can_shrink)
				677	{
				678	struct xfs_alloc_arg args = {0};
				679
				680	args.mp = sc->mp;
				681	args.tp = sc->tp;
				682	args.agno = sc->sa.agno;
				683	args.alignment = 1;
				684	args.pag = sc->sa.pag;
				685
				686	return xfs_alloc_fix_freelist(&args,
				687	can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
				688	}
				689
				690	/*
				691	* Put a block back on the AGFL.
				692	*/
				693	STATIC int
				694	xfs_repair_put_freelist(
				695	struct xfs_scrub_context *sc,
				696	xfs_agblock_t agbno)
				697	{
				698	struct xfs_owner_info oinfo;
				699	int error;
				700
				701	/* Make sure there's space on the freelist. */
				702	error = xfs_repair_fix_freelist(sc, true);
				703	if (error)
				704	return error;
				705
				706	/*
				707	* Since we're "freeing" a lost block onto the AGFL, we have to
				708	* create an rmap for the block prior to merging it or else other
				709	* parts will break.
				710	*/
				711	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
				712	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
				713	&oinfo);
				714	if (error)
				715	return error;
				716
				717	/* Put the block on the AGFL. */
				718	error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
				719	agbno, 0);
				720	if (error)
				721	return error;
				722	xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
				723	XFS_EXTENT_BUSY_SKIP_DISCARD);
				724
				725	return 0;
				726	}
				727
				728	/* Dispose of a single metadata block. */
				729	STATIC int
				730	xfs_repair_dispose_btree_block(
				731	struct xfs_scrub_context *sc,
				732	xfs_fsblock_t fsbno,
				733	struct xfs_owner_info *oinfo,
				734	enum xfs_ag_resv_type resv)
				735	{
				736	struct xfs_btree_cur *cur;
				737	struct xfs_buf *agf_bp = NULL;
				738	xfs_agnumber_t agno;
				739	xfs_agblock_t agbno;
				740	bool has_other_rmap;
				741	int error;
				742
				743	agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
				744	agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
				745
				746	/*
				747	* If we are repairing per-inode metadata, we need to read in the AGF
				748	* buffer. Otherwise, we're repairing a per-AG structure, so reuse
				749	* the AGF buffer that the setup functions already grabbed.
				750	*/
				751	if (sc->ip) {
				752	error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
				753	if (error)
				754	return error;
				755	if (!agf_bp)
				756	return -ENOMEM;
				757	} else {
				758	agf_bp = sc->sa.agf_bp;
				759	}
				760	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
				761
				762	/* Can we find any other rmappings? */
				763	error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
				764	if (error)
				765	goto out_cur;
				766	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
				767
				768	/*
				769	* If there are other rmappings, this block is cross linked and must
				770	* not be freed. Remove the reverse mapping and move on. Otherwise,
				771	* we were the only owner of the block, so free the extent, which will
				772	* also remove the rmap.
				773	*
				774	* XXX: XFS doesn't support detecting the case where a single block
				775	* metadata structure is crosslinked with a multi-block structure
				776	* because the buffer cache doesn't detect aliasing problems, so we
				777	* can't fix 100% of crosslinking problems (yet). The verifiers will
				778	* blow on writeout, the filesystem will shut down, and the admin gets
				779	* to run xfs_repair.
				780	*/
				781	if (has_other_rmap)
				782	error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
				783	else if (resv == XFS_AG_RESV_AGFL)
				784	error = xfs_repair_put_freelist(sc, agbno);
				785	else
				786	error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
				787	if (agf_bp != sc->sa.agf_bp)
				788	xfs_trans_brelse(sc->tp, agf_bp);
				789	if (error)
				790	return error;
				791
				792	if (sc->ip)
				793	return xfs_trans_roll_inode(&sc->tp, sc->ip);
				794	return xfs_repair_roll_ag_trans(sc);
				795
				796	out_cur:
				797	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
				798	if (agf_bp != sc->sa.agf_bp)
				799	xfs_trans_brelse(sc->tp, agf_bp);
				800	return error;
				801	}
				802
				803	/* Dispose of btree blocks from an old per-AG btree. */
				804	int
				805	xfs_repair_reap_btree_extents(
				806	struct xfs_scrub_context *sc,
				807	struct xfs_repair_extent_list *exlist,
				808	struct xfs_owner_info *oinfo,
				809	enum xfs_ag_resv_type type)
				810	{
				811	struct xfs_repair_extent *rex;
				812	struct xfs_repair_extent *n;
				813	int error = 0;
				814
				815	ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
				816
				817	/* Dispose of every block from the old btree. */
				818	for_each_xfs_repair_extent_safe(rex, n, exlist) {
				819	ASSERT(sc->ip != NULL \|\|
				820	XFS_FSB_TO_AGNO(sc->mp, rex->fsbno) == sc->sa.agno);
				821
				822	trace_xfs_repair_dispose_btree_extent(sc->mp,
				823	XFS_FSB_TO_AGNO(sc->mp, rex->fsbno),
				824	XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno), rex->len);
				825
				826	for (; rex->len > 0; rex->len--, rex->fsbno++) {
				827	error = xfs_repair_dispose_btree_block(sc, rex->fsbno,
				828	oinfo, type);
				829	if (error)
				830	goto out;
				831	}
				832	list_del(&rex->list);
				833	kmem_free(rex);
				834	}
				835
				836	out:
				837	xfs_repair_cancel_btree_extents(sc, exlist);
				838	return error;
				839	}
Darrick J. Wong	04a2b7b	2018-05-29 22:18:10 -0700	[diff] [blame]	840
				841	/*
				842	* Finding per-AG Btree Roots for AGF/AGI Reconstruction
				843	*
				844	* If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
				845	* the AG headers by using the rmap data to rummage through the AG looking for
				846	* btree roots. This is not guaranteed to work if the AG is heavily damaged
				847	* or the rmap data are corrupt.
				848	*
				849	* Callers of xfs_repair_find_ag_btree_roots must lock the AGF and AGFL
				850	* buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
				851	* AGI is being rebuilt. It must maintain these locks until it's safe for
				852	* other threads to change the btrees' shapes. The caller provides
				853	* information about the btrees to look for by passing in an array of
				854	* xfs_repair_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
				855	* The (root, height) fields will be set on return if anything is found. The
				856	* last element of the array should have a NULL buf_ops to mark the end of the
				857	* array.
				858	*
				859	* For every rmapbt record matching any of the rmap owners in btree_info,
				860	* read each block referenced by the rmap record. If the block is a btree
				861	* block from this filesystem matching any of the magic numbers and has a
				862	* level higher than what we've already seen, remember the block and the
				863	* height of the tree required to have such a block. When the call completes,
				864	* we return the highest block we've found for each btree description; those
				865	* should be the roots.
				866	*/
				867
				868	struct xfs_repair_findroot {
				869	struct xfs_scrub_context *sc;
				870	struct xfs_buf *agfl_bp;
				871	struct xfs_agf *agf;
				872	struct xfs_repair_find_ag_btree *btree_info;
				873	};
				874
				875	/* See if our block is in the AGFL. */
				876	STATIC int
				877	xfs_repair_findroot_agfl_walk(
				878	struct xfs_mount *mp,
				879	xfs_agblock_t bno,
				880	void *priv)
				881	{
				882	xfs_agblock_t *agbno = priv;
				883
				884	return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
				885	}
				886
				887	/* Does this block match the btree information passed in? */
				888	STATIC int
				889	xfs_repair_findroot_block(
				890	struct xfs_repair_findroot *ri,
				891	struct xfs_repair_find_ag_btree *fab,
				892	uint64_t owner,
				893	xfs_agblock_t agbno,
				894	bool *found_it)
				895	{
				896	struct xfs_mount *mp = ri->sc->mp;
				897	struct xfs_buf *bp;
				898	struct xfs_btree_block *btblock;
				899	xfs_daddr_t daddr;
				900	int error;
				901
				902	daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
				903
				904	/*
				905	* Blocks in the AGFL have stale contents that might just happen to
				906	* have a matching magic and uuid. We don't want to pull these blocks
				907	* in as part of a tree root, so we have to filter out the AGFL stuff
				908	* here. If the AGFL looks insane we'll just refuse to repair.
				909	*/
				910	if (owner == XFS_RMAP_OWN_AG) {
				911	error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
				912	xfs_repair_findroot_agfl_walk, &agbno);
				913	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
				914	return 0;
				915	if (error)
				916	return error;
				917	}
				918
				919	error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
				920	mp->m_bsize, 0, &bp, NULL);
				921	if (error)
				922	return error;
				923
				924	/*
				925	* Does this look like a block matching our fs and higher than any
				926	* other block we've found so far? If so, reattach buffer verifiers
				927	* so the AIL won't complain if the buffer is also dirty.
				928	*/
				929	btblock = XFS_BUF_TO_BLOCK(bp);
				930	if (be32_to_cpu(btblock->bb_magic) != fab->magic)
				931	goto out;
				932	if (xfs_sb_version_hascrc(&mp->m_sb) &&
				933	!uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
				934	goto out;
				935	bp->b_ops = fab->buf_ops;
				936
				937	/* Ignore this block if it's lower in the tree than we've seen. */
				938	if (fab->root != NULLAGBLOCK &&
				939	xfs_btree_get_level(btblock) < fab->height)
				940	goto out;
				941
				942	/* Make sure we pass the verifiers. */
				943	bp->b_ops->verify_read(bp);
				944	if (bp->b_error)
				945	goto out;
				946	fab->root = agbno;
				947	fab->height = xfs_btree_get_level(btblock) + 1;
				948	*found_it = true;
				949
				950	trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
				951	be32_to_cpu(btblock->bb_magic), fab->height - 1);
				952	out:
				953	xfs_trans_brelse(ri->sc->tp, bp);
				954	return error;
				955	}
				956
				957	/*
				958	* Do any of the blocks in this rmap record match one of the btrees we're
				959	* looking for?
				960	*/
				961	STATIC int
				962	xfs_repair_findroot_rmap(
				963	struct xfs_btree_cur *cur,
				964	struct xfs_rmap_irec *rec,
				965	void *priv)
				966	{
				967	struct xfs_repair_findroot *ri = priv;
				968	struct xfs_repair_find_ag_btree *fab;
				969	xfs_agblock_t b;
				970	bool found_it;
				971	int error = 0;
				972
				973	/* Ignore anything that isn't AG metadata. */
				974	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
				975	return 0;
				976
				977	/* Otherwise scan each block + btree type. */
				978	for (b = 0; b < rec->rm_blockcount; b++) {
				979	found_it = false;
				980	for (fab = ri->btree_info; fab->buf_ops; fab++) {
				981	if (rec->rm_owner != fab->rmap_owner)
				982	continue;
				983	error = xfs_repair_findroot_block(ri, fab,
				984	rec->rm_owner, rec->rm_startblock + b,
				985	&found_it);
				986	if (error)
				987	return error;
				988	if (found_it)
				989	break;
				990	}
				991	}
				992
				993	return 0;
				994	}
				995
				996	/* Find the roots of the per-AG btrees described in btree_info. */
				997	int
				998	xfs_repair_find_ag_btree_roots(
				999	struct xfs_scrub_context *sc,
				1000	struct xfs_buf *agf_bp,
				1001	struct xfs_repair_find_ag_btree *btree_info,
				1002	struct xfs_buf *agfl_bp)
				1003	{
				1004	struct xfs_mount *mp = sc->mp;
				1005	struct xfs_repair_findroot ri;
				1006	struct xfs_repair_find_ag_btree *fab;
				1007	struct xfs_btree_cur *cur;
				1008	int error;
				1009
				1010	ASSERT(xfs_buf_islocked(agf_bp));
				1011	ASSERT(agfl_bp == NULL \|\| xfs_buf_islocked(agfl_bp));
				1012
				1013	ri.sc = sc;
				1014	ri.btree_info = btree_info;
				1015	ri.agf = XFS_BUF_TO_AGF(agf_bp);
				1016	ri.agfl_bp = agfl_bp;
				1017	for (fab = btree_info; fab->buf_ops; fab++) {
				1018	ASSERT(agfl_bp \|\| fab->rmap_owner != XFS_RMAP_OWN_AG);
				1019	ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
				1020	fab->root = NULLAGBLOCK;
				1021	fab->height = 0;
				1022	}
				1023
				1024	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
				1025	error = xfs_rmap_query_all(cur, xfs_repair_findroot_rmap, &ri);
				1026	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
				1027
				1028	return error;
				1029	}
Darrick J. Wong	7e85bc6	2018-05-29 22:18:11 -0700	[diff] [blame^]	1030
				1031	/* Force a quotacheck the next time we mount. */
				1032	void
				1033	xfs_repair_force_quotacheck(
				1034	struct xfs_scrub_context *sc,
				1035	uint dqtype)
				1036	{
				1037	uint flag;
				1038
				1039	flag = xfs_quota_chkd_flag(dqtype);
				1040	if (!(flag & sc->mp->m_qflags))
				1041	return;
				1042
				1043	sc->mp->m_qflags &= ~flag;
				1044	spin_lock(&sc->mp->m_sb_lock);
				1045	sc->mp->m_sb.sb_qflags &= ~flag;
				1046	spin_unlock(&sc->mp->m_sb_lock);
				1047	xfs_log_sb(sc->tp);
				1048	}
				1049
				1050	/*
				1051	* Attach dquots to this inode, or schedule quotacheck to fix them.
				1052	*
				1053	* This function ensures that the appropriate dquots are attached to an inode.
				1054	* We cannot allow the dquot code to allocate an on-disk dquot block here
				1055	* because we're already in transaction context with the inode locked. The
				1056	* on-disk dquot should already exist anyway. If the quota code signals
				1057	* corruption or missing quota information, schedule quotacheck, which will
				1058	* repair corruptions in the quota metadata.
				1059	*/
				1060	int
				1061	xfs_repair_ino_dqattach(
				1062	struct xfs_scrub_context *sc)
				1063	{
				1064	int error;
				1065
				1066	error = xfs_qm_dqattach_locked(sc->ip, false);
				1067	switch (error) {
				1068	case -EFSBADCRC:
				1069	case -EFSCORRUPTED:
				1070	case -ENOENT:
				1071	xfs_err_ratelimited(sc->mp,
				1072	"inode %llu repair encountered quota error %d, quotacheck forced.",
				1073	(unsigned long long)sc->ip->i_ino, error);
				1074	if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
				1075	xfs_repair_force_quotacheck(sc, XFS_DQ_USER);
				1076	if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
				1077	xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP);
				1078	if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
				1079	xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ);
				1080	/* fall through */
				1081	case -ESRCH:
				1082	error = 0;
				1083	break;
				1084	default:
				1085	break;
				1086	}
				1087
				1088	return error;
				1089	}