Blame - fs/xfs/scrub/repair.c - SHIFTPHONES/mainline/linux

blob: fca8e3c7887dad8884adf508cad549e59a2e36e0 [file] [log] [blame]

Darrick J. Wong	84d42ea	2018-05-14 06:34:36 -0700	[diff] [blame]	1	/*
				2	* Copyright (C) 2018 Oracle. All Rights Reserved.
				3	*
				4	* Author: Darrick J. Wong <darrick.wong@oracle.com>
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public License
				8	* as published by the Free Software Foundation; either version 2
				9	* of the License, or (at your option) any later version.
				10	*
				11	* This program is distributed in the hope that it would be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				14	* GNU General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU General Public License
				17	* along with this program; if not, write the Free Software Foundation,
				18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
				19	*/
				20	#include "xfs.h"
				21	#include "xfs_fs.h"
				22	#include "xfs_shared.h"
				23	#include "xfs_format.h"
				24	#include "xfs_trans_resv.h"
				25	#include "xfs_mount.h"
				26	#include "xfs_defer.h"
				27	#include "xfs_btree.h"
				28	#include "xfs_bit.h"
				29	#include "xfs_log_format.h"
				30	#include "xfs_trans.h"
				31	#include "xfs_sb.h"
				32	#include "xfs_inode.h"
				33	#include "xfs_icache.h"
				34	#include "xfs_alloc.h"
				35	#include "xfs_alloc_btree.h"
				36	#include "xfs_ialloc.h"
				37	#include "xfs_ialloc_btree.h"
				38	#include "xfs_rmap.h"
				39	#include "xfs_rmap_btree.h"
				40	#include "xfs_refcount.h"
				41	#include "xfs_refcount_btree.h"
				42	#include "xfs_extent_busy.h"
				43	#include "xfs_ag_resv.h"
				44	#include "xfs_trans_space.h"
				45	#include "scrub/xfs_scrub.h"
				46	#include "scrub/scrub.h"
				47	#include "scrub/common.h"
				48	#include "scrub/trace.h"
				49	#include "scrub/repair.h"
				50
				51	/*
				52	* Attempt to repair some metadata, if the metadata is corrupt and userspace
				53	* told us to fix it. This function returns -EAGAIN to mean "re-run scrub",
				54	* and will set *fixed to true if it thinks it repaired anything.
				55	*/
				56	int
				57	xfs_repair_attempt(
				58	struct xfs_inode *ip,
				59	struct xfs_scrub_context *sc,
				60	bool *fixed)
				61	{
				62	int error = 0;
				63
				64	trace_xfs_repair_attempt(ip, sc->sm, error);
				65
				66	xfs_scrub_ag_btcur_free(&sc->sa);
				67
				68	/* Repair whatever's broken. */
				69	ASSERT(sc->ops->repair);
				70	error = sc->ops->repair(sc);
				71	trace_xfs_repair_done(ip, sc->sm, error);
				72	switch (error) {
				73	case 0:
				74	/*
				75	* Repair succeeded. Commit the fixes and perform a second
				76	* scrub so that we can tell userspace if we fixed the problem.
				77	*/
				78	sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
				79	*fixed = true;
				80	return -EAGAIN;
				81	case -EDEADLOCK:
				82	case -EAGAIN:
				83	/* Tell the caller to try again having grabbed all the locks. */
				84	if (!sc->try_harder) {
				85	sc->try_harder = true;
				86	return -EAGAIN;
				87	}
				88	/*
				89	* We tried harder but still couldn't grab all the resources
				90	* we needed to fix it. The corruption has not been fixed,
				91	* so report back to userspace.
				92	*/
				93	return -EFSCORRUPTED;
				94	default:
				95	return error;
				96	}
				97	}
				98
				99	/*
				100	* Complain about unfixable problems in the filesystem. We don't log
				101	* corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
				102	* program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
				103	* administrator isn't running xfs_scrub in no-repairs mode.
				104	*
				105	* Use this helper function because _ratelimited silently declares a static
				106	* structure to track rate limiting information.
				107	*/
				108	void
				109	xfs_repair_failure(
				110	struct xfs_mount *mp)
				111	{
				112	xfs_alert_ratelimited(mp,
				113	"Corruption not fixed during online repair. Unmount and run xfs_repair.");
				114	}
				115
				116	/*
				117	* Repair probe -- userspace uses this to probe if we're willing to repair a
				118	* given mountpoint.
				119	*/
				120	int
				121	xfs_repair_probe(
				122	struct xfs_scrub_context *sc)
				123	{
				124	int error = 0;
				125
				126	if (xfs_scrub_should_terminate(sc, &error))
				127	return error;
				128
				129	return 0;
				130	}
Darrick J. Wong	0a9633f	2018-05-29 22:18:08 -0700	[diff] [blame]	131
				132	/*
				133	* Roll a transaction, keeping the AG headers locked and reinitializing
				134	* the btree cursors.
				135	*/
				136	int
				137	xfs_repair_roll_ag_trans(
				138	struct xfs_scrub_context *sc)
				139	{
				140	int error;
				141
				142	/* Keep the AG header buffers locked so we can keep going. */
				143	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
				144	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
				145	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
				146
				147	/* Roll the transaction. */
				148	error = xfs_trans_roll(&sc->tp);
				149	if (error)
				150	goto out_release;
				151
				152	/* Join AG headers to the new transaction. */
				153	xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
				154	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
				155	xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
				156
				157	return 0;
				158
				159	out_release:
				160	/*
				161	* Rolling failed, so release the hold on the buffers. The
				162	* buffers will be released during teardown on our way out
				163	* of the kernel.
				164	*/
				165	xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
				166	xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
				167	xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
				168
				169	return error;
				170	}
				171
				172	/*
				173	* Does the given AG have enough space to rebuild a btree? Neither AG
				174	* reservation can be critical, and we must have enough space (factoring
				175	* in AG reservations) to construct a whole btree.
				176	*/
				177	bool
				178	xfs_repair_ag_has_space(
				179	struct xfs_perag *pag,
				180	xfs_extlen_t nr_blocks,
				181	enum xfs_ag_resv_type type)
				182	{
				183	return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
				184	!xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
				185	pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
				186	}
				187
				188	/*
				189	* Figure out how many blocks to reserve for an AG repair. We calculate the
				190	* worst case estimate for the number of blocks we'd need to rebuild one of
				191	* any type of per-AG btree.
				192	*/
				193	xfs_extlen_t
				194	xfs_repair_calc_ag_resblks(
				195	struct xfs_scrub_context *sc)
				196	{
				197	struct xfs_mount *mp = sc->mp;
				198	struct xfs_scrub_metadata *sm = sc->sm;
				199	struct xfs_perag *pag;
				200	struct xfs_buf *bp;
				201	xfs_agino_t icount = 0;
				202	xfs_extlen_t aglen = 0;
				203	xfs_extlen_t usedlen;
				204	xfs_extlen_t freelen;
				205	xfs_extlen_t bnobt_sz;
				206	xfs_extlen_t inobt_sz;
				207	xfs_extlen_t rmapbt_sz;
				208	xfs_extlen_t refcbt_sz;
				209	int error;
				210
				211	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
				212	return 0;
				213
				214	/* Use in-core counters if possible. */
				215	pag = xfs_perag_get(mp, sm->sm_agno);
				216	if (pag->pagi_init)
				217	icount = pag->pagi_count;
				218
				219	/*
				220	* Otherwise try to get the actual counters from disk; if not, make
				221	* some worst case assumptions.
				222	*/
				223	if (icount == 0) {
				224	error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
				225	if (error) {
				226	icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
				227	} else {
				228	icount = pag->pagi_count;
				229	xfs_buf_relse(bp);
				230	}
				231	}
				232
				233	/* Now grab the block counters from the AGF. */
				234	error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
				235	if (error) {
				236	aglen = mp->m_sb.sb_agblocks;
				237	freelen = aglen;
				238	usedlen = aglen;
				239	} else {
				240	aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
				241	freelen = pag->pagf_freeblks;
				242	usedlen = aglen - freelen;
				243	xfs_buf_relse(bp);
				244	}
				245	xfs_perag_put(pag);
				246
				247	trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
				248	freelen, usedlen);
				249
				250	/*
				251	* Figure out how many blocks we'd need worst case to rebuild
				252	* each type of btree. Note that we can only rebuild the
				253	* bnobt/cntbt or inobt/finobt as pairs.
				254	*/
				255	bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
				256	if (xfs_sb_version_hassparseinodes(&mp->m_sb))
				257	inobt_sz = xfs_iallocbt_calc_size(mp, icount /
				258	XFS_INODES_PER_HOLEMASK_BIT);
				259	else
				260	inobt_sz = xfs_iallocbt_calc_size(mp, icount /
				261	XFS_INODES_PER_CHUNK);
				262	if (xfs_sb_version_hasfinobt(&mp->m_sb))
				263	inobt_sz *= 2;
				264	if (xfs_sb_version_hasreflink(&mp->m_sb))
				265	refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
				266	else
				267	refcbt_sz = 0;
				268	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
				269	/*
				270	* Guess how many blocks we need to rebuild the rmapbt.
				271	* For non-reflink filesystems we can't have more records than
				272	* used blocks. However, with reflink it's possible to have
				273	* more than one rmap record per AG block. We don't know how
				274	* many rmaps there could be in the AG, so we start off with
				275	* what we hope is an generous over-estimation.
				276	*/
				277	if (xfs_sb_version_hasreflink(&mp->m_sb))
				278	rmapbt_sz = xfs_rmapbt_calc_size(mp,
				279	(unsigned long long)aglen * 2);
				280	else
				281	rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
				282	} else {
				283	rmapbt_sz = 0;
				284	}
				285
				286	trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
				287	inobt_sz, rmapbt_sz, refcbt_sz);
				288
				289	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
				290	}
Darrick J. Wong	73d6b42	2018-05-29 22:18:09 -0700	[diff] [blame]	291
				292	/* Allocate a block in an AG. */
				293	int
				294	xfs_repair_alloc_ag_block(
				295	struct xfs_scrub_context *sc,
				296	struct xfs_owner_info *oinfo,
				297	xfs_fsblock_t *fsbno,
				298	enum xfs_ag_resv_type resv)
				299	{
				300	struct xfs_alloc_arg args = {0};
				301	xfs_agblock_t bno;
				302	int error;
				303
				304	switch (resv) {
				305	case XFS_AG_RESV_AGFL:
				306	case XFS_AG_RESV_RMAPBT:
				307	error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
				308	if (error)
				309	return error;
				310	if (bno == NULLAGBLOCK)
				311	return -ENOSPC;
				312	xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
				313	1, false);
				314	*fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
				315	if (resv == XFS_AG_RESV_RMAPBT)
				316	xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
				317	return 0;
				318	default:
				319	break;
				320	}
				321
				322	args.tp = sc->tp;
				323	args.mp = sc->mp;
				324	args.oinfo = *oinfo;
				325	args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
				326	args.minlen = 1;
				327	args.maxlen = 1;
				328	args.prod = 1;
				329	args.type = XFS_ALLOCTYPE_THIS_AG;
				330	args.resv = resv;
				331
				332	error = xfs_alloc_vextent(&args);
				333	if (error)
				334	return error;
				335	if (args.fsbno == NULLFSBLOCK)
				336	return -ENOSPC;
				337	ASSERT(args.len == 1);
				338	*fsbno = args.fsbno;
				339
				340	return 0;
				341	}
				342
				343	/* Initialize a new AG btree root block with zero entries. */
				344	int
				345	xfs_repair_init_btblock(
				346	struct xfs_scrub_context *sc,
				347	xfs_fsblock_t fsb,
				348	struct xfs_buf **bpp,
				349	xfs_btnum_t btnum,
				350	const struct xfs_buf_ops *ops)
				351	{
				352	struct xfs_trans *tp = sc->tp;
				353	struct xfs_mount *mp = sc->mp;
				354	struct xfs_buf *bp;
				355
				356	trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
				357	XFS_FSB_TO_AGBNO(mp, fsb), btnum);
				358
				359	ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
				360	bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
				361	XFS_FSB_TO_BB(mp, 1), 0);
				362	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
				363	xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0);
				364	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
				365	xfs_trans_log_buf(tp, bp, 0, bp->b_length);
				366	bp->b_ops = ops;
				367	*bpp = bp;
				368
				369	return 0;
				370	}
Darrick J. Wong	64a39d8	2018-05-29 22:18:09 -0700	[diff] [blame^]	371
				372	/*
				373	* Reconstructing per-AG Btrees
				374	*
				375	* When a space btree is corrupt, we don't bother trying to fix it. Instead,
				376	* we scan secondary space metadata to derive the records that should be in
				377	* the damaged btree, initialize a fresh btree root, and insert the records.
				378	* Note that for rebuilding the rmapbt we scan all the primary data to
				379	* generate the new records.
				380	*
				381	* However, that leaves the matter of removing all the metadata describing the
				382	* old broken structure. For primary metadata we use the rmap data to collect
				383	* every extent with a matching rmap owner (exlist); we then iterate all other
				384	* metadata structures with the same rmap owner to collect the extents that
				385	* cannot be removed (sublist). We then subtract sublist from exlist to
				386	* derive the blocks that were used by the old btree. These blocks can be
				387	* reaped.
				388	*
				389	* For rmapbt reconstructions we must use different tactics for extent
				390	* collection. First we iterate all primary metadata (this excludes the old
				391	* rmapbt, obviously) to generate new rmap records. The gaps in the rmap
				392	* records are collected as exlist. The bnobt records are collected as
				393	* sublist. As with the other btrees we subtract sublist from exlist, and the
				394	* result (since the rmapbt lives in the free space) are the blocks from the
				395	* old rmapbt.
				396	*/
				397
				398	/* Collect a dead btree extent for later disposal. */
				399	int
				400	xfs_repair_collect_btree_extent(
				401	struct xfs_scrub_context *sc,
				402	struct xfs_repair_extent_list *exlist,
				403	xfs_fsblock_t fsbno,
				404	xfs_extlen_t len)
				405	{
				406	struct xfs_repair_extent *rex;
				407
				408	trace_xfs_repair_collect_btree_extent(sc->mp,
				409	XFS_FSB_TO_AGNO(sc->mp, fsbno),
				410	XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
				411
				412	rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
				413	if (!rex)
				414	return -ENOMEM;
				415
				416	INIT_LIST_HEAD(&rex->list);
				417	rex->fsbno = fsbno;
				418	rex->len = len;
				419	list_add_tail(&rex->list, &exlist->list);
				420
				421	return 0;
				422	}
				423
				424	/*
				425	* An error happened during the rebuild so the transaction will be cancelled.
				426	* The fs will shut down, and the administrator has to unmount and run repair.
				427	* Therefore, free all the memory associated with the list so we can die.
				428	*/
				429	void
				430	xfs_repair_cancel_btree_extents(
				431	struct xfs_scrub_context *sc,
				432	struct xfs_repair_extent_list *exlist)
				433	{
				434	struct xfs_repair_extent *rex;
				435	struct xfs_repair_extent *n;
				436
				437	for_each_xfs_repair_extent_safe(rex, n, exlist) {
				438	list_del(&rex->list);
				439	kmem_free(rex);
				440	}
				441	}
				442
				443	/* Compare two btree extents. */
				444	static int
				445	xfs_repair_btree_extent_cmp(
				446	void *priv,
				447	struct list_head *a,
				448	struct list_head *b)
				449	{
				450	struct xfs_repair_extent *ap;
				451	struct xfs_repair_extent *bp;
				452
				453	ap = container_of(a, struct xfs_repair_extent, list);
				454	bp = container_of(b, struct xfs_repair_extent, list);
				455
				456	if (ap->fsbno > bp->fsbno)
				457	return 1;
				458	if (ap->fsbno < bp->fsbno)
				459	return -1;
				460	return 0;
				461	}
				462
				463	/*
				464	* Remove all the blocks mentioned in @sublist from the extents in @exlist.
				465	*
				466	* The intent is that callers will iterate the rmapbt for all of its records
				467	* for a given owner to generate @exlist; and iterate all the blocks of the
				468	* metadata structures that are not being rebuilt and have the same rmapbt
				469	* owner to generate @sublist. This routine subtracts all the extents
				470	* mentioned in sublist from all the extents linked in @exlist, which leaves
				471	* @exlist as the list of blocks that are not accounted for, which we assume
				472	* are the dead blocks of the old metadata structure. The blocks mentioned in
				473	* @exlist can be reaped.
				474	*/
				475	#define LEFT_ALIGNED (1 << 0)
				476	#define RIGHT_ALIGNED (1 << 1)
				477	int
				478	xfs_repair_subtract_extents(
				479	struct xfs_scrub_context *sc,
				480	struct xfs_repair_extent_list *exlist,
				481	struct xfs_repair_extent_list *sublist)
				482	{
				483	struct list_head *lp;
				484	struct xfs_repair_extent *ex;
				485	struct xfs_repair_extent *newex;
				486	struct xfs_repair_extent *subex;
				487	xfs_fsblock_t sub_fsb;
				488	xfs_extlen_t sub_len;
				489	int state;
				490	int error = 0;
				491
				492	if (list_empty(&exlist->list) \|\| list_empty(&sublist->list))
				493	return 0;
				494	ASSERT(!list_empty(&sublist->list));
				495
				496	list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
				497	list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
				498
				499	/*
				500	* Now that we've sorted both lists, we iterate exlist once, rolling
				501	* forward through sublist and/or exlist as necessary until we find an
				502	* overlap or reach the end of either list. We do not reset lp to the
				503	* head of exlist nor do we reset subex to the head of sublist. The
				504	* list traversal is similar to merge sort, but we're deleting
				505	* instead. In this manner we avoid O(n^2) operations.
				506	*/
				507	subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
				508	list);
				509	lp = exlist->list.next;
				510	while (lp != &exlist->list) {
				511	ex = list_entry(lp, struct xfs_repair_extent, list);
				512
				513	/*
				514	* Advance subex and/or ex until we find a pair that
				515	* intersect or we run out of extents.
				516	*/
				517	while (subex->fsbno + subex->len <= ex->fsbno) {
				518	if (list_is_last(&subex->list, &sublist->list))
				519	goto out;
				520	subex = list_next_entry(subex, list);
				521	}
				522	if (subex->fsbno >= ex->fsbno + ex->len) {
				523	lp = lp->next;
				524	continue;
				525	}
				526
				527	/* trim subex to fit the extent we have */
				528	sub_fsb = subex->fsbno;
				529	sub_len = subex->len;
				530	if (subex->fsbno < ex->fsbno) {
				531	sub_len -= ex->fsbno - subex->fsbno;
				532	sub_fsb = ex->fsbno;
				533	}
				534	if (sub_len > ex->len)
				535	sub_len = ex->len;
				536
				537	state = 0;
				538	if (sub_fsb == ex->fsbno)
				539	state \|= LEFT_ALIGNED;
				540	if (sub_fsb + sub_len == ex->fsbno + ex->len)
				541	state \|= RIGHT_ALIGNED;
				542	switch (state) {
				543	case LEFT_ALIGNED:
				544	/* Coincides with only the left. */
				545	ex->fsbno += sub_len;
				546	ex->len -= sub_len;
				547	break;
				548	case RIGHT_ALIGNED:
				549	/* Coincides with only the right. */
				550	ex->len -= sub_len;
				551	lp = lp->next;
				552	break;
				553	case LEFT_ALIGNED \| RIGHT_ALIGNED:
				554	/* Total overlap, just delete ex. */
				555	lp = lp->next;
				556	list_del(&ex->list);
				557	kmem_free(ex);
				558	break;
				559	case 0:
				560	/*
				561	* Deleting from the middle: add the new right extent
				562	* and then shrink the left extent.
				563	*/
				564	newex = kmem_alloc(sizeof(struct xfs_repair_extent),
				565	KM_MAYFAIL);
				566	if (!newex) {
				567	error = -ENOMEM;
				568	goto out;
				569	}
				570	INIT_LIST_HEAD(&newex->list);
				571	newex->fsbno = sub_fsb + sub_len;
				572	newex->len = ex->fsbno + ex->len - newex->fsbno;
				573	list_add(&newex->list, &ex->list);
				574	ex->len = sub_fsb - ex->fsbno;
				575	lp = lp->next;
				576	break;
				577	default:
				578	ASSERT(0);
				579	break;
				580	}
				581	}
				582
				583	out:
				584	return error;
				585	}
				586	#undef LEFT_ALIGNED
				587	#undef RIGHT_ALIGNED