Blame - fs/xfs/xfs_buf_item_recover.c - SHIFTPHONES/mainline/linux

blob: d480f11e6b007d3691709659976b5039a285d85f [file] [log] [blame]

Darrick J. Wong	86ffa47	2020-05-01 16:00:45 -0700	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
				4	* All Rights Reserved.
				5	*/
				6	#include "xfs.h"
				7	#include "xfs_fs.h"
				8	#include "xfs_shared.h"
				9	#include "xfs_format.h"
				10	#include "xfs_log_format.h"
				11	#include "xfs_trans_resv.h"
				12	#include "xfs_bit.h"
				13	#include "xfs_mount.h"
				14	#include "xfs_trans.h"
				15	#include "xfs_buf_item.h"
				16	#include "xfs_trans_priv.h"
				17	#include "xfs_trace.h"
				18	#include "xfs_log.h"
				19	#include "xfs_log_priv.h"
				20	#include "xfs_log_recover.h"
Darrick J. Wong	1094d3f	2020-05-01 16:00:47 -0700	[diff] [blame]	21	#include "xfs_error.h"
				22	#include "xfs_inode.h"
				23	#include "xfs_dir2.h"
				24	#include "xfs_quota.h"
Darrick J. Wong	86ffa47	2020-05-01 16:00:45 -0700	[diff] [blame]	25
				26	/*
Darrick J. Wong	17d29bf	2020-05-01 16:00:56 -0700	[diff] [blame]	27	* This structure is used during recovery to record the buf log items which
				28	* have been canceled and should not be replayed.
				29	*/
				30	struct xfs_buf_cancel {
				31	xfs_daddr_t bc_blkno;
				32	uint bc_len;
				33	int bc_refcount;
				34	struct list_head bc_list;
				35	};
				36
				37	static struct xfs_buf_cancel *
				38	xlog_find_buffer_cancelled(
				39	struct xlog *log,
				40	xfs_daddr_t blkno,
				41	uint len)
				42	{
				43	struct list_head *bucket;
				44	struct xfs_buf_cancel *bcp;
				45
				46	if (!log->l_buf_cancel_table)
				47	return NULL;
				48
				49	bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
				50	list_for_each_entry(bcp, bucket, bc_list) {
				51	if (bcp->bc_blkno == blkno && bcp->bc_len == len)
				52	return bcp;
				53	}
				54
				55	return NULL;
				56	}
				57
				58	static bool
				59	xlog_add_buffer_cancelled(
				60	struct xlog *log,
				61	xfs_daddr_t blkno,
				62	uint len)
				63	{
				64	struct xfs_buf_cancel *bcp;
				65
				66	/*
				67	* If we find an existing cancel record, this indicates that the buffer
				68	* was cancelled multiple times. To ensure that during pass 2 we keep
				69	* the record in the table until we reach its last occurrence in the
				70	* log, a reference count is kept to tell how many times we expect to
				71	* see this record during the second pass.
				72	*/
				73	bcp = xlog_find_buffer_cancelled(log, blkno, len);
				74	if (bcp) {
				75	bcp->bc_refcount++;
				76	return false;
				77	}
				78
				79	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
				80	bcp->bc_blkno = blkno;
				81	bcp->bc_len = len;
				82	bcp->bc_refcount = 1;
				83	list_add_tail(&bcp->bc_list, XLOG_BUF_CANCEL_BUCKET(log, blkno));
				84	return true;
				85	}
				86
				87	/*
				88	* Check if there is and entry for blkno, len in the buffer cancel record table.
				89	*/
				90	bool
				91	xlog_is_buffer_cancelled(
				92	struct xlog *log,
				93	xfs_daddr_t blkno,
				94	uint len)
				95	{
				96	return xlog_find_buffer_cancelled(log, blkno, len) != NULL;
				97	}
				98
				99	/*
				100	* Check if there is and entry for blkno, len in the buffer cancel record table,
				101	* and decremented the reference count on it if there is one.
				102	*
				103	* Remove the cancel record once the refcount hits zero, so that if the same
				104	* buffer is re-used again after its last cancellation we actually replay the
				105	* changes made at that point.
				106	*/
				107	static bool
				108	xlog_put_buffer_cancelled(
				109	struct xlog *log,
				110	xfs_daddr_t blkno,
				111	uint len)
				112	{
				113	struct xfs_buf_cancel *bcp;
				114
				115	bcp = xlog_find_buffer_cancelled(log, blkno, len);
				116	if (!bcp) {
				117	ASSERT(0);
				118	return false;
				119	}
				120
				121	if (--bcp->bc_refcount == 0) {
				122	list_del(&bcp->bc_list);
				123	kmem_free(bcp);
				124	}
				125	return true;
				126	}
				127
				128	/* log buffer item recovery */
				129
				130	/*
Darrick J. Wong	86ffa47	2020-05-01 16:00:45 -0700	[diff] [blame]	131	* Sort buffer items for log recovery. Most buffer items should end up on the
				132	* buffer list and are recovered first, with the following exceptions:
				133	*
				134	* 1. XFS_BLF_CANCEL buffers must be processed last because some log items
				135	* might depend on the incor ecancellation record, and replaying a cancelled
				136	* buffer item can remove the incore record.
				137	*
				138	* 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that
				139	* we replay di_next_unlinked only after flushing the inode 'free' state
				140	* to the inode buffer.
				141	*
				142	* See xlog_recover_reorder_trans for more details.
				143	*/
				144	STATIC enum xlog_recover_reorder
				145	xlog_recover_buf_reorder(
				146	struct xlog_recover_item *item)
				147	{
				148	struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
				149
				150	if (buf_f->blf_flags & XFS_BLF_CANCEL)
				151	return XLOG_REORDER_CANCEL_LIST;
				152	if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
				153	return XLOG_REORDER_INODE_BUFFER_LIST;
				154	return XLOG_REORDER_BUFFER_LIST;
				155	}
				156
Darrick J. Wong	8ea5682	2020-05-01 16:00:46 -0700	[diff] [blame]	157	STATIC void
				158	xlog_recover_buf_ra_pass2(
				159	struct xlog *log,
				160	struct xlog_recover_item *item)
				161	{
				162	struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
				163
				164	xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL);
				165	}
				166
Darrick J. Wong	3304a4f	2020-05-01 16:00:46 -0700	[diff] [blame]	167	/*
				168	* Build up the table of buf cancel records so that we don't replay cancelled
				169	* data in the second pass.
				170	*/
				171	static int
				172	xlog_recover_buf_commit_pass1(
				173	struct xlog *log,
				174	struct xlog_recover_item *item)
				175	{
				176	struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr;
				177
				178	if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) {
				179	xfs_err(log->l_mp, "bad buffer log item size (%d)",
				180	item->ri_buf[0].i_len);
				181	return -EFSCORRUPTED;
				182	}
				183
				184	if (!(bf->blf_flags & XFS_BLF_CANCEL))
				185	trace_xfs_log_recover_buf_not_cancel(log, bf);
				186	else if (xlog_add_buffer_cancelled(log, bf->blf_blkno, bf->blf_len))
				187	trace_xfs_log_recover_buf_cancel_add(log, bf);
				188	else
				189	trace_xfs_log_recover_buf_cancel_ref_inc(log, bf);
				190	return 0;
				191	}
				192
Darrick J. Wong	1094d3f	2020-05-01 16:00:47 -0700	[diff] [blame]	193	/*
				194	* Validate the recovered buffer is of the correct type and attach the
				195	* appropriate buffer operations to them for writeback. Magic numbers are in a
				196	* few places:
				197	* the first 16 bits of the buffer (inode buffer, dquot buffer),
				198	* the first 32 bits of the buffer (most blocks),
				199	* inside a struct xfs_da_blkinfo at the start of the buffer.
				200	*/
				201	static void
				202	xlog_recover_validate_buf_type(
				203	struct xfs_mount *mp,
				204	struct xfs_buf *bp,
				205	struct xfs_buf_log_format *buf_f,
				206	xfs_lsn_t current_lsn)
				207	{
				208	struct xfs_da_blkinfo *info = bp->b_addr;
				209	uint32_t magic32;
				210	uint16_t magic16;
				211	uint16_t magicda;
				212	char *warnmsg = NULL;
				213
				214	/*
				215	* We can only do post recovery validation on items on CRC enabled
				216	* fielsystems as we need to know when the buffer was written to be able
				217	* to determine if we should have replayed the item. If we replay old
				218	* metadata over a newer buffer, then it will enter a temporarily
				219	* inconsistent state resulting in verification failures. Hence for now
				220	* just avoid the verification stage for non-crc filesystems
				221	*/
				222	if (!xfs_sb_version_hascrc(&mp->m_sb))
				223	return;
				224
				225	magic32 = be32_to_cpu((__be32 )bp->b_addr);
				226	magic16 = be16_to_cpu((__be16)bp->b_addr);
				227	magicda = be16_to_cpu(info->magic);
				228	switch (xfs_blft_from_flags(buf_f)) {
				229	case XFS_BLFT_BTREE_BUF:
				230	switch (magic32) {
				231	case XFS_ABTB_CRC_MAGIC:
				232	case XFS_ABTB_MAGIC:
				233	bp->b_ops = &xfs_bnobt_buf_ops;
				234	break;
				235	case XFS_ABTC_CRC_MAGIC:
				236	case XFS_ABTC_MAGIC:
				237	bp->b_ops = &xfs_cntbt_buf_ops;
				238	break;
				239	case XFS_IBT_CRC_MAGIC:
				240	case XFS_IBT_MAGIC:
				241	bp->b_ops = &xfs_inobt_buf_ops;
				242	break;
				243	case XFS_FIBT_CRC_MAGIC:
				244	case XFS_FIBT_MAGIC:
				245	bp->b_ops = &xfs_finobt_buf_ops;
				246	break;
				247	case XFS_BMAP_CRC_MAGIC:
				248	case XFS_BMAP_MAGIC:
				249	bp->b_ops = &xfs_bmbt_buf_ops;
				250	break;
				251	case XFS_RMAP_CRC_MAGIC:
				252	bp->b_ops = &xfs_rmapbt_buf_ops;
				253	break;
				254	case XFS_REFC_CRC_MAGIC:
				255	bp->b_ops = &xfs_refcountbt_buf_ops;
				256	break;
				257	default:
				258	warnmsg = "Bad btree block magic!";
				259	break;
				260	}
				261	break;
				262	case XFS_BLFT_AGF_BUF:
				263	if (magic32 != XFS_AGF_MAGIC) {
				264	warnmsg = "Bad AGF block magic!";
				265	break;
				266	}
				267	bp->b_ops = &xfs_agf_buf_ops;
				268	break;
				269	case XFS_BLFT_AGFL_BUF:
				270	if (magic32 != XFS_AGFL_MAGIC) {
				271	warnmsg = "Bad AGFL block magic!";
				272	break;
				273	}
				274	bp->b_ops = &xfs_agfl_buf_ops;
				275	break;
				276	case XFS_BLFT_AGI_BUF:
				277	if (magic32 != XFS_AGI_MAGIC) {
				278	warnmsg = "Bad AGI block magic!";
				279	break;
				280	}
				281	bp->b_ops = &xfs_agi_buf_ops;
				282	break;
				283	case XFS_BLFT_UDQUOT_BUF:
				284	case XFS_BLFT_PDQUOT_BUF:
				285	case XFS_BLFT_GDQUOT_BUF:
				286	#ifdef CONFIG_XFS_QUOTA
				287	if (magic16 != XFS_DQUOT_MAGIC) {
				288	warnmsg = "Bad DQUOT block magic!";
				289	break;
				290	}
				291	bp->b_ops = &xfs_dquot_buf_ops;
				292	#else
				293	xfs_alert(mp,
				294	"Trying to recover dquots without QUOTA support built in!");
				295	ASSERT(0);
				296	#endif
				297	break;
				298	case XFS_BLFT_DINO_BUF:
				299	if (magic16 != XFS_DINODE_MAGIC) {
				300	warnmsg = "Bad INODE block magic!";
				301	break;
				302	}
				303	bp->b_ops = &xfs_inode_buf_ops;
				304	break;
				305	case XFS_BLFT_SYMLINK_BUF:
				306	if (magic32 != XFS_SYMLINK_MAGIC) {
				307	warnmsg = "Bad symlink block magic!";
				308	break;
				309	}
				310	bp->b_ops = &xfs_symlink_buf_ops;
				311	break;
				312	case XFS_BLFT_DIR_BLOCK_BUF:
				313	if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
				314	magic32 != XFS_DIR3_BLOCK_MAGIC) {
				315	warnmsg = "Bad dir block magic!";
				316	break;
				317	}
				318	bp->b_ops = &xfs_dir3_block_buf_ops;
				319	break;
				320	case XFS_BLFT_DIR_DATA_BUF:
				321	if (magic32 != XFS_DIR2_DATA_MAGIC &&
				322	magic32 != XFS_DIR3_DATA_MAGIC) {
				323	warnmsg = "Bad dir data magic!";
				324	break;
				325	}
				326	bp->b_ops = &xfs_dir3_data_buf_ops;
				327	break;
				328	case XFS_BLFT_DIR_FREE_BUF:
				329	if (magic32 != XFS_DIR2_FREE_MAGIC &&
				330	magic32 != XFS_DIR3_FREE_MAGIC) {
				331	warnmsg = "Bad dir3 free magic!";
				332	break;
				333	}
				334	bp->b_ops = &xfs_dir3_free_buf_ops;
				335	break;
				336	case XFS_BLFT_DIR_LEAF1_BUF:
				337	if (magicda != XFS_DIR2_LEAF1_MAGIC &&
				338	magicda != XFS_DIR3_LEAF1_MAGIC) {
				339	warnmsg = "Bad dir leaf1 magic!";
				340	break;
				341	}
				342	bp->b_ops = &xfs_dir3_leaf1_buf_ops;
				343	break;
				344	case XFS_BLFT_DIR_LEAFN_BUF:
				345	if (magicda != XFS_DIR2_LEAFN_MAGIC &&
				346	magicda != XFS_DIR3_LEAFN_MAGIC) {
				347	warnmsg = "Bad dir leafn magic!";
				348	break;
				349	}
				350	bp->b_ops = &xfs_dir3_leafn_buf_ops;
				351	break;
				352	case XFS_BLFT_DA_NODE_BUF:
				353	if (magicda != XFS_DA_NODE_MAGIC &&
				354	magicda != XFS_DA3_NODE_MAGIC) {
				355	warnmsg = "Bad da node magic!";
				356	break;
				357	}
				358	bp->b_ops = &xfs_da3_node_buf_ops;
				359	break;
				360	case XFS_BLFT_ATTR_LEAF_BUF:
				361	if (magicda != XFS_ATTR_LEAF_MAGIC &&
				362	magicda != XFS_ATTR3_LEAF_MAGIC) {
				363	warnmsg = "Bad attr leaf magic!";
				364	break;
				365	}
				366	bp->b_ops = &xfs_attr3_leaf_buf_ops;
				367	break;
				368	case XFS_BLFT_ATTR_RMT_BUF:
				369	if (magic32 != XFS_ATTR3_RMT_MAGIC) {
				370	warnmsg = "Bad attr remote magic!";
				371	break;
				372	}
				373	bp->b_ops = &xfs_attr3_rmt_buf_ops;
				374	break;
				375	case XFS_BLFT_SB_BUF:
				376	if (magic32 != XFS_SB_MAGIC) {
				377	warnmsg = "Bad SB block magic!";
				378	break;
				379	}
				380	bp->b_ops = &xfs_sb_buf_ops;
				381	break;
				382	#ifdef CONFIG_XFS_RT
				383	case XFS_BLFT_RTBITMAP_BUF:
				384	case XFS_BLFT_RTSUMMARY_BUF:
				385	/* no magic numbers for verification of RT buffers */
				386	bp->b_ops = &xfs_rtbuf_ops;
				387	break;
				388	#endif /* CONFIG_XFS_RT */
				389	default:
				390	xfs_warn(mp, "Unknown buffer type %d!",
				391	xfs_blft_from_flags(buf_f));
				392	break;
				393	}
				394
				395	/*
				396	* Nothing else to do in the case of a NULL current LSN as this means
				397	* the buffer is more recent than the change in the log and will be
				398	* skipped.
				399	*/
				400	if (current_lsn == NULLCOMMITLSN)
				401	return;
				402
				403	if (warnmsg) {
				404	xfs_warn(mp, warnmsg);
				405	ASSERT(0);
				406	}
				407
				408	/*
				409	* We must update the metadata LSN of the buffer as it is written out to
				410	* ensure that older transactions never replay over this one and corrupt
				411	* the buffer. This can occur if log recovery is interrupted at some
				412	* point after the current transaction completes, at which point a
				413	* subsequent mount starts recovery from the beginning.
				414	*
				415	* Write verifiers update the metadata LSN from log items attached to
				416	* the buffer. Therefore, initialize a bli purely to carry the LSN to
				417	* the verifier. We'll clean it up in our ->iodone() callback.
				418	*/
				419	if (bp->b_ops) {
				420	struct xfs_buf_log_item *bip;
				421
Dave Chinner	9fe5c77	2020-06-29 14:48:47 -0700	[diff] [blame]	422	bp->b_flags \|= _XBF_LOGRECOVERY;
Darrick J. Wong	1094d3f	2020-05-01 16:00:47 -0700	[diff] [blame]	423	xfs_buf_item_init(bp, mp);
				424	bip = bp->b_log_item;
				425	bip->bli_item.li_lsn = current_lsn;
				426	}
				427	}
				428
				429	/*
				430	* Perform a 'normal' buffer recovery. Each logged region of the
				431	* buffer should be copied over the corresponding region in the
				432	* given buffer. The bitmap in the buf log format structure indicates
				433	* where to place the logged data.
				434	*/
				435	STATIC void
				436	xlog_recover_do_reg_buffer(
				437	struct xfs_mount *mp,
				438	struct xlog_recover_item *item,
				439	struct xfs_buf *bp,
				440	struct xfs_buf_log_format *buf_f,
				441	xfs_lsn_t current_lsn)
				442	{
				443	int i;
				444	int bit;
				445	int nbits;
				446	xfs_failaddr_t fa;
				447	const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot);
				448
				449	trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
				450
				451	bit = 0;
				452	i = 1; /* 0 is the buf format structure */
				453	while (1) {
				454	bit = xfs_next_bit(buf_f->blf_data_map,
				455	buf_f->blf_map_size, bit);
				456	if (bit == -1)
				457	break;
				458	nbits = xfs_contig_bits(buf_f->blf_data_map,
				459	buf_f->blf_map_size, bit);
				460	ASSERT(nbits > 0);
				461	ASSERT(item->ri_buf[i].i_addr != NULL);
				462	ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
				463	ASSERT(BBTOB(bp->b_length) >=
				464	((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
				465
				466	/*
				467	* The dirty regions logged in the buffer, even though
				468	* contiguous, may span multiple chunks. This is because the
				469	* dirty region may span a physical page boundary in a buffer
				470	* and hence be split into two separate vectors for writing into
				471	* the log. Hence we need to trim nbits back to the length of
				472	* the current region being copied out of the log.
				473	*/
				474	if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
				475	nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
				476
				477	/*
				478	* Do a sanity check if this is a dquot buffer. Just checking
				479	* the first dquot in the buffer should do. XXXThis is
				480	* probably a good thing to do for other buf types also.
				481	*/
				482	fa = NULL;
				483	if (buf_f->blf_flags &
				484	(XFS_BLF_UDQUOT_BUF\|XFS_BLF_PDQUOT_BUF\|XFS_BLF_GDQUOT_BUF)) {
				485	if (item->ri_buf[i].i_addr == NULL) {
				486	xfs_alert(mp,
				487	"XFS: NULL dquot in %s.", __func__);
				488	goto next;
				489	}
				490	if (item->ri_buf[i].i_len < size_disk_dquot) {
				491	xfs_alert(mp,
				492	"XFS: dquot too small (%d) in %s.",
				493	item->ri_buf[i].i_len, __func__);
				494	goto next;
				495	}
Darrick J. Wong	f9751c4	2020-07-15 17:41:24 -0700	[diff] [blame]	496	fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1);
Darrick J. Wong	1094d3f	2020-05-01 16:00:47 -0700	[diff] [blame]	497	if (fa) {
				498	xfs_alert(mp,
				499	"dquot corrupt at %pS trying to replay into block 0x%llx",
				500	fa, bp->b_bn);
				501	goto next;
				502	}
				503	}
				504
				505	memcpy(xfs_buf_offset(bp,
				506	(uint)bit << XFS_BLF_SHIFT), /* dest */
				507	item->ri_buf[i].i_addr, /* source */
				508	nbits<<XFS_BLF_SHIFT); /* length */
				509	next:
				510	i++;
				511	bit += nbits;
				512	}
				513
				514	/* Shouldn't be any more regions */
				515	ASSERT(i == item->ri_total);
				516
				517	xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
				518	}
				519
				520	/*
				521	* Perform a dquot buffer recovery.
				522	* Simple algorithm: if we have found a QUOTAOFF log item of the same type
				523	* (ie. USR or GRP), then just toss this buffer away; don't recover it.
				524	* Else, treat it as a regular buffer and do recovery.
				525	*
				526	* Return false if the buffer was tossed and true if we recovered the buffer to
				527	* indicate to the caller if the buffer needs writing.
				528	*/
				529	STATIC bool
				530	xlog_recover_do_dquot_buffer(
				531	struct xfs_mount *mp,
				532	struct xlog *log,
				533	struct xlog_recover_item *item,
				534	struct xfs_buf *bp,
				535	struct xfs_buf_log_format *buf_f)
				536	{
				537	uint type;
				538
				539	trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
				540
				541	/*
				542	* Filesystems are required to send in quota flags at mount time.
				543	*/
				544	if (!mp->m_qflags)
				545	return false;
				546
				547	type = 0;
				548	if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
Darrick J. Wong	8cd4901	2020-07-15 17:42:36 -0700	[diff] [blame^]	549	type \|= XFS_DQTYPE_USER;
Darrick J. Wong	1094d3f	2020-05-01 16:00:47 -0700	[diff] [blame]	550	if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
Darrick J. Wong	8cd4901	2020-07-15 17:42:36 -0700	[diff] [blame^]	551	type \|= XFS_DQTYPE_PROJ;
Darrick J. Wong	1094d3f	2020-05-01 16:00:47 -0700	[diff] [blame]	552	if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
Darrick J. Wong	8cd4901	2020-07-15 17:42:36 -0700	[diff] [blame^]	553	type \|= XFS_DQTYPE_GROUP;
Darrick J. Wong	1094d3f	2020-05-01 16:00:47 -0700	[diff] [blame]	554	/*
				555	* This type of quotas was turned off, so ignore this buffer
				556	*/
				557	if (log->l_quotaoffs_flag & type)
				558	return false;
				559
				560	xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
				561	return true;
				562	}
				563
				564	/*
				565	* Perform recovery for a buffer full of inodes. In these buffers, the only
				566	* data which should be recovered is that which corresponds to the
				567	* di_next_unlinked pointers in the on disk inode structures. The rest of the
				568	* data for the inodes is always logged through the inodes themselves rather
				569	* than the inode buffer and is recovered in xlog_recover_inode_pass2().
				570	*
				571	* The only time when buffers full of inodes are fully recovered is when the
				572	* buffer is full of newly allocated inodes. In this case the buffer will
				573	* not be marked as an inode buffer and so will be sent to
				574	* xlog_recover_do_reg_buffer() below during recovery.
				575	*/
				576	STATIC int
				577	xlog_recover_do_inode_buffer(
				578	struct xfs_mount *mp,
				579	struct xlog_recover_item *item,
				580	struct xfs_buf *bp,
				581	struct xfs_buf_log_format *buf_f)
				582	{
				583	int i;
				584	int item_index = 0;
				585	int bit = 0;
				586	int nbits = 0;
				587	int reg_buf_offset = 0;
				588	int reg_buf_bytes = 0;
				589	int next_unlinked_offset;
				590	int inodes_per_buf;
				591	xfs_agino_t *logged_nextp;
				592	xfs_agino_t *buffer_nextp;
				593
				594	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
				595
				596	/*
				597	* Post recovery validation only works properly on CRC enabled
				598	* filesystems.
				599	*/
				600	if (xfs_sb_version_hascrc(&mp->m_sb))
				601	bp->b_ops = &xfs_inode_buf_ops;
				602
				603	inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog;
				604	for (i = 0; i < inodes_per_buf; i++) {
				605	next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
				606	offsetof(xfs_dinode_t, di_next_unlinked);
				607
				608	while (next_unlinked_offset >=
				609	(reg_buf_offset + reg_buf_bytes)) {
				610	/*
				611	* The next di_next_unlinked field is beyond
				612	* the current logged region. Find the next
				613	* logged region that contains or is beyond
				614	* the current di_next_unlinked field.
				615	*/
				616	bit += nbits;
				617	bit = xfs_next_bit(buf_f->blf_data_map,
				618	buf_f->blf_map_size, bit);
				619
				620	/*
				621	* If there are no more logged regions in the
				622	* buffer, then we're done.
				623	*/
				624	if (bit == -1)
				625	return 0;
				626
				627	nbits = xfs_contig_bits(buf_f->blf_data_map,
				628	buf_f->blf_map_size, bit);
				629	ASSERT(nbits > 0);
				630	reg_buf_offset = bit << XFS_BLF_SHIFT;
				631	reg_buf_bytes = nbits << XFS_BLF_SHIFT;
				632	item_index++;
				633	}
				634
				635	/*
				636	* If the current logged region starts after the current
				637	* di_next_unlinked field, then move on to the next
				638	* di_next_unlinked field.
				639	*/
				640	if (next_unlinked_offset < reg_buf_offset)
				641	continue;
				642
				643	ASSERT(item->ri_buf[item_index].i_addr != NULL);
				644	ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
				645	ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
				646
				647	/*
				648	* The current logged region contains a copy of the
				649	* current di_next_unlinked field. Extract its value
				650	* and copy it to the buffer copy.
				651	*/
				652	logged_nextp = item->ri_buf[item_index].i_addr +
				653	next_unlinked_offset - reg_buf_offset;
				654	if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) {
				655	xfs_alert(mp,
				656	"Bad inode buffer log record (ptr = "PTR_FMT", bp = "PTR_FMT"). "
				657	"Trying to replay bad (0) inode di_next_unlinked field.",
				658	item, bp);
				659	return -EFSCORRUPTED;
				660	}
				661
				662	buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
				663	buffer_nextp = logged_nextp;
				664
				665	/*
				666	* If necessary, recalculate the CRC in the on-disk inode. We
				667	* have to leave the inode in a consistent state for whoever
				668	* reads it next....
				669	*/
				670	xfs_dinode_calc_crc(mp,
				671	xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
				672
				673	}
				674
				675	return 0;
				676	}
				677
				678	/*
				679	* V5 filesystems know the age of the buffer on disk being recovered. We can
				680	* have newer objects on disk than we are replaying, and so for these cases we
				681	* don't want to replay the current change as that will make the buffer contents
				682	* temporarily invalid on disk.
				683	*
				684	* The magic number might not match the buffer type we are going to recover
				685	* (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
				686	* extract the LSN of the existing object in the buffer based on it's current
				687	* magic number. If we don't recognise the magic number in the buffer, then
				688	* return a LSN of -1 so that the caller knows it was an unrecognised block and
				689	* so can recover the buffer.
				690	*
				691	* Note: we cannot rely solely on magic number matches to determine that the
				692	* buffer has a valid LSN - we also need to verify that it belongs to this
				693	* filesystem, so we need to extract the object's LSN and compare it to that
				694	* which we read from the superblock. If the UUIDs don't match, then we've got a
				695	* stale metadata block from an old filesystem instance that we need to recover
				696	* over the top of.
				697	*/
				698	static xfs_lsn_t
				699	xlog_recover_get_buf_lsn(
				700	struct xfs_mount *mp,
				701	struct xfs_buf *bp)
				702	{
				703	uint32_t magic32;
				704	uint16_t magic16;
				705	uint16_t magicda;
				706	void *blk = bp->b_addr;
				707	uuid_t *uuid;
				708	xfs_lsn_t lsn = -1;
				709
				710	/* v4 filesystems always recover immediately */
				711	if (!xfs_sb_version_hascrc(&mp->m_sb))
				712	goto recover_immediately;
				713
				714	magic32 = be32_to_cpu((__be32 )blk);
				715	switch (magic32) {
				716	case XFS_ABTB_CRC_MAGIC:
				717	case XFS_ABTC_CRC_MAGIC:
				718	case XFS_ABTB_MAGIC:
				719	case XFS_ABTC_MAGIC:
				720	case XFS_RMAP_CRC_MAGIC:
				721	case XFS_REFC_CRC_MAGIC:
				722	case XFS_IBT_CRC_MAGIC:
				723	case XFS_IBT_MAGIC: {
				724	struct xfs_btree_block *btb = blk;
				725
				726	lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
				727	uuid = &btb->bb_u.s.bb_uuid;
				728	break;
				729	}
				730	case XFS_BMAP_CRC_MAGIC:
				731	case XFS_BMAP_MAGIC: {
				732	struct xfs_btree_block *btb = blk;
				733
				734	lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
				735	uuid = &btb->bb_u.l.bb_uuid;
				736	break;
				737	}
				738	case XFS_AGF_MAGIC:
				739	lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
				740	uuid = &((struct xfs_agf *)blk)->agf_uuid;
				741	break;
				742	case XFS_AGFL_MAGIC:
				743	lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
				744	uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
				745	break;
				746	case XFS_AGI_MAGIC:
				747	lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
				748	uuid = &((struct xfs_agi *)blk)->agi_uuid;
				749	break;
				750	case XFS_SYMLINK_MAGIC:
				751	lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
				752	uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
				753	break;
				754	case XFS_DIR3_BLOCK_MAGIC:
				755	case XFS_DIR3_DATA_MAGIC:
				756	case XFS_DIR3_FREE_MAGIC:
				757	lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
				758	uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
				759	break;
				760	case XFS_ATTR3_RMT_MAGIC:
				761	/*
				762	* Remote attr blocks are written synchronously, rather than
				763	* being logged. That means they do not contain a valid LSN
				764	* (i.e. transactionally ordered) in them, and hence any time we
				765	* see a buffer to replay over the top of a remote attribute
				766	* block we should simply do so.
				767	*/
				768	goto recover_immediately;
				769	case XFS_SB_MAGIC:
				770	/*
				771	* superblock uuids are magic. We may or may not have a
				772	* sb_meta_uuid on disk, but it will be set in the in-core
				773	* superblock. We set the uuid pointer for verification
				774	* according to the superblock feature mask to ensure we check
				775	* the relevant UUID in the superblock.
				776	*/
				777	lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
				778	if (xfs_sb_version_hasmetauuid(&mp->m_sb))
				779	uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
				780	else
				781	uuid = &((struct xfs_dsb *)blk)->sb_uuid;
				782	break;
				783	default:
				784	break;
				785	}
				786
				787	if (lsn != (xfs_lsn_t)-1) {
				788	if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
				789	goto recover_immediately;
				790	return lsn;
				791	}
				792
				793	magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
				794	switch (magicda) {
				795	case XFS_DIR3_LEAF1_MAGIC:
				796	case XFS_DIR3_LEAFN_MAGIC:
				797	case XFS_DA3_NODE_MAGIC:
				798	lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
				799	uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
				800	break;
				801	default:
				802	break;
				803	}
				804
				805	if (lsn != (xfs_lsn_t)-1) {
				806	if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
				807	goto recover_immediately;
				808	return lsn;
				809	}
				810
				811	/*
				812	* We do individual object checks on dquot and inode buffers as they
				813	* have their own individual LSN records. Also, we could have a stale
				814	* buffer here, so we have to at least recognise these buffer types.
				815	*
				816	* A notd complexity here is inode unlinked list processing - it logs
				817	* the inode directly in the buffer, but we don't know which inodes have
				818	* been modified, and there is no global buffer LSN. Hence we need to
				819	* recover all inode buffer types immediately. This problem will be
				820	* fixed by logical logging of the unlinked list modifications.
				821	*/
				822	magic16 = be16_to_cpu((__be16 )blk);
				823	switch (magic16) {
				824	case XFS_DQUOT_MAGIC:
				825	case XFS_DINODE_MAGIC:
				826	goto recover_immediately;
				827	default:
				828	break;
				829	}
				830
				831	/* unknown buffer contents, recover immediately */
				832
				833	recover_immediately:
				834	return (xfs_lsn_t)-1;
				835
				836	}
				837
				838	/*
				839	* This routine replays a modification made to a buffer at runtime.
				840	* There are actually two types of buffer, regular and inode, which
				841	* are handled differently. Inode buffers are handled differently
				842	* in that we only recover a specific set of data from them, namely
				843	* the inode di_next_unlinked fields. This is because all other inode
				844	* data is actually logged via inode records and any data we replay
				845	* here which overlaps that may be stale.
				846	*
				847	* When meta-data buffers are freed at run time we log a buffer item
				848	* with the XFS_BLF_CANCEL bit set to indicate that previous copies
				849	* of the buffer in the log should not be replayed at recovery time.
				850	* This is so that if the blocks covered by the buffer are reused for
				851	* file data before we crash we don't end up replaying old, freed
				852	* meta-data into a user's file.
				853	*
				854	* To handle the cancellation of buffer log items, we make two passes
				855	* over the log during recovery. During the first we build a table of
				856	* those buffers which have been cancelled, and during the second we
				857	* only replay those buffers which do not have corresponding cancel
				858	* records in the table. See xlog_recover_buf_pass[1,2] above
				859	* for more details on the implementation of the table of cancel records.
				860	*/
				861	STATIC int
				862	xlog_recover_buf_commit_pass2(
				863	struct xlog *log,
				864	struct list_head *buffer_list,
				865	struct xlog_recover_item *item,
				866	xfs_lsn_t current_lsn)
				867	{
				868	struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
				869	struct xfs_mount *mp = log->l_mp;
				870	struct xfs_buf *bp;
				871	int error;
				872	uint buf_flags;
				873	xfs_lsn_t lsn;
				874
				875	/*
				876	* In this pass we only want to recover all the buffers which have
				877	* not been cancelled and are not cancellation buffers themselves.
				878	*/
				879	if (buf_f->blf_flags & XFS_BLF_CANCEL) {
				880	if (xlog_put_buffer_cancelled(log, buf_f->blf_blkno,
				881	buf_f->blf_len))
				882	goto cancelled;
				883	} else {
				884
				885	if (xlog_is_buffer_cancelled(log, buf_f->blf_blkno,
				886	buf_f->blf_len))
				887	goto cancelled;
				888	}
				889
				890	trace_xfs_log_recover_buf_recover(log, buf_f);
				891
				892	buf_flags = 0;
				893	if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
				894	buf_flags \|= XBF_UNMAPPED;
				895
				896	error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
				897	buf_flags, &bp, NULL);
				898	if (error)
				899	return error;
				900
				901	/*
				902	* Recover the buffer only if we get an LSN from it and it's less than
				903	* the lsn of the transaction we are replaying.
				904	*
				905	* Note that we have to be extremely careful of readahead here.
				906	* Readahead does not attach verfiers to the buffers so if we don't
				907	* actually do any replay after readahead because of the LSN we found
				908	* in the buffer if more recent than that current transaction then we
				909	* need to attach the verifier directly. Failure to do so can lead to
				910	* future recovery actions (e.g. EFI and unlinked list recovery) can
				911	* operate on the buffers and they won't get the verifier attached. This
				912	* can lead to blocks on disk having the correct content but a stale
				913	* CRC.
				914	*
				915	* It is safe to assume these clean buffers are currently up to date.
				916	* If the buffer is dirtied by a later transaction being replayed, then
				917	* the verifier will be reset to match whatever recover turns that
				918	* buffer into.
				919	*/
				920	lsn = xlog_recover_get_buf_lsn(mp, bp);
				921	if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
				922	trace_xfs_log_recover_buf_skip(log, buf_f);
				923	xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
				924	goto out_release;
				925	}
				926
				927	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
				928	error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
				929	if (error)
				930	goto out_release;
				931	} else if (buf_f->blf_flags &
				932	(XFS_BLF_UDQUOT_BUF\|XFS_BLF_PDQUOT_BUF\|XFS_BLF_GDQUOT_BUF)) {
				933	bool dirty;
				934
				935	dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
				936	if (!dirty)
				937	goto out_release;
				938	} else {
				939	xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
				940	}
				941
				942	/*
				943	* Perform delayed write on the buffer. Asynchronous writes will be
				944	* slower when taking into account all the buffers to be flushed.
				945	*
				946	* Also make sure that only inode buffers with good sizes stay in
				947	* the buffer cache. The kernel moves inodes in buffers of 1 block
				948	* or inode_cluster_size bytes, whichever is bigger. The inode
				949	* buffers in the log can be a different size if the log was generated
				950	* by an older kernel using unclustered inode buffers or a newer kernel
				951	* running with a different inode cluster size. Regardless, if the
				952	* the inode buffer size isn't max(blocksize, inode_cluster_size)
				953	* for our value of inode_cluster_size, then we need to keep
				954	* the buffer out of the buffer cache so that the buffer won't
				955	* overlap with future reads of those inodes.
				956	*/
				957	if (XFS_DINODE_MAGIC ==
				958	be16_to_cpu(((__be16 )xfs_buf_offset(bp, 0))) &&
				959	(BBTOB(bp->b_length) != M_IGEO(log->l_mp)->inode_cluster_size)) {
				960	xfs_buf_stale(bp);
				961	error = xfs_bwrite(bp);
				962	} else {
				963	ASSERT(bp->b_mount == mp);
Dave Chinner	9fe5c77	2020-06-29 14:48:47 -0700	[diff] [blame]	964	bp->b_flags \|= _XBF_LOGRECOVERY;
Darrick J. Wong	1094d3f	2020-05-01 16:00:47 -0700	[diff] [blame]	965	xfs_buf_delwri_queue(bp, buffer_list);
				966	}
				967
				968	out_release:
				969	xfs_buf_relse(bp);
				970	return error;
				971	cancelled:
				972	trace_xfs_log_recover_buf_cancel(log, buf_f);
				973	return 0;
				974	}
				975
Darrick J. Wong	86ffa47	2020-05-01 16:00:45 -0700	[diff] [blame]	976	const struct xlog_recover_item_ops xlog_buf_item_ops = {
				977	.item_type = XFS_LI_BUF,
				978	.reorder = xlog_recover_buf_reorder,
Darrick J. Wong	8ea5682	2020-05-01 16:00:46 -0700	[diff] [blame]	979	.ra_pass2 = xlog_recover_buf_ra_pass2,
Darrick J. Wong	3304a4f	2020-05-01 16:00:46 -0700	[diff] [blame]	980	.commit_pass1 = xlog_recover_buf_commit_pass1,
Darrick J. Wong	1094d3f	2020-05-01 16:00:47 -0700	[diff] [blame]	981	.commit_pass2 = xlog_recover_buf_commit_pass2,
Darrick J. Wong	86ffa47	2020-05-01 16:00:45 -0700	[diff] [blame]	982	};