ext4: fast commit recovery path

This patch adds fast commit recovery path support for Ext4 file
system. We add several helper functions that are similar in spirit to
e2fsprogs journal recovery path handlers. Example of such functions
include - a simple block allocator, idempotent block bitmap update
function etc. Using these routines and the fast commit log in the fast
commit area, the recovery path (ext4_fc_replay()) performs fast commit
log recovery.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@gmail.com>
Link: https://lore.kernel.org/r/20201015203802.3597742-8-harshadshirwadkar@gmail.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 888d9d21..426c0ab 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -166,7 +166,8 @@ void ext4_fc_start_update(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
-	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
+	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 		return;
 
 restart:
@@ -205,7 +206,8 @@ void ext4_fc_stop_update(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
-	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
+	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 		return;
 
 	if (atomic_dec_and_test(&ei->i_fc_updates))
@@ -220,11 +222,8 @@ void ext4_fc_del(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
-	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
-		return;
-
-
-	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
+	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 		return;
 
 restart:
@@ -266,6 +265,10 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
+	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+		return;
+
 	sbi->s_mount_state |= EXT4_FC_INELIGIBLE;
 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
@@ -279,6 +282,10 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
+	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+		return;
+
 	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 	atomic_inc(&sbi->s_fc_ineligible_updates);
@@ -291,6 +298,10 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason)
  */
 void ext4_fc_stop_ineligible(struct super_block *sb)
 {
+	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
+	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
+		return;
+
 	EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE;
 	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 }
@@ -321,7 +332,8 @@ static int ext4_fc_track_template(
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	int ret;
 
-	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
+	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
+	    (sbi->s_mount_state & EXT4_FC_REPLAY))
 		return -EOPNOTSUPP;
 
 	if (ext4_fc_is_ineligible(inode->i_sb))
@@ -1188,13 +1200,880 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
 	trace_ext4_fc_stats(sb);
 }
 
+/* Ext4 Replay Path Routines */
+
+/* Get length of a particular tlv */
+static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
+{
+	return le16_to_cpu(tl->fc_len);
+}
+
+/* Get a pointer to "value" of a tlv */
+static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
+{
+	return (u8 *)tl + sizeof(*tl);
+}
+
+/* Helper struct for dentry replay routines */
+struct dentry_info_args {
+	int parent_ino, dname_len, ino, inode_len;
+	char *dname;
+};
+
+static inline void tl_to_darg(struct dentry_info_args *darg,
+				struct  ext4_fc_tl *tl)
+{
+	struct ext4_fc_dentry_info *fcd;
+
+	fcd = (struct ext4_fc_dentry_info *)ext4_fc_tag_val(tl);
+
+	darg->parent_ino = le32_to_cpu(fcd->fc_parent_ino);
+	darg->ino = le32_to_cpu(fcd->fc_ino);
+	darg->dname = fcd->fc_dname;
+	darg->dname_len = ext4_fc_tag_len(tl) -
+			sizeof(struct ext4_fc_dentry_info);
+}
+
+/* Unlink replay function */
+static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl)
+{
+	struct inode *inode, *old_parent;
+	struct qstr entry;
+	struct dentry_info_args darg;
+	int ret = 0;
+
+	tl_to_darg(&darg, tl);
+
+	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
+			darg.parent_ino, darg.dname_len);
+
+	entry.name = darg.dname;
+	entry.len = darg.dname_len;
+	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+
+	if (IS_ERR_OR_NULL(inode)) {
+		jbd_debug(1, "Inode %d not found", darg.ino);
+		return 0;
+	}
+
+	old_parent = ext4_iget(sb, darg.parent_ino,
+				EXT4_IGET_NORMAL);
+	if (IS_ERR_OR_NULL(old_parent)) {
+		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
+		iput(inode);
+		return 0;
+	}
+
+	ret = __ext4_unlink(old_parent, &entry, inode);
+	/* -ENOENT ok coz it might not exist anymore. */
+	if (ret == -ENOENT)
+		ret = 0;
+	iput(old_parent);
+	iput(inode);
+	return ret;
+}
+
+static int ext4_fc_replay_link_internal(struct super_block *sb,
+				struct dentry_info_args *darg,
+				struct inode *inode)
+{
+	struct inode *dir = NULL;
+	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
+	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
+	int ret = 0;
+
+	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
+	if (IS_ERR(dir)) {
+		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
+		dir = NULL;
+		goto out;
+	}
+
+	dentry_dir = d_obtain_alias(dir);
+	if (IS_ERR(dentry_dir)) {
+		jbd_debug(1, "Failed to obtain dentry");
+		dentry_dir = NULL;
+		goto out;
+	}
+
+	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
+	if (!dentry_inode) {
+		jbd_debug(1, "Inode dentry not created.");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = __ext4_link(dir, inode, dentry_inode);
+	/*
+	 * It's possible that link already existed since data blocks
+	 * for the dir in question got persisted before we crashed OR
+	 * we replayed this tag and crashed before the entire replay
+	 * could complete.
+	 */
+	if (ret && ret != -EEXIST) {
+		jbd_debug(1, "Failed to link\n");
+		goto out;
+	}
+
+	ret = 0;
+out:
+	if (dentry_dir) {
+		d_drop(dentry_dir);
+		dput(dentry_dir);
+	} else if (dir) {
+		iput(dir);
+	}
+	if (dentry_inode) {
+		d_drop(dentry_inode);
+		dput(dentry_inode);
+	}
+
+	return ret;
+}
+
+/* Link replay function */
+static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl)
+{
+	struct inode *inode;
+	struct dentry_info_args darg;
+	int ret = 0;
+
+	tl_to_darg(&darg, tl);
+	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
+			darg.parent_ino, darg.dname_len);
+
+	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+	if (IS_ERR_OR_NULL(inode)) {
+		jbd_debug(1, "Inode not found.");
+		return 0;
+	}
+
+	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
+	iput(inode);
+	return ret;
+}
+
+/*
+ * Record all the modified inodes during replay. We use this later to setup
+ * block bitmaps correctly.
+ */
+static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
+{
+	struct ext4_fc_replay_state *state;
+	int i;
+
+	state = &EXT4_SB(sb)->s_fc_replay_state;
+	for (i = 0; i < state->fc_modified_inodes_used; i++)
+		if (state->fc_modified_inodes[i] == ino)
+			return 0;
+	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
+		state->fc_modified_inodes_size +=
+			EXT4_FC_REPLAY_REALLOC_INCREMENT;
+		state->fc_modified_inodes = krealloc(
+					state->fc_modified_inodes, sizeof(int) *
+					state->fc_modified_inodes_size,
+					GFP_KERNEL);
+		if (!state->fc_modified_inodes)
+			return -ENOMEM;
+	}
+	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
+	return 0;
+}
+
+/*
+ * Inode replay function
+ */
+static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl)
+{
+	struct ext4_fc_inode *fc_inode;
+	struct ext4_inode *raw_inode;
+	struct ext4_inode *raw_fc_inode;
+	struct inode *inode = NULL;
+	struct ext4_iloc iloc;
+	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
+	struct ext4_extent_header *eh;
+
+	fc_inode = (struct ext4_fc_inode *)ext4_fc_tag_val(tl);
+
+	ino = le32_to_cpu(fc_inode->fc_ino);
+	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
+
+	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
+	if (!IS_ERR_OR_NULL(inode)) {
+		ext4_ext_clear_bb(inode);
+		iput(inode);
+	}
+
+	ext4_fc_record_modified_inode(sb, ino);
+
+	raw_fc_inode = (struct ext4_inode *)fc_inode->fc_raw_inode;
+	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
+	if (ret)
+		goto out;
+
+	inode_len = ext4_fc_tag_len(tl) - sizeof(struct ext4_fc_inode);
+	raw_inode = ext4_raw_inode(&iloc);
+
+	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
+	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
+		inode_len - offsetof(struct ext4_inode, i_generation));
+	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
+		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
+		if (eh->eh_magic != EXT4_EXT_MAGIC) {
+			memset(eh, 0, sizeof(*eh));
+			eh->eh_magic = EXT4_EXT_MAGIC;
+			eh->eh_max = cpu_to_le16(
+				(sizeof(raw_inode->i_block) -
+				 sizeof(struct ext4_extent_header))
+				 / sizeof(struct ext4_extent));
+		}
+	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
+		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
+			sizeof(raw_inode->i_block));
+	}
+
+	/* Immediately update the inode on disk. */
+	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
+	if (ret)
+		goto out;
+	ret = sync_dirty_buffer(iloc.bh);
+	if (ret)
+		goto out;
+	ret = ext4_mark_inode_used(sb, ino);
+	if (ret)
+		goto out;
+
+	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
+	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
+	if (IS_ERR_OR_NULL(inode)) {
+		jbd_debug(1, "Inode not found.");
+		return -EFSCORRUPTED;
+	}
+
+	/*
+	 * Our allocator could have made different decisions than before
+	 * crashing. This should be fixed but until then, we calculate
+	 * the number of blocks the inode.
+	 */
+	ext4_ext_replay_set_iblocks(inode);
+
+	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
+	ext4_reset_inode_seed(inode);
+
+	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
+	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
+	sync_dirty_buffer(iloc.bh);
+	brelse(iloc.bh);
+out:
+	iput(inode);
+	if (!ret)
+		blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
+
+	return 0;
+}
+
+/*
+ * Dentry create replay function.
+ *
+ * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
+ * inode for which we are trying to create a dentry here, should already have
+ * been replayed before we start here.
+ */
+static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl)
+{
+	int ret = 0;
+	struct inode *inode = NULL;
+	struct inode *dir = NULL;
+	struct dentry_info_args darg;
+
+	tl_to_darg(&darg, tl);
+
+	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
+			darg.parent_ino, darg.dname_len);
+
+	/* This takes care of update group descriptor and other metadata */
+	ret = ext4_mark_inode_used(sb, darg.ino);
+	if (ret)
+		goto out;
+
+	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
+	if (IS_ERR_OR_NULL(inode)) {
+		jbd_debug(1, "inode %d not found.", darg.ino);
+		inode = NULL;
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		/*
+		 * If we are creating a directory, we need to make sure that the
+		 * dot and dot dot dirents are setup properly.
+		 */
+		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
+		if (IS_ERR_OR_NULL(dir)) {
+			jbd_debug(1, "Dir %d not found.", darg.ino);
+			goto out;
+		}
+		ret = ext4_init_new_dir(NULL, dir, inode);
+		iput(dir);
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+	}
+	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
+	if (ret)
+		goto out;
+	set_nlink(inode, 1);
+	ext4_mark_inode_dirty(NULL, inode);
+out:
+	if (inode)
+		iput(inode);
+	return ret;
+}
+
+/*
+ * Record physical disk regions which are in use as per fast commit area. Our
+ * simple replay phase allocator excludes these regions from allocation.
+ */
+static int ext4_fc_record_regions(struct super_block *sb, int ino,
+		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
+{
+	struct ext4_fc_replay_state *state;
+	struct ext4_fc_alloc_region *region;
+
+	state = &EXT4_SB(sb)->s_fc_replay_state;
+	if (state->fc_regions_used == state->fc_regions_size) {
+		state->fc_regions_size +=
+			EXT4_FC_REPLAY_REALLOC_INCREMENT;
+		state->fc_regions = krealloc(
+					state->fc_regions,
+					state->fc_regions_size *
+					sizeof(struct ext4_fc_alloc_region),
+					GFP_KERNEL);
+		if (!state->fc_regions)
+			return -ENOMEM;
+	}
+	region = &state->fc_regions[state->fc_regions_used++];
+	region->ino = ino;
+	region->lblk = lblk;
+	region->pblk = pblk;
+	region->len = len;
+
+	return 0;
+}
+
+/* Replay add range tag */
+static int ext4_fc_replay_add_range(struct super_block *sb,
+				struct ext4_fc_tl *tl)
+{
+	struct ext4_fc_add_range *fc_add_ex;
+	struct ext4_extent newex, *ex;
+	struct inode *inode;
+	ext4_lblk_t start, cur;
+	int remaining, len;
+	ext4_fsblk_t start_pblk;
+	struct ext4_map_blocks map;
+	struct ext4_ext_path *path = NULL;
+	int ret;
+
+	fc_add_ex = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
+	ex = (struct ext4_extent *)&fc_add_ex->fc_ex;
+
+	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
+		le32_to_cpu(fc_add_ex->fc_ino), le32_to_cpu(ex->ee_block),
+		ext4_ext_get_actual_len(ex));
+
+	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino),
+				EXT4_IGET_NORMAL);
+	if (IS_ERR_OR_NULL(inode)) {
+		jbd_debug(1, "Inode not found.");
+		return 0;
+	}
+
+	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+
+	start = le32_to_cpu(ex->ee_block);
+	start_pblk = ext4_ext_pblock(ex);
+	len = ext4_ext_get_actual_len(ex);
+
+	cur = start;
+	remaining = len;
+	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
+		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
+		  inode->i_ino);
+
+	while (remaining > 0) {
+		map.m_lblk = cur;
+		map.m_len = remaining;
+		map.m_pblk = 0;
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+
+		if (ret < 0) {
+			iput(inode);
+			return 0;
+		}
+
+		if (ret == 0) {
+			/* Range is not mapped */
+			path = ext4_find_extent(inode, cur, NULL, 0);
+			if (!path)
+				continue;
+			memset(&newex, 0, sizeof(newex));
+			newex.ee_block = cpu_to_le32(cur);
+			ext4_ext_store_pblock(
+				&newex, start_pblk + cur - start);
+			newex.ee_len = cpu_to_le16(map.m_len);
+			if (ext4_ext_is_unwritten(ex))
+				ext4_ext_mark_unwritten(&newex);
+			down_write(&EXT4_I(inode)->i_data_sem);
+			ret = ext4_ext_insert_extent(
+				NULL, inode, &path, &newex, 0);
+			up_write((&EXT4_I(inode)->i_data_sem));
+			ext4_ext_drop_refs(path);
+			kfree(path);
+			if (ret) {
+				iput(inode);
+				return 0;
+			}
+			goto next;
+		}
+
+		if (start_pblk + cur - start != map.m_pblk) {
+			/*
+			 * Logical to physical mapping changed. This can happen
+			 * if this range was removed and then reallocated to
+			 * map to new physical blocks during a fast commit.
+			 */
+			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+					ext4_ext_is_unwritten(ex),
+					start_pblk + cur - start);
+			if (ret) {
+				iput(inode);
+				return 0;
+			}
+			/*
+			 * Mark the old blocks as free since they aren't used
+			 * anymore. We maintain an array of all the modified
+			 * inodes. In case these blocks are still used at either
+			 * a different logical range in the same inode or in
+			 * some different inode, we will mark them as allocated
+			 * at the end of the FC replay using our array of
+			 * modified inodes.
+			 */
+			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+			goto next;
+		}
+
+		/* Range is mapped and needs a state change */
+		jbd_debug(1, "Converting from %d to %d %lld",
+				map.m_flags & EXT4_MAP_UNWRITTEN,
+			ext4_ext_is_unwritten(ex), map.m_pblk);
+		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
+					ext4_ext_is_unwritten(ex), map.m_pblk);
+		if (ret) {
+			iput(inode);
+			return 0;
+		}
+		/*
+		 * We may have split the extent tree while toggling the state.
+		 * Try to shrink the extent tree now.
+		 */
+		ext4_ext_replay_shrink_inode(inode, start + len);
+next:
+		cur += map.m_len;
+		remaining -= map.m_len;
+	}
+	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
+					sb->s_blocksize_bits);
+	iput(inode);
+	return 0;
+}
+
+/* Replay DEL_RANGE tag */
+static int
+ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
+{
+	struct inode *inode;
+	struct ext4_fc_del_range *lrange;
+	struct ext4_map_blocks map;
+	ext4_lblk_t cur, remaining;
+	int ret;
+
+	lrange = (struct ext4_fc_del_range *)ext4_fc_tag_val(tl);
+	cur = le32_to_cpu(lrange->fc_lblk);
+	remaining = le32_to_cpu(lrange->fc_len);
+
+	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
+		le32_to_cpu(lrange->fc_ino), cur, remaining);
+
+	inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL);
+	if (IS_ERR_OR_NULL(inode)) {
+		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino));
+		return 0;
+	}
+
+	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
+
+	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
+			inode->i_ino, le32_to_cpu(lrange->fc_lblk),
+			le32_to_cpu(lrange->fc_len));
+	while (remaining > 0) {
+		map.m_lblk = cur;
+		map.m_len = remaining;
+
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+		if (ret < 0) {
+			iput(inode);
+			return 0;
+		}
+		if (ret > 0) {
+			remaining -= ret;
+			cur += ret;
+			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
+		} else {
+			remaining -= map.m_len;
+			cur += map.m_len;
+		}
+	}
+
+	ret = ext4_punch_hole(inode,
+		le32_to_cpu(lrange->fc_lblk) << sb->s_blocksize_bits,
+		le32_to_cpu(lrange->fc_len) <<  sb->s_blocksize_bits);
+	if (ret)
+		jbd_debug(1, "ext4_punch_hole returned %d", ret);
+	ext4_ext_replay_shrink_inode(inode,
+		i_size_read(inode) >> sb->s_blocksize_bits);
+	ext4_mark_inode_dirty(NULL, inode);
+	iput(inode);
+
+	return 0;
+}
+
+static inline const char *tag2str(u16 tag)
+{
+	switch (tag) {
+	case EXT4_FC_TAG_LINK:
+		return "TAG_ADD_ENTRY";
+	case EXT4_FC_TAG_UNLINK:
+		return "TAG_DEL_ENTRY";
+	case EXT4_FC_TAG_ADD_RANGE:
+		return "TAG_ADD_RANGE";
+	case EXT4_FC_TAG_CREAT:
+		return "TAG_CREAT_DENTRY";
+	case EXT4_FC_TAG_DEL_RANGE:
+		return "TAG_DEL_RANGE";
+	case EXT4_FC_TAG_INODE:
+		return "TAG_INODE";
+	case EXT4_FC_TAG_PAD:
+		return "TAG_PAD";
+	case EXT4_FC_TAG_TAIL:
+		return "TAG_TAIL";
+	case EXT4_FC_TAG_HEAD:
+		return "TAG_HEAD";
+	default:
+		return "TAG_ERROR";
+	}
+}
+
+static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
+{
+	struct ext4_fc_replay_state *state;
+	struct inode *inode;
+	struct ext4_ext_path *path = NULL;
+	struct ext4_map_blocks map;
+	int i, ret, j;
+	ext4_lblk_t cur, end;
+
+	state = &EXT4_SB(sb)->s_fc_replay_state;
+	for (i = 0; i < state->fc_modified_inodes_used; i++) {
+		inode = ext4_iget(sb, state->fc_modified_inodes[i],
+			EXT4_IGET_NORMAL);
+		if (IS_ERR_OR_NULL(inode)) {
+			jbd_debug(1, "Inode %d not found.",
+				state->fc_modified_inodes[i]);
+			continue;
+		}
+		cur = 0;
+		end = EXT_MAX_BLOCKS;
+		while (cur < end) {
+			map.m_lblk = cur;
+			map.m_len = end - cur;
+
+			ret = ext4_map_blocks(NULL, inode, &map, 0);
+			if (ret < 0)
+				break;
+
+			if (ret > 0) {
+				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
+				if (!IS_ERR_OR_NULL(path)) {
+					for (j = 0; j < path->p_depth; j++)
+						ext4_mb_mark_bb(inode->i_sb,
+							path[j].p_block, 1, 1);
+					ext4_ext_drop_refs(path);
+					kfree(path);
+				}
+				cur += ret;
+				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
+							map.m_len, 1);
+			} else {
+				cur = cur + (map.m_len ? map.m_len : 1);
+			}
+		}
+		iput(inode);
+	}
+}
+
+/*
+ * Check if block is in excluded regions for block allocation. The simple
+ * allocator that runs during replay phase is calls this function to see
+ * if it is okay to use a block.
+ */
+bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
+{
+	int i;
+	struct ext4_fc_replay_state *state;
+
+	state = &EXT4_SB(sb)->s_fc_replay_state;
+	for (i = 0; i < state->fc_regions_valid; i++) {
+		if (state->fc_regions[i].ino == 0 ||
+			state->fc_regions[i].len == 0)
+			continue;
+		if (blk >= state->fc_regions[i].pblk &&
+		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
+			return true;
+	}
+	return false;
+}
+
+/* Cleanup function called after replay */
+void ext4_fc_replay_cleanup(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
+	kfree(sbi->s_fc_replay_state.fc_regions);
+	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
+}
+
+/*
+ * Recovery Scan phase handler
+ *
+ * This function is called during the scan phase and is responsible
+ * for doing following things:
+ * - Make sure the fast commit area has valid tags for replay
+ * - Count number of tags that need to be replayed by the replay handler
+ * - Verify CRC
+ * - Create a list of excluded blocks for allocation during replay phase
+ *
+ * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
+ * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
+ * to indicate that scan has finished and JBD2 can now start replay phase.
+ * It returns a negative error to indicate that there was an error. At the end
+ * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
+ * to indicate the number of tags that need to replayed during the replay phase.
+ */
+static int ext4_fc_replay_scan(journal_t *journal,
+				struct buffer_head *bh, int off,
+				tid_t expected_tid)
+{
+	struct super_block *sb = journal->j_private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_fc_replay_state *state;
+	int ret = JBD2_FC_REPLAY_CONTINUE;
+	struct ext4_fc_add_range *ext;
+	struct ext4_fc_tl *tl;
+	struct ext4_fc_tail *tail;
+	__u8 *start, *end;
+	struct ext4_fc_head *head;
+	struct ext4_extent *ex;
+
+	state = &sbi->s_fc_replay_state;
+
+	start = (u8 *)bh->b_data;
+	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+
+	if (state->fc_replay_expected_off == 0) {
+		state->fc_cur_tag = 0;
+		state->fc_replay_num_tags = 0;
+		state->fc_crc = 0;
+		state->fc_regions = NULL;
+		state->fc_regions_valid = state->fc_regions_used =
+			state->fc_regions_size = 0;
+		/* Check if we can stop early */
+		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
+			!= EXT4_FC_TAG_HEAD)
+			return 0;
+	}
+
+	if (off != state->fc_replay_expected_off) {
+		ret = -EFSCORRUPTED;
+		goto out_err;
+	}
+
+	state->fc_replay_expected_off++;
+	fc_for_each_tl(start, end, tl) {
+		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
+			  tag2str(le16_to_cpu(tl->fc_tag)), bh->b_blocknr);
+		switch (le16_to_cpu(tl->fc_tag)) {
+		case EXT4_FC_TAG_ADD_RANGE:
+			ext = (struct ext4_fc_add_range *)ext4_fc_tag_val(tl);
+			ex = (struct ext4_extent *)&ext->fc_ex;
+			ret = ext4_fc_record_regions(sb,
+				le32_to_cpu(ext->fc_ino),
+				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
+				ext4_ext_get_actual_len(ex));
+			if (ret < 0)
+				break;
+			ret = JBD2_FC_REPLAY_CONTINUE;
+			fallthrough;
+		case EXT4_FC_TAG_DEL_RANGE:
+		case EXT4_FC_TAG_LINK:
+		case EXT4_FC_TAG_UNLINK:
+		case EXT4_FC_TAG_CREAT:
+		case EXT4_FC_TAG_INODE:
+		case EXT4_FC_TAG_PAD:
+			state->fc_cur_tag++;
+			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
+					sizeof(*tl) + ext4_fc_tag_len(tl));
+			break;
+		case EXT4_FC_TAG_TAIL:
+			state->fc_cur_tag++;
+			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
+			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
+						sizeof(*tl) +
+						offsetof(struct ext4_fc_tail,
+						fc_crc));
+			if (le32_to_cpu(tail->fc_tid) == expected_tid &&
+				le32_to_cpu(tail->fc_crc) == state->fc_crc) {
+				state->fc_replay_num_tags = state->fc_cur_tag;
+				state->fc_regions_valid =
+					state->fc_regions_used;
+			} else {
+				ret = state->fc_replay_num_tags ?
+					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
+			}
+			state->fc_crc = 0;
+			break;
+		case EXT4_FC_TAG_HEAD:
+			head = (struct ext4_fc_head *)ext4_fc_tag_val(tl);
+			if (le32_to_cpu(head->fc_features) &
+				~EXT4_FC_SUPPORTED_FEATURES) {
+				ret = -EOPNOTSUPP;
+				break;
+			}
+			if (le32_to_cpu(head->fc_tid) != expected_tid) {
+				ret = JBD2_FC_REPLAY_STOP;
+				break;
+			}
+			state->fc_cur_tag++;
+			state->fc_crc = ext4_chksum(sbi, state->fc_crc, tl,
+					sizeof(*tl) + ext4_fc_tag_len(tl));
+			break;
+		default:
+			ret = state->fc_replay_num_tags ?
+				JBD2_FC_REPLAY_STOP : -ECANCELED;
+		}
+		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
+			break;
+	}
+
+out_err:
+	trace_ext4_fc_replay_scan(sb, ret, off);
+	return ret;
+}
+
 /*
  * Main recovery path entry point.
+ * The meaning of return codes is similar as above.
  */
 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
 				enum passtype pass, int off, tid_t expected_tid)
 {
-	return 0;
+	struct super_block *sb = journal->j_private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_fc_tl *tl;
+	__u8 *start, *end;
+	int ret = JBD2_FC_REPLAY_CONTINUE;
+	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
+	struct ext4_fc_tail *tail;
+
+	if (pass == PASS_SCAN) {
+		state->fc_current_pass = PASS_SCAN;
+		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
+	}
+
+	if (state->fc_current_pass != pass) {
+		state->fc_current_pass = pass;
+		sbi->s_mount_state |= EXT4_FC_REPLAY;
+	}
+	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
+		jbd_debug(1, "Replay stops\n");
+		ext4_fc_set_bitmaps_and_counters(sb);
+		return 0;
+	}
+
+#ifdef CONFIG_EXT4_DEBUG
+	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
+		pr_warn("Dropping fc block %d because max_replay set\n", off);
+		return JBD2_FC_REPLAY_STOP;
+	}
+#endif
+
+	start = (u8 *)bh->b_data;
+	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
+
+	fc_for_each_tl(start, end, tl) {
+		if (state->fc_replay_num_tags == 0) {
+			ret = JBD2_FC_REPLAY_STOP;
+			ext4_fc_set_bitmaps_and_counters(sb);
+			break;
+		}
+		jbd_debug(3, "Replay phase, tag:%s\n",
+				tag2str(le16_to_cpu(tl->fc_tag)));
+		state->fc_replay_num_tags--;
+		switch (le16_to_cpu(tl->fc_tag)) {
+		case EXT4_FC_TAG_LINK:
+			ret = ext4_fc_replay_link(sb, tl);
+			break;
+		case EXT4_FC_TAG_UNLINK:
+			ret = ext4_fc_replay_unlink(sb, tl);
+			break;
+		case EXT4_FC_TAG_ADD_RANGE:
+			ret = ext4_fc_replay_add_range(sb, tl);
+			break;
+		case EXT4_FC_TAG_CREAT:
+			ret = ext4_fc_replay_create(sb, tl);
+			break;
+		case EXT4_FC_TAG_DEL_RANGE:
+			ret = ext4_fc_replay_del_range(sb, tl);
+			break;
+		case EXT4_FC_TAG_INODE:
+			ret = ext4_fc_replay_inode(sb, tl);
+			break;
+		case EXT4_FC_TAG_PAD:
+			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
+				ext4_fc_tag_len(tl), 0);
+			break;
+		case EXT4_FC_TAG_TAIL:
+			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
+				ext4_fc_tag_len(tl), 0);
+			tail = (struct ext4_fc_tail *)ext4_fc_tag_val(tl);
+			WARN_ON(le32_to_cpu(tail->fc_tid) != expected_tid);
+			break;
+		case EXT4_FC_TAG_HEAD:
+			break;
+		default:
+			trace_ext4_fc_replay(sb, le16_to_cpu(tl->fc_tag), 0,
+				ext4_fc_tag_len(tl), 0);
+			ret = -ECANCELED;
+			break;
+		}
+		if (ret < 0)
+			break;
+		ret = JBD2_FC_REPLAY_CONTINUE;
+	}
+	return ret;
 }
 
 void ext4_fc_init(struct super_block *sb, journal_t *journal)