btrfs: disable strict file flushes for renames and truncates Truncates and renames are often used to replace old versions of a file with new versions. Applications often expect this to be an atomic replacement, even if they haven't done anything to make sure the new version is fully on disk. Btrfs has strict flushing in place to make sure that renaming over an old file with a new file will fully flush out the new file before allowing the transaction commit with the rename to complete. This ordering means the commit code needs to be able to lock file pages, and there are a few paths in the filesystem where we will try to end a transaction with the page lock held. It's rare, but these things can deadlock. This patch removes the ordered flushes and switches to a best effort filemap_flush like ext4 uses. It's not perfect, but it should fix the deadlocks. Signed-off-by: Chris Mason <clm@fb.com>

commit: 8d875f95da43c6a8f18f77869f2ef26e9594fecc [log] [tgz]
author: Chris Mason <clm@fb.com> Tue Aug 12 10:47:42 2014 -0700
committer: Chris Mason <clm@fb.com> Fri Aug 15 07:43:42 2014 -0700
tree: 601416f676c0e2291bdbed359092eb284f1c32dc
parent: 27b9a8122ff71a8cadfbffb9c4f0694300464f3b [diff] [blame]
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8ea7610..7309832 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c

@@ -7951,27 +7951,6 @@
 	BUG_ON(ret);
 
 	/*
-	 * setattr is responsible for setting the ordered_data_close flag,
-	 * but that is only tested during the last file release.  That
-	 * could happen well after the next commit, leaving a great big
-	 * window where new writes may get lost if someone chooses to write
-	 * to this file after truncating to zero
-	 *
-	 * The inode doesn't have any dirty data here, and so if we commit
-	 * this is a noop.  If someone immediately starts writing to the inode
-	 * it is very likely we'll catch some of their writes in this
-	 * transaction, and the commit will find this file on the ordered
-	 * data list with good things to send down.
-	 *
-	 * This is a best effort solution, there is still a window where
-	 * using truncate to replace the contents of the file will
-	 * end up with a zero length file after a crash.
-	 */
-	if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
-					   &BTRFS_I(inode)->runtime_flags))
-		btrfs_add_ordered_operation(trans, root, inode);
-
-	/*
 	 * So if we truncate and then write and fsync we normally would just
 	 * write the extents that changed, which is a problem if we need to
 	 * first truncate that entire inode.  So set this flag so we write out
@@ -8118,7 +8097,6 @@
 	mutex_init(&ei->delalloc_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
-	INIT_LIST_HEAD(&ei->ordered_operations);
 	RB_CLEAR_NODE(&ei->rb_node);
 
 	return inode;
@@ -8158,17 +8136,6 @@
 	if (!root)
 		goto free;
 
-	/*
-	 * Make sure we're properly removed from the ordered operation
-	 * lists.
-	 */
-	smp_mb();
-	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-		spin_lock(&root->fs_info->ordered_root_lock);
-		list_del_init(&BTRFS_I(inode)->ordered_operations);
-		spin_unlock(&root->fs_info->ordered_root_lock);
-	}
-
 	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
 		     &BTRFS_I(inode)->runtime_flags)) {
 		btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8350,12 +8317,10 @@
 	ret = 0;
 
 	/*
-	 * we're using rename to replace one file with another.
-	 * and the replacement file is large.  Start IO on it now so
-	 * we don't add too much work to the end of the transaction
+	 * we're using rename to replace one file with another.  Start IO on it
+	 * now so  we don't add too much work to the end of the transaction
 	 */
-	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
-	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
 		filemap_flush(old_inode->i_mapping);
 
 	/* close the racy window with snapshot create/destroy ioctl */
@@ -8403,12 +8368,6 @@
 		 */
 		btrfs_pin_log_trans(root);
 	}
-	/*
-	 * make sure the inode gets flushed if it is replacing
-	 * something.
-	 */
-	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
-		btrfs_add_ordered_operation(trans, root, old_inode);
 
 	inode_inc_iversion(old_dir);
 	inode_inc_iversion(new_dir);
commit	8d875f95da43c6a8f18f77869f2ef26e9594fecc	[log] [tgz]
author	Chris Mason <clm@fb.com>	Tue Aug 12 10:47:42 2014 -0700
committer	Chris Mason <clm@fb.com>	Fri Aug 15 07:43:42 2014 -0700
tree	601416f676c0e2291bdbed359092eb284f1c32dc
parent	27b9a8122ff71a8cadfbffb9c4f0694300464f3b [diff] [blame]