Btrfs: Add mount option to turn off data cow

A number of workloads do not require copy on write data or checksumming.
mount -o nodatasum to disable checksums and -o nodatacow to disable
both copy on write and checksumming.

In nodatacow mode, copy on write is still performed when a given extent
is under snapshot.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fd8233e..585f279 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -70,7 +70,58 @@
 	memset(p, 0, sizeof(*p));
 }
 
-static int __btrfs_cow_block(struct btrfs_trans_handle *trans,
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      struct extent_buffer *buf,
+		      struct extent_buffer **cow_ret, u64 new_root_objectid)
+{
+	struct extent_buffer *cow;
+	u32 nritems;
+	int ret = 0;
+	int level;
+	struct btrfs_key first_key;
+	struct btrfs_root new_root;
+
+	memcpy(&new_root, root, sizeof(new_root));
+	new_root.root_key.objectid = new_root_objectid;
+
+	WARN_ON(root->ref_cows && trans->transid !=
+		root->fs_info->running_transaction->transid);
+	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+
+	level = btrfs_header_level(buf);
+	nritems = btrfs_header_nritems(buf);
+	if (nritems) {
+		if (level == 0)
+			btrfs_item_key_to_cpu(buf, &first_key, 0);
+		else
+			btrfs_node_key_to_cpu(buf, &first_key, 0);
+	} else {
+		first_key.objectid = 0;
+	}
+	cow = __btrfs_alloc_free_block(trans, &new_root, buf->len,
+				       new_root_objectid,
+				       trans->transid, first_key.objectid,
+				       level, buf->start, 0);
+	if (IS_ERR(cow))
+		return PTR_ERR(cow);
+
+	copy_extent_buffer(cow, buf, 0, 0, cow->len);
+	btrfs_set_header_bytenr(cow, cow->start);
+	btrfs_set_header_generation(cow, trans->transid);
+	btrfs_set_header_owner(cow, new_root_objectid);
+
+	WARN_ON(btrfs_header_generation(buf) > trans->transid);
+	ret = btrfs_inc_ref(trans, &new_root, buf);
+	if (ret)
+		return ret;
+
+	btrfs_mark_buffer_dirty(cow);
+	*cow_ret = cow;
+	return 0;
+}
+
+int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct extent_buffer *buf,
 			     struct extent_buffer *parent, int parent_slot,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e255ca..b51b021 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -431,6 +431,7 @@
 #define BTRFS_STRING_ITEM_KEY	253
 
 #define BTRFS_MOUNT_NODATASUM		0x1
+#define BTRFS_MOUNT_NODATACOW		0x2
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -915,6 +916,9 @@
 	btrfs_item_offset_nr(leaf, slot)))
 
 /* extent-tree.c */
+u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
+				  struct btrfs_path *count_path,
+				  u64 first_extent);
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_map_tree *copy);
@@ -974,6 +978,10 @@
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
 		    struct extent_buffer **cow_ret);
+int btrfs_copy_root(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root,
+		      struct extent_buffer *buf,
+		      struct extent_buffer **cow_ret, u64 new_root_objectid);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0041483..1412d55 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -679,6 +679,104 @@
 	return 0;
 }
 
+u32 btrfs_count_snapshots_in_path(struct btrfs_root *root,
+				  struct btrfs_path *count_path,
+				  u64 first_extent)
+{
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_path *path;
+	u64 bytenr;
+	u64 found_objectid;
+	u64 root_objectid = 0;
+	u32 total_count = 0;
+	u32 cur_count;
+	u32 refs;
+	u32 nritems;
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct extent_buffer *l;
+	struct btrfs_extent_item *item;
+	struct btrfs_extent_ref *ref_item;
+	int level = -1;
+
+	path = btrfs_alloc_path();
+again:
+	if (level == -1)
+		bytenr = first_extent;
+	else
+		bytenr = count_path->nodes[level]->start;
+
+	cur_count = 0;
+	key.objectid = bytenr;
+	key.offset = 0;
+
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	l = path->nodes[0];
+	btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
+
+	if (found_key.objectid != bytenr ||
+	    found_key.type != BTRFS_EXTENT_ITEM_KEY) {
+		goto out;
+	}
+
+	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(l, item);
+	while (1) {
+		nritems = btrfs_header_nritems(l);
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret == 0)
+				continue;
+			break;
+		}
+		btrfs_item_key_to_cpu(l, &found_key, path->slots[0]);
+		if (found_key.objectid != bytenr)
+			break;
+		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
+			path->slots[0]++;
+			continue;
+		}
+
+		cur_count++;
+		ref_item = btrfs_item_ptr(l, path->slots[0],
+					  struct btrfs_extent_ref);
+		found_objectid = btrfs_ref_root(l, ref_item);
+
+		if (found_objectid != root_objectid)
+			total_count++;
+
+		if (total_count > 1)
+			goto out;
+
+		if (root_objectid == 0)
+			root_objectid = found_objectid;
+
+		path->slots[0]++;
+	}
+	if (cur_count == 0) {
+		total_count = 0;
+		goto out;
+	}
+	if (total_count > 1)
+		goto out;
+	if (level >= 0 && root->node == count_path->nodes[level])
+		goto out;
+	level++;
+	btrfs_release_path(root, path);
+	goto again;
+
+out:
+	btrfs_free_path(path);
+	return total_count;
+
+}
+
 int btrfs_inc_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, u64 owner_objectid)
 {
@@ -1127,9 +1225,6 @@
 	if (!path)
 		return -ENOMEM;
 
-	if (ref_generation && owner_objectid == 0 && root_objectid == 3) {
-//printk("drop backref root %Lu gen %Lu byte %Lu\n", root_objectid, ref_generation, bytenr );
-	}
 	ret = lookup_extent_backref(trans, extent_root, path,
 				    bytenr, root_objectid,
 				    ref_generation,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 11885cb..91f3fc4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -72,21 +72,22 @@
 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
 };
 
-static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+static int cow_file_range(struct inode *inode, u64 start, u64 end)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	struct btrfs_key ins;
 	u64 alloc_hint = 0;
 	u64 num_bytes;
-	int ret;
 	u64 blocksize = root->sectorsize;
+	struct btrfs_key ins;
+	int ret;
 
-	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, inode);
 	BUG_ON(!trans);
+	btrfs_set_trans_block_group(trans, inode);
+
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = max(blocksize,  num_bytes);
 	ret = btrfs_drop_extents(trans, root, inode,
 				 start, start + num_bytes, start, &alloc_hint);
 
@@ -106,6 +107,101 @@
 				       ins.offset);
 out:
 	btrfs_end_transaction(trans, root);
+	return ret;
+}
+
+static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
+{
+	u64 extent_start;
+	u64 extent_end;
+	u64 bytenr;
+	u64 cow_end;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf;
+	int found_type;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *item;
+	int ret;
+	int err;
+	struct btrfs_key found_key;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+again:
+	ret = btrfs_lookup_file_extent(NULL, root, path,
+				       inode->i_ino, start, 0);
+	if (ret < 0) {
+		btrfs_free_path(path);
+		return ret;
+	}
+
+	cow_end = end;
+	if (ret != 0) {
+		if (path->slots[0] == 0)
+			goto not_found;
+		path->slots[0]--;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+			      struct btrfs_file_extent_item);
+
+	/* are we inside the extent that was found? */
+	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	found_type = btrfs_key_type(&found_key);
+	if (found_key.objectid != inode->i_ino ||
+	    found_type != BTRFS_EXTENT_DATA_KEY) {
+		goto not_found;
+	}
+
+	found_type = btrfs_file_extent_type(leaf, item);
+	extent_start = found_key.offset;
+	if (found_type == BTRFS_FILE_EXTENT_REG) {
+		extent_end = extent_start +
+		       btrfs_file_extent_num_bytes(leaf, item);
+		err = 0;
+
+		if (start < extent_start || start >= extent_end)
+			goto not_found;
+
+		cow_end = min(end, extent_end - 1);
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
+		if (bytenr == 0)
+			goto not_found;
+
+		bytenr += btrfs_file_extent_offset(leaf, item);
+		if (btrfs_count_snapshots_in_path(root, path, bytenr) != 1) {
+			goto not_found;
+		}
+
+		start = extent_end;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+		goto not_found;
+	}
+loop:
+	if (start > end) {
+		btrfs_free_path(path);
+		return 0;
+	}
+	btrfs_release_path(root, path);
+	goto again;
+
+not_found:
+	cow_file_range(inode, start, cow_end);
+	start = cow_end + 1;
+	goto loop;
+}
+
+static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	mutex_lock(&root->fs_info->fs_mutex);
+	if (btrfs_test_opt(root, NODATACOW))
+		ret = run_delalloc_nocow(inode, start, end);
+	else
+		ret = cow_file_range(inode, start, end);
 	mutex_unlock(&root->fs_info->fs_mutex);
 	return ret;
 }
@@ -1907,9 +2003,6 @@
 
 	btrfs_cow_one_page(inode, page, PAGE_CACHE_SIZE);
 
-	set_page_extent_mapped(page);
-	set_page_dirty(page);
-
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
 		mark_inode_dirty(inode);
@@ -2078,13 +2171,18 @@
 	key.objectid = objectid;
 	key.offset = 1;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+
 	extent_buffer_get(root->node);
 	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 	free_extent_buffer(tmp);
-	btrfs_set_root_bytenr(&new_root_item, root->node->start);
-	btrfs_set_root_level(&new_root_item, btrfs_header_level(root->node));
+
+	btrfs_copy_root(trans, root, root->node, &tmp, objectid);
+
+	btrfs_set_root_bytenr(&new_root_item, tmp->start);
+	btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp));
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&new_root_item);
+	free_extent_buffer(tmp);
 	if (ret)
 		goto fail;
 
@@ -2106,10 +2204,6 @@
 
 	if (ret)
 		goto fail;
-
-	ret = btrfs_inc_root_ref(trans, root, objectid);
-	if (ret)
-		goto fail;
 fail:
 	nr = trans->blocks_used;
 	err = btrfs_commit_transaction(trans, root);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ad4f280..2116728 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -61,12 +61,13 @@
 }
 
 enum {
-	Opt_subvol, Opt_nodatasum, Opt_err,
+	Opt_subvol, Opt_nodatasum, Opt_nodatacow, Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_subvol, "subvol=%s"},
 	{Opt_nodatasum, "nodatasum"},
+	{Opt_nodatacow, "nodatacow"},
 	{Opt_err, NULL}
 };
 
@@ -78,12 +79,20 @@
 	struct btrfs_fs_info *info = NULL;
 	substring_t args[MAX_OPT_ARGS];
 
-	if (root)
-		info = root->fs_info;
-
 	if (!options)
 		return 1;
 
+	/*
+	 * strsep changes the string, duplicate it because parse_options
+	 * gets called twice
+	 */
+	options = kstrdup(options, GFP_NOFS);
+	if (!options)
+		return -ENOMEM;
+
+	if (root)
+		info = root->fs_info;
+
 	while ((p = strsep (&options, ",")) != NULL) {
 		int token;
 		if (!*p)
@@ -92,17 +101,28 @@
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_subvol:
-			if (subvol_name)
+			if (subvol_name) {
 				*subvol_name = match_strdup(&args[0]);
+			}
 			break;
 		case Opt_nodatasum:
-			if (root)
+			if (info) {
+				printk("btrfs: setting nodatacsum\n");
 				btrfs_set_opt(info->mount_opt, NODATASUM);
+			}
+			break;
+		case Opt_nodatacow:
+			if (info) {
+				printk("btrfs: setting nodatacow\n");
+				btrfs_set_opt(info->mount_opt, NODATACOW);
+				btrfs_set_opt(info->mount_opt, NODATASUM);
+			}
 			break;
 		default:
-			return 0;
+			break;
 		}
 	}
+	kfree(options);
 	return 1;
 }