Btrfs: Do snapshot deletion in smaller chunks.

Before, snapshot deletion was a single atomic unit.  This caused considerable
lock contention and required an unbounded amount of space.  Now,
the drop_progress field in the root item is used to indicate how far along
snapshot deletion is, and to resume where it left off.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 19a30b7..aa824e2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -659,9 +659,16 @@
 	struct buffer_head *b;
 	struct buffer_head *cow_buf;
 	struct btrfs_node *c;
+	struct btrfs_root_item *root_item = &root->root_item;
 	int slot;
 	int ret;
 	int level;
+	u8 lowest_level = 0;
+
+	if (btrfs_root_refs(root_item) == 0 && root->ref_cows) {
+		lowest_level = root_item->drop_level;
+		WARN_ON(ins_len || cow);
+	}
 
 	WARN_ON(p->nodes[0] != NULL);
 	WARN_ON(!mutex_is_locked(&root->fs_info->fs_mutex));
@@ -718,6 +725,9 @@
 				slot = p->slots[level];
 				BUG_ON(btrfs_header_nritems(&c->header) == 1);
 			}
+			/* this is only true while dropping a snapshot */
+			if (level == lowest_level)
+				break;
 			b = read_tree_block(root, btrfs_node_blockptr(c, slot));
 		} else {
 			struct btrfs_leaf *l = (struct btrfs_leaf *)c;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0287bd5..73c2e75 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -333,10 +333,10 @@
 	u64 objectid;
 	u64 last_trans;
 	u32 blocksize;
-	int ref_cows;
 	u32 type;
 	u64 highest_inode;
 	u64 last_inode_alloc;
+	int ref_cows;
 };
 
 /* the lower bits in the key flags defines the item type */
@@ -1073,7 +1073,7 @@
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct btrfs_leaf *leaf);
 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
-			*root, struct buffer_head *snap);
+			*root);
 /* root-item.c */
 int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_key *key);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5ace2c3..9455974 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1561,12 +1561,21 @@
 	int i;
 	int slot;
 	int ret;
+	struct btrfs_root_item *root_item = &root->root_item;
+
 	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
 		slot = path->slots[i];
 		if (slot < btrfs_header_nritems(
 		    btrfs_buffer_header(path->nodes[i])) - 1) {
+			struct btrfs_node *node;
+			node = btrfs_buffer_node(path->nodes[i]);
 			path->slots[i]++;
 			*level = i;
+			WARN_ON(*level == 0);
+			memcpy(&root_item->drop_progress,
+			       &node->ptrs[path->slots[i]].key,
+			       sizeof(root_item->drop_progress));
+			root_item->drop_level = i;
 			return 0;
 		} else {
 			ret = btrfs_free_extent(trans, root,
@@ -1587,7 +1596,7 @@
  * decremented.
  */
 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
-			*root, struct buffer_head *snap)
+			*root)
 {
 	int ret = 0;
 	int wret;
@@ -1595,14 +1604,33 @@
 	struct btrfs_path *path;
 	int i;
 	int orig_level;
+	int num_walks = 0;
+	struct btrfs_root_item *root_item = &root->root_item;
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	level = btrfs_header_level(btrfs_buffer_header(snap));
+	level = btrfs_header_level(btrfs_buffer_header(root->node));
 	orig_level = level;
-	path->nodes[level] = snap;
-	path->slots[level] = 0;
+	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+		path->nodes[level] = root->node;
+		path->slots[level] = 0;
+	} else {
+		struct btrfs_key key;
+		struct btrfs_disk_key *found_key;
+		struct btrfs_node *node;
+		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+		wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		if (ret < 0) {
+			ret = wret;
+			goto out;
+		}
+		level = root_item->drop_level;
+		node = btrfs_buffer_node(path->nodes[level]);
+		found_key = &node->ptrs[path->slots[level]].key;
+		WARN_ON(memcmp(found_key, &root_item->drop_progress,
+			       sizeof(*found_key)));
+	}
 	while(1) {
 		wret = walk_down_tree(trans, root, path, &level);
 		if (wret > 0)
@@ -1615,12 +1643,21 @@
 			break;
 		if (wret < 0)
 			ret = wret;
+		num_walks++;
+		if (num_walks > 10) {
+			struct btrfs_key key;
+			btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+			ret = -EAGAIN;
+			get_bh(root->node);
+			break;
+		}
 	}
 	for (i = 0; i <= orig_level; i++) {
 		if (path->nodes[i]) {
 			btrfs_block_release(root, path->nodes[i]);
 		}
 	}
+out:
 	btrfs_free_path(path);
 	return ret;
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0f494fe..4986264 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -213,10 +213,7 @@
 
 struct dirty_root {
 	struct list_head list;
-	struct btrfs_key snap_key;
-	struct buffer_head *commit_root;
 	struct btrfs_root *root;
-	int free_on_drop;
 };
 
 int btrfs_add_dead_root(struct btrfs_root *root, struct list_head *dead_list)
@@ -226,10 +223,7 @@
 	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 	if (!dirty)
 		return -ENOMEM;
-	memcpy(&dirty->snap_key, &root->root_key, sizeof(root->root_key));
-	dirty->commit_root = root->node;
 	dirty->root = root;
-	dirty->free_on_drop = 1;
 	list_add(&dirty->list, dead_list);
 	return 0;
 }
@@ -241,7 +235,6 @@
 	struct dirty_root *dirty;
 	struct btrfs_root *gang[8];
 	struct btrfs_root *root;
-	struct btrfs_root_item tmp_item;
 	int i;
 	int ret;
 	int err = 0;
@@ -267,13 +260,16 @@
 			}
 			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
 			BUG_ON(!dirty);
-			memcpy(&dirty->snap_key, &root->root_key,
-			       sizeof(root->root_key));
-			dirty->commit_root = root->commit_root;
+			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
+			BUG_ON(!dirty->root);
+
+			memset(&root->root_item.drop_progress, 0,
+			       sizeof(struct btrfs_disk_key));
+			root->root_item.drop_level = 0;
+
+			memcpy(dirty->root, root, sizeof(*root));
+			dirty->root->node = root->commit_root;
 			root->commit_root = NULL;
-			dirty->root = root;
-			dirty->free_on_drop = 0;
-			memcpy(&tmp_item, &root->root_item, sizeof(tmp_item));
 
 			root->root_key.offset = root->fs_info->generation;
 			btrfs_set_root_blocknr(&root->root_item,
@@ -283,17 +279,21 @@
 						&root->root_item);
 			if (err)
 				break;
-			refs = btrfs_root_refs(&tmp_item);
-			btrfs_set_root_refs(&tmp_item, refs - 1);
+
+			refs = btrfs_root_refs(&dirty->root->root_item);
+			btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
 			err = btrfs_update_root(trans, root->fs_info->tree_root,
-						&dirty->snap_key,
-						&tmp_item);
+						&dirty->root->root_key,
+						&dirty->root->root_item);
 
 			BUG_ON(err);
-			if (refs == 1)
+			if (refs == 1) {
 				list_add(&dirty->list, list);
-			else
+			} else {
+				WARN_ON(1);
+				kfree(dirty->root);
 				kfree(dirty);
+			}
 		}
 	}
 	return err;
@@ -305,23 +305,36 @@
 	struct dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
+	int err;
+
 	while(!list_empty(list)) {
 		mutex_lock(&tree_root->fs_info->fs_mutex);
 		dirty = list_entry(list->next, struct dirty_root, list);
 		list_del_init(&dirty->list);
 
-		trans = btrfs_start_transaction(tree_root, 1);
-		ret = btrfs_drop_snapshot(trans, dirty->root,
-					  dirty->commit_root);
+		while(1) {
+			trans = btrfs_start_transaction(tree_root, 1);
+			ret = btrfs_drop_snapshot(trans, dirty->root);
+			if (ret != -EAGAIN) {
+				break;
+			}
+			err = btrfs_update_root(trans,
+					tree_root,
+					&dirty->root->root_key,
+					&dirty->root->root_item);
+			if (err)
+				ret = err;
+			ret = btrfs_end_transaction(trans, tree_root);
+			BUG_ON(ret);
+		}
 		BUG_ON(ret);
-		ret = btrfs_del_root(trans, tree_root, &dirty->snap_key);
+		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
 		if (ret)
 			break;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
-		if (dirty->free_on_drop)
-			kfree(dirty->root);
+		kfree(dirty->root);
 		kfree(dirty);
 		mutex_unlock(&tree_root->fs_info->fs_mutex);
 		btrfs_btree_balance_dirty(tree_root);