Btrfs: Add a write ahead tree log to optimize synchronous operations

File syncs and directory syncs are optimized by copying their
items into a special (copy-on-write) log tree.  There is one log tree per
subvolume and the btrfs super block points to a tree of log tree roots.

After a crash, items are copied out of the log tree and back into the
subvolume.  See tree-log.c for all the details.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e63b3b4..646b914 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -496,6 +496,23 @@
 	return ret == 0;
 }
 
+/* simple helper to search for an existing extent at a given offset */
+int btrfs_lookup_extent(struct btrfs_root *root, struct btrfs_path *path,
+			u64 start, u64 len)
+{
+	int ret;
+	struct btrfs_key key;
+
+	maybe_lock_mutex(root);
+	key.objectid = start;
+	key.offset = len;
+	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
+				0, 0);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
 static int noinline lookup_extent_backref(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path, u64 bytenr,
@@ -1409,7 +1426,7 @@
 }
 
 
-static int update_pinned_extents(struct btrfs_root *root,
+int btrfs_update_pinned_extents(struct btrfs_root *root,
 				u64 bytenr, u64 num, int pin)
 {
 	u64 len;
@@ -1492,7 +1509,7 @@
 					    EXTENT_DIRTY);
 		if (ret)
 			break;
-		update_pinned_extents(root, start, end + 1 - start, 0);
+		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
 		set_extent_dirty(free_space_cache, start, end, GFP_NOFS);
 		if (need_resched()) {
@@ -1538,14 +1555,11 @@
 		clear_extent_bits(&info->extent_ins, start, end, EXTENT_LOCKED,
 				  GFP_NOFS);
 
-		eb = btrfs_find_tree_block(extent_root, ins.objectid,
+		eb = btrfs_find_create_tree_block(extent_root, ins.objectid,
 					   ins.offset);
 
-		if (!btrfs_buffer_uptodate(eb, trans->transid)) {
-			mutex_unlock(&extent_root->fs_info->alloc_mutex);
+		if (!btrfs_buffer_uptodate(eb, trans->transid))
 			btrfs_read_buffer(eb, trans->transid);
-			mutex_lock(&extent_root->fs_info->alloc_mutex);
-		}
 
 		btrfs_tree_lock(eb);
 		level = btrfs_header_level(eb);
@@ -1585,13 +1599,20 @@
 		struct extent_buffer *buf;
 		buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 		if (buf) {
+			/* we can reuse a block if it hasn't been written
+			 * and it is from this transaction.  We can't
+			 * reuse anything from the tree log root because
+			 * it has tiny sub-transactions.
+			 */
 			if (btrfs_buffer_uptodate(buf, 0) &&
 			    btrfs_try_tree_lock(buf)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
 				u64 header_transid =
 					btrfs_header_generation(buf);
-				if (header_transid == transid &&
+				if (btrfs_header_owner(buf) !=
+				    BTRFS_TREE_LOG_OBJECTID &&
+				    header_transid == transid &&
 				    !btrfs_header_flag(buf,
 					       BTRFS_HEADER_FLAG_WRITTEN)) {
 					clean_tree_block(NULL, root, buf);
@@ -1603,7 +1624,7 @@
 			}
 			free_extent_buffer(buf);
 		}
-		update_pinned_extents(root, bytenr, num_bytes, 1);
+		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 	} else {
 		set_extent_bits(&root->fs_info->pending_del,
 				bytenr, bytenr + num_bytes - 1,
@@ -1801,7 +1822,7 @@
 				  GFP_NOFS);
 		if (!test_range_bit(&extent_root->fs_info->extent_ins,
 				    start, end, EXTENT_LOCKED, 0)) {
-			update_pinned_extents(extent_root, start,
+			btrfs_update_pinned_extents(extent_root, start,
 					      end + 1 - start, 1);
 			ret = __free_extent(trans, extent_root,
 					     start, end + 1 - start,
@@ -1919,6 +1940,12 @@
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
 		last_ptr = &root->fs_info->last_data_alloc;
 	}
+	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+		last_ptr = &root->fs_info->last_log_alloc;
+		if (!last_ptr == 0 && root->fs_info->last_alloc) {
+			*last_ptr = root->fs_info->last_alloc + empty_cluster;
+		}
+	}
 
 	if (last_ptr) {
 		if (*last_ptr)
@@ -2268,6 +2295,35 @@
 	maybe_unlock_mutex(root);
 	return ret;
 }
+
+/*
+ * this is used by the tree logging recovery code.  It records that
+ * an extent has been allocated and makes sure to clear the free
+ * space cache bits as well
+ */
+int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins)
+{
+	int ret;
+	struct btrfs_block_group_cache *block_group;
+
+	maybe_lock_mutex(root);
+	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	cache_block_group(root, block_group);
+
+	clear_extent_dirty(&root->fs_info->free_space_cache,
+			   ins->objectid, ins->objectid + ins->offset - 1,
+			   GFP_NOFS);
+	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+					    ref_generation, owner,
+					    owner_offset, ins);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
 /*
  * finds a free extent and does all the dirty work required for allocation
  * returns the key for the extent through ins, and a tree buffer for
@@ -2350,9 +2406,8 @@
 	return buf;
 }
 
-static int noinline drop_leaf_ref_no_cache(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
-					   struct extent_buffer *leaf)
+int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
+			struct btrfs_root *root, struct extent_buffer *leaf)
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
@@ -2402,9 +2457,9 @@
 	return 0;
 }
 
-static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root,
-					 struct btrfs_leaf_ref *ref)
+static int noinline cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_leaf_ref *ref)
 {
 	int i;
 	int ret;
@@ -2512,7 +2567,7 @@
 		    btrfs_header_nritems(cur))
 			break;
 		if (*level == 0) {
-			ret = drop_leaf_ref_no_cache(trans, root, cur);
+			ret = btrfs_drop_leaf_ref(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
@@ -2552,7 +2607,7 @@
 			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
 			ref = btrfs_lookup_leaf_ref(root, bytenr);
 			if (ref) {
-				ret = drop_leaf_ref(trans, root, ref);
+				ret = cache_drop_leaf_ref(trans, root, ref);
 				BUG_ON(ret);
 				btrfs_remove_leaf_ref(root, ref);
 				btrfs_free_leaf_ref(root, ref);
@@ -3628,6 +3683,8 @@
 	extent_root = root->fs_info->extent_root;
 	block_group_cache = &root->fs_info->block_group_cache;
 
+	root->fs_info->last_trans_new_blockgroup = trans->transid;
+
 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
 	BUG_ON(!cache);
 	cache->key.objectid = chunk_offset;