Btrfs: Fix extent_buffer and extent_state leaks

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9427b79..50ef351 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -87,6 +87,7 @@
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
+	cow->alloc_addr = (unsigned long)__builtin_return_address(0);
 	if (buf->len != root->sectorsize || cow->len != root->sectorsize)
 		WARN_ON(1);
 
@@ -132,6 +133,7 @@
 		    struct extent_buffer **cow_ret)
 {
 	u64 search_start;
+	int ret;
 	if (trans->transaction != root->fs_info->running_transaction) {
 		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
 		       root->fs_info->running_transaction->transid);
@@ -148,8 +150,10 @@
 	}
 
 	search_start = extent_buffer_blocknr(buf) & ~((u64)65535);
-	return __btrfs_cow_block(trans, root, buf, parent,
+	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0);
+	(*cow_ret)->alloc_addr = (unsigned long)__builtin_return_address(0);
+	return ret;
 }
 
 static int close_blocks(u64 blocknr, u64 other)
@@ -1013,8 +1017,10 @@
 				if (sret)
 					return sret;
 				b = p->nodes[level];
-				if (!b)
+				if (!b) {
+					btrfs_release_path(NULL, p);
 					goto again;
+				}
 				slot = p->slots[level];
 				BUG_ON(btrfs_header_nritems(b) == 1);
 			}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index aed0861..5262b28 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -303,8 +303,8 @@
 	struct radix_tree_root pinned_radix;
 	struct radix_tree_root block_group_radix;
 	struct radix_tree_root block_group_data_radix;
-	struct radix_tree_root extent_map_radix;
 	struct radix_tree_root extent_ins_radix;
+	struct extent_map_tree free_space_cache;
 	u64 generation;
 	u64 last_trans_committed;
 	struct btrfs_transaction *running_transaction;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8242933..09f4e69 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,18 +46,25 @@
 					    u64 blocknr)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
+	struct extent_buffer *eb;
+	eb = find_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
 				   blocknr * root->sectorsize,
 				   root->sectorsize, GFP_NOFS);
+	if (eb)
+		eb->alloc_addr = (unsigned long)__builtin_return_address(0);
+	return eb;
 }
 
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						 u64 blocknr)
 {
 	struct inode *btree_inode = root->fs_info->btree_inode;
-	return alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
+	struct extent_buffer *eb;
+	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->extent_tree,
 				   blocknr * root->sectorsize,
 				   root->sectorsize, GFP_NOFS);
+	eb->alloc_addr = (unsigned long)__builtin_return_address(0);
+	return eb;
 }
 
 struct extent_map *btree_get_extent(struct inode *inode, struct page *page,
@@ -226,6 +233,7 @@
 		return NULL;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->extent_tree,
 				 buf, 1);
+	buf->alloc_addr = (unsigned long)__builtin_return_address(0);
 	return buf;
 }
 
@@ -426,7 +434,6 @@
 	}
 	init_bit_radix(&fs_info->pinned_radix);
 	init_bit_radix(&fs_info->pending_del_radix);
-	init_bit_radix(&fs_info->extent_map_radix);
 	init_bit_radix(&fs_info->extent_ins_radix);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
 	INIT_RADIX_TREE(&fs_info->block_group_radix, GFP_KERNEL);
@@ -449,6 +456,8 @@
 	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
 			     fs_info->btree_inode->i_mapping,
 			     GFP_NOFS);
+	extent_map_tree_init(&fs_info->free_space_cache,
+			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 	fs_info->closing = 0;
 
@@ -594,8 +603,10 @@
 
 	if (fs_info->extent_root->node)
 		free_extent_buffer(fs_info->extent_root->node);
+
 	if (fs_info->tree_root->node)
 		free_extent_buffer(fs_info->tree_root->node);
+
 	free_extent_buffer(fs_info->sb_buffer);
 	truncate_inode_pages(fs_info->btree_inode->i_mapping, 0);
 	iput(fs_info->btree_inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 089c41c..74cfbee 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,21 +34,19 @@
 	int ret;
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	struct radix_tree_root *extent_radix;
+	struct extent_map_tree *free_space_cache;
 	int slot;
-	u64 i;
 	u64 last = 0;
 	u64 hole_size;
 	u64 first_free;
 	int found = 0;
 
 	root = root->fs_info->extent_root;
-	extent_radix = &root->fs_info->extent_map_radix;
+	free_space_cache = &root->fs_info->free_space_cache;
 
 	if (block_group->cached)
 		return 0;
-	if (block_group->data)
-		return 0;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -98,9 +96,11 @@
 				last = first_free;
 				found = 1;
 			}
-			hole_size = key.objectid - last;
-			for (i = 0; i < hole_size; i++) {
-				set_radix_bit(extent_radix, last + i);
+			if (key.objectid > last) {
+				hole_size = key.objectid - last;
+				set_extent_dirty(free_space_cache, last,
+						 last + hole_size - 1,
+						 GFP_NOFS);
 			}
 			last = key.objectid + key.offset;
 		}
@@ -114,9 +114,8 @@
 	    block_group->key.offset > last) {
 		hole_size = block_group->key.objectid +
 			block_group->key.offset - last;
-		for (i = 0; i < hole_size; i++) {
-			set_radix_bit(extent_radix, last + i);
-		}
+		set_extent_dirty(free_space_cache, last,
+				 last + hole_size - 1, GFP_NOFS);
 	}
 	block_group->cached = 1;
 err:
@@ -150,47 +149,33 @@
 	return NULL;
 }
 
-static u64 leaf_range(struct btrfs_root *root)
-{
-	u64 size = BTRFS_LEAF_DATA_SIZE(root);
-	do_div(size, sizeof(struct btrfs_extent_item) +
-		sizeof(struct btrfs_item));
-	return size;
-}
-
 static u64 find_search_start(struct btrfs_root *root,
 			     struct btrfs_block_group_cache **cache_ret,
-			     u64 search_start, int num)
+			     u64 search_start, int num, int data)
 {
-	unsigned long gang[8];
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
 	u64 last = max(search_start, cache->key.objectid);
+	u64 start = 0;
+	u64 end = 0;
 
-	if (cache->data)
-		goto out;
 again:
 	ret = cache_block_group(root, cache);
 	if (ret)
 		goto out;
 	while(1) {
-		ret = find_first_radix_bit(&root->fs_info->extent_map_radix,
-					   gang, last, ARRAY_SIZE(gang));
-		if (!ret)
+		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
+					    last, &start, &end, EXTENT_DIRTY);
+		if (ret)
 			goto out;
-		last = gang[ret-1] + 1;
-		if (num > 1) {
-			if (ret != ARRAY_SIZE(gang)) {
-				goto new_group;
-			}
-			if (gang[ret-1] - gang[0] > leaf_range(root)) {
-				continue;
-			}
-		}
-		if (gang[0] >= cache->key.objectid + cache->key.offset) {
+
+		start = max(last, start);
+		last = end + 1;
+		if (end + 1 - start < num)
+			continue;
+		if (start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
-		}
-		return gang[0];
+		return start;
 	}
 out:
 	return max(cache->last_alloc, search_start);
@@ -202,7 +187,7 @@
 		return max((*cache_ret)->last_alloc, search_start);
 	}
 	cache = btrfs_find_block_group(root, cache,
-				       last + cache->key.offset - 1, 0, 0);
+				       last + cache->key.offset - 1, data, 0);
 	*cache_ret = cache;
 	goto again;
 }
@@ -625,7 +610,6 @@
 	u64 total = num;
 	u64 old_val;
 	u64 block_in_group;
-	u64 i;
 	int ret;
 
 	while(total) {
@@ -644,12 +628,6 @@
 		if (alloc) {
 			if (blocknr > cache->last_alloc)
 				cache->last_alloc = blocknr;
-			if (!cache->data) {
-				for (i = 0; i < num; i++) {
-					clear_radix_bit(&info->extent_map_radix,
-						        blocknr + i);
-				}
-			}
 			if (cache->data != data &&
 			    old_val < (cache->key.offset >> 1)) {
 				cache->data = data;
@@ -677,11 +655,10 @@
 			old_val -= num;
 			if (blocknr < cache->first_free)
 				cache->first_free = blocknr;
-			if (!cache->data && mark_free) {
-				for (i = 0; i < num; i++) {
-					set_radix_bit(&info->extent_map_radix,
-						      blocknr + i);
-				}
+			if (mark_free) {
+				set_extent_dirty(&info->free_space_cache,
+						 blocknr, blocknr + num - 1,
+						 GFP_NOFS);
 			}
 			if (old_val < (cache->key.offset >> 1) &&
 			    old_val + num >= (cache->key.offset >> 1)) {
@@ -732,7 +709,9 @@
 	int ret;
 	int i;
 	struct radix_tree_root *pinned_radix = &root->fs_info->pinned_radix;
-	struct radix_tree_root *extent_radix = &root->fs_info->extent_map_radix;
+	struct extent_map_tree *free_space_cache;
+
+	free_space_cache = &root->fs_info->free_space_cache;
 
 	while(1) {
 		ret = find_first_radix_bit(unpin_radix, gang, 0,
@@ -751,8 +730,11 @@
 				block_group->pinned--;
 				if (gang[i] < block_group->last_alloc)
 					block_group->last_alloc = gang[i];
-				if (!block_group->data)
-					set_radix_bit(extent_radix, gang[i]);
+				if (!block_group->data) {
+					set_extent_dirty(free_space_cache,
+							 gang[i], gang[i],
+							 GFP_NOFS);
+				}
 			}
 		}
 	}
@@ -995,6 +977,7 @@
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
+	u64 cached_search_start = 0;
 
 	WARN_ON(num_blocks < 1);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -1017,11 +1000,9 @@
 	path = btrfs_alloc_path();
 
 check_failed:
-	if (!block_group->data)
-		search_start = find_search_start(root, &block_group,
-						 search_start, total_needed);
-	else if (!full_scan)
-		search_start = max(block_group->last_alloc, search_start);
+	search_start = find_search_start(root, &block_group,
+					 search_start, total_needed, data);
+	cached_search_start = search_start;
 
 	btrfs_init_path(path);
 	ins->objectid = search_start;
@@ -1097,6 +1078,7 @@
 
 		start_found = 1;
 		last_block = key.objectid + key.offset;
+
 		if (!full_scan && last_block >= block_group->key.objectid +
 		    block_group->key.offset) {
 			btrfs_release_path(root, path);
@@ -1138,6 +1120,9 @@
 	}
 	ins->offset = num_blocks;
 	btrfs_free_path(path);
+	if (0 && ins->objectid != cached_search_start) {
+printk("\tcached was %Lu found %Lu\n", cached_search_start, ins->objectid);
+	}
 	return 0;
 
 new_group:
@@ -1209,6 +1194,10 @@
 	btrfs_set_root_used(&root->root_item, root_blocks_used +
 				   num_blocks);
 
+	clear_extent_dirty(&root->fs_info->free_space_cache,
+			   ins->objectid, ins->objectid + ins->offset - 1,
+			   GFP_NOFS);
+
 	if (root == extent_root) {
 		BUG_ON(num_blocks != 1);
 		set_radix_bit(&root->fs_info->extent_ins_radix, ins->objectid);
@@ -1227,6 +1216,7 @@
 	BUG_ON(ret);
 	finish_current_insert(trans, extent_root);
 	pending_ret = del_pending_extents(trans, extent_root);
+
 	if (ret) {
 		return ret;
 	}
@@ -1265,6 +1255,7 @@
 		return ERR_PTR(-ENOMEM);
 	}
 	btrfs_set_buffer_uptodate(buf);
+	buf->alloc_addr = (unsigned long)__builtin_return_address(0);
 	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
 	/*
@@ -1492,6 +1483,7 @@
 	orig_level = level;
 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
 		path->nodes[level] = root->node;
+		extent_buffer_get(root->node);
 		path->slots[level] = 0;
 	} else {
 		struct btrfs_key key;
@@ -1524,7 +1516,6 @@
 		if (wret < 0)
 			ret = wret;
 		ret = -EAGAIN;
-		extent_buffer_get(root->node);
 		break;
 	}
 	for (i = 0; i <= orig_level; i++) {
@@ -1562,8 +1553,8 @@
 {
 	int ret;
 	int ret2;
-	unsigned long gang[16];
-	int i;
+	u64 start;
+	u64 end;
 
 	ret = free_block_group_radix(&info->block_group_radix);
 	ret2 = free_block_group_radix(&info->block_group_data_radix);
@@ -1573,13 +1564,12 @@
 		return ret2;
 
 	while(1) {
-		ret = find_first_radix_bit(&info->extent_map_radix,
-					   gang, 0, ARRAY_SIZE(gang));
-		if (!ret)
+		ret = find_first_extent_bit(&info->free_space_cache, 0,
+					    &start, &end, EXTENT_DIRTY);
+		if (ret)
 			break;
-		for (i = 0; i < ret; i++) {
-			clear_radix_bit(&info->extent_map_radix, gang[i]);
-		}
+		clear_extent_dirty(&info->free_space_cache, start,
+				   end, GFP_NOFS);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f150188..5b7dbca 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -19,8 +19,13 @@
 static struct kmem_cache *extent_map_cache;
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
+
 static LIST_HEAD(extent_buffers);
+static LIST_HEAD(buffers);
+static LIST_HEAD(states);
+
 static spinlock_t extent_buffers_lock;
+static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
 static int nr_extent_buffers;
 #define MAX_EXTENT_BUFFER_CACHE 128
 
@@ -48,6 +53,7 @@
 void __exit extent_map_exit(void)
 {
 	struct extent_buffer *eb;
+	struct extent_state *state;
 
 	while (!list_empty(&extent_buffers)) {
 		eb = list_entry(extent_buffers.next,
@@ -55,6 +61,22 @@
 		list_del(&eb->list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
+	while (!list_empty(&states)) {
+		state = list_entry(states.next, struct extent_state, list);
+		printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
+		list_del(&state->list);
+		kmem_cache_free(extent_state_cache, state);
+
+	}
+	while (!list_empty(&buffers)) {
+		eb = list_entry(buffers.next,
+				struct extent_buffer, leak_list);
+		printk("buffer leak start %Lu len %lu return %lX\n", eb->start, eb->len, eb->alloc_addr);
+		list_del(&eb->leak_list);
+		kmem_cache_free(extent_buffer_cache, eb);
+	}
+
+
 	if (extent_map_cache)
 		kmem_cache_destroy(extent_map_cache);
 	if (extent_state_cache)
@@ -101,12 +123,19 @@
 struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
+	unsigned long flags;
+
 	state = kmem_cache_alloc(extent_state_cache, mask);
 	if (!state || IS_ERR(state))
 		return state;
 	state->state = 0;
 	state->in_tree = 0;
 	state->private = 0;
+
+	spin_lock_irqsave(&state_lock, flags);
+	list_add(&state->list, &states);
+	spin_unlock_irqrestore(&state_lock, flags);
+
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
 	return state;
@@ -115,10 +144,14 @@
 
 void free_extent_state(struct extent_state *state)
 {
+	unsigned long flags;
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
 		WARN_ON(state->in_tree);
+		spin_lock_irqsave(&state_lock, flags);
+		list_del(&state->list);
+		spin_unlock_irqrestore(&state_lock, flags);
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
@@ -361,10 +394,6 @@
 	state->state |= bits;
 	state->start = start;
 	state->end = end;
-	if ((end & 4095) == 0) {
-		printk("insert state %Lu %Lu strange end\n", start, end);
-		WARN_ON(1);
-	}
 	node = tree_insert(&tree->state, end, &state->rb_node);
 	if (node) {
 		struct extent_state *found;
@@ -399,11 +428,7 @@
 	prealloc->end = split - 1;
 	prealloc->state = orig->state;
 	orig->start = split;
-	if ((prealloc->end & 4095) == 0) {
-		printk("insert state %Lu %Lu strange end\n", prealloc->start,
-		       prealloc->end);
-		WARN_ON(1);
-	}
+
 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 	if (node) {
 		struct extent_state *found;
@@ -957,6 +982,7 @@
 			*start_ret = state->start;
 			*end_ret = state->end;
 			ret = 0;
+			break;
 		}
 		node = rb_next(node);
 		if (!node)
@@ -1877,6 +1903,7 @@
 static struct extent_buffer *__alloc_extent_buffer(gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
+
 	spin_lock(&extent_buffers_lock);
 	if (!list_empty(&extent_buffers)) {
 		eb = list_entry(extent_buffers.next, struct extent_buffer,
@@ -1886,15 +1913,26 @@
 		nr_extent_buffers--;
 	}
 	spin_unlock(&extent_buffers_lock);
+
 	if (eb) {
 		memset(eb, 0, sizeof(*eb));
-		return eb;
+	} else {
+		eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	}
-	return kmem_cache_zalloc(extent_buffer_cache, mask);
+	spin_lock(&extent_buffers_lock);
+	list_add(&eb->leak_list, &buffers);
+	spin_unlock(&extent_buffers_lock);
+
+	return eb;
 }
 
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
+
+	spin_lock(&extent_buffers_lock);
+	list_del_init(&eb->leak_list);
+	spin_unlock(&extent_buffers_lock);
+
 	if (nr_extent_buffers >= MAX_EXTENT_BUFFER_CACHE) {
 		kmem_cache_free(extent_buffer_cache, eb);
 	} else {
@@ -1933,6 +1971,7 @@
 	if (!eb || IS_ERR(eb))
 		return NULL;
 
+	eb->alloc_addr = __builtin_return_address(0);
 	eb->start = start;
 	eb->len = len;
 	atomic_set(&eb->refs, 1);
@@ -1947,6 +1986,7 @@
 			eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
 			goto fail;
 		}
+		set_page_extent_mapped(p);
 		if (i == 0)
 			eb->first_page = p;
 		if (!PageUptodate(p))
@@ -1978,6 +2018,7 @@
 	if (!eb || IS_ERR(eb))
 		return NULL;
 
+	eb->alloc_addr = __builtin_return_address(0);
 	eb->start = start;
 	eb->len = len;
 	atomic_set(&eb->refs, 1);
@@ -1992,6 +2033,7 @@
 			eb->start &= ~((u64)PAGE_CACHE_SIZE - 1);
 			goto fail;
 		}
+		set_page_extent_mapped(p);
 		if (i == 0)
 			eb->first_page = p;
 	}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 3b3abf3..d100f7c 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -68,7 +68,9 @@
 	atomic_t refs;
 	int flags;
 	struct list_head list;
+	struct list_head leak_list;
 	struct page *first_page;
+	unsigned long alloc_addr;
 };
 
 typedef struct extent_map *(get_extent_t)(struct inode *inode,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 750f35a..372b61f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -352,7 +352,7 @@
 		return 0;
 
 	trans = btrfs_start_transaction(root, 1);
-	while (1) {
+	while (0) {
 		root->defrag_running = 1;
 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
 		nr = trans->blocks_used;
@@ -394,7 +394,7 @@
 		for (i = 0; i < ret; i++) {
 			root = gang[i];
 			last = root->root_key.objectid + 1;
-			// btrfs_defrag_root(root, 1);
+			btrfs_defrag_root(root, 1);
 		}
 	}
 	// btrfs_defrag_root(info->extent_root, 1);
@@ -462,6 +462,7 @@
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
+		free_extent_buffer(dirty->root->node);
 		kfree(dirty->root);
 		kfree(dirty);
 		mutex_unlock(&tree_root->fs_info->fs_mutex);