Blame - fs/btrfs/tree-log.c - SHIFTPHONES/kernel/common

blob: 08469ec05850602d16235ccaa66e470d025ba9ac [file] [log] [blame]

Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
				19	#include <linux/sched.h>
				20	#include "ctree.h"
				21	#include "transaction.h"
				22	#include "disk-io.h"
				23	#include "locking.h"
				24	#include "print-tree.h"
				25	#include "compat.h"
Christoph Hellwig	b295086	2008-12-02 09:54:17 -0500	[diff] [blame]	26	#include "tree-log.h"
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	27
				28	/* magic values for the inode_only field in btrfs_log_inode:
				29	*
				30	* LOG_INODE_ALL means to log everything
				31	* LOG_INODE_EXISTS means to log just enough to recreate the inode
				32	* during log replay
				33	*/
				34	#define LOG_INODE_ALL 0
				35	#define LOG_INODE_EXISTS 1
				36
				37	/*
				38	* stages for the tree walking. The first
				39	* stage (0) is to only pin down the blocks we find
				40	* the second stage (1) is to make sure that all the inodes
				41	* we find in the log are created in the subvolume.
				42	*
				43	* The last stage is to deal with directories and links and extents
				44	* and all the other fun semantics
				45	*/
				46	#define LOG_WALK_PIN_ONLY 0
				47	#define LOG_WALK_REPLAY_INODES 1
				48	#define LOG_WALK_REPLAY_ALL 2
				49
				50	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				51	struct btrfs_root root, struct inode inode,
				52	int inode_only);
				53
				54	/*
				55	* tree logging is a special write ahead log used to make sure that
				56	* fsyncs and O_SYNCs can happen without doing full tree commits.
				57	*
				58	* Full tree commits are expensive because they require commonly
				59	* modified blocks to be recowed, creating many dirty pages in the
				60	* extent tree an 4x-6x higher write load than ext3.
				61	*
				62	* Instead of doing a tree commit on every fsync, we use the
				63	* key ranges and transaction ids to find items for a given file or directory
				64	* that have changed in this transaction. Those items are copied into
				65	* a special tree (one per subvolume root), that tree is written to disk
				66	* and then the fsync is considered complete.
				67	*
				68	* After a crash, items are copied out of the log-tree back into the
				69	* subvolume tree. Any file data extents found are recorded in the extent
				70	* allocation tree, and the log-tree freed.
				71	*
				72	* The log tree is read three times, once to pin down all the extents it is
				73	* using in ram and once, once to create all the inodes logged in the tree
				74	* and once to do all the other items.
				75	*/
				76
				77	/*
				78	* btrfs_add_log_tree adds a new per-subvolume log tree into the
				79	* tree of log tree roots. This must be called with a tree log transaction
				80	* running (see start_log_trans).
				81	*/
Christoph Hellwig	b295086	2008-12-02 09:54:17 -0500	[diff] [blame]	82	static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	83	struct btrfs_root *root)
				84	{
				85	struct btrfs_key key;
				86	struct btrfs_root_item root_item;
				87	struct btrfs_inode_item *inode_item;
				88	struct extent_buffer *leaf;
				89	struct btrfs_root *new_root = root;
				90	int ret;
				91	u64 objectid = root->root_key.objectid;
				92
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	93	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	94	BTRFS_TREE_LOG_OBJECTID,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	95	trans->transid, 0, 0, 0);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	96	if (IS_ERR(leaf)) {
				97	ret = PTR_ERR(leaf);
				98	return ret;
				99	}
				100
				101	btrfs_set_header_nritems(leaf, 0);
				102	btrfs_set_header_level(leaf, 0);
				103	btrfs_set_header_bytenr(leaf, leaf->start);
				104	btrfs_set_header_generation(leaf, trans->transid);
				105	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
				106
				107	write_extent_buffer(leaf, root->fs_info->fsid,
				108	(unsigned long)btrfs_header_fsid(leaf),
				109	BTRFS_FSID_SIZE);
				110	btrfs_mark_buffer_dirty(leaf);
				111
				112	inode_item = &root_item.inode;
				113	memset(inode_item, 0, sizeof(*inode_item));
				114	inode_item->generation = cpu_to_le64(1);
				115	inode_item->size = cpu_to_le64(3);
				116	inode_item->nlink = cpu_to_le32(1);
Yan Zheng	a76a3cd	2008-10-09 11:46:29 -0400	[diff] [blame]	117	inode_item->nbytes = cpu_to_le64(root->leafsize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	118	inode_item->mode = cpu_to_le32(S_IFDIR \| 0755);
				119
				120	btrfs_set_root_bytenr(&root_item, leaf->start);
Yan Zheng	84234f3	2008-10-29 14:49:05 -0400	[diff] [blame]	121	btrfs_set_root_generation(&root_item, trans->transid);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	122	btrfs_set_root_level(&root_item, 0);
				123	btrfs_set_root_refs(&root_item, 0);
				124	btrfs_set_root_used(&root_item, 0);
				125
				126	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
				127	root_item.drop_level = 0;
				128
				129	btrfs_tree_unlock(leaf);
				130	free_extent_buffer(leaf);
				131	leaf = NULL;
				132
				133	btrfs_set_root_dirid(&root_item, 0);
				134
				135	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				136	key.offset = objectid;
				137	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				138	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
				139	&root_item);
				140	if (ret)
				141	goto fail;
				142
				143	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
				144	&key);
				145	BUG_ON(!new_root);
				146
				147	WARN_ON(root->log_root);
				148	root->log_root = new_root;
				149
				150	/*
				151	* log trees do not get reference counted because they go away
				152	* before a real commit is actually done. They do store pointers
				153	* to file data extents, and those reference counts still get
				154	* updated (along with back refs to the log tree).
				155	*/
				156	new_root->ref_cows = 0;
				157	new_root->last_trans = trans->transid;
				158	fail:
				159	return ret;
				160	}
				161
				162	/*
				163	* start a sub transaction and setup the log tree
				164	* this increments the log tree writer count to make the people
				165	* syncing the tree wait for us to finish
				166	*/
				167	static int start_log_trans(struct btrfs_trans_handle *trans,
				168	struct btrfs_root *root)
				169	{
				170	int ret;
				171	mutex_lock(&root->fs_info->tree_log_mutex);
				172	if (!root->fs_info->log_root_tree) {
				173	ret = btrfs_init_log_root_tree(trans, root->fs_info);
				174	BUG_ON(ret);
				175	}
				176	if (!root->log_root) {
				177	ret = btrfs_add_log_tree(trans, root);
				178	BUG_ON(ret);
				179	}
				180	atomic_inc(&root->fs_info->tree_log_writers);
				181	root->fs_info->tree_log_batch++;
				182	mutex_unlock(&root->fs_info->tree_log_mutex);
				183	return 0;
				184	}
				185
				186	/*
				187	* returns 0 if there was a log transaction running and we were able
				188	* to join, or returns -ENOENT if there were not transactions
				189	* in progress
				190	*/
				191	static int join_running_log_trans(struct btrfs_root *root)
				192	{
				193	int ret = -ENOENT;
				194
				195	smp_mb();
				196	if (!root->log_root)
				197	return -ENOENT;
				198
				199	mutex_lock(&root->fs_info->tree_log_mutex);
				200	if (root->log_root) {
				201	ret = 0;
				202	atomic_inc(&root->fs_info->tree_log_writers);
				203	root->fs_info->tree_log_batch++;
				204	}
				205	mutex_unlock(&root->fs_info->tree_log_mutex);
				206	return ret;
				207	}
				208
				209	/*
				210	* indicate we're done making changes to the log tree
				211	* and wake up anyone waiting to do a sync
				212	*/
				213	static int end_log_trans(struct btrfs_root *root)
				214	{
				215	atomic_dec(&root->fs_info->tree_log_writers);
				216	smp_mb();
				217	if (waitqueue_active(&root->fs_info->tree_log_wait))
				218	wake_up(&root->fs_info->tree_log_wait);
				219	return 0;
				220	}
				221
				222
				223	/*
				224	* the walk control struct is used to pass state down the chain when
				225	* processing the log tree. The stage field tells us which part
				226	* of the log tree processing we are currently doing. The others
				227	* are state fields used for that specific part
				228	*/
				229	struct walk_control {
				230	/* should we free the extent on disk when done? This is used
				231	* at transaction commit time while freeing a log tree
				232	*/
				233	int free;
				234
				235	/* should we write out the extent buffer? This is used
				236	* while flushing the log tree to disk during a sync
				237	*/
				238	int write;
				239
				240	/* should we wait for the extent buffer io to finish? Also used
				241	* while flushing the log tree to disk for a sync
				242	*/
				243	int wait;
				244
				245	/* pin only walk, we record which extents on disk belong to the
				246	* log trees
				247	*/
				248	int pin;
				249
				250	/* what stage of the replay code we're currently in */
				251	int stage;
				252
				253	/* the root we are currently replaying */
				254	struct btrfs_root *replay_dest;
				255
				256	/* the trans handle for the current replay */
				257	struct btrfs_trans_handle *trans;
				258
				259	/* the function that gets used to process blocks we find in the
				260	* tree. Note the extent_buffer might not be up to date when it is
				261	* passed in, and it must be checked or read if you need the data
				262	* inside it
				263	*/
				264	int (process_func)(struct btrfs_root log, struct extent_buffer *eb,
				265	struct walk_control *wc, u64 gen);
				266	};
				267
				268	/*
				269	* process_func used to pin down extents, write them or wait on them
				270	*/
				271	static int process_one_buffer(struct btrfs_root *log,
				272	struct extent_buffer *eb,
				273	struct walk_control *wc, u64 gen)
				274	{
				275	if (wc->pin) {
Josef Bacik	2517920	2008-10-29 14:49:05 -0400	[diff] [blame]	276	mutex_lock(&log->fs_info->pinned_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	277	btrfs_update_pinned_extents(log->fs_info->extent_root,
				278	eb->start, eb->len, 1);
Josef Bacik	2517920	2008-10-29 14:49:05 -0400	[diff] [blame]	279	mutex_unlock(&log->fs_info->pinned_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	280	}
				281
				282	if (btrfs_buffer_uptodate(eb, gen)) {
				283	if (wc->write)
				284	btrfs_write_tree_block(eb);
				285	if (wc->wait)
				286	btrfs_wait_tree_block_writeback(eb);
				287	}
				288	return 0;
				289	}
				290
				291	/*
				292	* Item overwrite used by replay and tree logging. eb, slot and key all refer
				293	* to the src data we are copying out.
				294	*
				295	* root is the tree we are copying into, and path is a scratch
				296	* path for use in this function (it should be released on entry and
				297	* will be released on exit).
				298	*
				299	* If the key is already in the destination tree the existing item is
				300	* overwritten. If the existing item isn't big enough, it is extended.
				301	* If it is too large, it is truncated.
				302	*
				303	* If the key isn't in the destination yet, a new item is inserted.
				304	*/
				305	static noinline int overwrite_item(struct btrfs_trans_handle *trans,
				306	struct btrfs_root *root,
				307	struct btrfs_path *path,
				308	struct extent_buffer *eb, int slot,
				309	struct btrfs_key *key)
				310	{
				311	int ret;
				312	u32 item_size;
				313	u64 saved_i_size = 0;
				314	int save_old_i_size = 0;
				315	unsigned long src_ptr;
				316	unsigned long dst_ptr;
				317	int overwrite_root = 0;
				318
				319	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				320	overwrite_root = 1;
				321
				322	item_size = btrfs_item_size_nr(eb, slot);
				323	src_ptr = btrfs_item_ptr_offset(eb, slot);
				324
				325	/* look for the key in the destination tree */
				326	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				327	if (ret == 0) {
				328	char *src_copy;
				329	char *dst_copy;
				330	u32 dst_size = btrfs_item_size_nr(path->nodes[0],
				331	path->slots[0]);
				332	if (dst_size != item_size)
				333	goto insert;
				334
				335	if (item_size == 0) {
				336	btrfs_release_path(root, path);
				337	return 0;
				338	}
				339	dst_copy = kmalloc(item_size, GFP_NOFS);
				340	src_copy = kmalloc(item_size, GFP_NOFS);
				341
				342	read_extent_buffer(eb, src_copy, src_ptr, item_size);
				343
				344	dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				345	read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
				346	item_size);
				347	ret = memcmp(dst_copy, src_copy, item_size);
				348
				349	kfree(dst_copy);
				350	kfree(src_copy);
				351	/*
				352	* they have the same contents, just return, this saves
				353	* us from cowing blocks in the destination tree and doing
				354	* extra writes that may not have been done by a previous
				355	* sync
				356	*/
				357	if (ret == 0) {
				358	btrfs_release_path(root, path);
				359	return 0;
				360	}
				361
				362	}
				363	insert:
				364	btrfs_release_path(root, path);
				365	/* try to insert the key into the destination tree */
				366	ret = btrfs_insert_empty_item(trans, root, path,
				367	key, item_size);
				368
				369	/* make sure any existing item is the correct size */
				370	if (ret == -EEXIST) {
				371	u32 found_size;
				372	found_size = btrfs_item_size_nr(path->nodes[0],
				373	path->slots[0]);
				374	if (found_size > item_size) {
				375	btrfs_truncate_item(trans, root, path, item_size, 1);
				376	} else if (found_size < item_size) {
				377	ret = btrfs_del_item(trans, root,
				378	path);
				379	BUG_ON(ret);
				380
				381	btrfs_release_path(root, path);
				382	ret = btrfs_insert_empty_item(trans,
				383	root, path, key, item_size);
				384	BUG_ON(ret);
				385	}
				386	} else if (ret) {
				387	BUG();
				388	}
				389	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
				390	path->slots[0]);
				391
				392	/* don't overwrite an existing inode if the generation number
				393	* was logged as zero. This is done when the tree logging code
				394	* is just logging an inode to make sure it exists after recovery.
				395	*
				396	* Also, don't overwrite i_size on directories during replay.
				397	* log replay inserts and removes directory items based on the
				398	* state of the tree found in the subvolume, and i_size is modified
				399	* as it goes
				400	*/
				401	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
				402	struct btrfs_inode_item *src_item;
				403	struct btrfs_inode_item *dst_item;
				404
				405	src_item = (struct btrfs_inode_item *)src_ptr;
				406	dst_item = (struct btrfs_inode_item *)dst_ptr;
				407
				408	if (btrfs_inode_generation(eb, src_item) == 0)
				409	goto no_copy;
				410
				411	if (overwrite_root &&
				412	S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
				413	S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
				414	save_old_i_size = 1;
				415	saved_i_size = btrfs_inode_size(path->nodes[0],
				416	dst_item);
				417	}
				418	}
				419
				420	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
				421	src_ptr, item_size);
				422
				423	if (save_old_i_size) {
				424	struct btrfs_inode_item *dst_item;
				425	dst_item = (struct btrfs_inode_item *)dst_ptr;
				426	btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
				427	}
				428
				429	/* make sure the generation is filled in */
				430	if (key->type == BTRFS_INODE_ITEM_KEY) {
				431	struct btrfs_inode_item *dst_item;
				432	dst_item = (struct btrfs_inode_item *)dst_ptr;
				433	if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
				434	btrfs_set_inode_generation(path->nodes[0], dst_item,
				435	trans->transid);
				436	}
				437	}
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	438
				439	if (overwrite_root &&
				440	key->type == BTRFS_EXTENT_DATA_KEY) {
				441	int extent_type;
				442	struct btrfs_file_extent_item *fi;
				443
				444	fi = (struct btrfs_file_extent_item *)dst_ptr;
				445	extent_type = btrfs_file_extent_type(path->nodes[0], fi);
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	446	if (extent_type == BTRFS_FILE_EXTENT_REG \|\|
				447	extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	448	struct btrfs_key ins;
				449	ins.objectid = btrfs_file_extent_disk_bytenr(
				450	path->nodes[0], fi);
				451	ins.offset = btrfs_file_extent_disk_num_bytes(
				452	path->nodes[0], fi);
				453	ins.type = BTRFS_EXTENT_ITEM_KEY;
				454
				455	/*
				456	* is this extent already allocated in the extent
				457	* allocation tree? If so, just add a reference
				458	*/
				459	ret = btrfs_lookup_extent(root, ins.objectid,
				460	ins.offset);
				461	if (ret == 0) {
				462	ret = btrfs_inc_extent_ref(trans, root,
				463	ins.objectid, ins.offset,
				464	path->nodes[0]->start,
				465	root->root_key.objectid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	466	trans->transid, key->objectid);
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	467	} else {
				468	/*
				469	* insert the extent pointer in the extent
				470	* allocation tree
				471	*/
				472	ret = btrfs_alloc_logged_extent(trans, root,
				473	path->nodes[0]->start,
				474	root->root_key.objectid,
				475	trans->transid, key->objectid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	476	&ins);
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	477	BUG_ON(ret);
				478	}
				479	}
				480	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	481	no_copy:
				482	btrfs_mark_buffer_dirty(path->nodes[0]);
				483	btrfs_release_path(root, path);
				484	return 0;
				485	}
				486
				487	/*
				488	* simple helper to read an inode off the disk from a given root
				489	* This can only be called for subvolume roots and not for the log
				490	*/
				491	static noinline struct inode read_one_inode(struct btrfs_root root,
				492	u64 objectid)
				493	{
				494	struct inode *inode;
				495	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
				496	if (inode->i_state & I_NEW) {
				497	BTRFS_I(inode)->root = root;
				498	BTRFS_I(inode)->location.objectid = objectid;
				499	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
				500	BTRFS_I(inode)->location.offset = 0;
				501	btrfs_read_locked_inode(inode);
				502	unlock_new_inode(inode);
				503
				504	}
				505	if (is_bad_inode(inode)) {
				506	iput(inode);
				507	inode = NULL;
				508	}
				509	return inode;
				510	}
				511
				512	/* replays a single extent in 'eb' at 'slot' with 'key' into the
				513	* subvolume 'root'. path is released on entry and should be released
				514	* on exit.
				515	*
				516	* extents in the log tree have not been allocated out of the extent
				517	* tree yet. So, this completes the allocation, taking a reference
				518	* as required if the extent already exists or creating a new extent
				519	* if it isn't in the extent allocation tree yet.
				520	*
				521	* The extent is inserted into the file, dropping any existing extents
				522	* from the file that overlap the new one.
				523	*/
				524	static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
				525	struct btrfs_root *root,
				526	struct btrfs_path *path,
				527	struct extent_buffer *eb, int slot,
				528	struct btrfs_key *key)
				529	{
				530	int found_type;
				531	u64 mask = root->sectorsize - 1;
				532	u64 extent_end;
				533	u64 alloc_hint;
				534	u64 start = key->offset;
				535	struct btrfs_file_extent_item *item;
				536	struct inode *inode = NULL;
				537	unsigned long size;
				538	int ret = 0;
				539
				540	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
				541	found_type = btrfs_file_extent_type(eb, item);
				542
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	543	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				544	found_type == BTRFS_FILE_EXTENT_PREALLOC)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	545	extent_end = start + btrfs_file_extent_num_bytes(eb, item);
				546	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
Chris Mason	c8b9781	2008-10-29 14:49:59 -0400	[diff] [blame]	547	size = btrfs_file_extent_inline_len(eb, item);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	548	extent_end = (start + size + mask) & ~mask;
				549	} else {
				550	ret = 0;
				551	goto out;
				552	}
				553
				554	inode = read_one_inode(root, key->objectid);
				555	if (!inode) {
				556	ret = -EIO;
				557	goto out;
				558	}
				559
				560	/*
				561	* first check to see if we already have this extent in the
				562	* file. This must be done before the btrfs_drop_extents run
				563	* so we don't try to drop this extent.
				564	*/
				565	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
				566	start, 0);
				567
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	568	if (ret == 0 &&
				569	(found_type == BTRFS_FILE_EXTENT_REG \|\|
				570	found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	571	struct btrfs_file_extent_item cmp1;
				572	struct btrfs_file_extent_item cmp2;
				573	struct btrfs_file_extent_item *existing;
				574	struct extent_buffer *leaf;
				575
				576	leaf = path->nodes[0];
				577	existing = btrfs_item_ptr(leaf, path->slots[0],
				578	struct btrfs_file_extent_item);
				579
				580	read_extent_buffer(eb, &cmp1, (unsigned long)item,
				581	sizeof(cmp1));
				582	read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
				583	sizeof(cmp2));
				584
				585	/*
				586	* we already have a pointer to this exact extent,
				587	* we don't have to do anything
				588	*/
				589	if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
				590	btrfs_release_path(root, path);
				591	goto out;
				592	}
				593	}
				594	btrfs_release_path(root, path);
				595
				596	/* drop any overlapping extents */
				597	ret = btrfs_drop_extents(trans, root, inode,
				598	start, extent_end, start, &alloc_hint);
				599	BUG_ON(ret);
				600
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	601	/* insert the extent */
				602	ret = overwrite_item(trans, root, path, eb, slot, key);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	603	BUG_ON(ret);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	604
Yan Zheng	a76a3cd	2008-10-09 11:46:29 -0400	[diff] [blame]	605	/* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
				606	inode_add_bytes(inode, extent_end - start);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	607	btrfs_update_inode(trans, root, inode);
				608	out:
				609	if (inode)
				610	iput(inode);
				611	return ret;
				612	}
				613
				614	/*
				615	* when cleaning up conflicts between the directory names in the
				616	* subvolume, directory names in the log and directory names in the
				617	* inode back references, we may have to unlink inodes from directories.
				618	*
				619	* This is a helper function to do the unlink of a specific directory
				620	* item
				621	*/
				622	static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
				623	struct btrfs_root *root,
				624	struct btrfs_path *path,
				625	struct inode *dir,
				626	struct btrfs_dir_item *di)
				627	{
				628	struct inode *inode;
				629	char *name;
				630	int name_len;
				631	struct extent_buffer *leaf;
				632	struct btrfs_key location;
				633	int ret;
				634
				635	leaf = path->nodes[0];
				636
				637	btrfs_dir_item_key_to_cpu(leaf, di, &location);
				638	name_len = btrfs_dir_name_len(leaf, di);
				639	name = kmalloc(name_len, GFP_NOFS);
				640	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
				641	btrfs_release_path(root, path);
				642
				643	inode = read_one_inode(root, location.objectid);
				644	BUG_ON(!inode);
				645
				646	btrfs_inc_nlink(inode);
				647	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
				648	kfree(name);
				649
				650	iput(inode);
				651	return ret;
				652	}
				653
				654	/*
				655	* helper function to see if a given name and sequence number found
				656	* in an inode back reference are already in a directory and correctly
				657	* point to this inode
				658	*/
				659	static noinline int inode_in_dir(struct btrfs_root *root,
				660	struct btrfs_path *path,
				661	u64 dirid, u64 objectid, u64 index,
				662	const char *name, int name_len)
				663	{
				664	struct btrfs_dir_item *di;
				665	struct btrfs_key location;
				666	int match = 0;
				667
				668	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
				669	index, name, name_len, 0);
				670	if (di && !IS_ERR(di)) {
				671	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				672	if (location.objectid != objectid)
				673	goto out;
				674	} else
				675	goto out;
				676	btrfs_release_path(root, path);
				677
				678	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
				679	if (di && !IS_ERR(di)) {
				680	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				681	if (location.objectid != objectid)
				682	goto out;
				683	} else
				684	goto out;
				685	match = 1;
				686	out:
				687	btrfs_release_path(root, path);
				688	return match;
				689	}
				690
				691	/*
				692	* helper function to check a log tree for a named back reference in
				693	* an inode. This is used to decide if a back reference that is
				694	* found in the subvolume conflicts with what we find in the log.
				695	*
				696	* inode backreferences may have multiple refs in a single item,
				697	* during replay we process one reference at a time, and we don't
				698	* want to delete valid links to a file from the subvolume if that
				699	* link is also in the log.
				700	*/
				701	static noinline int backref_in_log(struct btrfs_root *log,
				702	struct btrfs_key *key,
				703	char *name, int namelen)
				704	{
				705	struct btrfs_path *path;
				706	struct btrfs_inode_ref *ref;
				707	unsigned long ptr;
				708	unsigned long ptr_end;
				709	unsigned long name_ptr;
				710	int found_name_len;
				711	int item_size;
				712	int ret;
				713	int match = 0;
				714
				715	path = btrfs_alloc_path();
				716	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
				717	if (ret != 0)
				718	goto out;
				719
				720	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
				721	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				722	ptr_end = ptr + item_size;
				723	while (ptr < ptr_end) {
				724	ref = (struct btrfs_inode_ref *)ptr;
				725	found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
				726	if (found_name_len == namelen) {
				727	name_ptr = (unsigned long)(ref + 1);
				728	ret = memcmp_extent_buffer(path->nodes[0], name,
				729	name_ptr, namelen);
				730	if (ret == 0) {
				731	match = 1;
				732	goto out;
				733	}
				734	}
				735	ptr = (unsigned long)(ref + 1) + found_name_len;
				736	}
				737	out:
				738	btrfs_free_path(path);
				739	return match;
				740	}
				741
				742
				743	/*
				744	* replay one inode back reference item found in the log tree.
				745	* eb, slot and key refer to the buffer and key found in the log tree.
				746	* root is the destination we are replaying into, and path is for temp
				747	* use by this function. (it should be released on return).
				748	*/
				749	static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
				750	struct btrfs_root *root,
				751	struct btrfs_root *log,
				752	struct btrfs_path *path,
				753	struct extent_buffer *eb, int slot,
				754	struct btrfs_key *key)
				755	{
				756	struct inode *dir;
				757	int ret;
				758	struct btrfs_key location;
				759	struct btrfs_inode_ref *ref;
				760	struct btrfs_dir_item *di;
				761	struct inode *inode;
				762	char *name;
				763	int namelen;
				764	unsigned long ref_ptr;
				765	unsigned long ref_end;
				766
				767	location.objectid = key->objectid;
				768	location.type = BTRFS_INODE_ITEM_KEY;
				769	location.offset = 0;
				770
				771	/*
				772	* it is possible that we didn't log all the parent directories
				773	* for a given inode. If we don't find the dir, just don't
				774	* copy the back ref in. The link count fixup code will take
				775	* care of the rest
				776	*/
				777	dir = read_one_inode(root, key->offset);
				778	if (!dir)
				779	return -ENOENT;
				780
				781	inode = read_one_inode(root, key->objectid);
				782	BUG_ON(!dir);
				783
				784	ref_ptr = btrfs_item_ptr_offset(eb, slot);
				785	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
				786
				787	again:
				788	ref = (struct btrfs_inode_ref *)ref_ptr;
				789
				790	namelen = btrfs_inode_ref_name_len(eb, ref);
				791	name = kmalloc(namelen, GFP_NOFS);
				792	BUG_ON(!name);
				793
				794	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
				795
				796	/* if we already have a perfect match, we're done */
				797	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
				798	btrfs_inode_ref_index(eb, ref),
				799	name, namelen)) {
				800	goto out;
				801	}
				802
				803	/*
				804	* look for a conflicting back reference in the metadata.
				805	* if we find one we have to unlink that name of the file
				806	* before we add our new link. Later on, we overwrite any
				807	* existing back reference, and we don't want to create
				808	* dangling pointers in the directory.
				809	*/
				810	conflict_again:
				811	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				812	if (ret == 0) {
				813	char *victim_name;
				814	int victim_name_len;
				815	struct btrfs_inode_ref *victim_ref;
				816	unsigned long ptr;
				817	unsigned long ptr_end;
				818	struct extent_buffer *leaf = path->nodes[0];
				819
				820	/* are we trying to overwrite a back ref for the root directory
				821	* if so, just jump out, we're done
				822	*/
				823	if (key->objectid == key->offset)
				824	goto out_nowrite;
				825
				826	/* check all the names in this back reference to see
				827	* if they are in the log. if so, we allow them to stay
				828	* otherwise they must be unlinked as a conflict
				829	*/
				830	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
				831	ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
				832	while(ptr < ptr_end) {
				833	victim_ref = (struct btrfs_inode_ref *)ptr;
				834	victim_name_len = btrfs_inode_ref_name_len(leaf,
				835	victim_ref);
				836	victim_name = kmalloc(victim_name_len, GFP_NOFS);
				837	BUG_ON(!victim_name);
				838
				839	read_extent_buffer(leaf, victim_name,
				840	(unsigned long)(victim_ref + 1),
				841	victim_name_len);
				842
				843	if (!backref_in_log(log, key, victim_name,
				844	victim_name_len)) {
				845	btrfs_inc_nlink(inode);
				846	btrfs_release_path(root, path);
				847	ret = btrfs_unlink_inode(trans, root, dir,
				848	inode, victim_name,
				849	victim_name_len);
				850	kfree(victim_name);
				851	btrfs_release_path(root, path);
				852	goto conflict_again;
				853	}
				854	kfree(victim_name);
				855	ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
				856	}
				857	BUG_ON(ret);
				858	}
				859	btrfs_release_path(root, path);
				860
				861	/* look for a conflicting sequence number */
				862	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
				863	btrfs_inode_ref_index(eb, ref),
				864	name, namelen, 0);
				865	if (di && !IS_ERR(di)) {
				866	ret = drop_one_dir_item(trans, root, path, dir, di);
				867	BUG_ON(ret);
				868	}
				869	btrfs_release_path(root, path);
				870
				871
				872	/* look for a conflicting name */
				873	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
				874	name, namelen, 0);
				875	if (di && !IS_ERR(di)) {
				876	ret = drop_one_dir_item(trans, root, path, dir, di);
				877	BUG_ON(ret);
				878	}
				879	btrfs_release_path(root, path);
				880
				881	/* insert our name */
				882	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
				883	btrfs_inode_ref_index(eb, ref));
				884	BUG_ON(ret);
				885
				886	btrfs_update_inode(trans, root, inode);
				887
				888	out:
				889	ref_ptr = (unsigned long)(ref + 1) + namelen;
				890	kfree(name);
				891	if (ref_ptr < ref_end)
				892	goto again;
				893
				894	/* finally write the back reference in the inode */
				895	ret = overwrite_item(trans, root, path, eb, slot, key);
				896	BUG_ON(ret);
				897
				898	out_nowrite:
				899	btrfs_release_path(root, path);
				900	iput(dir);
				901	iput(inode);
				902	return 0;
				903	}
				904
				905	/*
				906	* replay one csum item from the log tree into the subvolume 'root'
				907	* eb, slot and key all refer to the log tree
				908	* path is for temp use by this function and should be released on return
				909	*
				910	* This copies the checksums out of the log tree and inserts them into
				911	* the subvolume. Any existing checksums for this range in the file
				912	* are overwritten, and new items are added where required.
				913	*
				914	* We keep this simple by reusing the btrfs_ordered_sum code from
				915	* the data=ordered mode. This basically means making a copy
				916	* of all the checksums in ram, which we have to do anyway for kmap
				917	* rules.
				918	*
				919	* The copy is then sent down to btrfs_csum_file_blocks, which
				920	* does all the hard work of finding existing items in the file
				921	* or adding new ones.
				922	*/
				923	static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
				924	struct btrfs_root *root,
				925	struct btrfs_path *path,
				926	struct extent_buffer *eb, int slot,
				927	struct btrfs_key *key)
				928	{
				929	int ret;
				930	u32 item_size = btrfs_item_size_nr(eb, slot);
				931	u64 cur_offset;
Josef Bacik	607d432	2008-12-02 07:17:45 -0500	[diff] [blame]	932	u16 csum_size =
				933	btrfs_super_csum_size(&root->fs_info->super_copy);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	934	unsigned long file_bytes;
				935	struct btrfs_ordered_sum *sums;
				936	struct btrfs_sector_sum *sector_sum;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	937	unsigned long ptr;
				938
Josef Bacik	607d432	2008-12-02 07:17:45 -0500	[diff] [blame]	939	file_bytes = (item_size / csum_size) * root->sectorsize;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	940	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
				941	if (!sums) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	942	return -ENOMEM;
				943	}
				944
				945	INIT_LIST_HEAD(&sums->list);
				946	sums->len = file_bytes;
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame^]	947	sums->bytenr = key->offset;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	948
				949	/*
				950	* copy all the sums into the ordered sum struct
				951	*/
				952	sector_sum = sums->sums;
				953	cur_offset = key->offset;
				954	ptr = btrfs_item_ptr_offset(eb, slot);
				955	while(item_size > 0) {
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame^]	956	sector_sum->bytenr = cur_offset;
Josef Bacik	607d432	2008-12-02 07:17:45 -0500	[diff] [blame]	957	read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	958	sector_sum++;
Josef Bacik	607d432	2008-12-02 07:17:45 -0500	[diff] [blame]	959	item_size -= csum_size;
				960	ptr += csum_size;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	961	cur_offset += root->sectorsize;
				962	}
				963
				964	/* let btrfs_csum_file_blocks add them into the file */
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame^]	965	ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	966	BUG_ON(ret);
				967	kfree(sums);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	968	return 0;
				969	}
				970	/*
				971	* There are a few corners where the link count of the file can't
				972	* be properly maintained during replay. So, instead of adding
				973	* lots of complexity to the log code, we just scan the backrefs
				974	* for any file that has been through replay.
				975	*
				976	* The scan will update the link count on the inode to reflect the
				977	* number of back refs found. If it goes down to zero, the iput
				978	* will free the inode.
				979	*/
				980	static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
				981	struct btrfs_root *root,
				982	struct inode *inode)
				983	{
				984	struct btrfs_path *path;
				985	int ret;
				986	struct btrfs_key key;
				987	u64 nlink = 0;
				988	unsigned long ptr;
				989	unsigned long ptr_end;
				990	int name_len;
				991
				992	key.objectid = inode->i_ino;
				993	key.type = BTRFS_INODE_REF_KEY;
				994	key.offset = (u64)-1;
				995
				996	path = btrfs_alloc_path();
				997
				998	while(1) {
				999	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1000	if (ret < 0)
				1001	break;
				1002	if (ret > 0) {
				1003	if (path->slots[0] == 0)
				1004	break;
				1005	path->slots[0]--;
				1006	}
				1007	btrfs_item_key_to_cpu(path->nodes[0], &key,
				1008	path->slots[0]);
				1009	if (key.objectid != inode->i_ino \|\|
				1010	key.type != BTRFS_INODE_REF_KEY)
				1011	break;
				1012	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				1013	ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
				1014	path->slots[0]);
				1015	while(ptr < ptr_end) {
				1016	struct btrfs_inode_ref *ref;
				1017
				1018	ref = (struct btrfs_inode_ref *)ptr;
				1019	name_len = btrfs_inode_ref_name_len(path->nodes[0],
				1020	ref);
				1021	ptr = (unsigned long)(ref + 1) + name_len;
				1022	nlink++;
				1023	}
				1024
				1025	if (key.offset == 0)
				1026	break;
				1027	key.offset--;
				1028	btrfs_release_path(root, path);
				1029	}
				1030	btrfs_free_path(path);
				1031	if (nlink != inode->i_nlink) {
				1032	inode->i_nlink = nlink;
				1033	btrfs_update_inode(trans, root, inode);
				1034	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	1035	BTRFS_I(inode)->index_cnt = (u64)-1;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1036
				1037	return 0;
				1038	}
				1039
				1040	static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
				1041	struct btrfs_root *root,
				1042	struct btrfs_path *path)
				1043	{
				1044	int ret;
				1045	struct btrfs_key key;
				1046	struct inode *inode;
				1047
				1048	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1049	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1050	key.offset = (u64)-1;
				1051	while(1) {
				1052	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1053	if (ret < 0)
				1054	break;
				1055
				1056	if (ret == 1) {
				1057	if (path->slots[0] == 0)
				1058	break;
				1059	path->slots[0]--;
				1060	}
				1061
				1062	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1063	if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID \|\|
				1064	key.type != BTRFS_ORPHAN_ITEM_KEY)
				1065	break;
				1066
				1067	ret = btrfs_del_item(trans, root, path);
				1068	BUG_ON(ret);
				1069
				1070	btrfs_release_path(root, path);
				1071	inode = read_one_inode(root, key.offset);
				1072	BUG_ON(!inode);
				1073
				1074	ret = fixup_inode_link_count(trans, root, inode);
				1075	BUG_ON(ret);
				1076
				1077	iput(inode);
				1078
				1079	if (key.offset == 0)
				1080	break;
				1081	key.offset--;
				1082	}
				1083	btrfs_release_path(root, path);
				1084	return 0;
				1085	}
				1086
				1087
				1088	/*
				1089	* record a given inode in the fixup dir so we can check its link
				1090	* count when replay is done. The link count is incremented here
				1091	* so the inode won't go away until we check it
				1092	*/
				1093	static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				1094	struct btrfs_root *root,
				1095	struct btrfs_path *path,
				1096	u64 objectid)
				1097	{
				1098	struct btrfs_key key;
				1099	int ret = 0;
				1100	struct inode *inode;
				1101
				1102	inode = read_one_inode(root, objectid);
				1103	BUG_ON(!inode);
				1104
				1105	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1106	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
				1107	key.offset = objectid;
				1108
				1109	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
				1110
				1111	btrfs_release_path(root, path);
				1112	if (ret == 0) {
				1113	btrfs_inc_nlink(inode);
				1114	btrfs_update_inode(trans, root, inode);
				1115	} else if (ret == -EEXIST) {
				1116	ret = 0;
				1117	} else {
				1118	BUG();
				1119	}
				1120	iput(inode);
				1121
				1122	return ret;
				1123	}
				1124
				1125	/*
				1126	* when replaying the log for a directory, we only insert names
				1127	* for inodes that actually exist. This means an fsync on a directory
				1128	* does not implicitly fsync all the new files in it
				1129	*/
				1130	static noinline int insert_one_name(struct btrfs_trans_handle *trans,
				1131	struct btrfs_root *root,
				1132	struct btrfs_path *path,
				1133	u64 dirid, u64 index,
				1134	char *name, int name_len, u8 type,
				1135	struct btrfs_key *location)
				1136	{
				1137	struct inode *inode;
				1138	struct inode *dir;
				1139	int ret;
				1140
				1141	inode = read_one_inode(root, location->objectid);
				1142	if (!inode)
				1143	return -ENOENT;
				1144
				1145	dir = read_one_inode(root, dirid);
				1146	if (!dir) {
				1147	iput(inode);
				1148	return -EIO;
				1149	}
				1150	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
				1151
				1152	/* FIXME, put inode into FIXUP list */
				1153
				1154	iput(inode);
				1155	iput(dir);
				1156	return ret;
				1157	}
				1158
				1159	/*
				1160	* take a single entry in a log directory item and replay it into
				1161	* the subvolume.
				1162	*
				1163	* if a conflicting item exists in the subdirectory already,
				1164	* the inode it points to is unlinked and put into the link count
				1165	* fix up tree.
				1166	*
				1167	* If a name from the log points to a file or directory that does
				1168	* not exist in the FS, it is skipped. fsyncs on directories
				1169	* do not force down inodes inside that directory, just changes to the
				1170	* names or unlinks in a directory.
				1171	*/
				1172	static noinline int replay_one_name(struct btrfs_trans_handle *trans,
				1173	struct btrfs_root *root,
				1174	struct btrfs_path *path,
				1175	struct extent_buffer *eb,
				1176	struct btrfs_dir_item *di,
				1177	struct btrfs_key *key)
				1178	{
				1179	char *name;
				1180	int name_len;
				1181	struct btrfs_dir_item *dst_di;
				1182	struct btrfs_key found_key;
				1183	struct btrfs_key log_key;
				1184	struct inode *dir;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1185	u8 log_type;
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1186	int exists;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1187	int ret;
				1188
				1189	dir = read_one_inode(root, key->objectid);
				1190	BUG_ON(!dir);
				1191
				1192	name_len = btrfs_dir_name_len(eb, di);
				1193	name = kmalloc(name_len, GFP_NOFS);
				1194	log_type = btrfs_dir_type(eb, di);
				1195	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1196	name_len);
				1197
				1198	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1199	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
				1200	if (exists == 0)
				1201	exists = 1;
				1202	else
				1203	exists = 0;
				1204	btrfs_release_path(root, path);
				1205
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1206	if (key->type == BTRFS_DIR_ITEM_KEY) {
				1207	dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
				1208	name, name_len, 1);
				1209	}
				1210	else if (key->type == BTRFS_DIR_INDEX_KEY) {
				1211	dst_di = btrfs_lookup_dir_index_item(trans, root, path,
				1212	key->objectid,
				1213	key->offset, name,
				1214	name_len, 1);
				1215	} else {
				1216	BUG();
				1217	}
				1218	if (!dst_di \|\| IS_ERR(dst_di)) {
				1219	/* we need a sequence number to insert, so we only
				1220	* do inserts for the BTRFS_DIR_INDEX_KEY types
				1221	*/
				1222	if (key->type != BTRFS_DIR_INDEX_KEY)
				1223	goto out;
				1224	goto insert;
				1225	}
				1226
				1227	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
				1228	/* the existing item matches the logged item */
				1229	if (found_key.objectid == log_key.objectid &&
				1230	found_key.type == log_key.type &&
				1231	found_key.offset == log_key.offset &&
				1232	btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
				1233	goto out;
				1234	}
				1235
				1236	/*
				1237	* don't drop the conflicting directory entry if the inode
				1238	* for the new entry doesn't exist
				1239	*/
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1240	if (!exists)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1241	goto out;
				1242
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1243	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
				1244	BUG_ON(ret);
				1245
				1246	if (key->type == BTRFS_DIR_INDEX_KEY)
				1247	goto insert;
				1248	out:
				1249	btrfs_release_path(root, path);
				1250	kfree(name);
				1251	iput(dir);
				1252	return 0;
				1253
				1254	insert:
				1255	btrfs_release_path(root, path);
				1256	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
				1257	name, name_len, log_type, &log_key);
				1258
				1259	if (ret && ret != -ENOENT)
				1260	BUG();
				1261	goto out;
				1262	}
				1263
				1264	/*
				1265	* find all the names in a directory item and reconcile them into
				1266	* the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
				1267	* one name in a directory item, but the same code gets used for
				1268	* both directory index types
				1269	*/
				1270	static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
				1271	struct btrfs_root *root,
				1272	struct btrfs_path *path,
				1273	struct extent_buffer *eb, int slot,
				1274	struct btrfs_key *key)
				1275	{
				1276	int ret;
				1277	u32 item_size = btrfs_item_size_nr(eb, slot);
				1278	struct btrfs_dir_item *di;
				1279	int name_len;
				1280	unsigned long ptr;
				1281	unsigned long ptr_end;
				1282
				1283	ptr = btrfs_item_ptr_offset(eb, slot);
				1284	ptr_end = ptr + item_size;
				1285	while(ptr < ptr_end) {
				1286	di = (struct btrfs_dir_item *)ptr;
				1287	name_len = btrfs_dir_name_len(eb, di);
				1288	ret = replay_one_name(trans, root, path, eb, di, key);
				1289	BUG_ON(ret);
				1290	ptr = (unsigned long)(di + 1);
				1291	ptr += name_len;
				1292	}
				1293	return 0;
				1294	}
				1295
				1296	/*
				1297	* directory replay has two parts. There are the standard directory
				1298	* items in the log copied from the subvolume, and range items
				1299	* created in the log while the subvolume was logged.
				1300	*
				1301	* The range items tell us which parts of the key space the log
				1302	* is authoritative for. During replay, if a key in the subvolume
				1303	* directory is in a logged range item, but not actually in the log
				1304	* that means it was deleted from the directory before the fsync
				1305	* and should be removed.
				1306	*/
				1307	static noinline int find_dir_range(struct btrfs_root *root,
				1308	struct btrfs_path *path,
				1309	u64 dirid, int key_type,
				1310	u64 start_ret, u64 end_ret)
				1311	{
				1312	struct btrfs_key key;
				1313	u64 found_end;
				1314	struct btrfs_dir_log_item *item;
				1315	int ret;
				1316	int nritems;
				1317
				1318	if (*start_ret == (u64)-1)
				1319	return 1;
				1320
				1321	key.objectid = dirid;
				1322	key.type = key_type;
				1323	key.offset = *start_ret;
				1324
				1325	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1326	if (ret < 0)
				1327	goto out;
				1328	if (ret > 0) {
				1329	if (path->slots[0] == 0)
				1330	goto out;
				1331	path->slots[0]--;
				1332	}
				1333	if (ret != 0)
				1334	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1335
				1336	if (key.type != key_type \|\| key.objectid != dirid) {
				1337	ret = 1;
				1338	goto next;
				1339	}
				1340	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1341	struct btrfs_dir_log_item);
				1342	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1343
				1344	if (start_ret >= key.offset && start_ret <= found_end) {
				1345	ret = 0;
				1346	*start_ret = key.offset;
				1347	*end_ret = found_end;
				1348	goto out;
				1349	}
				1350	ret = 1;
				1351	next:
				1352	/* check the next slot in the tree to see if it is a valid item */
				1353	nritems = btrfs_header_nritems(path->nodes[0]);
				1354	if (path->slots[0] >= nritems) {
				1355	ret = btrfs_next_leaf(root, path);
				1356	if (ret)
				1357	goto out;
				1358	} else {
				1359	path->slots[0]++;
				1360	}
				1361
				1362	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1363
				1364	if (key.type != key_type \|\| key.objectid != dirid) {
				1365	ret = 1;
				1366	goto out;
				1367	}
				1368	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1369	struct btrfs_dir_log_item);
				1370	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1371	*start_ret = key.offset;
				1372	*end_ret = found_end;
				1373	ret = 0;
				1374	out:
				1375	btrfs_release_path(root, path);
				1376	return ret;
				1377	}
				1378
				1379	/*
				1380	* this looks for a given directory item in the log. If the directory
				1381	* item is not in the log, the item is removed and the inode it points
				1382	* to is unlinked
				1383	*/
				1384	static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
				1385	struct btrfs_root *root,
				1386	struct btrfs_root *log,
				1387	struct btrfs_path *path,
				1388	struct btrfs_path *log_path,
				1389	struct inode *dir,
				1390	struct btrfs_key *dir_key)
				1391	{
				1392	int ret;
				1393	struct extent_buffer *eb;
				1394	int slot;
				1395	u32 item_size;
				1396	struct btrfs_dir_item *di;
				1397	struct btrfs_dir_item *log_di;
				1398	int name_len;
				1399	unsigned long ptr;
				1400	unsigned long ptr_end;
				1401	char *name;
				1402	struct inode *inode;
				1403	struct btrfs_key location;
				1404
				1405	again:
				1406	eb = path->nodes[0];
				1407	slot = path->slots[0];
				1408	item_size = btrfs_item_size_nr(eb, slot);
				1409	ptr = btrfs_item_ptr_offset(eb, slot);
				1410	ptr_end = ptr + item_size;
				1411	while(ptr < ptr_end) {
				1412	di = (struct btrfs_dir_item *)ptr;
				1413	name_len = btrfs_dir_name_len(eb, di);
				1414	name = kmalloc(name_len, GFP_NOFS);
				1415	if (!name) {
				1416	ret = -ENOMEM;
				1417	goto out;
				1418	}
				1419	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1420	name_len);
				1421	log_di = NULL;
				1422	if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
				1423	log_di = btrfs_lookup_dir_item(trans, log, log_path,
				1424	dir_key->objectid,
				1425	name, name_len, 0);
				1426	} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
				1427	log_di = btrfs_lookup_dir_index_item(trans, log,
				1428	log_path,
				1429	dir_key->objectid,
				1430	dir_key->offset,
				1431	name, name_len, 0);
				1432	}
				1433	if (!log_di \|\| IS_ERR(log_di)) {
				1434	btrfs_dir_item_key_to_cpu(eb, di, &location);
				1435	btrfs_release_path(root, path);
				1436	btrfs_release_path(log, log_path);
				1437	inode = read_one_inode(root, location.objectid);
				1438	BUG_ON(!inode);
				1439
				1440	ret = link_to_fixup_dir(trans, root,
				1441	path, location.objectid);
				1442	BUG_ON(ret);
				1443	btrfs_inc_nlink(inode);
				1444	ret = btrfs_unlink_inode(trans, root, dir, inode,
				1445	name, name_len);
				1446	BUG_ON(ret);
				1447	kfree(name);
				1448	iput(inode);
				1449
				1450	/* there might still be more names under this key
				1451	* check and repeat if required
				1452	*/
				1453	ret = btrfs_search_slot(NULL, root, dir_key, path,
				1454	0, 0);
				1455	if (ret == 0)
				1456	goto again;
				1457	ret = 0;
				1458	goto out;
				1459	}
				1460	btrfs_release_path(log, log_path);
				1461	kfree(name);
				1462
				1463	ptr = (unsigned long)(di + 1);
				1464	ptr += name_len;
				1465	}
				1466	ret = 0;
				1467	out:
				1468	btrfs_release_path(root, path);
				1469	btrfs_release_path(log, log_path);
				1470	return ret;
				1471	}
				1472
				1473	/*
				1474	* deletion replay happens before we copy any new directory items
				1475	* out of the log or out of backreferences from inodes. It
				1476	* scans the log to find ranges of keys that log is authoritative for,
				1477	* and then scans the directory to find items in those ranges that are
				1478	* not present in the log.
				1479	*
				1480	* Anything we don't find in the log is unlinked and removed from the
				1481	* directory.
				1482	*/
				1483	static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				1484	struct btrfs_root *root,
				1485	struct btrfs_root *log,
				1486	struct btrfs_path *path,
				1487	u64 dirid)
				1488	{
				1489	u64 range_start;
				1490	u64 range_end;
				1491	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
				1492	int ret = 0;
				1493	struct btrfs_key dir_key;
				1494	struct btrfs_key found_key;
				1495	struct btrfs_path *log_path;
				1496	struct inode *dir;
				1497
				1498	dir_key.objectid = dirid;
				1499	dir_key.type = BTRFS_DIR_ITEM_KEY;
				1500	log_path = btrfs_alloc_path();
				1501	if (!log_path)
				1502	return -ENOMEM;
				1503
				1504	dir = read_one_inode(root, dirid);
				1505	/* it isn't an error if the inode isn't there, that can happen
				1506	* because we replay the deletes before we copy in the inode item
				1507	* from the log
				1508	*/
				1509	if (!dir) {
				1510	btrfs_free_path(log_path);
				1511	return 0;
				1512	}
				1513	again:
				1514	range_start = 0;
				1515	range_end = 0;
				1516	while(1) {
				1517	ret = find_dir_range(log, path, dirid, key_type,
				1518	&range_start, &range_end);
				1519	if (ret != 0)
				1520	break;
				1521
				1522	dir_key.offset = range_start;
				1523	while(1) {
				1524	int nritems;
				1525	ret = btrfs_search_slot(NULL, root, &dir_key, path,
				1526	0, 0);
				1527	if (ret < 0)
				1528	goto out;
				1529
				1530	nritems = btrfs_header_nritems(path->nodes[0]);
				1531	if (path->slots[0] >= nritems) {
				1532	ret = btrfs_next_leaf(root, path);
				1533	if (ret)
				1534	break;
				1535	}
				1536	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				1537	path->slots[0]);
				1538	if (found_key.objectid != dirid \|\|
				1539	found_key.type != dir_key.type)
				1540	goto next_type;
				1541
				1542	if (found_key.offset > range_end)
				1543	break;
				1544
				1545	ret = check_item_in_log(trans, root, log, path,
				1546	log_path, dir, &found_key);
				1547	BUG_ON(ret);
				1548	if (found_key.offset == (u64)-1)
				1549	break;
				1550	dir_key.offset = found_key.offset + 1;
				1551	}
				1552	btrfs_release_path(root, path);
				1553	if (range_end == (u64)-1)
				1554	break;
				1555	range_start = range_end + 1;
				1556	}
				1557
				1558	next_type:
				1559	ret = 0;
				1560	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
				1561	key_type = BTRFS_DIR_LOG_INDEX_KEY;
				1562	dir_key.type = BTRFS_DIR_INDEX_KEY;
				1563	btrfs_release_path(root, path);
				1564	goto again;
				1565	}
				1566	out:
				1567	btrfs_release_path(root, path);
				1568	btrfs_free_path(log_path);
				1569	iput(dir);
				1570	return ret;
				1571	}
				1572
				1573	/*
				1574	* the process_func used to replay items from the log tree. This
				1575	* gets called in two different stages. The first stage just looks
				1576	* for inodes and makes sure they are all copied into the subvolume.
				1577	*
				1578	* The second stage copies all the other item types from the log into
				1579	* the subvolume. The two stage approach is slower, but gets rid of
				1580	* lots of complexity around inodes referencing other inodes that exist
				1581	* only in the log (references come from either directory items or inode
				1582	* back refs).
				1583	*/
				1584	static int replay_one_buffer(struct btrfs_root log, struct extent_buffer eb,
				1585	struct walk_control *wc, u64 gen)
				1586	{
				1587	int nritems;
				1588	struct btrfs_path *path;
				1589	struct btrfs_root *root = wc->replay_dest;
				1590	struct btrfs_key key;
				1591	u32 item_size;
				1592	int level;
				1593	int i;
				1594	int ret;
				1595
				1596	btrfs_read_buffer(eb, gen);
				1597
				1598	level = btrfs_header_level(eb);
				1599
				1600	if (level != 0)
				1601	return 0;
				1602
				1603	path = btrfs_alloc_path();
				1604	BUG_ON(!path);
				1605
				1606	nritems = btrfs_header_nritems(eb);
				1607	for (i = 0; i < nritems; i++) {
				1608	btrfs_item_key_to_cpu(eb, &key, i);
				1609	item_size = btrfs_item_size_nr(eb, i);
				1610
				1611	/* inode keys are done during the first stage */
				1612	if (key.type == BTRFS_INODE_ITEM_KEY &&
				1613	wc->stage == LOG_WALK_REPLAY_INODES) {
				1614	struct inode *inode;
				1615	struct btrfs_inode_item *inode_item;
				1616	u32 mode;
				1617
				1618	inode_item = btrfs_item_ptr(eb, i,
				1619	struct btrfs_inode_item);
				1620	mode = btrfs_inode_mode(eb, inode_item);
				1621	if (S_ISDIR(mode)) {
				1622	ret = replay_dir_deletes(wc->trans,
				1623	root, log, path, key.objectid);
				1624	BUG_ON(ret);
				1625	}
				1626	ret = overwrite_item(wc->trans, root, path,
				1627	eb, i, &key);
				1628	BUG_ON(ret);
				1629
				1630	/* for regular files, truncate away
				1631	* extents past the new EOF
				1632	*/
				1633	if (S_ISREG(mode)) {
				1634	inode = read_one_inode(root,
				1635	key.objectid);
				1636	BUG_ON(!inode);
				1637
				1638	ret = btrfs_truncate_inode_items(wc->trans,
				1639	root, inode, inode->i_size,
				1640	BTRFS_EXTENT_DATA_KEY);
				1641	BUG_ON(ret);
				1642	iput(inode);
				1643	}
				1644	ret = link_to_fixup_dir(wc->trans, root,
				1645	path, key.objectid);
				1646	BUG_ON(ret);
				1647	}
				1648	if (wc->stage < LOG_WALK_REPLAY_ALL)
				1649	continue;
				1650
				1651	/* these keys are simply copied */
				1652	if (key.type == BTRFS_XATTR_ITEM_KEY) {
				1653	ret = overwrite_item(wc->trans, root, path,
				1654	eb, i, &key);
				1655	BUG_ON(ret);
				1656	} else if (key.type == BTRFS_INODE_REF_KEY) {
				1657	ret = add_inode_ref(wc->trans, root, log, path,
				1658	eb, i, &key);
				1659	BUG_ON(ret && ret != -ENOENT);
				1660	} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
				1661	ret = replay_one_extent(wc->trans, root, path,
				1662	eb, i, &key);
				1663	BUG_ON(ret);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame^]	1664	} else if (key.type == BTRFS_EXTENT_CSUM_KEY) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1665	ret = replay_one_csum(wc->trans, root, path,
				1666	eb, i, &key);
				1667	BUG_ON(ret);
				1668	} else if (key.type == BTRFS_DIR_ITEM_KEY \|\|
				1669	key.type == BTRFS_DIR_INDEX_KEY) {
				1670	ret = replay_one_dir_item(wc->trans, root, path,
				1671	eb, i, &key);
				1672	BUG_ON(ret);
				1673	}
				1674	}
				1675	btrfs_free_path(path);
				1676	return 0;
				1677	}
				1678
				1679	static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
				1680	struct btrfs_root *root,
				1681	struct btrfs_path path, int level,
				1682	struct walk_control *wc)
				1683	{
				1684	u64 root_owner;
				1685	u64 root_gen;
				1686	u64 bytenr;
				1687	u64 ptr_gen;
				1688	struct extent_buffer *next;
				1689	struct extent_buffer *cur;
				1690	struct extent_buffer *parent;
				1691	u32 blocksize;
				1692	int ret = 0;
				1693
				1694	WARN_ON(*level < 0);
				1695	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1696
				1697	while(*level > 0) {
				1698	WARN_ON(*level < 0);
				1699	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1700	cur = path->nodes[*level];
				1701
				1702	if (btrfs_header_level(cur) != *level)
				1703	WARN_ON(1);
				1704
				1705	if (path->slots[*level] >=
				1706	btrfs_header_nritems(cur))
				1707	break;
				1708
				1709	bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
				1710	ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
				1711	blocksize = btrfs_level_size(root, *level - 1);
				1712
				1713	parent = path->nodes[*level];
				1714	root_owner = btrfs_header_owner(parent);
				1715	root_gen = btrfs_header_generation(parent);
				1716
				1717	next = btrfs_find_create_tree_block(root, bytenr, blocksize);
				1718
				1719	wc->process_func(root, next, wc, ptr_gen);
				1720
				1721	if (*level == 1) {
				1722	path->slots[*level]++;
				1723	if (wc->free) {
				1724	btrfs_read_buffer(next, ptr_gen);
				1725
				1726	btrfs_tree_lock(next);
				1727	clean_tree_block(trans, root, next);
				1728	btrfs_wait_tree_block_writeback(next);
				1729	btrfs_tree_unlock(next);
				1730
				1731	ret = btrfs_drop_leaf_ref(trans, root, next);
				1732	BUG_ON(ret);
				1733
				1734	WARN_ON(root_owner !=
				1735	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1736	ret = btrfs_free_reserved_extent(root,
				1737	bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1738	BUG_ON(ret);
				1739	}
				1740	free_extent_buffer(next);
				1741	continue;
				1742	}
				1743	btrfs_read_buffer(next, ptr_gen);
				1744
				1745	WARN_ON(*level <= 0);
				1746	if (path->nodes[*level-1])
				1747	free_extent_buffer(path->nodes[*level-1]);
				1748	path->nodes[*level-1] = next;
				1749	*level = btrfs_header_level(next);
				1750	path->slots[*level] = 0;
				1751	cond_resched();
				1752	}
				1753	WARN_ON(*level < 0);
				1754	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1755
				1756	if (path->nodes[*level] == root->node) {
				1757	parent = path->nodes[*level];
				1758	} else {
				1759	parent = path->nodes[*level + 1];
				1760	}
				1761	bytenr = path->nodes[*level]->start;
				1762
				1763	blocksize = btrfs_level_size(root, *level);
				1764	root_owner = btrfs_header_owner(parent);
				1765	root_gen = btrfs_header_generation(parent);
				1766
				1767	wc->process_func(root, path->nodes[*level], wc,
				1768	btrfs_header_generation(path->nodes[*level]));
				1769
				1770	if (wc->free) {
				1771	next = path->nodes[*level];
				1772	btrfs_tree_lock(next);
				1773	clean_tree_block(trans, root, next);
				1774	btrfs_wait_tree_block_writeback(next);
				1775	btrfs_tree_unlock(next);
				1776
				1777	if (*level == 0) {
				1778	ret = btrfs_drop_leaf_ref(trans, root, next);
				1779	BUG_ON(ret);
				1780	}
				1781	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1782	ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1783	BUG_ON(ret);
				1784	}
				1785	free_extent_buffer(path->nodes[*level]);
				1786	path->nodes[*level] = NULL;
				1787	*level += 1;
				1788
				1789	cond_resched();
				1790	return 0;
				1791	}
				1792
				1793	static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
				1794	struct btrfs_root *root,
				1795	struct btrfs_path path, int level,
				1796	struct walk_control *wc)
				1797	{
				1798	u64 root_owner;
				1799	u64 root_gen;
				1800	int i;
				1801	int slot;
				1802	int ret;
				1803
				1804	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
				1805	slot = path->slots[i];
				1806	if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
				1807	struct extent_buffer *node;
				1808	node = path->nodes[i];
				1809	path->slots[i]++;
				1810	*level = i;
				1811	WARN_ON(*level == 0);
				1812	return 0;
				1813	} else {
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	1814	struct extent_buffer *parent;
				1815	if (path->nodes[*level] == root->node)
				1816	parent = path->nodes[*level];
				1817	else
				1818	parent = path->nodes[*level + 1];
				1819
				1820	root_owner = btrfs_header_owner(parent);
				1821	root_gen = btrfs_header_generation(parent);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1822	wc->process_func(root, path->nodes[*level], wc,
				1823	btrfs_header_generation(path->nodes[*level]));
				1824	if (wc->free) {
				1825	struct extent_buffer *next;
				1826
				1827	next = path->nodes[*level];
				1828
				1829	btrfs_tree_lock(next);
				1830	clean_tree_block(trans, root, next);
				1831	btrfs_wait_tree_block_writeback(next);
				1832	btrfs_tree_unlock(next);
				1833
				1834	if (*level == 0) {
				1835	ret = btrfs_drop_leaf_ref(trans, root,
				1836	next);
				1837	BUG_ON(ret);
				1838	}
				1839
				1840	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1841	ret = btrfs_free_reserved_extent(root,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1842	path->nodes[*level]->start,
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1843	path->nodes[*level]->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1844	BUG_ON(ret);
				1845	}
				1846	free_extent_buffer(path->nodes[*level]);
				1847	path->nodes[*level] = NULL;
				1848	*level = i + 1;
				1849	}
				1850	}
				1851	return 1;
				1852	}
				1853
				1854	/*
				1855	* drop the reference count on the tree rooted at 'snap'. This traverses
				1856	* the tree freeing any blocks that have a ref count of zero after being
				1857	* decremented.
				1858	*/
				1859	static int walk_log_tree(struct btrfs_trans_handle *trans,
				1860	struct btrfs_root log, struct walk_control wc)
				1861	{
				1862	int ret = 0;
				1863	int wret;
				1864	int level;
				1865	struct btrfs_path *path;
				1866	int i;
				1867	int orig_level;
				1868
				1869	path = btrfs_alloc_path();
				1870	BUG_ON(!path);
				1871
				1872	level = btrfs_header_level(log->node);
				1873	orig_level = level;
				1874	path->nodes[level] = log->node;
				1875	extent_buffer_get(log->node);
				1876	path->slots[level] = 0;
				1877
				1878	while(1) {
				1879	wret = walk_down_log_tree(trans, log, path, &level, wc);
				1880	if (wret > 0)
				1881	break;
				1882	if (wret < 0)
				1883	ret = wret;
				1884
				1885	wret = walk_up_log_tree(trans, log, path, &level, wc);
				1886	if (wret > 0)
				1887	break;
				1888	if (wret < 0)
				1889	ret = wret;
				1890	}
				1891
				1892	/* was the root node processed? if not, catch it here */
				1893	if (path->nodes[orig_level]) {
				1894	wc->process_func(log, path->nodes[orig_level], wc,
				1895	btrfs_header_generation(path->nodes[orig_level]));
				1896	if (wc->free) {
				1897	struct extent_buffer *next;
				1898
				1899	next = path->nodes[orig_level];
				1900
				1901	btrfs_tree_lock(next);
				1902	clean_tree_block(trans, log, next);
				1903	btrfs_wait_tree_block_writeback(next);
				1904	btrfs_tree_unlock(next);
				1905
				1906	if (orig_level == 0) {
				1907	ret = btrfs_drop_leaf_ref(trans, log,
				1908	next);
				1909	BUG_ON(ret);
				1910	}
				1911	WARN_ON(log->root_key.objectid !=
				1912	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1913	ret = btrfs_free_reserved_extent(log, next->start,
				1914	next->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1915	BUG_ON(ret);
				1916	}
				1917	}
				1918
				1919	for (i = 0; i <= orig_level; i++) {
				1920	if (path->nodes[i]) {
				1921	free_extent_buffer(path->nodes[i]);
				1922	path->nodes[i] = NULL;
				1923	}
				1924	}
				1925	btrfs_free_path(path);
				1926	if (wc->free)
				1927	free_extent_buffer(log->node);
				1928	return ret;
				1929	}
				1930
Christoph Hellwig	b295086	2008-12-02 09:54:17 -0500	[diff] [blame]	1931	static int wait_log_commit(struct btrfs_root *log)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1932	{
				1933	DEFINE_WAIT(wait);
				1934	u64 transid = log->fs_info->tree_log_transid;
				1935
				1936	do {
				1937	prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
				1938	TASK_UNINTERRUPTIBLE);
				1939	mutex_unlock(&log->fs_info->tree_log_mutex);
				1940	if (atomic_read(&log->fs_info->tree_log_commit))
				1941	schedule();
				1942	finish_wait(&log->fs_info->tree_log_wait, &wait);
				1943	mutex_lock(&log->fs_info->tree_log_mutex);
				1944	} while(transid == log->fs_info->tree_log_transid &&
				1945	atomic_read(&log->fs_info->tree_log_commit));
				1946	return 0;
				1947	}
				1948
				1949	/*
				1950	* btrfs_sync_log does sends a given tree log down to the disk and
				1951	* updates the super blocks to record it. When this call is done,
				1952	* you know that any inodes previously logged are safely on disk
				1953	*/
				1954	int btrfs_sync_log(struct btrfs_trans_handle *trans,
				1955	struct btrfs_root *root)
				1956	{
				1957	int ret;
				1958	unsigned long batch;
				1959	struct btrfs_root *log = root->log_root;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1960
				1961	mutex_lock(&log->fs_info->tree_log_mutex);
				1962	if (atomic_read(&log->fs_info->tree_log_commit)) {
				1963	wait_log_commit(log);
				1964	goto out;
				1965	}
				1966	atomic_set(&log->fs_info->tree_log_commit, 1);
				1967
				1968	while(1) {
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	1969	batch = log->fs_info->tree_log_batch;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1970	mutex_unlock(&log->fs_info->tree_log_mutex);
				1971	schedule_timeout_uninterruptible(1);
				1972	mutex_lock(&log->fs_info->tree_log_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1973
				1974	while(atomic_read(&log->fs_info->tree_log_writers)) {
				1975	DEFINE_WAIT(wait);
				1976	prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
				1977	TASK_UNINTERRUPTIBLE);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1978	mutex_unlock(&log->fs_info->tree_log_mutex);
				1979	if (atomic_read(&log->fs_info->tree_log_writers))
				1980	schedule();
				1981	mutex_lock(&log->fs_info->tree_log_mutex);
				1982	finish_wait(&log->fs_info->tree_log_wait, &wait);
				1983	}
				1984	if (batch == log->fs_info->tree_log_batch)
				1985	break;
				1986	}
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	1987
				1988	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1989	BUG_ON(ret);
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	1990	ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
				1991	&root->fs_info->log_root_tree->dirty_log_pages);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1992	BUG_ON(ret);
				1993
				1994	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
				1995	log->fs_info->log_root_tree->node->start);
				1996	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
				1997	btrfs_header_level(log->fs_info->log_root_tree->node));
				1998
				1999	write_ctree_super(trans, log->fs_info->tree_root);
				2000	log->fs_info->tree_log_transid++;
				2001	log->fs_info->tree_log_batch = 0;
				2002	atomic_set(&log->fs_info->tree_log_commit, 0);
				2003	smp_mb();
				2004	if (waitqueue_active(&log->fs_info->tree_log_wait))
				2005	wake_up(&log->fs_info->tree_log_wait);
				2006	out:
				2007	mutex_unlock(&log->fs_info->tree_log_mutex);
				2008	return 0;
				2009
				2010	}
				2011
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2012	/* * free all the extents used by the tree log. This should be called
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2013	* at commit time of the full transaction
				2014	*/
				2015	int btrfs_free_log(struct btrfs_trans_handle trans, struct btrfs_root root)
				2016	{
				2017	int ret;
				2018	struct btrfs_root *log;
				2019	struct key;
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	2020	u64 start;
				2021	u64 end;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2022	struct walk_control wc = {
				2023	.free = 1,
				2024	.process_func = process_one_buffer
				2025	};
				2026
				2027	if (!root->log_root)
				2028	return 0;
				2029
				2030	log = root->log_root;
				2031	ret = walk_log_tree(trans, log, &wc);
				2032	BUG_ON(ret);
				2033
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	2034	while(1) {
				2035	ret = find_first_extent_bit(&log->dirty_log_pages,
				2036	0, &start, &end, EXTENT_DIRTY);
				2037	if (ret)
				2038	break;
				2039
				2040	clear_extent_dirty(&log->dirty_log_pages,
				2041	start, end, GFP_NOFS);
				2042	}
				2043
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2044	log = root->log_root;
				2045	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
				2046	&log->root_key);
				2047	BUG_ON(ret);
				2048	root->log_root = NULL;
				2049	kfree(root->log_root);
				2050	return 0;
				2051	}
				2052
				2053	/*
				2054	* helper function to update the item for a given subvolumes log root
				2055	* in the tree of log roots
				2056	*/
				2057	static int update_log_root(struct btrfs_trans_handle *trans,
				2058	struct btrfs_root *log)
				2059	{
				2060	u64 bytenr = btrfs_root_bytenr(&log->root_item);
				2061	int ret;
				2062
				2063	if (log->node->start == bytenr)
				2064	return 0;
				2065
				2066	btrfs_set_root_bytenr(&log->root_item, log->node->start);
Yan Zheng	84234f3	2008-10-29 14:49:05 -0400	[diff] [blame]	2067	btrfs_set_root_generation(&log->root_item, trans->transid);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2068	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
				2069	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
				2070	&log->root_key, &log->root_item);
				2071	BUG_ON(ret);
				2072	return ret;
				2073	}
				2074
				2075	/*
				2076	* If both a file and directory are logged, and unlinks or renames are
				2077	* mixed in, we have a few interesting corners:
				2078	*
				2079	* create file X in dir Y
				2080	* link file X to X.link in dir Y
				2081	* fsync file X
				2082	* unlink file X but leave X.link
				2083	* fsync dir Y
				2084	*
				2085	* After a crash we would expect only X.link to exist. But file X
				2086	* didn't get fsync'd again so the log has back refs for X and X.link.
				2087	*
				2088	* We solve this by removing directory entries and inode backrefs from the
				2089	* log when a file that was logged in the current transaction is
				2090	* unlinked. Any later fsync will include the updated log entries, and
				2091	* we'll be able to reconstruct the proper directory items from backrefs.
				2092	*
				2093	* This optimizations allows us to avoid relogging the entire inode
				2094	* or the entire directory.
				2095	*/
				2096	int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
				2097	struct btrfs_root *root,
				2098	const char *name, int name_len,
				2099	struct inode *dir, u64 index)
				2100	{
				2101	struct btrfs_root *log;
				2102	struct btrfs_dir_item *di;
				2103	struct btrfs_path *path;
				2104	int ret;
				2105	int bytes_del = 0;
				2106
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2107	if (BTRFS_I(dir)->logged_trans < trans->transid)
				2108	return 0;
				2109
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2110	ret = join_running_log_trans(root);
				2111	if (ret)
				2112	return 0;
				2113
				2114	mutex_lock(&BTRFS_I(dir)->log_mutex);
				2115
				2116	log = root->log_root;
				2117	path = btrfs_alloc_path();
				2118	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
				2119	name, name_len, -1);
				2120	if (di && !IS_ERR(di)) {
				2121	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2122	bytes_del += name_len;
				2123	BUG_ON(ret);
				2124	}
				2125	btrfs_release_path(log, path);
				2126	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
				2127	index, name, name_len, -1);
				2128	if (di && !IS_ERR(di)) {
				2129	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2130	bytes_del += name_len;
				2131	BUG_ON(ret);
				2132	}
				2133
				2134	/* update the directory size in the log to reflect the names
				2135	* we have removed
				2136	*/
				2137	if (bytes_del) {
				2138	struct btrfs_key key;
				2139
				2140	key.objectid = dir->i_ino;
				2141	key.offset = 0;
				2142	key.type = BTRFS_INODE_ITEM_KEY;
				2143	btrfs_release_path(log, path);
				2144
				2145	ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
				2146	if (ret == 0) {
				2147	struct btrfs_inode_item *item;
				2148	u64 i_size;
				2149
				2150	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2151	struct btrfs_inode_item);
				2152	i_size = btrfs_inode_size(path->nodes[0], item);
				2153	if (i_size > bytes_del)
				2154	i_size -= bytes_del;
				2155	else
				2156	i_size = 0;
				2157	btrfs_set_inode_size(path->nodes[0], item, i_size);
				2158	btrfs_mark_buffer_dirty(path->nodes[0]);
				2159	} else
				2160	ret = 0;
				2161	btrfs_release_path(log, path);
				2162	}
				2163
				2164	btrfs_free_path(path);
				2165	mutex_unlock(&BTRFS_I(dir)->log_mutex);
				2166	end_log_trans(root);
				2167
				2168	return 0;
				2169	}
				2170
				2171	/* see comments for btrfs_del_dir_entries_in_log */
				2172	int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
				2173	struct btrfs_root *root,
				2174	const char *name, int name_len,
				2175	struct inode *inode, u64 dirid)
				2176	{
				2177	struct btrfs_root *log;
				2178	u64 index;
				2179	int ret;
				2180
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2181	if (BTRFS_I(inode)->logged_trans < trans->transid)
				2182	return 0;
				2183
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2184	ret = join_running_log_trans(root);
				2185	if (ret)
				2186	return 0;
				2187	log = root->log_root;
				2188	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2189
				2190	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
				2191	dirid, &index);
				2192	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2193	end_log_trans(root);
				2194
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2195	return ret;
				2196	}
				2197
				2198	/*
				2199	* creates a range item in the log for 'dirid'. first_offset and
				2200	* last_offset tell us which parts of the key space the log should
				2201	* be considered authoritative for.
				2202	*/
				2203	static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
				2204	struct btrfs_root *log,
				2205	struct btrfs_path *path,
				2206	int key_type, u64 dirid,
				2207	u64 first_offset, u64 last_offset)
				2208	{
				2209	int ret;
				2210	struct btrfs_key key;
				2211	struct btrfs_dir_log_item *item;
				2212
				2213	key.objectid = dirid;
				2214	key.offset = first_offset;
				2215	if (key_type == BTRFS_DIR_ITEM_KEY)
				2216	key.type = BTRFS_DIR_LOG_ITEM_KEY;
				2217	else
				2218	key.type = BTRFS_DIR_LOG_INDEX_KEY;
				2219	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
				2220	BUG_ON(ret);
				2221
				2222	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2223	struct btrfs_dir_log_item);
				2224	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
				2225	btrfs_mark_buffer_dirty(path->nodes[0]);
				2226	btrfs_release_path(log, path);
				2227	return 0;
				2228	}
				2229
				2230	/*
				2231	* log all the items included in the current transaction for a given
				2232	* directory. This also creates the range items in the log tree required
				2233	* to replay anything deleted before the fsync
				2234	*/
				2235	static noinline int log_dir_items(struct btrfs_trans_handle *trans,
				2236	struct btrfs_root root, struct inode inode,
				2237	struct btrfs_path *path,
				2238	struct btrfs_path *dst_path, int key_type,
				2239	u64 min_offset, u64 *last_offset_ret)
				2240	{
				2241	struct btrfs_key min_key;
				2242	struct btrfs_key max_key;
				2243	struct btrfs_root *log = root->log_root;
				2244	struct extent_buffer *src;
				2245	int ret;
				2246	int i;
				2247	int nritems;
				2248	u64 first_offset = min_offset;
				2249	u64 last_offset = (u64)-1;
				2250
				2251	log = root->log_root;
				2252	max_key.objectid = inode->i_ino;
				2253	max_key.offset = (u64)-1;
				2254	max_key.type = key_type;
				2255
				2256	min_key.objectid = inode->i_ino;
				2257	min_key.type = key_type;
				2258	min_key.offset = min_offset;
				2259
				2260	path->keep_locks = 1;
				2261
				2262	ret = btrfs_search_forward(root, &min_key, &max_key,
				2263	path, 0, trans->transid);
				2264
				2265	/*
				2266	* we didn't find anything from this transaction, see if there
				2267	* is anything at all
				2268	*/
				2269	if (ret != 0 \|\| min_key.objectid != inode->i_ino \|\|
				2270	min_key.type != key_type) {
				2271	min_key.objectid = inode->i_ino;
				2272	min_key.type = key_type;
				2273	min_key.offset = (u64)-1;
				2274	btrfs_release_path(root, path);
				2275	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2276	if (ret < 0) {
				2277	btrfs_release_path(root, path);
				2278	return ret;
				2279	}
				2280	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2281
				2282	/* if ret == 0 there are items for this type,
				2283	* create a range to tell us the last key of this type.
				2284	* otherwise, there are no items in this directory after
				2285	* *min_offset, and we create a range to indicate that.
				2286	*/
				2287	if (ret == 0) {
				2288	struct btrfs_key tmp;
				2289	btrfs_item_key_to_cpu(path->nodes[0], &tmp,
				2290	path->slots[0]);
				2291	if (key_type == tmp.type) {
				2292	first_offset = max(min_offset, tmp.offset) + 1;
				2293	}
				2294	}
				2295	goto done;
				2296	}
				2297
				2298	/* go backward to find any previous key */
				2299	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2300	if (ret == 0) {
				2301	struct btrfs_key tmp;
				2302	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2303	if (key_type == tmp.type) {
				2304	first_offset = tmp.offset;
				2305	ret = overwrite_item(trans, log, dst_path,
				2306	path->nodes[0], path->slots[0],
				2307	&tmp);
				2308	}
				2309	}
				2310	btrfs_release_path(root, path);
				2311
				2312	/* find the first key from this transaction again */
				2313	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2314	if (ret != 0) {
				2315	WARN_ON(1);
				2316	goto done;
				2317	}
				2318
				2319	/*
				2320	* we have a block from this transaction, log every item in it
				2321	* from our directory
				2322	*/
				2323	while(1) {
				2324	struct btrfs_key tmp;
				2325	src = path->nodes[0];
				2326	nritems = btrfs_header_nritems(src);
				2327	for (i = path->slots[0]; i < nritems; i++) {
				2328	btrfs_item_key_to_cpu(src, &min_key, i);
				2329
				2330	if (min_key.objectid != inode->i_ino \|\|
				2331	min_key.type != key_type)
				2332	goto done;
				2333	ret = overwrite_item(trans, log, dst_path, src, i,
				2334	&min_key);
				2335	BUG_ON(ret);
				2336	}
				2337	path->slots[0] = nritems;
				2338
				2339	/*
				2340	* look ahead to the next item and see if it is also
				2341	* from this directory and from this transaction
				2342	*/
				2343	ret = btrfs_next_leaf(root, path);
				2344	if (ret == 1) {
				2345	last_offset = (u64)-1;
				2346	goto done;
				2347	}
				2348	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2349	if (tmp.objectid != inode->i_ino \|\| tmp.type != key_type) {
				2350	last_offset = (u64)-1;
				2351	goto done;
				2352	}
				2353	if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
				2354	ret = overwrite_item(trans, log, dst_path,
				2355	path->nodes[0], path->slots[0],
				2356	&tmp);
				2357
				2358	BUG_ON(ret);
				2359	last_offset = tmp.offset;
				2360	goto done;
				2361	}
				2362	}
				2363	done:
				2364	*last_offset_ret = last_offset;
				2365	btrfs_release_path(root, path);
				2366	btrfs_release_path(log, dst_path);
				2367
				2368	/* insert the log range keys to indicate where the log is valid */
				2369	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
				2370	first_offset, last_offset);
				2371	BUG_ON(ret);
				2372	return 0;
				2373	}
				2374
				2375	/*
				2376	* logging directories is very similar to logging inodes, We find all the items
				2377	* from the current transaction and write them to the log.
				2378	*
				2379	* The recovery code scans the directory in the subvolume, and if it finds a
				2380	* key in the range logged that is not present in the log tree, then it means
				2381	* that dir entry was unlinked during the transaction.
				2382	*
				2383	* In order for that scan to work, we must include one key smaller than
				2384	* the smallest logged by this transaction and one key larger than the largest
				2385	* key logged by this transaction.
				2386	*/
				2387	static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
				2388	struct btrfs_root root, struct inode inode,
				2389	struct btrfs_path *path,
				2390	struct btrfs_path *dst_path)
				2391	{
				2392	u64 min_key;
				2393	u64 max_key;
				2394	int ret;
				2395	int key_type = BTRFS_DIR_ITEM_KEY;
				2396
				2397	again:
				2398	min_key = 0;
				2399	max_key = 0;
				2400	while(1) {
				2401	ret = log_dir_items(trans, root, inode, path,
				2402	dst_path, key_type, min_key,
				2403	&max_key);
				2404	BUG_ON(ret);
				2405	if (max_key == (u64)-1)
				2406	break;
				2407	min_key = max_key + 1;
				2408	}
				2409
				2410	if (key_type == BTRFS_DIR_ITEM_KEY) {
				2411	key_type = BTRFS_DIR_INDEX_KEY;
				2412	goto again;
				2413	}
				2414	return 0;
				2415	}
				2416
				2417	/*
				2418	* a helper function to drop items from the log before we relog an
				2419	* inode. max_key_type indicates the highest item type to remove.
				2420	* This cannot be run for file data extents because it does not
				2421	* free the extents they point to.
				2422	*/
				2423	static int drop_objectid_items(struct btrfs_trans_handle *trans,
				2424	struct btrfs_root *log,
				2425	struct btrfs_path *path,
				2426	u64 objectid, int max_key_type)
				2427	{
				2428	int ret;
				2429	struct btrfs_key key;
				2430	struct btrfs_key found_key;
				2431
				2432	key.objectid = objectid;
				2433	key.type = max_key_type;
				2434	key.offset = (u64)-1;
				2435
				2436	while(1) {
				2437	ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
				2438
				2439	if (ret != 1)
				2440	break;
				2441
				2442	if (path->slots[0] == 0)
				2443	break;
				2444
				2445	path->slots[0]--;
				2446	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2447	path->slots[0]);
				2448
				2449	if (found_key.objectid != objectid)
				2450	break;
				2451
				2452	ret = btrfs_del_item(trans, log, path);
				2453	BUG_ON(ret);
				2454	btrfs_release_path(log, path);
				2455	}
				2456	btrfs_release_path(log, path);
				2457	return 0;
				2458	}
				2459
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame^]	2460	static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
				2461	struct list_head *list,
				2462	struct btrfs_root *root,
				2463	u64 disk_bytenr, u64 len)
				2464	{
				2465	struct btrfs_ordered_sum *sums;
				2466	struct btrfs_sector_sum *sector_sum;
				2467	int ret;
				2468	struct btrfs_path *path;
				2469	struct btrfs_csum_item *item = NULL;
				2470	u64 end = disk_bytenr + len;
				2471	u64 item_start_offset = 0;
				2472	u64 item_last_offset = 0;
				2473	u32 diff;
				2474	u32 sum;
				2475	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
				2476
				2477	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
				2478
				2479	sector_sum = sums->sums;
				2480	sums->bytenr = disk_bytenr;
				2481	sums->len = len;
				2482	list_add_tail(&sums->list, list);
				2483
				2484	path = btrfs_alloc_path();
				2485	while(disk_bytenr < end) {
				2486	if (!item \|\| disk_bytenr < item_start_offset \|\|
				2487	disk_bytenr >= item_last_offset) {
				2488	struct btrfs_key found_key;
				2489	u32 item_size;
				2490
				2491	if (item)
				2492	btrfs_release_path(root, path);
				2493	item = btrfs_lookup_csum(NULL, root, path,
				2494	disk_bytenr, 0);
				2495	if (IS_ERR(item)) {
				2496	ret = PTR_ERR(item);
				2497	if (ret == -ENOENT \|\| ret == -EFBIG)
				2498	ret = 0;
				2499	sum = 0;
				2500	printk("log no csum found for byte %llu\n",
				2501	(unsigned long long)disk_bytenr);
				2502	item = NULL;
				2503	btrfs_release_path(root, path);
				2504	goto found;
				2505	}
				2506	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2507	path->slots[0]);
				2508
				2509	item_start_offset = found_key.offset;
				2510	item_size = btrfs_item_size_nr(path->nodes[0],
				2511	path->slots[0]);
				2512	item_last_offset = item_start_offset +
				2513	(item_size / csum_size) *
				2514	root->sectorsize;
				2515	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2516	struct btrfs_csum_item);
				2517	}
				2518	/*
				2519	* this byte range must be able to fit inside
				2520	* a single leaf so it will also fit inside a u32
				2521	*/
				2522	diff = disk_bytenr - item_start_offset;
				2523	diff = diff / root->sectorsize;
				2524	diff = diff * csum_size;
				2525
				2526	read_extent_buffer(path->nodes[0], &sum,
				2527	((unsigned long)item) + diff,
				2528	csum_size);
				2529	found:
				2530	sector_sum->bytenr = disk_bytenr;
				2531	sector_sum->sum = sum;
				2532	disk_bytenr += root->sectorsize;
				2533	sector_sum++;
				2534	}
				2535	btrfs_free_path(path);
				2536	return 0;
				2537	}
				2538
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2539	static noinline int copy_items(struct btrfs_trans_handle *trans,
				2540	struct btrfs_root *log,
				2541	struct btrfs_path *dst_path,
				2542	struct extent_buffer *src,
				2543	int start_slot, int nr, int inode_only)
				2544	{
				2545	unsigned long src_offset;
				2546	unsigned long dst_offset;
				2547	struct btrfs_file_extent_item *extent;
				2548	struct btrfs_inode_item *inode_item;
				2549	int ret;
				2550	struct btrfs_key *ins_keys;
				2551	u32 *ins_sizes;
				2552	char *ins_data;
				2553	int i;
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame^]	2554	struct list_head ordered_sums;
				2555
				2556	INIT_LIST_HEAD(&ordered_sums);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2557
				2558	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
				2559	nr * sizeof(u32), GFP_NOFS);
				2560	ins_sizes = (u32 *)ins_data;
				2561	ins_keys = (struct btrfs_key )(ins_data + nr sizeof(u32));
				2562
				2563	for (i = 0; i < nr; i++) {
				2564	ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
				2565	btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
				2566	}
				2567	ret = btrfs_insert_empty_items(trans, log, dst_path,
				2568	ins_keys, ins_sizes, nr);
				2569	BUG_ON(ret);
				2570
				2571	for (i = 0; i < nr; i++) {
				2572	dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
				2573	dst_path->slots[0]);
				2574
				2575	src_offset = btrfs_item_ptr_offset(src, start_slot + i);
				2576
				2577	copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
				2578	src_offset, ins_sizes[i]);
				2579
				2580	if (inode_only == LOG_INODE_EXISTS &&
				2581	ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
				2582	inode_item = btrfs_item_ptr(dst_path->nodes[0],
				2583	dst_path->slots[0],
				2584	struct btrfs_inode_item);
				2585	btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
				2586
				2587	/* set the generation to zero so the recover code
				2588	* can tell the difference between an logging
				2589	* just to say 'this inode exists' and a logging
				2590	* to say 'update this inode with these values'
				2591	*/
				2592	btrfs_set_inode_generation(dst_path->nodes[0],
				2593	inode_item, 0);
				2594	}
				2595	/* take a reference on file data extents so that truncates
				2596	* or deletes of this inode don't have to relog the inode
				2597	* again
				2598	*/
				2599	if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
				2600	int found_type;
				2601	extent = btrfs_item_ptr(src, start_slot + i,
				2602	struct btrfs_file_extent_item);
				2603
				2604	found_type = btrfs_file_extent_type(src, extent);
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	2605	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				2606	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2607	u64 ds = btrfs_file_extent_disk_bytenr(src,
				2608	extent);
				2609	u64 dl = btrfs_file_extent_disk_num_bytes(src,
				2610	extent);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame^]	2611	u64 cs = btrfs_file_extent_offset(src, extent);
				2612	u64 cl = btrfs_file_extent_num_bytes(src,
				2613	extent);;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2614	/* ds == 0 is a hole */
				2615	if (ds != 0) {
				2616	ret = btrfs_inc_extent_ref(trans, log,
				2617	ds, dl,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	2618	dst_path->nodes[0]->start,
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2619	BTRFS_TREE_LOG_OBJECTID,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	2620	trans->transid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	2621	ins_keys[i].objectid);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2622	BUG_ON(ret);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame^]	2623	ret = copy_extent_csums(trans,
				2624	&ordered_sums,
				2625	log->fs_info->csum_root,
				2626	ds + cs, cl);
				2627	BUG_ON(ret);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2628	}
				2629	}
				2630	}
				2631	dst_path->slots[0]++;
				2632	}
				2633
				2634	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
				2635	btrfs_release_path(log, dst_path);
				2636	kfree(ins_data);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame^]	2637
				2638	/*
				2639	* we have to do this after the loop above to avoid changing the
				2640	* log tree while trying to change the log tree.
				2641	*/
				2642	while(!list_empty(&ordered_sums)) {
				2643	struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
				2644	struct btrfs_ordered_sum,
				2645	list);
				2646	ret = btrfs_csum_file_blocks(trans, log, sums);
				2647	BUG_ON(ret);
				2648	list_del(&sums->list);
				2649	kfree(sums);
				2650	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2651	return 0;
				2652	}
				2653
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2654	/* log a single inode in the tree log.
				2655	* At least one parent directory for this inode must exist in the tree
				2656	* or be logged already.
				2657	*
				2658	* Any items from this inode changed by the current transaction are copied
				2659	* to the log tree. An extra reference is taken on any extents in this
				2660	* file, allowing us to avoid a whole pile of corner cases around logging
				2661	* blocks that have been removed from the tree.
				2662	*
				2663	* See LOG_INODE_ALL and related defines for a description of what inode_only
				2664	* does.
				2665	*
				2666	* This handles both files and directories.
				2667	*/
				2668	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				2669	struct btrfs_root root, struct inode inode,
				2670	int inode_only)
				2671	{
				2672	struct btrfs_path *path;
				2673	struct btrfs_path *dst_path;
				2674	struct btrfs_key min_key;
				2675	struct btrfs_key max_key;
				2676	struct btrfs_root *log = root->log_root;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2677	struct extent_buffer *src = NULL;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2678	u32 size;
				2679	int ret;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2680	int nritems;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2681	int ins_start_slot = 0;
				2682	int ins_nr;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2683
				2684	log = root->log_root;
				2685
				2686	path = btrfs_alloc_path();
				2687	dst_path = btrfs_alloc_path();
				2688
				2689	min_key.objectid = inode->i_ino;
				2690	min_key.type = BTRFS_INODE_ITEM_KEY;
				2691	min_key.offset = 0;
				2692
				2693	max_key.objectid = inode->i_ino;
				2694	if (inode_only == LOG_INODE_EXISTS \|\| S_ISDIR(inode->i_mode))
				2695	max_key.type = BTRFS_XATTR_ITEM_KEY;
				2696	else
				2697	max_key.type = (u8)-1;
				2698	max_key.offset = (u64)-1;
				2699
				2700	/*
				2701	* if this inode has already been logged and we're in inode_only
				2702	* mode, we don't want to delete the things that have already
				2703	* been written to the log.
				2704	*
				2705	* But, if the inode has been through an inode_only log,
				2706	* the logged_trans field is not set. This allows us to catch
				2707	* any new names for this inode in the backrefs by logging it
				2708	* again
				2709	*/
				2710	if (inode_only == LOG_INODE_EXISTS &&
				2711	BTRFS_I(inode)->logged_trans == trans->transid) {
				2712	btrfs_free_path(path);
				2713	btrfs_free_path(dst_path);
				2714	goto out;
				2715	}
				2716	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2717
				2718	/*
				2719	* a brute force approach to making sure we get the most uptodate
				2720	* copies of everything.
				2721	*/
				2722	if (S_ISDIR(inode->i_mode)) {
				2723	int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
				2724
				2725	if (inode_only == LOG_INODE_EXISTS)
				2726	max_key_type = BTRFS_XATTR_ITEM_KEY;
				2727	ret = drop_objectid_items(trans, log, path,
				2728	inode->i_ino, max_key_type);
				2729	} else {
				2730	ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
				2731	}
				2732	BUG_ON(ret);
				2733	path->keep_locks = 1;
				2734
				2735	while(1) {
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2736	ins_nr = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2737	ret = btrfs_search_forward(root, &min_key, &max_key,
				2738	path, 0, trans->transid);
				2739	if (ret != 0)
				2740	break;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2741	again:
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2742	/* note, ins_nr might be > 0 here, cleanup outside the loop */
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2743	if (min_key.objectid != inode->i_ino)
				2744	break;
				2745	if (min_key.type > max_key.type)
				2746	break;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2747
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2748	src = path->nodes[0];
				2749	size = btrfs_item_size_nr(src, path->slots[0]);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2750	if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
				2751	ins_nr++;
				2752	goto next_slot;
				2753	} else if (!ins_nr) {
				2754	ins_start_slot = path->slots[0];
				2755	ins_nr = 1;
				2756	goto next_slot;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2757	}
				2758
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2759	ret = copy_items(trans, log, dst_path, src, ins_start_slot,
				2760	ins_nr, inode_only);
				2761	BUG_ON(ret);
				2762	ins_nr = 1;
				2763	ins_start_slot = path->slots[0];
				2764	next_slot:
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2765
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2766	nritems = btrfs_header_nritems(path->nodes[0]);
				2767	path->slots[0]++;
				2768	if (path->slots[0] < nritems) {
				2769	btrfs_item_key_to_cpu(path->nodes[0], &min_key,
				2770	path->slots[0]);
				2771	goto again;
				2772	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2773	if (ins_nr) {
				2774	ret = copy_items(trans, log, dst_path, src,
				2775	ins_start_slot,
				2776	ins_nr, inode_only);
				2777	BUG_ON(ret);
				2778	ins_nr = 0;
				2779	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2780	btrfs_release_path(root, path);
				2781
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2782	if (min_key.offset < (u64)-1)
				2783	min_key.offset++;
				2784	else if (min_key.type < (u8)-1)
				2785	min_key.type++;
				2786	else if (min_key.objectid < (u64)-1)
				2787	min_key.objectid++;
				2788	else
				2789	break;
				2790	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2791	if (ins_nr) {
				2792	ret = copy_items(trans, log, dst_path, src,
				2793	ins_start_slot,
				2794	ins_nr, inode_only);
				2795	BUG_ON(ret);
				2796	ins_nr = 0;
				2797	}
				2798	WARN_ON(ins_nr);
Chris Mason	9623f9a	2008-09-11 17:42:42 -0400	[diff] [blame]	2799	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2800	btrfs_release_path(root, path);
				2801	btrfs_release_path(log, dst_path);
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	2802	BTRFS_I(inode)->log_dirty_trans = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2803	ret = log_directory_changes(trans, root, inode, path, dst_path);
				2804	BUG_ON(ret);
				2805	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2806	BTRFS_I(inode)->logged_trans = trans->transid;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2807	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2808
				2809	btrfs_free_path(path);
				2810	btrfs_free_path(dst_path);
				2811
				2812	mutex_lock(&root->fs_info->tree_log_mutex);
				2813	ret = update_log_root(trans, log);
				2814	BUG_ON(ret);
				2815	mutex_unlock(&root->fs_info->tree_log_mutex);
				2816	out:
				2817	return 0;
				2818	}
				2819
				2820	int btrfs_log_inode(struct btrfs_trans_handle *trans,
				2821	struct btrfs_root root, struct inode inode,
				2822	int inode_only)
				2823	{
				2824	int ret;
				2825
				2826	start_log_trans(trans, root);
				2827	ret = __btrfs_log_inode(trans, root, inode, inode_only);
				2828	end_log_trans(root);
				2829	return ret;
				2830	}
				2831
				2832	/*
				2833	* helper function around btrfs_log_inode to make sure newly created
				2834	* parent directories also end up in the log. A minimal inode and backref
				2835	* only logging is done of any parent directories that are older than
				2836	* the last committed transaction
				2837	*/
				2838	int btrfs_log_dentry(struct btrfs_trans_handle *trans,
				2839	struct btrfs_root root, struct dentry dentry)
				2840	{
				2841	int inode_only = LOG_INODE_ALL;
				2842	struct super_block *sb;
				2843	int ret;
				2844
				2845	start_log_trans(trans, root);
				2846	sb = dentry->d_inode->i_sb;
				2847	while(1) {
				2848	ret = __btrfs_log_inode(trans, root, dentry->d_inode,
				2849	inode_only);
				2850	BUG_ON(ret);
				2851	inode_only = LOG_INODE_EXISTS;
				2852
				2853	dentry = dentry->d_parent;
				2854	if (!dentry \|\| !dentry->d_inode \|\| sb != dentry->d_inode->i_sb)
				2855	break;
				2856
				2857	if (BTRFS_I(dentry->d_inode)->generation <=
				2858	root->fs_info->last_trans_committed)
				2859	break;
				2860	}
				2861	end_log_trans(root);
				2862	return 0;
				2863	}
				2864
				2865	/*
				2866	* it is not safe to log dentry if the chunk root has added new
				2867	* chunks. This returns 0 if the dentry was logged, and 1 otherwise.
				2868	* If this returns 1, you must commit the transaction to safely get your
				2869	* data on disk.
				2870	*/
				2871	int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
				2872	struct btrfs_root root, struct dentry dentry)
				2873	{
				2874	u64 gen;
				2875	gen = root->fs_info->last_trans_new_blockgroup;
				2876	if (gen > root->fs_info->last_trans_committed)
				2877	return 1;
				2878	else
				2879	return btrfs_log_dentry(trans, root, dentry);
				2880	}
				2881
				2882	/*
				2883	* should be called during mount to recover any replay any log trees
				2884	* from the FS
				2885	*/
				2886	int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
				2887	{
				2888	int ret;
				2889	struct btrfs_path *path;
				2890	struct btrfs_trans_handle *trans;
				2891	struct btrfs_key key;
				2892	struct btrfs_key found_key;
				2893	struct btrfs_key tmp_key;
				2894	struct btrfs_root *log;
				2895	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2896	u64 highest_inode;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2897	struct walk_control wc = {
				2898	.process_func = process_one_buffer,
				2899	.stage = 0,
				2900	};
				2901
				2902	fs_info->log_root_recovering = 1;
				2903	path = btrfs_alloc_path();
				2904	BUG_ON(!path);
				2905
				2906	trans = btrfs_start_transaction(fs_info->tree_root, 1);
				2907
				2908	wc.trans = trans;
				2909	wc.pin = 1;
				2910
				2911	walk_log_tree(trans, log_root_tree, &wc);
				2912
				2913	again:
				2914	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				2915	key.offset = (u64)-1;
				2916	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				2917
				2918	while(1) {
				2919	ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
				2920	if (ret < 0)
				2921	break;
				2922	if (ret > 0) {
				2923	if (path->slots[0] == 0)
				2924	break;
				2925	path->slots[0]--;
				2926	}
				2927	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2928	path->slots[0]);
				2929	btrfs_release_path(log_root_tree, path);
				2930	if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				2931	break;
				2932
				2933	log = btrfs_read_fs_root_no_radix(log_root_tree,
				2934	&found_key);
				2935	BUG_ON(!log);
				2936
				2937
				2938	tmp_key.objectid = found_key.offset;
				2939	tmp_key.type = BTRFS_ROOT_ITEM_KEY;
				2940	tmp_key.offset = (u64)-1;
				2941
				2942	wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
				2943
				2944	BUG_ON(!wc.replay_dest);
				2945
				2946	btrfs_record_root_in_trans(wc.replay_dest);
				2947	ret = walk_log_tree(trans, log, &wc);
				2948	BUG_ON(ret);
				2949
				2950	if (wc.stage == LOG_WALK_REPLAY_ALL) {
				2951	ret = fixup_inode_link_counts(trans, wc.replay_dest,
				2952	path);
				2953	BUG_ON(ret);
				2954	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2955	ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
				2956	if (ret == 0) {
				2957	wc.replay_dest->highest_inode = highest_inode;
				2958	wc.replay_dest->last_inode_alloc = highest_inode;
				2959	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2960
				2961	key.offset = found_key.offset - 1;
				2962	free_extent_buffer(log->node);
				2963	kfree(log);
				2964
				2965	if (found_key.offset == 0)
				2966	break;
				2967	}
				2968	btrfs_release_path(log_root_tree, path);
				2969
				2970	/* step one is to pin it all, step two is to replay just inodes */
				2971	if (wc.pin) {
				2972	wc.pin = 0;
				2973	wc.process_func = replay_one_buffer;
				2974	wc.stage = LOG_WALK_REPLAY_INODES;
				2975	goto again;
				2976	}
				2977	/* step three is to replay everything */
				2978	if (wc.stage < LOG_WALK_REPLAY_ALL) {
				2979	wc.stage++;
				2980	goto again;
				2981	}
				2982
				2983	btrfs_free_path(path);
				2984
				2985	free_extent_buffer(log_root_tree->node);
				2986	log_root_tree->log_root = NULL;
				2987	fs_info->log_root_recovering = 0;
				2988
				2989	/* step 4: commit the transaction, which also unpins the blocks */
				2990	btrfs_commit_transaction(trans, fs_info->tree_root);
				2991
				2992	kfree(log_root_tree);
				2993	return 0;
				2994	}