Blame - fs/ext3/inode.c - SHIFTPHONES/mainline/linux

blob: ea5888688f9471735395537eeab09c31472fc21f [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/ext3/inode.c
				3	*
				4	* Copyright (C) 1992, 1993, 1994, 1995
				5	* Remy Card (card@masi.ibp.fr)
				6	* Laboratoire MASI - Institut Blaise Pascal
				7	* Universite Pierre et Marie Curie (Paris VI)
				8	*
				9	* from
				10	*
				11	* linux/fs/minix/inode.c
				12	*
				13	* Copyright (C) 1991, 1992 Linus Torvalds
				14	*
				15	* Goal-directed block allocation by Stephen Tweedie
				16	* (sct@redhat.com), 1993, 1998
				17	* Big-endian to little-endian byte-swapping/bitmaps by
				18	* David S. Miller (davem@caip.rutgers.edu), 1995
				19	* 64-bit file support on 64-bit platforms by Jakub Jelinek
				20	* (jj@sunsite.ms.mff.cuni.cz)
				21	*
				22	* Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
				23	*/
				24
				25	#include <linux/module.h>
				26	#include <linux/fs.h>
				27	#include <linux/time.h>
				28	#include <linux/ext3_jbd.h>
				29	#include <linux/jbd.h>
				30	#include <linux/smp_lock.h>
				31	#include <linux/highuid.h>
				32	#include <linux/pagemap.h>
				33	#include <linux/quotaops.h>
				34	#include <linux/string.h>
				35	#include <linux/buffer_head.h>
				36	#include <linux/writeback.h>
				37	#include <linux/mpage.h>
				38	#include <linux/uio.h>
				39	#include "xattr.h"
				40	#include "acl.h"
				41
				42	static int ext3_writepage_trans_blocks(struct inode *inode);
				43
				44	/*
				45	* Test whether an inode is a fast symlink.
				46	*/
				47	static inline int ext3_inode_is_fast_symlink(struct inode *inode)
				48	{
				49	int ea_blocks = EXT3_I(inode)->i_file_acl ?
				50	(inode->i_sb->s_blocksize >> 9) : 0;
				51
				52	return (S_ISLNK(inode->i_mode) &&
				53	inode->i_blocks - ea_blocks == 0);
				54	}
				55
				56	/* The ext3 forget function must perform a revoke if we are freeing data
				57	* which has been journaled. Metadata (eg. indirect blocks) must be
				58	* revoked in all cases.
				59	*
				60	* "bh" may be NULL: a metadata block may have been freed from memory
				61	* but there may still be a record of it in the journal, and that record
				62	* still needs to be revoked.
				63	*/
				64
				65	int ext3_forget(handle_t *handle, int is_metadata,
				66	struct inode inode, struct buffer_head bh,
				67	int blocknr)
				68	{
				69	int err;
				70
				71	might_sleep();
				72
				73	BUFFER_TRACE(bh, "enter");
				74
				75	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
				76	"data mode %lx\n",
				77	bh, is_metadata, inode->i_mode,
				78	test_opt(inode->i_sb, DATA_FLAGS));
				79
				80	/* Never use the revoke function if we are doing full data
				81	* journaling: there is no need to, and a V1 superblock won't
				82	* support it. Otherwise, only skip the revoke on un-journaled
				83	* data blocks. */
				84
				85	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA \|\|
				86	(!is_metadata && !ext3_should_journal_data(inode))) {
				87	if (bh) {
				88	BUFFER_TRACE(bh, "call journal_forget");
				89	return ext3_journal_forget(handle, bh);
				90	}
				91	return 0;
				92	}
				93
				94	/*
				95	* data!=journal && (is_metadata \|\| should_journal_data(inode))
				96	*/
				97	BUFFER_TRACE(bh, "call ext3_journal_revoke");
				98	err = ext3_journal_revoke(handle, blocknr, bh);
				99	if (err)
				100	ext3_abort(inode->i_sb, __FUNCTION__,
				101	"error %d when attempting revoke", err);
				102	BUFFER_TRACE(bh, "exit");
				103	return err;
				104	}
				105
				106	/*
				107	* Work out how many blocks we need to progress with the next chunk of a
				108	* truncate transaction.
				109	*/
				110
				111	static unsigned long blocks_for_truncate(struct inode *inode)
				112	{
				113	unsigned long needed;
				114
				115	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
				116
				117	/* Give ourselves just enough room to cope with inodes in which
				118	* i_blocks is corrupt: we've seen disk corruptions in the past
				119	* which resulted in random data in an inode which looked enough
				120	* like a regular file for ext3 to try to delete it. Things
				121	* will go a bit crazy if that happens, but at least we should
				122	* try not to panic the whole kernel. */
				123	if (needed < 2)
				124	needed = 2;
				125
				126	/* But we need to bound the transaction so we don't overflow the
				127	* journal. */
				128	if (needed > EXT3_MAX_TRANS_DATA)
				129	needed = EXT3_MAX_TRANS_DATA;
				130
				131	return EXT3_DATA_TRANS_BLOCKS + needed;
				132	}
				133
				134	/*
				135	* Truncate transactions can be complex and absolutely huge. So we need to
				136	* be able to restart the transaction at a conventient checkpoint to make
				137	* sure we don't overflow the journal.
				138	*
				139	* start_transaction gets us a new handle for a truncate transaction,
				140	* and extend_transaction tries to extend the existing one a bit. If
				141	* extend fails, we need to propagate the failure up and restart the
				142	* transaction in the top-level truncate loop. --sct
				143	*/
				144
				145	static handle_t start_transaction(struct inode inode)
				146	{
				147	handle_t *result;
				148
				149	result = ext3_journal_start(inode, blocks_for_truncate(inode));
				150	if (!IS_ERR(result))
				151	return result;
				152
				153	ext3_std_error(inode->i_sb, PTR_ERR(result));
				154	return result;
				155	}
				156
				157	/*
				158	* Try to extend this transaction for the purposes of truncation.
				159	*
				160	* Returns 0 if we managed to create more room. If we can't create more
				161	* room, and the transaction must be restarted we return 1.
				162	*/
				163	static int try_to_extend_transaction(handle_t handle, struct inode inode)
				164	{
				165	if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
				166	return 0;
				167	if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
				168	return 0;
				169	return 1;
				170	}
				171
				172	/*
				173	* Restart the transaction associated with *handle. This does a commit,
				174	* so before we call here everything must be consistently dirtied against
				175	* this transaction.
				176	*/
				177	static int ext3_journal_test_restart(handle_t handle, struct inode inode)
				178	{
				179	jbd_debug(2, "restarting handle %p\n", handle);
				180	return ext3_journal_restart(handle, blocks_for_truncate(inode));
				181	}
				182
				183	/*
				184	* Called at the last iput() if i_nlink is zero.
				185	*/
				186	void ext3_delete_inode (struct inode * inode)
				187	{
				188	handle_t *handle;
				189
				190	if (is_bad_inode(inode))
				191	goto no_delete;
				192
				193	handle = start_transaction(inode);
				194	if (IS_ERR(handle)) {
				195	/* If we're going to skip the normal cleanup, we still
				196	* need to make sure that the in-core orphan linked list
				197	* is properly cleaned up. */
				198	ext3_orphan_del(NULL, inode);
				199	goto no_delete;
				200	}
				201
				202	if (IS_SYNC(inode))
				203	handle->h_sync = 1;
				204	inode->i_size = 0;
				205	if (inode->i_blocks)
				206	ext3_truncate(inode);
				207	/*
				208	* Kill off the orphan record which ext3_truncate created.
				209	* AKPM: I think this can be inside the above `if'.
				210	* Note that ext3_orphan_del() has to be able to cope with the
				211	* deletion of a non-existent orphan - this is because we don't
				212	* know if ext3_truncate() actually created an orphan record.
				213	* (Well, we could do this if we need to, but heck - it works)
				214	*/
				215	ext3_orphan_del(handle, inode);
				216	EXT3_I(inode)->i_dtime = get_seconds();
				217
				218	/*
				219	* One subtle ordering requirement: if anything has gone wrong
				220	* (transaction abort, IO errors, whatever), then we can still
				221	* do these next steps (the fs will already have been marked as
				222	* having errors), but we can't free the inode if the mark_dirty
				223	* fails.
				224	*/
				225	if (ext3_mark_inode_dirty(handle, inode))
				226	/* If that failed, just do the required in-core inode clear. */
				227	clear_inode(inode);
				228	else
				229	ext3_free_inode(handle, inode);
				230	ext3_journal_stop(handle);
				231	return;
				232	no_delete:
				233	clear_inode(inode); /* We must guarantee clearing of inode... */
				234	}
				235
				236	static int ext3_alloc_block (handle_t *handle,
				237	struct inode * inode, unsigned long goal, int *err)
				238	{
				239	unsigned long result;
				240
				241	result = ext3_new_block(handle, inode, goal, err);
				242	return result;
				243	}
				244
				245
				246	typedef struct {
				247	__le32 *p;
				248	__le32 key;
				249	struct buffer_head *bh;
				250	} Indirect;
				251
				252	static inline void add_chain(Indirect p, struct buffer_head bh, __le32 *v)
				253	{
				254	p->key = *(p->p = v);
				255	p->bh = bh;
				256	}
				257
				258	static inline int verify_chain(Indirect from, Indirect to)
				259	{
				260	while (from <= to && from->key == *from->p)
				261	from++;
				262	return (from > to);
				263	}
				264
				265	/**
				266	* ext3_block_to_path - parse the block number into array of offsets
				267	* @inode: inode in question (we are only interested in its superblock)
				268	* @i_block: block number to be parsed
				269	* @offsets: array to store the offsets in
				270	* @boundary: set this non-zero if the referred-to block is likely to be
				271	* followed (on disk) by an indirect block.
				272	*
				273	* To store the locations of file's data ext3 uses a data structure common
				274	* for UNIX filesystems - tree of pointers anchored in the inode, with
				275	* data blocks at leaves and indirect blocks in intermediate nodes.
				276	* This function translates the block number into path in that tree -
				277	* return value is the path length and @offsets[n] is the offset of
				278	* pointer to (n+1)th node in the nth one. If @block is out of range
				279	* (negative or too large) warning is printed and zero returned.
				280	*
				281	* Note: function doesn't find node addresses, so no IO is needed. All
				282	* we need to know is the capacity of indirect blocks (taken from the
				283	* inode->i_sb).
				284	*/
				285
				286	/*
				287	* Portability note: the last comparison (check that we fit into triple
				288	* indirect block) is spelled differently, because otherwise on an
				289	* architecture with 32-bit longs and 8Kb pages we might get into trouble
				290	* if our filesystem had 8Kb blocks. We might use long long, but that would
				291	* kill us on x86. Oh, well, at least the sign propagation does not matter -
				292	* i_block would have to be negative in the very beginning, so we would not
				293	* get there at all.
				294	*/
				295
				296	static int ext3_block_to_path(struct inode *inode,
				297	long i_block, int offsets[4], int *boundary)
				298	{
				299	int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				300	int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
				301	const long direct_blocks = EXT3_NDIR_BLOCKS,
				302	indirect_blocks = ptrs,
				303	double_blocks = (1 << (ptrs_bits * 2));
				304	int n = 0;
				305	int final = 0;
				306
				307	if (i_block < 0) {
				308	ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
				309	} else if (i_block < direct_blocks) {
				310	offsets[n++] = i_block;
				311	final = direct_blocks;
				312	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
				313	offsets[n++] = EXT3_IND_BLOCK;
				314	offsets[n++] = i_block;
				315	final = ptrs;
				316	} else if ((i_block -= indirect_blocks) < double_blocks) {
				317	offsets[n++] = EXT3_DIND_BLOCK;
				318	offsets[n++] = i_block >> ptrs_bits;
				319	offsets[n++] = i_block & (ptrs - 1);
				320	final = ptrs;
				321	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
				322	offsets[n++] = EXT3_TIND_BLOCK;
				323	offsets[n++] = i_block >> (ptrs_bits * 2);
				324	offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
				325	offsets[n++] = i_block & (ptrs - 1);
				326	final = ptrs;
				327	} else {
				328	ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
				329	}
				330	if (boundary)
				331	*boundary = (i_block & (ptrs - 1)) == (final - 1);
				332	return n;
				333	}
				334
				335	/**
				336	* ext3_get_branch - read the chain of indirect blocks leading to data
				337	* @inode: inode in question
				338	* @depth: depth of the chain (1 - direct pointer, etc.)
				339	* @offsets: offsets of pointers in inode/indirect blocks
				340	* @chain: place to store the result
				341	* @err: here we store the error value
				342	*
				343	* Function fills the array of triples <key, p, bh> and returns %NULL
				344	* if everything went OK or the pointer to the last filled triple
				345	* (incomplete one) otherwise. Upon the return chain[i].key contains
				346	* the number of (i+1)-th block in the chain (as it is stored in memory,
				347	* i.e. little-endian 32-bit), chain[i].p contains the address of that
				348	* number (it points into struct inode for i==0 and into the bh->b_data
				349	* for i>0) and chain[i].bh points to the buffer_head of i-th indirect
				350	* block for i>0 and NULL for i==0. In other words, it holds the block
				351	* numbers of the chain, addresses they were taken from (and where we can
				352	* verify that chain did not change) and buffer_heads hosting these
				353	* numbers.
				354	*
				355	* Function stops when it stumbles upon zero pointer (absent block)
				356	* (pointer to last triple returned, *@err == 0)
				357	* or when it gets an IO error reading an indirect block
				358	* (ditto, *@err == -EIO)
				359	* or when it notices that chain had been changed while it was reading
				360	* (ditto, *@err == -EAGAIN)
				361	* or when it reads all @depth-1 indirect blocks successfully and finds
				362	* the whole chain, all way to the data (returns %NULL, *err == 0).
				363	*/
				364	static Indirect ext3_get_branch(struct inode inode, int depth, int *offsets,
				365	Indirect chain[4], int *err)
				366	{
				367	struct super_block *sb = inode->i_sb;
				368	Indirect *p = chain;
				369	struct buffer_head *bh;
				370
				371	*err = 0;
				372	/* i_data is not going away, no lock needed */
				373	add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
				374	if (!p->key)
				375	goto no_block;
				376	while (--depth) {
				377	bh = sb_bread(sb, le32_to_cpu(p->key));
				378	if (!bh)
				379	goto failure;
				380	/* Reader: pointers */
				381	if (!verify_chain(chain, p))
				382	goto changed;
				383	add_chain(++p, bh, (__le32)bh->b_data + ++offsets);
				384	/* Reader: end */
				385	if (!p->key)
				386	goto no_block;
				387	}
				388	return NULL;
				389
				390	changed:
				391	brelse(bh);
				392	*err = -EAGAIN;
				393	goto no_block;
				394	failure:
				395	*err = -EIO;
				396	no_block:
				397	return p;
				398	}
				399
				400	/**
				401	* ext3_find_near - find a place for allocation with sufficient locality
				402	* @inode: owner
				403	* @ind: descriptor of indirect block.
				404	*
				405	* This function returns the prefered place for block allocation.
				406	* It is used when heuristic for sequential allocation fails.
				407	* Rules are:
				408	* + if there is a block to the left of our position - allocate near it.
				409	* + if pointer will live in indirect block - allocate near that block.
				410	* + if pointer will live in inode - allocate in the same
				411	* cylinder group.
				412	*
				413	* In the latter case we colour the starting block by the callers PID to
				414	* prevent it from clashing with concurrent allocations for a different inode
				415	* in the same block group. The PID is used here so that functionally related
				416	* files will be close-by on-disk.
				417	*
				418	* Caller must make sure that @ind is valid and will stay that way.
				419	*/
				420
				421	static unsigned long ext3_find_near(struct inode inode, Indirect ind)
				422	{
				423	struct ext3_inode_info *ei = EXT3_I(inode);
				424	__le32 start = ind->bh ? (__le32) ind->bh->b_data : ei->i_data;
				425	__le32 *p;
				426	unsigned long bg_start;
				427	unsigned long colour;
				428
				429	/* Try to find previous block */
				430	for (p = ind->p - 1; p >= start; p--)
				431	if (*p)
				432	return le32_to_cpu(*p);
				433
				434	/* No such thing, so let's try location of indirect block */
				435	if (ind->bh)
				436	return ind->bh->b_blocknr;
				437
				438	/*
				439	* It is going to be refered from inode itself? OK, just put it into
				440	* the same cylinder group then.
				441	*/
				442	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
				443	le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
				444	colour = (current->pid % 16) *
				445	(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
				446	return bg_start + colour;
				447	}
				448
				449	/**
				450	* ext3_find_goal - find a prefered place for allocation.
				451	* @inode: owner
				452	* @block: block we want
				453	* @chain: chain of indirect blocks
				454	* @partial: pointer to the last triple within a chain
				455	* @goal: place to store the result.
				456	*
				457	* Normally this function find the prefered place for block allocation,
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	458	* stores it in *@goal and returns zero.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	459	*/
				460
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	461	static unsigned long ext3_find_goal(struct inode *inode, long block,
				462	Indirect chain[4], Indirect *partial)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	463	{
				464	struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
				465
				466	/*
				467	* try the heuristic for sequential allocation,
				468	* failing that at least try to get decent locality.
				469	*/
				470	if (block_i && (block == block_i->last_alloc_logical_block + 1)
				471	&& (block_i->last_alloc_physical_block != 0)) {
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	472	return block_i->last_alloc_physical_block + 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	473	}
				474
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	475	return ext3_find_near(inode, partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	476	}
				477
				478	/**
				479	* ext3_alloc_branch - allocate and set up a chain of blocks.
				480	* @inode: owner
				481	* @num: depth of the chain (number of blocks to allocate)
				482	* @offsets: offsets (in the blocks) to store the pointers to next.
				483	* @branch: place to store the chain in.
				484	*
				485	* This function allocates @num blocks, zeroes out all but the last one,
				486	* links them into chain and (if we are synchronous) writes them to disk.
				487	* In other words, it prepares a branch that can be spliced onto the
				488	* inode. It stores the information about that chain in the branch[], in
				489	* the same format as ext3_get_branch() would do. We are calling it after
				490	* we had read the existing part of chain and partial points to the last
				491	* triple of that (one with zero ->key). Upon the exit we have the same
				492	* picture as after the successful ext3_get_block(), excpet that in one
				493	* place chain is disconnected - *branch->p is still zero (we did not
				494	* set the last link), but branch->key contains the number that should
				495	* be placed into *branch->p to fill that gap.
				496	*
				497	* If allocation fails we free all blocks we've allocated (and forget
				498	* their buffer_heads) and return the error value the from failed
				499	* ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
				500	* as described above and return 0.
				501	*/
				502
				503	static int ext3_alloc_branch(handle_t handle, struct inode inode,
				504	int num,
				505	unsigned long goal,
				506	int *offsets,
				507	Indirect *branch)
				508	{
				509	int blocksize = inode->i_sb->s_blocksize;
				510	int n = 0, keys = 0;
				511	int err = 0;
				512	int i;
				513	int parent = ext3_alloc_block(handle, inode, goal, &err);
				514
				515	branch[0].key = cpu_to_le32(parent);
				516	if (parent) {
				517	for (n = 1; n < num; n++) {
				518	struct buffer_head *bh;
				519	/* Allocate the next block */
				520	int nr = ext3_alloc_block(handle, inode, parent, &err);
				521	if (!nr)
				522	break;
				523	branch[n].key = cpu_to_le32(nr);
				524	keys = n+1;
				525
				526	/*
				527	* Get buffer_head for parent block, zero it out
				528	* and set the pointer to new one, then send
				529	* parent to disk.
				530	*/
				531	bh = sb_getblk(inode->i_sb, parent);
				532	branch[n].bh = bh;
				533	lock_buffer(bh);
				534	BUFFER_TRACE(bh, "call get_create_access");
				535	err = ext3_journal_get_create_access(handle, bh);
				536	if (err) {
				537	unlock_buffer(bh);
				538	brelse(bh);
				539	break;
				540	}
				541
				542	memset(bh->b_data, 0, blocksize);
				543	branch[n].p = (__le32*) bh->b_data + offsets[n];
				544	*branch[n].p = branch[n].key;
				545	BUFFER_TRACE(bh, "marking uptodate");
				546	set_buffer_uptodate(bh);
				547	unlock_buffer(bh);
				548
				549	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				550	err = ext3_journal_dirty_metadata(handle, bh);
				551	if (err)
				552	break;
				553
				554	parent = nr;
				555	}
				556	}
				557	if (n == num)
				558	return 0;
				559
				560	/* Allocation failed, free what we already allocated */
				561	for (i = 1; i < keys; i++) {
				562	BUFFER_TRACE(branch[i].bh, "call journal_forget");
				563	ext3_journal_forget(handle, branch[i].bh);
				564	}
				565	for (i = 0; i < keys; i++)
				566	ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
				567	return err;
				568	}
				569
				570	/**
				571	* ext3_splice_branch - splice the allocated branch onto inode.
				572	* @inode: owner
				573	* @block: (logical) number of block we are adding
				574	* @chain: chain of indirect blocks (with a missing link - see
				575	* ext3_alloc_branch)
				576	* @where: location of missing link
				577	* @num: number of blocks we are adding
				578	*
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	579	* This function fills the missing link and does all housekeeping needed in
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	580	* inode (->i_blocks, etc.). In case of success we end up with the full
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	581	* chain to new block and return 0.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	582	*/
				583
				584	static int ext3_splice_branch(handle_t handle, struct inode inode, long block,
				585	Indirect chain[4], Indirect *where, int num)
				586	{
				587	int i;
				588	int err = 0;
				589	struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
				590
				591	/*
				592	* If we're splicing into a [td]indirect block (as opposed to the
				593	* inode) then we need to get write access to the [td]indirect block
				594	* before the splice.
				595	*/
				596	if (where->bh) {
				597	BUFFER_TRACE(where->bh, "get_write_access");
				598	err = ext3_journal_get_write_access(handle, where->bh);
				599	if (err)
				600	goto err_out;
				601	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	602	/* That's it */
				603
				604	*where->p = where->key;
				605
				606	/*
				607	* update the most recently allocated logical & physical block
				608	* in i_block_alloc_info, to assist find the proper goal block for next
				609	* allocation
				610	*/
				611	if (block_i) {
				612	block_i->last_alloc_logical_block = block;
				613	block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key);
				614	}
				615
				616	/* We are done with atomic stuff, now do the rest of housekeeping */
				617
				618	inode->i_ctime = CURRENT_TIME_SEC;
				619	ext3_mark_inode_dirty(handle, inode);
				620
				621	/* had we spliced it onto indirect block? */
				622	if (where->bh) {
				623	/*
				624	* akpm: If we spliced it onto an indirect block, we haven't
				625	* altered the inode. Note however that if it is being spliced
				626	* onto an indirect block at the very end of the file (the
				627	* file is growing) then we will alter the inode to reflect
				628	* the new i_size. But that is not done here - it is done in
				629	* generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
				630	*/
				631	jbd_debug(5, "splicing indirect only\n");
				632	BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
				633	err = ext3_journal_dirty_metadata(handle, where->bh);
				634	if (err)
				635	goto err_out;
				636	} else {
				637	/*
				638	* OK, we spliced it into the inode itself on a direct block.
				639	* Inode was dirtied above.
				640	*/
				641	jbd_debug(5, "splicing direct\n");
				642	}
				643	return err;
				644
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	645	err_out:
				646	for (i = 1; i < num; i++) {
				647	BUFFER_TRACE(where[i].bh, "call journal_forget");
				648	ext3_journal_forget(handle, where[i].bh);
				649	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	650	return err;
				651	}
				652
				653	/*
				654	* Allocation strategy is simple: if we have to allocate something, we will
				655	* have to go the whole way to leaf. So let's do it before attaching anything
				656	* to tree, set linkage between the newborn blocks, write them if sync is
				657	* required, recheck the path, free and repeat if check fails, otherwise
				658	* set the last missing link (that will protect us from any truncate-generated
				659	* removals - all blocks on the path are immune now) and possibly force the
				660	* write on the parent block.
				661	* That has a nice additional property: no special recovery from the failed
				662	* allocations is needed - we simply release blocks and do not touch anything
				663	* reachable from inode.
				664	*
				665	* akpm: `handle' can be NULL if create == 0.
				666	*
				667	* The BKL may not be held on entry here. Be sure to take it early.
				668	*/
				669
				670	static int
				671	ext3_get_block_handle(handle_t handle, struct inode inode, sector_t iblock,
				672	struct buffer_head *bh_result, int create, int extend_disksize)
				673	{
				674	int err = -EIO;
				675	int offsets[4];
				676	Indirect chain[4];
				677	Indirect *partial;
				678	unsigned long goal;
				679	int left;
				680	int boundary = 0;
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	681	const int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	682	struct ext3_inode_info *ei = EXT3_I(inode);
				683
				684	J_ASSERT(handle != NULL \|\| create == 0);
				685
				686	if (depth == 0)
				687	goto out;
				688
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	689	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
				690
				691	/* Simplest case - block found, no allocation needed */
				692	if (!partial) {
				693	clear_buffer_new(bh_result);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	694	goto got_it;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	695	}
				696
				697	/* Next simple case - plain lookup or failed read of indirect block */
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	698	if (!create \|\| err == -EIO)
				699	goto cleanup;
				700
				701	down(&ei->truncate_sem);
				702
				703	/*
				704	* If the indirect block is missing while we are reading
				705	* the chain(ext3_get_branch() returns -EAGAIN err), or
				706	* if the chain has been changed after we grab the semaphore,
				707	* (either because another process truncated this branch, or
				708	* another get_block allocated this branch) re-grab the chain to see if
				709	* the request block has been allocated or not.
				710	*
				711	* Since we already block the truncate/other get_block
				712	* at this point, we will have the current copy of the chain when we
				713	* splice the branch into the tree.
				714	*/
				715	if (err == -EAGAIN \|\| !verify_chain(chain, partial)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	716	while (partial > chain) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	717	brelse(partial->bh);
				718	partial--;
				719	}
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	720	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
				721	if (!partial) {
				722	up(&ei->truncate_sem);
				723	if (err)
				724	goto cleanup;
				725	clear_buffer_new(bh_result);
				726	goto got_it;
				727	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	728	}
				729
				730	/*
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	731	* Okay, we need to do block allocation. Lazily initialize the block
				732	* allocation info here if necessary
				733	*/
				734	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	735	ext3_init_block_alloc_info(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	736
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	737	goal = ext3_find_goal(inode, iblock, chain, partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	738
				739	left = (chain + depth) - partial;
				740
				741	/*
				742	* Block out ext3_truncate while we alter the tree
				743	*/
				744	err = ext3_alloc_branch(handle, inode, left, goal,
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	745	offsets + (partial - chain), partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	746
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	747	/*
				748	* The ext3_splice_branch call will free and forget any buffers
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	749	* on the new chain if there is a failure, but that risks using
				750	* up transaction credits, especially for bitmaps where the
				751	* credits cannot be returned. Can we handle this somehow? We
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	752	* may need to return -EAGAIN upwards in the worst case. --sct
				753	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	754	if (!err)
				755	err = ext3_splice_branch(handle, inode, iblock, chain,
				756	partial, left);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	757	/*
				758	* i_disksize growing is protected by truncate_sem. Don't forget to
				759	* protect it if you're about to implement concurrent
				760	* ext3_get_block() -bzzz
				761	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	762	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
				763	ei->i_disksize = inode->i_size;
				764	up(&ei->truncate_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	765	if (err)
				766	goto cleanup;
				767
				768	set_buffer_new(bh_result);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	769	got_it:
				770	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
				771	if (boundary)
				772	set_buffer_boundary(bh_result);
				773	/* Clean up and exit */
				774	partial = chain + depth - 1; /* the whole chain */
				775	cleanup:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	776	while (partial > chain) {
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	777	BUFFER_TRACE(partial->bh, "call brelse");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	778	brelse(partial->bh);
				779	partial--;
				780	}
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame^]	781	BUFFER_TRACE(bh_result, "returned");
				782	out:
				783	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	784	}
				785
				786	static int ext3_get_block(struct inode *inode, sector_t iblock,
				787	struct buffer_head *bh_result, int create)
				788	{
				789	handle_t *handle = NULL;
				790	int ret;
				791
				792	if (create) {
				793	handle = ext3_journal_current_handle();
				794	J_ASSERT(handle != 0);
				795	}
				796	ret = ext3_get_block_handle(handle, inode, iblock,
				797	bh_result, create, 1);
				798	return ret;
				799	}
				800
				801	#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
				802
				803	static int
				804	ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
				805	unsigned long max_blocks, struct buffer_head *bh_result,
				806	int create)
				807	{
				808	handle_t *handle = journal_current_handle();
				809	int ret = 0;
				810
				811	if (!handle)
				812	goto get_block; /* A read */
				813
				814	if (handle->h_transaction->t_state == T_LOCKED) {
				815	/*
				816	* Huge direct-io writes can hold off commits for long
				817	* periods of time. Let this commit run.
				818	*/
				819	ext3_journal_stop(handle);
				820	handle = ext3_journal_start(inode, DIO_CREDITS);
				821	if (IS_ERR(handle))
				822	ret = PTR_ERR(handle);
				823	goto get_block;
				824	}
				825
				826	if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
				827	/*
				828	* Getting low on buffer credits...
				829	*/
				830	ret = ext3_journal_extend(handle, DIO_CREDITS);
				831	if (ret > 0) {
				832	/*
				833	* Couldn't extend the transaction. Start a new one.
				834	*/
				835	ret = ext3_journal_restart(handle, DIO_CREDITS);
				836	}
				837	}
				838
				839	get_block:
				840	if (ret == 0)
				841	ret = ext3_get_block_handle(handle, inode, iblock,
				842	bh_result, create, 0);
				843	bh_result->b_size = (1 << inode->i_blkbits);
				844	return ret;
				845	}
				846
				847	static int ext3_writepages_get_block(struct inode *inode, sector_t iblock,
				848	struct buffer_head *bh, int create)
				849	{
				850	return ext3_direct_io_get_blocks(inode, iblock, 1, bh, create);
				851	}
				852
				853	/*
				854	* `handle' can be NULL if create is zero
				855	*/
				856	struct buffer_head ext3_getblk(handle_t handle, struct inode * inode,
				857	long block, int create, int * errp)
				858	{
				859	struct buffer_head dummy;
				860	int fatal = 0, err;
				861
				862	J_ASSERT(handle != NULL \|\| create == 0);
				863
				864	dummy.b_state = 0;
				865	dummy.b_blocknr = -1000;
				866	buffer_trace_init(&dummy.b_history);
				867	*errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
				868	if (!*errp && buffer_mapped(&dummy)) {
				869	struct buffer_head *bh;
				870	bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
				871	if (buffer_new(&dummy)) {
				872	J_ASSERT(create != 0);
				873	J_ASSERT(handle != 0);
				874
				875	/* Now that we do not always journal data, we
				876	should keep in mind whether this should
				877	always journal the new buffer as metadata.
				878	For now, regular file writes use
				879	ext3_get_block instead, so it's not a
				880	problem. */
				881	lock_buffer(bh);
				882	BUFFER_TRACE(bh, "call get_create_access");
				883	fatal = ext3_journal_get_create_access(handle, bh);
				884	if (!fatal && !buffer_uptodate(bh)) {
				885	memset(bh->b_data, 0, inode->i_sb->s_blocksize);
				886	set_buffer_uptodate(bh);
				887	}
				888	unlock_buffer(bh);
				889	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				890	err = ext3_journal_dirty_metadata(handle, bh);
				891	if (!fatal)
				892	fatal = err;
				893	} else {
				894	BUFFER_TRACE(bh, "not a new buffer");
				895	}
				896	if (fatal) {
				897	*errp = fatal;
				898	brelse(bh);
				899	bh = NULL;
				900	}
				901	return bh;
				902	}
				903	return NULL;
				904	}
				905
				906	struct buffer_head ext3_bread(handle_t handle, struct inode * inode,
				907	int block, int create, int *err)
				908	{
				909	struct buffer_head * bh;
				910
				911	bh = ext3_getblk(handle, inode, block, create, err);
				912	if (!bh)
				913	return bh;
				914	if (buffer_uptodate(bh))
				915	return bh;
				916	ll_rw_block(READ, 1, &bh);
				917	wait_on_buffer(bh);
				918	if (buffer_uptodate(bh))
				919	return bh;
				920	put_bh(bh);
				921	*err = -EIO;
				922	return NULL;
				923	}
				924
				925	static int walk_page_buffers( handle_t *handle,
				926	struct buffer_head *head,
				927	unsigned from,
				928	unsigned to,
				929	int *partial,
				930	int (fn)( handle_t handle,
				931	struct buffer_head *bh))
				932	{
				933	struct buffer_head *bh;
				934	unsigned block_start, block_end;
				935	unsigned blocksize = head->b_size;
				936	int err, ret = 0;
				937	struct buffer_head *next;
				938
				939	for ( bh = head, block_start = 0;
				940	ret == 0 && (bh != head \|\| !block_start);
				941	block_start = block_end, bh = next)
				942	{
				943	next = bh->b_this_page;
				944	block_end = block_start + blocksize;
				945	if (block_end <= from \|\| block_start >= to) {
				946	if (partial && !buffer_uptodate(bh))
				947	*partial = 1;
				948	continue;
				949	}
				950	err = (*fn)(handle, bh);
				951	if (!ret)
				952	ret = err;
				953	}
				954	return ret;
				955	}
				956
				957	/*
				958	* To preserve ordering, it is essential that the hole instantiation and
				959	* the data write be encapsulated in a single transaction. We cannot
				960	* close off a transaction and start a new one between the ext3_get_block()
				961	* and the commit_write(). So doing the journal_start at the start of
				962	* prepare_write() is the right place.
				963	*
				964	* Also, this function can nest inside ext3_writepage() ->
				965	* block_write_full_page(). In that case, we know that ext3_writepage()
				966	* has generated enough buffer credits to do the whole page. So we won't
				967	* block on the journal in that case, which is good, because the caller may
				968	* be PF_MEMALLOC.
				969	*
				970	* By accident, ext3 can be reentered when a transaction is open via
				971	* quota file writes. If we were to commit the transaction while thus
				972	* reentered, there can be a deadlock - we would be holding a quota
				973	* lock, and the commit would never complete if another thread had a
				974	* transaction open and was blocking on the quota lock - a ranking
				975	* violation.
				976	*
				977	* So what we do is to rely on the fact that journal_stop/journal_start
				978	* will _not_ run commit under these circumstances because handle->h_ref
				979	* is elevated. We'll still have enough credits for the tiny quotafile
				980	* write.
				981	*/
				982
				983	static int do_journal_get_write_access(handle_t *handle,
				984	struct buffer_head *bh)
				985	{
				986	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
				987	return 0;
				988	return ext3_journal_get_write_access(handle, bh);
				989	}
				990
				991	static int ext3_prepare_write(struct file file, struct page page,
				992	unsigned from, unsigned to)
				993	{
				994	struct inode *inode = page->mapping->host;
				995	int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
				996	handle_t *handle;
				997	int retries = 0;
				998
				999	retry:
				1000	handle = ext3_journal_start(inode, needed_blocks);
				1001	if (IS_ERR(handle)) {
				1002	ret = PTR_ERR(handle);
				1003	goto out;
				1004	}
				1005	if (test_opt(inode->i_sb, NOBH))
				1006	ret = nobh_prepare_write(page, from, to, ext3_get_block);
				1007	else
				1008	ret = block_prepare_write(page, from, to, ext3_get_block);
				1009	if (ret)
				1010	goto prepare_write_failed;
				1011
				1012	if (ext3_should_journal_data(inode)) {
				1013	ret = walk_page_buffers(handle, page_buffers(page),
				1014	from, to, NULL, do_journal_get_write_access);
				1015	}
				1016	prepare_write_failed:
				1017	if (ret)
				1018	ext3_journal_stop(handle);
				1019	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
				1020	goto retry;
				1021	out:
				1022	return ret;
				1023	}
				1024
				1025	int
				1026	ext3_journal_dirty_data(handle_t handle, struct buffer_head bh)
				1027	{
				1028	int err = journal_dirty_data(handle, bh);
				1029	if (err)
				1030	ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
				1031	bh, handle,err);
				1032	return err;
				1033	}
				1034
				1035	/* For commit_write() in data=journal mode */
				1036	static int commit_write_fn(handle_t handle, struct buffer_head bh)
				1037	{
				1038	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
				1039	return 0;
				1040	set_buffer_uptodate(bh);
				1041	return ext3_journal_dirty_metadata(handle, bh);
				1042	}
				1043
				1044	/*
				1045	* We need to pick up the new inode size which generic_commit_write gave us
				1046	* `file' can be NULL - eg, when called from page_symlink().
				1047	*
				1048	* ext3 never places buffers on inode->i_mapping->private_list. metadata
				1049	* buffers are managed internally.
				1050	*/
				1051
				1052	static int ext3_ordered_commit_write(struct file file, struct page page,
				1053	unsigned from, unsigned to)
				1054	{
				1055	handle_t *handle = ext3_journal_current_handle();
				1056	struct inode *inode = page->mapping->host;
				1057	int ret = 0, ret2;
				1058
				1059	ret = walk_page_buffers(handle, page_buffers(page),
				1060	from, to, NULL, ext3_journal_dirty_data);
				1061
				1062	if (ret == 0) {
				1063	/*
				1064	* generic_commit_write() will run mark_inode_dirty() if i_size
				1065	* changes. So let's piggyback the i_disksize mark_inode_dirty
				1066	* into that.
				1067	*/
				1068	loff_t new_i_size;
				1069
				1070	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1071	if (new_i_size > EXT3_I(inode)->i_disksize)
				1072	EXT3_I(inode)->i_disksize = new_i_size;
				1073	ret = generic_commit_write(file, page, from, to);
				1074	}
				1075	ret2 = ext3_journal_stop(handle);
				1076	if (!ret)
				1077	ret = ret2;
				1078	return ret;
				1079	}
				1080
				1081	static int ext3_writeback_commit_write(struct file file, struct page page,
				1082	unsigned from, unsigned to)
				1083	{
				1084	handle_t *handle = ext3_journal_current_handle();
				1085	struct inode *inode = page->mapping->host;
				1086	int ret = 0, ret2;
				1087	loff_t new_i_size;
				1088
				1089	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1090	if (new_i_size > EXT3_I(inode)->i_disksize)
				1091	EXT3_I(inode)->i_disksize = new_i_size;
				1092
				1093	if (test_opt(inode->i_sb, NOBH))
				1094	ret = nobh_commit_write(file, page, from, to);
				1095	else
				1096	ret = generic_commit_write(file, page, from, to);
				1097
				1098	ret2 = ext3_journal_stop(handle);
				1099	if (!ret)
				1100	ret = ret2;
				1101	return ret;
				1102	}
				1103
				1104	static int ext3_journalled_commit_write(struct file *file,
				1105	struct page *page, unsigned from, unsigned to)
				1106	{
				1107	handle_t *handle = ext3_journal_current_handle();
				1108	struct inode *inode = page->mapping->host;
				1109	int ret = 0, ret2;
				1110	int partial = 0;
				1111	loff_t pos;
				1112
				1113	/*
				1114	* Here we duplicate the generic_commit_write() functionality
				1115	*/
				1116	pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1117
				1118	ret = walk_page_buffers(handle, page_buffers(page), from,
				1119	to, &partial, commit_write_fn);
				1120	if (!partial)
				1121	SetPageUptodate(page);
				1122	if (pos > inode->i_size)
				1123	i_size_write(inode, pos);
				1124	EXT3_I(inode)->i_state \|= EXT3_STATE_JDATA;
				1125	if (inode->i_size > EXT3_I(inode)->i_disksize) {
				1126	EXT3_I(inode)->i_disksize = inode->i_size;
				1127	ret2 = ext3_mark_inode_dirty(handle, inode);
				1128	if (!ret)
				1129	ret = ret2;
				1130	}
				1131	ret2 = ext3_journal_stop(handle);
				1132	if (!ret)
				1133	ret = ret2;
				1134	return ret;
				1135	}
				1136
				1137	/*
				1138	* bmap() is special. It gets used by applications such as lilo and by
				1139	* the swapper to find the on-disk block of a specific piece of data.
				1140	*
				1141	* Naturally, this is dangerous if the block concerned is still in the
				1142	* journal. If somebody makes a swapfile on an ext3 data-journaling
				1143	* filesystem and enables swap, then they may get a nasty shock when the
				1144	* data getting swapped to that swapfile suddenly gets overwritten by
				1145	* the original zero's written out previously to the journal and
				1146	* awaiting writeback in the kernel's buffer cache.
				1147	*
				1148	* So, if we see any bmap calls here on a modified, data-journaled file,
				1149	* take extra steps to flush any blocks which might be in the cache.
				1150	*/
				1151	static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
				1152	{
				1153	struct inode *inode = mapping->host;
				1154	journal_t *journal;
				1155	int err;
				1156
				1157	if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
				1158	/*
				1159	* This is a REALLY heavyweight approach, but the use of
				1160	* bmap on dirty files is expected to be extremely rare:
				1161	* only if we run lilo or swapon on a freshly made file
				1162	* do we expect this to happen.
				1163	*
				1164	* (bmap requires CAP_SYS_RAWIO so this does not
				1165	* represent an unprivileged user DOS attack --- we'd be
				1166	* in trouble if mortal users could trigger this path at
				1167	* will.)
				1168	*
				1169	* NB. EXT3_STATE_JDATA is not set on files other than
				1170	* regular files. If somebody wants to bmap a directory
				1171	* or symlink and gets confused because the buffer
				1172	* hasn't yet been flushed to disk, they deserve
				1173	* everything they get.
				1174	*/
				1175
				1176	EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
				1177	journal = EXT3_JOURNAL(inode);
				1178	journal_lock_updates(journal);
				1179	err = journal_flush(journal);
				1180	journal_unlock_updates(journal);
				1181
				1182	if (err)
				1183	return 0;
				1184	}
				1185
				1186	return generic_block_bmap(mapping,block,ext3_get_block);
				1187	}
				1188
				1189	static int bget_one(handle_t handle, struct buffer_head bh)
				1190	{
				1191	get_bh(bh);
				1192	return 0;
				1193	}
				1194
				1195	static int bput_one(handle_t handle, struct buffer_head bh)
				1196	{
				1197	put_bh(bh);
				1198	return 0;
				1199	}
				1200
				1201	static int journal_dirty_data_fn(handle_t handle, struct buffer_head bh)
				1202	{
				1203	if (buffer_mapped(bh))
				1204	return ext3_journal_dirty_data(handle, bh);
				1205	return 0;
				1206	}
				1207
				1208	/*
				1209	* Note that we always start a transaction even if we're not journalling
				1210	* data. This is to preserve ordering: any hole instantiation within
				1211	* __block_write_full_page -> ext3_get_block() should be journalled
				1212	* along with the data so we don't crash and then get metadata which
				1213	* refers to old data.
				1214	*
				1215	* In all journalling modes block_write_full_page() will start the I/O.
				1216	*
				1217	* Problem:
				1218	*
				1219	* ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
				1220	* ext3_writepage()
				1221	*
				1222	* Similar for:
				1223	*
				1224	* ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
				1225	*
				1226	* Same applies to ext3_get_block(). We will deadlock on various things like
				1227	* lock_journal and i_truncate_sem.
				1228	*
				1229	* Setting PF_MEMALLOC here doesn't work - too many internal memory
				1230	* allocations fail.
				1231	*
				1232	* 16May01: If we're reentered then journal_current_handle() will be
				1233	* non-zero. We simply return.
				1234	*
				1235	* 1 July 2001: @@@ FIXME:
				1236	* In journalled data mode, a data buffer may be metadata against the
				1237	* current transaction. But the same file is part of a shared mapping
				1238	* and someone does a writepage() on it.
				1239	*
				1240	* We will move the buffer onto the async_data list, but after it has
				1241	* been dirtied. So there's a small window where we have dirty data on
				1242	* BJ_Metadata.
				1243	*
				1244	* Note that this only applies to the last partial page in the file. The
				1245	* bit which block_write_full_page() uses prepare/commit for. (That's
				1246	* broken code anyway: it's wrong for msync()).
				1247	*
				1248	* It's a rare case: affects the final partial page, for journalled data
				1249	* where the file is subject to bith write() and writepage() in the same
				1250	* transction. To fix it we'll need a custom block_write_full_page().
				1251	* We'll probably need that anyway for journalling writepage() output.
				1252	*
				1253	* We don't honour synchronous mounts for writepage(). That would be
				1254	* disastrous. Any write() or metadata operation will sync the fs for
				1255	* us.
				1256	*
				1257	* AKPM2: if all the page's buffers are mapped to disk and !data=journal,
				1258	* we don't need to open a transaction here.
				1259	*/
				1260	static int ext3_ordered_writepage(struct page *page,
				1261	struct writeback_control *wbc)
				1262	{
				1263	struct inode *inode = page->mapping->host;
				1264	struct buffer_head *page_bufs;
				1265	handle_t *handle = NULL;
				1266	int ret = 0;
				1267	int err;
				1268
				1269	J_ASSERT(PageLocked(page));
				1270
				1271	/*
				1272	* We give up here if we're reentered, because it might be for a
				1273	* different filesystem.
				1274	*/
				1275	if (ext3_journal_current_handle())
				1276	goto out_fail;
				1277
				1278	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1279
				1280	if (IS_ERR(handle)) {
				1281	ret = PTR_ERR(handle);
				1282	goto out_fail;
				1283	}
				1284
				1285	if (!page_has_buffers(page)) {
				1286	create_empty_buffers(page, inode->i_sb->s_blocksize,
				1287	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1288	}
				1289	page_bufs = page_buffers(page);
				1290	walk_page_buffers(handle, page_bufs, 0,
				1291	PAGE_CACHE_SIZE, NULL, bget_one);
				1292
				1293	ret = block_write_full_page(page, ext3_get_block, wbc);
				1294
				1295	/*
				1296	* The page can become unlocked at any point now, and
				1297	* truncate can then come in and change things. So we
				1298	* can't touch page from now on. But page_bufs is
				1299	* safe due to elevated refcount.
				1300	*/
				1301
				1302	/*
				1303	* And attach them to the current transaction. But only if
				1304	* block_write_full_page() succeeded. Otherwise they are unmapped,
				1305	* and generally junk.
				1306	*/
				1307	if (ret == 0) {
				1308	err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
				1309	NULL, journal_dirty_data_fn);
				1310	if (!ret)
				1311	ret = err;
				1312	}
				1313	walk_page_buffers(handle, page_bufs, 0,
				1314	PAGE_CACHE_SIZE, NULL, bput_one);
				1315	err = ext3_journal_stop(handle);
				1316	if (!ret)
				1317	ret = err;
				1318	return ret;
				1319
				1320	out_fail:
				1321	redirty_page_for_writepage(wbc, page);
				1322	unlock_page(page);
				1323	return ret;
				1324	}
				1325
				1326	static int
				1327	ext3_writeback_writepage_helper(struct page *page,
				1328	struct writeback_control *wbc)
				1329	{
				1330	return block_write_full_page(page, ext3_get_block, wbc);
				1331	}
				1332
				1333	static int
				1334	ext3_writeback_writepages(struct address_space *mapping,
				1335	struct writeback_control *wbc)
				1336	{
				1337	struct inode *inode = mapping->host;
				1338	handle_t *handle = NULL;
				1339	int err, ret = 0;
				1340
				1341	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
				1342	return ret;
				1343
				1344	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1345	if (IS_ERR(handle)) {
				1346	ret = PTR_ERR(handle);
				1347	return ret;
				1348	}
				1349
				1350	ret = __mpage_writepages(mapping, wbc, ext3_writepages_get_block,
				1351	ext3_writeback_writepage_helper);
				1352
				1353	/*
				1354	* Need to reaquire the handle since ext3_writepages_get_block()
				1355	* can restart the handle
				1356	*/
				1357	handle = journal_current_handle();
				1358
				1359	err = ext3_journal_stop(handle);
				1360	if (!ret)
				1361	ret = err;
				1362	return ret;
				1363	}
				1364
				1365	static int ext3_writeback_writepage(struct page *page,
				1366	struct writeback_control *wbc)
				1367	{
				1368	struct inode *inode = page->mapping->host;
				1369	handle_t *handle = NULL;
				1370	int ret = 0;
				1371	int err;
				1372
				1373	if (ext3_journal_current_handle())
				1374	goto out_fail;
				1375
				1376	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1377	if (IS_ERR(handle)) {
				1378	ret = PTR_ERR(handle);
				1379	goto out_fail;
				1380	}
				1381
				1382	if (test_opt(inode->i_sb, NOBH))
				1383	ret = nobh_writepage(page, ext3_get_block, wbc);
				1384	else
				1385	ret = block_write_full_page(page, ext3_get_block, wbc);
				1386
				1387	err = ext3_journal_stop(handle);
				1388	if (!ret)
				1389	ret = err;
				1390	return ret;
				1391
				1392	out_fail:
				1393	redirty_page_for_writepage(wbc, page);
				1394	unlock_page(page);
				1395	return ret;
				1396	}
				1397
				1398	static int ext3_journalled_writepage(struct page *page,
				1399	struct writeback_control *wbc)
				1400	{
				1401	struct inode *inode = page->mapping->host;
				1402	handle_t *handle = NULL;
				1403	int ret = 0;
				1404	int err;
				1405
				1406	if (ext3_journal_current_handle())
				1407	goto no_write;
				1408
				1409	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1410	if (IS_ERR(handle)) {
				1411	ret = PTR_ERR(handle);
				1412	goto no_write;
				1413	}
				1414
				1415	if (!page_has_buffers(page) \|\| PageChecked(page)) {
				1416	/*
				1417	* It's mmapped pagecache. Add buffers and journal it. There
				1418	* doesn't seem much point in redirtying the page here.
				1419	*/
				1420	ClearPageChecked(page);
				1421	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
				1422	ext3_get_block);
				1423	if (ret != 0)
				1424	goto out_unlock;
				1425	ret = walk_page_buffers(handle, page_buffers(page), 0,
				1426	PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
				1427
				1428	err = walk_page_buffers(handle, page_buffers(page), 0,
				1429	PAGE_CACHE_SIZE, NULL, commit_write_fn);
				1430	if (ret == 0)
				1431	ret = err;
				1432	EXT3_I(inode)->i_state \|= EXT3_STATE_JDATA;
				1433	unlock_page(page);
				1434	} else {
				1435	/*
				1436	* It may be a page full of checkpoint-mode buffers. We don't
				1437	* really know unless we go poke around in the buffer_heads.
				1438	* But block_write_full_page will do the right thing.
				1439	*/
				1440	ret = block_write_full_page(page, ext3_get_block, wbc);
				1441	}
				1442	err = ext3_journal_stop(handle);
				1443	if (!ret)
				1444	ret = err;
				1445	out:
				1446	return ret;
				1447
				1448	no_write:
				1449	redirty_page_for_writepage(wbc, page);
				1450	out_unlock:
				1451	unlock_page(page);
				1452	goto out;
				1453	}
				1454
				1455	static int ext3_readpage(struct file file, struct page page)
				1456	{
				1457	return mpage_readpage(page, ext3_get_block);
				1458	}
				1459
				1460	static int
				1461	ext3_readpages(struct file file, struct address_space mapping,
				1462	struct list_head *pages, unsigned nr_pages)
				1463	{
				1464	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
				1465	}
				1466
				1467	static int ext3_invalidatepage(struct page *page, unsigned long offset)
				1468	{
				1469	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
				1470
				1471	/*
				1472	* If it's a full truncate we just forget about the pending dirtying
				1473	*/
				1474	if (offset == 0)
				1475	ClearPageChecked(page);
				1476
				1477	return journal_invalidatepage(journal, page, offset);
				1478	}
				1479
				1480	static int ext3_releasepage(struct page *page, int wait)
				1481	{
				1482	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
				1483
				1484	WARN_ON(PageChecked(page));
				1485	if (!page_has_buffers(page))
				1486	return 0;
				1487	return journal_try_to_free_buffers(journal, page, wait);
				1488	}
				1489
				1490	/*
				1491	* If the O_DIRECT write will extend the file then add this inode to the
				1492	* orphan list. So recovery will truncate it back to the original size
				1493	* if the machine crashes during the write.
				1494	*
				1495	* If the O_DIRECT write is intantiating holes inside i_size and the machine
				1496	* crashes then stale disk data _may_ be exposed inside the file.
				1497	*/
				1498	static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
				1499	const struct iovec *iov, loff_t offset,
				1500	unsigned long nr_segs)
				1501	{
				1502	struct file *file = iocb->ki_filp;
				1503	struct inode *inode = file->f_mapping->host;
				1504	struct ext3_inode_info *ei = EXT3_I(inode);
				1505	handle_t *handle = NULL;
				1506	ssize_t ret;
				1507	int orphan = 0;
				1508	size_t count = iov_length(iov, nr_segs);
				1509
				1510	if (rw == WRITE) {
				1511	loff_t final_size = offset + count;
				1512
				1513	handle = ext3_journal_start(inode, DIO_CREDITS);
				1514	if (IS_ERR(handle)) {
				1515	ret = PTR_ERR(handle);
				1516	goto out;
				1517	}
				1518	if (final_size > inode->i_size) {
				1519	ret = ext3_orphan_add(handle, inode);
				1520	if (ret)
				1521	goto out_stop;
				1522	orphan = 1;
				1523	ei->i_disksize = inode->i_size;
				1524	}
				1525	}
				1526
				1527	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
				1528	offset, nr_segs,
				1529	ext3_direct_io_get_blocks, NULL);
				1530
				1531	/*
				1532	* Reacquire the handle: ext3_direct_io_get_block() can restart the
				1533	* transaction
				1534	*/
				1535	handle = journal_current_handle();
				1536
				1537	out_stop:
				1538	if (handle) {
				1539	int err;
				1540
				1541	if (orphan && inode->i_nlink)
				1542	ext3_orphan_del(handle, inode);
				1543	if (orphan && ret > 0) {
				1544	loff_t end = offset + ret;
				1545	if (end > inode->i_size) {
				1546	ei->i_disksize = end;
				1547	i_size_write(inode, end);
				1548	/*
				1549	* We're going to return a positive `ret'
				1550	* here due to non-zero-length I/O, so there's
				1551	* no way of reporting error returns from
				1552	* ext3_mark_inode_dirty() to userspace. So
				1553	* ignore it.
				1554	*/
				1555	ext3_mark_inode_dirty(handle, inode);
				1556	}
				1557	}
				1558	err = ext3_journal_stop(handle);
				1559	if (ret == 0)
				1560	ret = err;
				1561	}
				1562	out:
				1563	return ret;
				1564	}
				1565
				1566	/*
				1567	* Pages can be marked dirty completely asynchronously from ext3's journalling
				1568	* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
				1569	* much here because ->set_page_dirty is called under VFS locks. The page is
				1570	* not necessarily locked.
				1571	*
				1572	* We cannot just dirty the page and leave attached buffers clean, because the
				1573	* buffers' dirty state is "definitive". We cannot just set the buffers dirty
				1574	* or jbddirty because all the journalling code will explode.
				1575	*
				1576	* So what we do is to mark the page "pending dirty" and next time writepage
				1577	* is called, propagate that into the buffers appropriately.
				1578	*/
				1579	static int ext3_journalled_set_page_dirty(struct page *page)
				1580	{
				1581	SetPageChecked(page);
				1582	return __set_page_dirty_nobuffers(page);
				1583	}
				1584
				1585	static struct address_space_operations ext3_ordered_aops = {
				1586	.readpage = ext3_readpage,
				1587	.readpages = ext3_readpages,
				1588	.writepage = ext3_ordered_writepage,
				1589	.sync_page = block_sync_page,
				1590	.prepare_write = ext3_prepare_write,
				1591	.commit_write = ext3_ordered_commit_write,
				1592	.bmap = ext3_bmap,
				1593	.invalidatepage = ext3_invalidatepage,
				1594	.releasepage = ext3_releasepage,
				1595	.direct_IO = ext3_direct_IO,
				1596	};
				1597
				1598	static struct address_space_operations ext3_writeback_aops = {
				1599	.readpage = ext3_readpage,
				1600	.readpages = ext3_readpages,
				1601	.writepage = ext3_writeback_writepage,
				1602	.writepages = ext3_writeback_writepages,
				1603	.sync_page = block_sync_page,
				1604	.prepare_write = ext3_prepare_write,
				1605	.commit_write = ext3_writeback_commit_write,
				1606	.bmap = ext3_bmap,
				1607	.invalidatepage = ext3_invalidatepage,
				1608	.releasepage = ext3_releasepage,
				1609	.direct_IO = ext3_direct_IO,
				1610	};
				1611
				1612	static struct address_space_operations ext3_journalled_aops = {
				1613	.readpage = ext3_readpage,
				1614	.readpages = ext3_readpages,
				1615	.writepage = ext3_journalled_writepage,
				1616	.sync_page = block_sync_page,
				1617	.prepare_write = ext3_prepare_write,
				1618	.commit_write = ext3_journalled_commit_write,
				1619	.set_page_dirty = ext3_journalled_set_page_dirty,
				1620	.bmap = ext3_bmap,
				1621	.invalidatepage = ext3_invalidatepage,
				1622	.releasepage = ext3_releasepage,
				1623	};
				1624
				1625	void ext3_set_aops(struct inode *inode)
				1626	{
				1627	if (ext3_should_order_data(inode))
				1628	inode->i_mapping->a_ops = &ext3_ordered_aops;
				1629	else if (ext3_should_writeback_data(inode))
				1630	inode->i_mapping->a_ops = &ext3_writeback_aops;
				1631	else
				1632	inode->i_mapping->a_ops = &ext3_journalled_aops;
				1633	}
				1634
				1635	/*
				1636	* ext3_block_truncate_page() zeroes out a mapping from file offset `from'
				1637	* up to the end of the block which corresponds to `from'.
				1638	* This required during truncate. We need to physically zero the tail end
				1639	* of that block so it doesn't yield old data if the file is later grown.
				1640	*/
				1641	static int ext3_block_truncate_page(handle_t handle, struct page page,
				1642	struct address_space *mapping, loff_t from)
				1643	{
				1644	unsigned long index = from >> PAGE_CACHE_SHIFT;
				1645	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				1646	unsigned blocksize, iblock, length, pos;
				1647	struct inode *inode = mapping->host;
				1648	struct buffer_head *bh;
				1649	int err = 0;
				1650	void *kaddr;
				1651
				1652	blocksize = inode->i_sb->s_blocksize;
				1653	length = blocksize - (offset & (blocksize - 1));
				1654	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
				1655
				1656	/*
				1657	* For "nobh" option, we can only work if we don't need to
				1658	* read-in the page - otherwise we create buffers to do the IO.
				1659	*/
				1660	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) {
				1661	if (PageUptodate(page)) {
				1662	kaddr = kmap_atomic(page, KM_USER0);
				1663	memset(kaddr + offset, 0, length);
				1664	flush_dcache_page(page);
				1665	kunmap_atomic(kaddr, KM_USER0);
				1666	set_page_dirty(page);
				1667	goto unlock;
				1668	}
				1669	}
				1670
				1671	if (!page_has_buffers(page))
				1672	create_empty_buffers(page, blocksize, 0);
				1673
				1674	/* Find the buffer that contains "offset" */
				1675	bh = page_buffers(page);
				1676	pos = blocksize;
				1677	while (offset >= pos) {
				1678	bh = bh->b_this_page;
				1679	iblock++;
				1680	pos += blocksize;
				1681	}
				1682
				1683	err = 0;
				1684	if (buffer_freed(bh)) {
				1685	BUFFER_TRACE(bh, "freed: skip");
				1686	goto unlock;
				1687	}
				1688
				1689	if (!buffer_mapped(bh)) {
				1690	BUFFER_TRACE(bh, "unmapped");
				1691	ext3_get_block(inode, iblock, bh, 0);
				1692	/* unmapped? It's a hole - nothing to do */
				1693	if (!buffer_mapped(bh)) {
				1694	BUFFER_TRACE(bh, "still unmapped");
				1695	goto unlock;
				1696	}
				1697	}
				1698
				1699	/* Ok, it's mapped. Make sure it's up-to-date */
				1700	if (PageUptodate(page))
				1701	set_buffer_uptodate(bh);
				1702
				1703	if (!buffer_uptodate(bh)) {
				1704	err = -EIO;
				1705	ll_rw_block(READ, 1, &bh);
				1706	wait_on_buffer(bh);
				1707	/* Uhhuh. Read error. Complain and punt. */
				1708	if (!buffer_uptodate(bh))
				1709	goto unlock;
				1710	}
				1711
				1712	if (ext3_should_journal_data(inode)) {
				1713	BUFFER_TRACE(bh, "get write access");
				1714	err = ext3_journal_get_write_access(handle, bh);
				1715	if (err)
				1716	goto unlock;
				1717	}
				1718
				1719	kaddr = kmap_atomic(page, KM_USER0);
				1720	memset(kaddr + offset, 0, length);
				1721	flush_dcache_page(page);
				1722	kunmap_atomic(kaddr, KM_USER0);
				1723
				1724	BUFFER_TRACE(bh, "zeroed end of block");
				1725
				1726	err = 0;
				1727	if (ext3_should_journal_data(inode)) {
				1728	err = ext3_journal_dirty_metadata(handle, bh);
				1729	} else {
				1730	if (ext3_should_order_data(inode))
				1731	err = ext3_journal_dirty_data(handle, bh);
				1732	mark_buffer_dirty(bh);
				1733	}
				1734
				1735	unlock:
				1736	unlock_page(page);
				1737	page_cache_release(page);
				1738	return err;
				1739	}
				1740
				1741	/*
				1742	* Probably it should be a library function... search for first non-zero word
				1743	* or memcmp with zero_page, whatever is better for particular architecture.
				1744	* Linus?
				1745	*/
				1746	static inline int all_zeroes(__le32 p, __le32 q)
				1747	{
				1748	while (p < q)
				1749	if (*p++)
				1750	return 0;
				1751	return 1;
				1752	}
				1753
				1754	/**
				1755	* ext3_find_shared - find the indirect blocks for partial truncation.
				1756	* @inode: inode in question
				1757	* @depth: depth of the affected branch
				1758	* @offsets: offsets of pointers in that branch (see ext3_block_to_path)
				1759	* @chain: place to store the pointers to partial indirect blocks
				1760	* @top: place to the (detached) top of branch
				1761	*
				1762	* This is a helper function used by ext3_truncate().
				1763	*
				1764	* When we do truncate() we may have to clean the ends of several
				1765	* indirect blocks but leave the blocks themselves alive. Block is
				1766	* partially truncated if some data below the new i_size is refered
				1767	* from it (and it is on the path to the first completely truncated
				1768	* data block, indeed). We have to free the top of that path along
				1769	* with everything to the right of the path. Since no allocation
				1770	* past the truncation point is possible until ext3_truncate()
				1771	* finishes, we may safely do the latter, but top of branch may
				1772	* require special attention - pageout below the truncation point
				1773	* might try to populate it.
				1774	*
				1775	* We atomically detach the top of branch from the tree, store the
				1776	* block number of its root in *@top, pointers to buffer_heads of
				1777	* partially truncated blocks - in @chain[].bh and pointers to
				1778	* their last elements that should not be removed - in
				1779	* @chain[].p. Return value is the pointer to last filled element
				1780	* of @chain.
				1781	*
				1782	* The work left to caller to do the actual freeing of subtrees:
				1783	* a) free the subtree starting from *@top
				1784	* b) free the subtrees whose roots are stored in
				1785	* (@chain[i].p+1 .. end of @chain[i].bh->b_data)
				1786	* c) free the subtrees growing from the inode past the @chain[0].
				1787	* (no partially truncated stuff there). */
				1788
				1789	static Indirect ext3_find_shared(struct inode inode,
				1790	int depth,
				1791	int offsets[4],
				1792	Indirect chain[4],
				1793	__le32 *top)
				1794	{
				1795	Indirect partial, p;
				1796	int k, err;
				1797
				1798	*top = 0;
				1799	/* Make k index the deepest non-null offest + 1 */
				1800	for (k = depth; k > 1 && !offsets[k-1]; k--)
				1801	;
				1802	partial = ext3_get_branch(inode, k, offsets, chain, &err);
				1803	/* Writer: pointers */
				1804	if (!partial)
				1805	partial = chain + k-1;
				1806	/*
				1807	* If the branch acquired continuation since we've looked at it -
				1808	* fine, it should all survive and (new) top doesn't belong to us.
				1809	*/
				1810	if (!partial->key && *partial->p)
				1811	/* Writer: end */
				1812	goto no_top;
				1813	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
				1814	;
				1815	/*
				1816	* OK, we've found the last block that must survive. The rest of our
				1817	* branch should be detached before unlocking. However, if that rest
				1818	* of branch is all ours and does not grow immediately from the inode
				1819	* it's easier to cheat and just decrement partial->p.
				1820	*/
				1821	if (p == chain + k - 1 && p > chain) {
				1822	p->p--;
				1823	} else {
				1824	top = p->p;
				1825	/* Nope, don't do this in ext3. Must leave the tree intact */
				1826	#if 0
				1827	*p->p = 0;
				1828	#endif
				1829	}
				1830	/* Writer: end */
				1831
				1832	while(partial > p)
				1833	{
				1834	brelse(partial->bh);
				1835	partial--;
				1836	}
				1837	no_top:
				1838	return partial;
				1839	}
				1840
				1841	/*
				1842	* Zero a number of block pointers in either an inode or an indirect block.
				1843	* If we restart the transaction we must again get write access to the
				1844	* indirect block for further modification.
				1845	*
				1846	* We release `count' blocks on disk, but (last - first) may be greater
				1847	* than `count' because there can be holes in there.
				1848	*/
				1849	static void
				1850	ext3_clear_blocks(handle_t handle, struct inode inode, struct buffer_head *bh,
				1851	unsigned long block_to_free, unsigned long count,
				1852	__le32 first, __le32 last)
				1853	{
				1854	__le32 *p;
				1855	if (try_to_extend_transaction(handle, inode)) {
				1856	if (bh) {
				1857	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				1858	ext3_journal_dirty_metadata(handle, bh);
				1859	}
				1860	ext3_mark_inode_dirty(handle, inode);
				1861	ext3_journal_test_restart(handle, inode);
				1862	if (bh) {
				1863	BUFFER_TRACE(bh, "retaking write access");
				1864	ext3_journal_get_write_access(handle, bh);
				1865	}
				1866	}
				1867
				1868	/*
				1869	* Any buffers which are on the journal will be in memory. We find
				1870	* them on the hash table so journal_revoke() will run journal_forget()
				1871	* on them. We've already detached each block from the file, so
				1872	* bforget() in journal_forget() should be safe.
				1873	*
				1874	* AKPM: turn on bforget in journal_forget()!!!
				1875	*/
				1876	for (p = first; p < last; p++) {
				1877	u32 nr = le32_to_cpu(*p);
				1878	if (nr) {
				1879	struct buffer_head *bh;
				1880
				1881	*p = 0;
				1882	bh = sb_find_get_block(inode->i_sb, nr);
				1883	ext3_forget(handle, 0, inode, bh, nr);
				1884	}
				1885	}
				1886
				1887	ext3_free_blocks(handle, inode, block_to_free, count);
				1888	}
				1889
				1890	/**
				1891	* ext3_free_data - free a list of data blocks
				1892	* @handle: handle for this transaction
				1893	* @inode: inode we are dealing with
				1894	* @this_bh: indirect buffer_head which contains @first and @last
				1895	* @first: array of block numbers
				1896	* @last: points immediately past the end of array
				1897	*
				1898	* We are freeing all blocks refered from that array (numbers are stored as
				1899	* little-endian 32-bit) and updating @inode->i_blocks appropriately.
				1900	*
				1901	* We accumulate contiguous runs of blocks to free. Conveniently, if these
				1902	* blocks are contiguous then releasing them at one time will only affect one
				1903	* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
				1904	* actually use a lot of journal space.
				1905	*
				1906	* @this_bh will be %NULL if @first and @last point into the inode's direct
				1907	* block pointers.
				1908	*/
				1909	static void ext3_free_data(handle_t handle, struct inode inode,
				1910	struct buffer_head *this_bh,
				1911	__le32 first, __le32 last)
				1912	{
				1913	unsigned long block_to_free = 0; /* Starting block # of a run */
				1914	unsigned long count = 0; /* Number of blocks in the run */
				1915	__le32 block_to_free_p = NULL; / Pointer into inode/ind
				1916	corresponding to
				1917	block_to_free */
				1918	unsigned long nr; /* Current block # */
				1919	__le32 p; / Pointer into inode/ind
				1920	for current block */
				1921	int err;
				1922
				1923	if (this_bh) { /* For indirect block */
				1924	BUFFER_TRACE(this_bh, "get_write_access");
				1925	err = ext3_journal_get_write_access(handle, this_bh);
				1926	/* Important: if we can't update the indirect pointers
				1927	* to the blocks, we can't free them. */
				1928	if (err)
				1929	return;
				1930	}
				1931
				1932	for (p = first; p < last; p++) {
				1933	nr = le32_to_cpu(*p);
				1934	if (nr) {
				1935	/* accumulate blocks to free if they're contiguous */
				1936	if (count == 0) {
				1937	block_to_free = nr;
				1938	block_to_free_p = p;
				1939	count = 1;
				1940	} else if (nr == block_to_free + count) {
				1941	count++;
				1942	} else {
				1943	ext3_clear_blocks(handle, inode, this_bh,
				1944	block_to_free,
				1945	count, block_to_free_p, p);
				1946	block_to_free = nr;
				1947	block_to_free_p = p;
				1948	count = 1;
				1949	}
				1950	}
				1951	}
				1952
				1953	if (count > 0)
				1954	ext3_clear_blocks(handle, inode, this_bh, block_to_free,
				1955	count, block_to_free_p, p);
				1956
				1957	if (this_bh) {
				1958	BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
				1959	ext3_journal_dirty_metadata(handle, this_bh);
				1960	}
				1961	}
				1962
				1963	/**
				1964	* ext3_free_branches - free an array of branches
				1965	* @handle: JBD handle for this transaction
				1966	* @inode: inode we are dealing with
				1967	* @parent_bh: the buffer_head which contains @first and @last
				1968	* @first: array of block numbers
				1969	* @last: pointer immediately past the end of array
				1970	* @depth: depth of the branches to free
				1971	*
				1972	* We are freeing all blocks refered from these branches (numbers are
				1973	* stored as little-endian 32-bit) and updating @inode->i_blocks
				1974	* appropriately.
				1975	*/
				1976	static void ext3_free_branches(handle_t handle, struct inode inode,
				1977	struct buffer_head *parent_bh,
				1978	__le32 first, __le32 last, int depth)
				1979	{
				1980	unsigned long nr;
				1981	__le32 *p;
				1982
				1983	if (is_handle_aborted(handle))
				1984	return;
				1985
				1986	if (depth--) {
				1987	struct buffer_head *bh;
				1988	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				1989	p = last;
				1990	while (--p >= first) {
				1991	nr = le32_to_cpu(*p);
				1992	if (!nr)
				1993	continue; /* A hole */
				1994
				1995	/* Go read the buffer for the next level down */
				1996	bh = sb_bread(inode->i_sb, nr);
				1997
				1998	/*
				1999	* A read failure? Report error and clear slot
				2000	* (should be rare).
				2001	*/
				2002	if (!bh) {
				2003	ext3_error(inode->i_sb, "ext3_free_branches",
				2004	"Read failure, inode=%ld, block=%ld",
				2005	inode->i_ino, nr);
				2006	continue;
				2007	}
				2008
				2009	/* This zaps the entire block. Bottom up. */
				2010	BUFFER_TRACE(bh, "free child branches");
				2011	ext3_free_branches(handle, inode, bh,
				2012	(__le32*)bh->b_data,
				2013	(__le32*)bh->b_data + addr_per_block,
				2014	depth);
				2015
				2016	/*
				2017	* We've probably journalled the indirect block several
				2018	* times during the truncate. But it's no longer
				2019	* needed and we now drop it from the transaction via
				2020	* journal_revoke().
				2021	*
				2022	* That's easy if it's exclusively part of this
				2023	* transaction. But if it's part of the committing
				2024	* transaction then journal_forget() will simply
				2025	* brelse() it. That means that if the underlying
				2026	* block is reallocated in ext3_get_block(),
				2027	* unmap_underlying_metadata() will find this block
				2028	* and will try to get rid of it. damn, damn.
				2029	*
				2030	* If this block has already been committed to the
				2031	* journal, a revoke record will be written. And
				2032	* revoke records must be emitted before clearing
				2033	* this block's bit in the bitmaps.
				2034	*/
				2035	ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
				2036
				2037	/*
				2038	* Everything below this this pointer has been
				2039	* released. Now let this top-of-subtree go.
				2040	*
				2041	* We want the freeing of this indirect block to be
				2042	* atomic in the journal with the updating of the
				2043	* bitmap block which owns it. So make some room in
				2044	* the journal.
				2045	*
				2046	* We zero the parent pointer after freeing its
				2047	* pointee in the bitmaps, so if extend_transaction()
				2048	* for some reason fails to put the bitmap changes and
				2049	* the release into the same transaction, recovery
				2050	* will merely complain about releasing a free block,
				2051	* rather than leaking blocks.
				2052	*/
				2053	if (is_handle_aborted(handle))
				2054	return;
				2055	if (try_to_extend_transaction(handle, inode)) {
				2056	ext3_mark_inode_dirty(handle, inode);
				2057	ext3_journal_test_restart(handle, inode);
				2058	}
				2059
				2060	ext3_free_blocks(handle, inode, nr, 1);
				2061
				2062	if (parent_bh) {
				2063	/*
				2064	* The block which we have just freed is
				2065	* pointed to by an indirect block: journal it
				2066	*/
				2067	BUFFER_TRACE(parent_bh, "get_write_access");
				2068	if (!ext3_journal_get_write_access(handle,
				2069	parent_bh)){
				2070	*p = 0;
				2071	BUFFER_TRACE(parent_bh,
				2072	"call ext3_journal_dirty_metadata");
				2073	ext3_journal_dirty_metadata(handle,
				2074	parent_bh);
				2075	}
				2076	}
				2077	}
				2078	} else {
				2079	/* We have reached the bottom of the tree. */
				2080	BUFFER_TRACE(parent_bh, "free data blocks");
				2081	ext3_free_data(handle, inode, parent_bh, first, last);
				2082	}
				2083	}
				2084
				2085	/*
				2086	* ext3_truncate()
				2087	*
				2088	* We block out ext3_get_block() block instantiations across the entire
				2089	* transaction, and VFS/VM ensures that ext3_truncate() cannot run
				2090	* simultaneously on behalf of the same inode.
				2091	*
				2092	* As we work through the truncate and commmit bits of it to the journal there
				2093	* is one core, guiding principle: the file's tree must always be consistent on
				2094	* disk. We must be able to restart the truncate after a crash.
				2095	*
				2096	* The file's tree may be transiently inconsistent in memory (although it
				2097	* probably isn't), but whenever we close off and commit a journal transaction,
				2098	* the contents of (the filesystem + the journal) must be consistent and
				2099	* restartable. It's pretty simple, really: bottom up, right to left (although
				2100	* left-to-right works OK too).
				2101	*
				2102	* Note that at recovery time, journal replay occurs before the restart of
				2103	* truncate against the orphan inode list.
				2104	*
				2105	* The committed inode has the new, desired i_size (which is the same as
				2106	* i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
				2107	* that this inode's truncate did not complete and it will again call
				2108	* ext3_truncate() to have another go. So there will be instantiated blocks
				2109	* to the right of the truncation point in a crashed ext3 filesystem. But
				2110	* that's fine - as long as they are linked from the inode, the post-crash
				2111	* ext3_truncate() run will find them and release them.
				2112	*/
				2113
				2114	void ext3_truncate(struct inode * inode)
				2115	{
				2116	handle_t *handle;
				2117	struct ext3_inode_info *ei = EXT3_I(inode);
				2118	__le32 *i_data = ei->i_data;
				2119	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				2120	struct address_space *mapping = inode->i_mapping;
				2121	int offsets[4];
				2122	Indirect chain[4];
				2123	Indirect *partial;
				2124	__le32 nr = 0;
				2125	int n;
				2126	long last_block;
				2127	unsigned blocksize = inode->i_sb->s_blocksize;
				2128	struct page *page;
				2129
				2130	if (!(S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode) \|\|
				2131	S_ISLNK(inode->i_mode)))
				2132	return;
				2133	if (ext3_inode_is_fast_symlink(inode))
				2134	return;
				2135	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
				2136	return;
				2137
				2138	/*
				2139	* We have to lock the EOF page here, because lock_page() nests
				2140	* outside journal_start().
				2141	*/
				2142	if ((inode->i_size & (blocksize - 1)) == 0) {
				2143	/* Block boundary? Nothing to do */
				2144	page = NULL;
				2145	} else {
				2146	page = grab_cache_page(mapping,
				2147	inode->i_size >> PAGE_CACHE_SHIFT);
				2148	if (!page)
				2149	return;
				2150	}
				2151
				2152	handle = start_transaction(inode);
				2153	if (IS_ERR(handle)) {
				2154	if (page) {
				2155	clear_highpage(page);
				2156	flush_dcache_page(page);
				2157	unlock_page(page);
				2158	page_cache_release(page);
				2159	}
				2160	return; /* AKPM: return what? */
				2161	}
				2162
				2163	last_block = (inode->i_size + blocksize-1)
				2164	>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
				2165
				2166	if (page)
				2167	ext3_block_truncate_page(handle, page, mapping, inode->i_size);
				2168
				2169	n = ext3_block_to_path(inode, last_block, offsets, NULL);
				2170	if (n == 0)
				2171	goto out_stop; /* error */
				2172
				2173	/*
				2174	* OK. This truncate is going to happen. We add the inode to the
				2175	* orphan list, so that if this truncate spans multiple transactions,
				2176	* and we crash, we will resume the truncate when the filesystem
				2177	* recovers. It also marks the inode dirty, to catch the new size.
				2178	*
				2179	* Implication: the file must always be in a sane, consistent
				2180	* truncatable state while each transaction commits.
				2181	*/
				2182	if (ext3_orphan_add(handle, inode))
				2183	goto out_stop;
				2184
				2185	/*
				2186	* The orphan list entry will now protect us from any crash which
				2187	* occurs before the truncate completes, so it is now safe to propagate
				2188	* the new, shorter inode size (held for now in i_size) into the
				2189	* on-disk inode. We do this via i_disksize, which is the value which
				2190	* ext3 really writes onto the disk inode.
				2191	*/
				2192	ei->i_disksize = inode->i_size;
				2193
				2194	/*
				2195	* From here we block out all ext3_get_block() callers who want to
				2196	* modify the block allocation tree.
				2197	*/
				2198	down(&ei->truncate_sem);
				2199
				2200	if (n == 1) { /* direct blocks */
				2201	ext3_free_data(handle, inode, NULL, i_data+offsets[0],
				2202	i_data + EXT3_NDIR_BLOCKS);
				2203	goto do_indirects;
				2204	}
				2205
				2206	partial = ext3_find_shared(inode, n, offsets, chain, &nr);
				2207	/* Kill the top of shared branch (not detached) */
				2208	if (nr) {
				2209	if (partial == chain) {
				2210	/* Shared branch grows from the inode */
				2211	ext3_free_branches(handle, inode, NULL,
				2212	&nr, &nr+1, (chain+n-1) - partial);
				2213	*partial->p = 0;
				2214	/*
				2215	* We mark the inode dirty prior to restart,
				2216	* and prior to stop. No need for it here.
				2217	*/
				2218	} else {
				2219	/* Shared branch grows from an indirect block */
				2220	BUFFER_TRACE(partial->bh, "get_write_access");
				2221	ext3_free_branches(handle, inode, partial->bh,
				2222	partial->p,
				2223	partial->p+1, (chain+n-1) - partial);
				2224	}
				2225	}
				2226	/* Clear the ends of indirect blocks on the shared branch */
				2227	while (partial > chain) {
				2228	ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
				2229	(__le32*)partial->bh->b_data+addr_per_block,
				2230	(chain+n-1) - partial);
				2231	BUFFER_TRACE(partial->bh, "call brelse");
				2232	brelse (partial->bh);
				2233	partial--;
				2234	}
				2235	do_indirects:
				2236	/* Kill the remaining (whole) subtrees */
				2237	switch (offsets[0]) {
				2238	default:
				2239	nr = i_data[EXT3_IND_BLOCK];
				2240	if (nr) {
				2241	ext3_free_branches(handle, inode, NULL,
				2242	&nr, &nr+1, 1);
				2243	i_data[EXT3_IND_BLOCK] = 0;
				2244	}
				2245	case EXT3_IND_BLOCK:
				2246	nr = i_data[EXT3_DIND_BLOCK];
				2247	if (nr) {
				2248	ext3_free_branches(handle, inode, NULL,
				2249	&nr, &nr+1, 2);
				2250	i_data[EXT3_DIND_BLOCK] = 0;
				2251	}
				2252	case EXT3_DIND_BLOCK:
				2253	nr = i_data[EXT3_TIND_BLOCK];
				2254	if (nr) {
				2255	ext3_free_branches(handle, inode, NULL,
				2256	&nr, &nr+1, 3);
				2257	i_data[EXT3_TIND_BLOCK] = 0;
				2258	}
				2259	case EXT3_TIND_BLOCK:
				2260	;
				2261	}
				2262
				2263	ext3_discard_reservation(inode);
				2264
				2265	up(&ei->truncate_sem);
				2266	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
				2267	ext3_mark_inode_dirty(handle, inode);
				2268
				2269	/* In a multi-transaction truncate, we only make the final
				2270	* transaction synchronous */
				2271	if (IS_SYNC(inode))
				2272	handle->h_sync = 1;
				2273	out_stop:
				2274	/*
				2275	* If this was a simple ftruncate(), and the file will remain alive
				2276	* then we need to clear up the orphan record which we created above.
				2277	* However, if this was a real unlink then we were called by
				2278	* ext3_delete_inode(), and we allow that function to clean up the
				2279	* orphan info for us.
				2280	*/
				2281	if (inode->i_nlink)
				2282	ext3_orphan_del(handle, inode);
				2283
				2284	ext3_journal_stop(handle);
				2285	}
				2286
				2287	static unsigned long ext3_get_inode_block(struct super_block *sb,
				2288	unsigned long ino, struct ext3_iloc *iloc)
				2289	{
				2290	unsigned long desc, group_desc, block_group;
				2291	unsigned long offset, block;
				2292	struct buffer_head *bh;
				2293	struct ext3_group_desc * gdp;
				2294
				2295
				2296	if ((ino != EXT3_ROOT_INO &&
				2297	ino != EXT3_JOURNAL_INO &&
				2298	ino != EXT3_RESIZE_INO &&
				2299	ino < EXT3_FIRST_INO(sb)) \|\|
				2300	ino > le32_to_cpu(
				2301	EXT3_SB(sb)->s_es->s_inodes_count)) {
				2302	ext3_error (sb, "ext3_get_inode_block",
				2303	"bad inode number: %lu", ino);
				2304	return 0;
				2305	}
				2306	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
				2307	if (block_group >= EXT3_SB(sb)->s_groups_count) {
				2308	ext3_error (sb, "ext3_get_inode_block",
				2309	"group >= groups count");
				2310	return 0;
				2311	}
				2312	smp_rmb();
				2313	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
				2314	desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
				2315	bh = EXT3_SB(sb)->s_group_desc[group_desc];
				2316	if (!bh) {
				2317	ext3_error (sb, "ext3_get_inode_block",
				2318	"Descriptor not loaded");
				2319	return 0;
				2320	}
				2321
				2322	gdp = (struct ext3_group_desc *) bh->b_data;
				2323	/*
				2324	* Figure out the offset within the block group inode table
				2325	*/
				2326	offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
				2327	EXT3_INODE_SIZE(sb);
				2328	block = le32_to_cpu(gdp[desc].bg_inode_table) +
				2329	(offset >> EXT3_BLOCK_SIZE_BITS(sb));
				2330
				2331	iloc->block_group = block_group;
				2332	iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
				2333	return block;
				2334	}
				2335
				2336	/*
				2337	* ext3_get_inode_loc returns with an extra refcount against the inode's
				2338	* underlying buffer_head on success. If 'in_mem' is true, we have all
				2339	* data in memory that is needed to recreate the on-disk version of this
				2340	* inode.
				2341	*/
				2342	static int __ext3_get_inode_loc(struct inode *inode,
				2343	struct ext3_iloc *iloc, int in_mem)
				2344	{
				2345	unsigned long block;
				2346	struct buffer_head *bh;
				2347
				2348	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
				2349	if (!block)
				2350	return -EIO;
				2351
				2352	bh = sb_getblk(inode->i_sb, block);
				2353	if (!bh) {
				2354	ext3_error (inode->i_sb, "ext3_get_inode_loc",
				2355	"unable to read inode block - "
				2356	"inode=%lu, block=%lu", inode->i_ino, block);
				2357	return -EIO;
				2358	}
				2359	if (!buffer_uptodate(bh)) {
				2360	lock_buffer(bh);
				2361	if (buffer_uptodate(bh)) {
				2362	/* someone brought it uptodate while we waited */
				2363	unlock_buffer(bh);
				2364	goto has_buffer;
				2365	}
				2366
				2367	/*
				2368	* If we have all information of the inode in memory and this
				2369	* is the only valid inode in the block, we need not read the
				2370	* block.
				2371	*/
				2372	if (in_mem) {
				2373	struct buffer_head *bitmap_bh;
				2374	struct ext3_group_desc *desc;
				2375	int inodes_per_buffer;
				2376	int inode_offset, i;
				2377	int block_group;
				2378	int start;
				2379
				2380	block_group = (inode->i_ino - 1) /
				2381	EXT3_INODES_PER_GROUP(inode->i_sb);
				2382	inodes_per_buffer = bh->b_size /
				2383	EXT3_INODE_SIZE(inode->i_sb);
				2384	inode_offset = ((inode->i_ino - 1) %
				2385	EXT3_INODES_PER_GROUP(inode->i_sb));
				2386	start = inode_offset & ~(inodes_per_buffer - 1);
				2387
				2388	/* Is the inode bitmap in cache? */
				2389	desc = ext3_get_group_desc(inode->i_sb,
				2390	block_group, NULL);
				2391	if (!desc)
				2392	goto make_io;
				2393
				2394	bitmap_bh = sb_getblk(inode->i_sb,
				2395	le32_to_cpu(desc->bg_inode_bitmap));
				2396	if (!bitmap_bh)
				2397	goto make_io;
				2398
				2399	/*
				2400	* If the inode bitmap isn't in cache then the
				2401	* optimisation may end up performing two reads instead
				2402	* of one, so skip it.
				2403	*/
				2404	if (!buffer_uptodate(bitmap_bh)) {
				2405	brelse(bitmap_bh);
				2406	goto make_io;
				2407	}
				2408	for (i = start; i < start + inodes_per_buffer; i++) {
				2409	if (i == inode_offset)
				2410	continue;
				2411	if (ext3_test_bit(i, bitmap_bh->b_data))
				2412	break;
				2413	}
				2414	brelse(bitmap_bh);
				2415	if (i == start + inodes_per_buffer) {
				2416	/* all other inodes are free, so skip I/O */
				2417	memset(bh->b_data, 0, bh->b_size);
				2418	set_buffer_uptodate(bh);
				2419	unlock_buffer(bh);
				2420	goto has_buffer;
				2421	}
				2422	}
				2423
				2424	make_io:
				2425	/*
				2426	* There are other valid inodes in the buffer, this inode
				2427	* has in-inode xattrs, or we don't have this inode in memory.
				2428	* Read the block from disk.
				2429	*/
				2430	get_bh(bh);
				2431	bh->b_end_io = end_buffer_read_sync;
				2432	submit_bh(READ, bh);
				2433	wait_on_buffer(bh);
				2434	if (!buffer_uptodate(bh)) {
				2435	ext3_error(inode->i_sb, "ext3_get_inode_loc",
				2436	"unable to read inode block - "
				2437	"inode=%lu, block=%lu",
				2438	inode->i_ino, block);
				2439	brelse(bh);
				2440	return -EIO;
				2441	}
				2442	}
				2443	has_buffer:
				2444	iloc->bh = bh;
				2445	return 0;
				2446	}
				2447
				2448	int ext3_get_inode_loc(struct inode inode, struct ext3_iloc iloc)
				2449	{
				2450	/* We have all inode data except xattrs in memory here. */
				2451	return __ext3_get_inode_loc(inode, iloc,
				2452	!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
				2453	}
				2454
				2455	void ext3_set_inode_flags(struct inode *inode)
				2456	{
				2457	unsigned int flags = EXT3_I(inode)->i_flags;
				2458
				2459	inode->i_flags &= ~(S_SYNC\|S_APPEND\|S_IMMUTABLE\|S_NOATIME\|S_DIRSYNC);
				2460	if (flags & EXT3_SYNC_FL)
				2461	inode->i_flags \|= S_SYNC;
				2462	if (flags & EXT3_APPEND_FL)
				2463	inode->i_flags \|= S_APPEND;
				2464	if (flags & EXT3_IMMUTABLE_FL)
				2465	inode->i_flags \|= S_IMMUTABLE;
				2466	if (flags & EXT3_NOATIME_FL)
				2467	inode->i_flags \|= S_NOATIME;
				2468	if (flags & EXT3_DIRSYNC_FL)
				2469	inode->i_flags \|= S_DIRSYNC;
				2470	}
				2471
				2472	void ext3_read_inode(struct inode * inode)
				2473	{
				2474	struct ext3_iloc iloc;
				2475	struct ext3_inode *raw_inode;
				2476	struct ext3_inode_info *ei = EXT3_I(inode);
				2477	struct buffer_head *bh;
				2478	int block;
				2479
				2480	#ifdef CONFIG_EXT3_FS_POSIX_ACL
				2481	ei->i_acl = EXT3_ACL_NOT_CACHED;
				2482	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
				2483	#endif
				2484	ei->i_block_alloc_info = NULL;
				2485
				2486	if (__ext3_get_inode_loc(inode, &iloc, 0))
				2487	goto bad_inode;
				2488	bh = iloc.bh;
				2489	raw_inode = ext3_raw_inode(&iloc);
				2490	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
				2491	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
				2492	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
				2493	if(!(test_opt (inode->i_sb, NO_UID32))) {
				2494	inode->i_uid \|= le16_to_cpu(raw_inode->i_uid_high) << 16;
				2495	inode->i_gid \|= le16_to_cpu(raw_inode->i_gid_high) << 16;
				2496	}
				2497	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
				2498	inode->i_size = le32_to_cpu(raw_inode->i_size);
				2499	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
				2500	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
				2501	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
				2502	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
				2503
				2504	ei->i_state = 0;
				2505	ei->i_dir_start_lookup = 0;
				2506	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
				2507	/* We now have enough fields to check if the inode was active or not.
				2508	* This is needed because nfsd might try to access dead inodes
				2509	* the test is that same one that e2fsck uses
				2510	* NeilBrown 1999oct15
				2511	*/
				2512	if (inode->i_nlink == 0) {
				2513	if (inode->i_mode == 0 \|\|
				2514	!(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
				2515	/* this inode is deleted */
				2516	brelse (bh);
				2517	goto bad_inode;
				2518	}
				2519	/* The only unlinked inodes we let through here have
				2520	* valid i_mode and are being read by the orphan
				2521	* recovery code: that's fine, we're about to complete
				2522	* the process of deleting those. */
				2523	}
				2524	inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
				2525	* (for stat), not the fs block
				2526	* size */
				2527	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
				2528	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
				2529	#ifdef EXT3_FRAGMENTS
				2530	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
				2531	ei->i_frag_no = raw_inode->i_frag;
				2532	ei->i_frag_size = raw_inode->i_fsize;
				2533	#endif
				2534	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
				2535	if (!S_ISREG(inode->i_mode)) {
				2536	ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
				2537	} else {
				2538	inode->i_size \|=
				2539	((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
				2540	}
				2541	ei->i_disksize = inode->i_size;
				2542	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
				2543	ei->i_block_group = iloc.block_group;
				2544	/*
				2545	* NOTE! The in-memory inode i_data array is in little-endian order
				2546	* even on big-endian machines: we do NOT byteswap the block numbers!
				2547	*/
				2548	for (block = 0; block < EXT3_N_BLOCKS; block++)
				2549	ei->i_data[block] = raw_inode->i_block[block];
				2550	INIT_LIST_HEAD(&ei->i_orphan);
				2551
				2552	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
				2553	EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
				2554	/*
				2555	* When mke2fs creates big inodes it does not zero out
				2556	* the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
				2557	* so ignore those first few inodes.
				2558	*/
				2559	ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
				2560	if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
				2561	EXT3_INODE_SIZE(inode->i_sb))
				2562	goto bad_inode;
				2563	if (ei->i_extra_isize == 0) {
				2564	/* The extra space is currently unused. Use it. */
				2565	ei->i_extra_isize = sizeof(struct ext3_inode) -
				2566	EXT3_GOOD_OLD_INODE_SIZE;
				2567	} else {
				2568	__le32 magic = (void )raw_inode +
				2569	EXT3_GOOD_OLD_INODE_SIZE +
				2570	ei->i_extra_isize;
				2571	if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
				2572	ei->i_state \|= EXT3_STATE_XATTR;
				2573	}
				2574	} else
				2575	ei->i_extra_isize = 0;
				2576
				2577	if (S_ISREG(inode->i_mode)) {
				2578	inode->i_op = &ext3_file_inode_operations;
				2579	inode->i_fop = &ext3_file_operations;
				2580	ext3_set_aops(inode);
				2581	} else if (S_ISDIR(inode->i_mode)) {
				2582	inode->i_op = &ext3_dir_inode_operations;
				2583	inode->i_fop = &ext3_dir_operations;
				2584	} else if (S_ISLNK(inode->i_mode)) {
				2585	if (ext3_inode_is_fast_symlink(inode))
				2586	inode->i_op = &ext3_fast_symlink_inode_operations;
				2587	else {
				2588	inode->i_op = &ext3_symlink_inode_operations;
				2589	ext3_set_aops(inode);
				2590	}
				2591	} else {
				2592	inode->i_op = &ext3_special_inode_operations;
				2593	if (raw_inode->i_block[0])
				2594	init_special_inode(inode, inode->i_mode,
				2595	old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
				2596	else
				2597	init_special_inode(inode, inode->i_mode,
				2598	new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
				2599	}
				2600	brelse (iloc.bh);
				2601	ext3_set_inode_flags(inode);
				2602	return;
				2603
				2604	bad_inode:
				2605	make_bad_inode(inode);
				2606	return;
				2607	}
				2608
				2609	/*
				2610	* Post the struct inode info into an on-disk inode location in the
				2611	* buffer-cache. This gobbles the caller's reference to the
				2612	* buffer_head in the inode location struct.
				2613	*
				2614	* The caller must have write access to iloc->bh.
				2615	*/
				2616	static int ext3_do_update_inode(handle_t *handle,
				2617	struct inode *inode,
				2618	struct ext3_iloc *iloc)
				2619	{
				2620	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
				2621	struct ext3_inode_info *ei = EXT3_I(inode);
				2622	struct buffer_head *bh = iloc->bh;
				2623	int err = 0, rc, block;
				2624
				2625	/* For fields not not tracking in the in-memory inode,
				2626	* initialise them to zero for new inodes. */
				2627	if (ei->i_state & EXT3_STATE_NEW)
				2628	memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
				2629
				2630	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
				2631	if(!(test_opt(inode->i_sb, NO_UID32))) {
				2632	raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
				2633	raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
				2634	/*
				2635	* Fix up interoperability with old kernels. Otherwise, old inodes get
				2636	* re-used with the upper 16 bits of the uid/gid intact
				2637	*/
				2638	if(!ei->i_dtime) {
				2639	raw_inode->i_uid_high =
				2640	cpu_to_le16(high_16_bits(inode->i_uid));
				2641	raw_inode->i_gid_high =
				2642	cpu_to_le16(high_16_bits(inode->i_gid));
				2643	} else {
				2644	raw_inode->i_uid_high = 0;
				2645	raw_inode->i_gid_high = 0;
				2646	}
				2647	} else {
				2648	raw_inode->i_uid_low =
				2649	cpu_to_le16(fs_high2lowuid(inode->i_uid));
				2650	raw_inode->i_gid_low =
				2651	cpu_to_le16(fs_high2lowgid(inode->i_gid));
				2652	raw_inode->i_uid_high = 0;
				2653	raw_inode->i_gid_high = 0;
				2654	}
				2655	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
				2656	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
				2657	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
				2658	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
				2659	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
				2660	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
				2661	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
				2662	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
				2663	#ifdef EXT3_FRAGMENTS
				2664	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
				2665	raw_inode->i_frag = ei->i_frag_no;
				2666	raw_inode->i_fsize = ei->i_frag_size;
				2667	#endif
				2668	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
				2669	if (!S_ISREG(inode->i_mode)) {
				2670	raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
				2671	} else {
				2672	raw_inode->i_size_high =
				2673	cpu_to_le32(ei->i_disksize >> 32);
				2674	if (ei->i_disksize > 0x7fffffffULL) {
				2675	struct super_block *sb = inode->i_sb;
				2676	if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
				2677	EXT3_FEATURE_RO_COMPAT_LARGE_FILE) \|\|
				2678	EXT3_SB(sb)->s_es->s_rev_level ==
				2679	cpu_to_le32(EXT3_GOOD_OLD_REV)) {
				2680	/* If this is the first large file
				2681	* created, add a flag to the superblock.
				2682	*/
				2683	err = ext3_journal_get_write_access(handle,
				2684	EXT3_SB(sb)->s_sbh);
				2685	if (err)
				2686	goto out_brelse;
				2687	ext3_update_dynamic_rev(sb);
				2688	EXT3_SET_RO_COMPAT_FEATURE(sb,
				2689	EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
				2690	sb->s_dirt = 1;
				2691	handle->h_sync = 1;
				2692	err = ext3_journal_dirty_metadata(handle,
				2693	EXT3_SB(sb)->s_sbh);
				2694	}
				2695	}
				2696	}
				2697	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
				2698	if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {
				2699	if (old_valid_dev(inode->i_rdev)) {
				2700	raw_inode->i_block[0] =
				2701	cpu_to_le32(old_encode_dev(inode->i_rdev));
				2702	raw_inode->i_block[1] = 0;
				2703	} else {
				2704	raw_inode->i_block[0] = 0;
				2705	raw_inode->i_block[1] =
				2706	cpu_to_le32(new_encode_dev(inode->i_rdev));
				2707	raw_inode->i_block[2] = 0;
				2708	}
				2709	} else for (block = 0; block < EXT3_N_BLOCKS; block++)
				2710	raw_inode->i_block[block] = ei->i_data[block];
				2711
				2712	if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE)
				2713	raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
				2714
				2715	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				2716	rc = ext3_journal_dirty_metadata(handle, bh);
				2717	if (!err)
				2718	err = rc;
				2719	ei->i_state &= ~EXT3_STATE_NEW;
				2720
				2721	out_brelse:
				2722	brelse (bh);
				2723	ext3_std_error(inode->i_sb, err);
				2724	return err;
				2725	}
				2726
				2727	/*
				2728	* ext3_write_inode()
				2729	*
				2730	* We are called from a few places:
				2731	*
				2732	* - Within generic_file_write() for O_SYNC files.
				2733	* Here, there will be no transaction running. We wait for any running
				2734	* trasnaction to commit.
				2735	*
				2736	* - Within sys_sync(), kupdate and such.
				2737	* We wait on commit, if tol to.
				2738	*
				2739	* - Within prune_icache() (PF_MEMALLOC == true)
				2740	* Here we simply return. We can't afford to block kswapd on the
				2741	* journal commit.
				2742	*
				2743	* In all cases it is actually safe for us to return without doing anything,
				2744	* because the inode has been copied into a raw inode buffer in
				2745	* ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
				2746	* knfsd.
				2747	*
				2748	* Note that we are absolutely dependent upon all inode dirtiers doing the
				2749	* right thing: they must call mark_inode_dirty() after dirtying info in
				2750	* which we are interested.
				2751	*
				2752	* It would be a bug for them to not do this. The code:
				2753	*
				2754	* mark_inode_dirty(inode)
				2755	* stuff();
				2756	* inode->i_size = expr;
				2757	*
				2758	* is in error because a kswapd-driven write_inode() could occur while
				2759	* `stuff()' is running, and the new i_size will be lost. Plus the inode
				2760	* will no longer be on the superblock's dirty inode list.
				2761	*/
				2762	int ext3_write_inode(struct inode *inode, int wait)
				2763	{
				2764	if (current->flags & PF_MEMALLOC)
				2765	return 0;
				2766
				2767	if (ext3_journal_current_handle()) {
				2768	jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
				2769	dump_stack();
				2770	return -EIO;
				2771	}
				2772
				2773	if (!wait)
				2774	return 0;
				2775
				2776	return ext3_force_commit(inode->i_sb);
				2777	}
				2778
				2779	/*
				2780	* ext3_setattr()
				2781	*
				2782	* Called from notify_change.
				2783	*
				2784	* We want to trap VFS attempts to truncate the file as soon as
				2785	* possible. In particular, we want to make sure that when the VFS
				2786	* shrinks i_size, we put the inode on the orphan list and modify
				2787	* i_disksize immediately, so that during the subsequent flushing of
				2788	* dirty pages and freeing of disk blocks, we can guarantee that any
				2789	* commit will leave the blocks being flushed in an unused state on
				2790	* disk. (On recovery, the inode will get truncated and the blocks will
				2791	* be freed, so we have a strong guarantee that no future commit will
				2792	* leave these blocks visible to the user.)
				2793	*
				2794	* Called with inode->sem down.
				2795	*/
				2796	int ext3_setattr(struct dentry dentry, struct iattr attr)
				2797	{
				2798	struct inode *inode = dentry->d_inode;
				2799	int error, rc = 0;
				2800	const unsigned int ia_valid = attr->ia_valid;
				2801
				2802	error = inode_change_ok(inode, attr);
				2803	if (error)
				2804	return error;
				2805
				2806	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) \|\|
				2807	(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
				2808	handle_t *handle;
				2809
				2810	/* (user+group)*(old+new) structure, inode write (sb,
				2811	* inode block, ? - but truncate inode update has it) */
				2812	handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
				2813	if (IS_ERR(handle)) {
				2814	error = PTR_ERR(handle);
				2815	goto err_out;
				2816	}
				2817	error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
				2818	if (error) {
				2819	ext3_journal_stop(handle);
				2820	return error;
				2821	}
				2822	/* Update corresponding info in inode so that everything is in
				2823	* one transaction */
				2824	if (attr->ia_valid & ATTR_UID)
				2825	inode->i_uid = attr->ia_uid;
				2826	if (attr->ia_valid & ATTR_GID)
				2827	inode->i_gid = attr->ia_gid;
				2828	error = ext3_mark_inode_dirty(handle, inode);
				2829	ext3_journal_stop(handle);
				2830	}
				2831
				2832	if (S_ISREG(inode->i_mode) &&
				2833	attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
				2834	handle_t *handle;
				2835
				2836	handle = ext3_journal_start(inode, 3);
				2837	if (IS_ERR(handle)) {
				2838	error = PTR_ERR(handle);
				2839	goto err_out;
				2840	}
				2841
				2842	error = ext3_orphan_add(handle, inode);
				2843	EXT3_I(inode)->i_disksize = attr->ia_size;
				2844	rc = ext3_mark_inode_dirty(handle, inode);
				2845	if (!error)
				2846	error = rc;
				2847	ext3_journal_stop(handle);
				2848	}
				2849
				2850	rc = inode_setattr(inode, attr);
				2851
				2852	/* If inode_setattr's call to ext3_truncate failed to get a
				2853	* transaction handle at all, we need to clean up the in-core
				2854	* orphan list manually. */
				2855	if (inode->i_nlink)
				2856	ext3_orphan_del(NULL, inode);
				2857
				2858	if (!rc && (ia_valid & ATTR_MODE))
				2859	rc = ext3_acl_chmod(inode);
				2860
				2861	err_out:
				2862	ext3_std_error(inode->i_sb, error);
				2863	if (!error)
				2864	error = rc;
				2865	return error;
				2866	}
				2867
				2868
				2869	/*
				2870	* akpm: how many blocks doth make a writepage()?
				2871	*
				2872	* With N blocks per page, it may be:
				2873	* N data blocks
				2874	* 2 indirect block
				2875	* 2 dindirect
				2876	* 1 tindirect
				2877	* N+5 bitmap blocks (from the above)
				2878	* N+5 group descriptor summary blocks
				2879	* 1 inode block
				2880	* 1 superblock.
				2881	* 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
				2882	*
				2883	* 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
				2884	*
				2885	* With ordered or writeback data it's the same, less the N data blocks.
				2886	*
				2887	* If the inode's direct blocks can hold an integral number of pages then a
				2888	* page cannot straddle two indirect blocks, and we can only touch one indirect
				2889	* and dindirect block, and the "5" above becomes "3".
				2890	*
				2891	* This still overestimates under most circumstances. If we were to pass the
				2892	* start and end offsets in here as well we could do block_to_path() on each
				2893	* block and work out the exact number of indirects which are touched. Pah.
				2894	*/
				2895
				2896	static int ext3_writepage_trans_blocks(struct inode *inode)
				2897	{
				2898	int bpp = ext3_journal_blocks_per_page(inode);
				2899	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
				2900	int ret;
				2901
				2902	if (ext3_should_journal_data(inode))
				2903	ret = 3 * (bpp + indirects) + 2;
				2904	else
				2905	ret = 2 * (bpp + indirects) + 2;
				2906
				2907	#ifdef CONFIG_QUOTA
				2908	/* We know that structure was already allocated during DQUOT_INIT so
				2909	* we will be updating only the data blocks + inodes */
				2910	ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
				2911	#endif
				2912
				2913	return ret;
				2914	}
				2915
				2916	/*
				2917	* The caller must have previously called ext3_reserve_inode_write().
				2918	* Give this, we know that the caller already has write access to iloc->bh.
				2919	*/
				2920	int ext3_mark_iloc_dirty(handle_t *handle,
				2921	struct inode inode, struct ext3_iloc iloc)
				2922	{
				2923	int err = 0;
				2924
				2925	/* the do_update_inode consumes one bh->b_count */
				2926	get_bh(iloc->bh);
				2927
				2928	/* ext3_do_update_inode() does journal_dirty_metadata */
				2929	err = ext3_do_update_inode(handle, inode, iloc);
				2930	put_bh(iloc->bh);
				2931	return err;
				2932	}
				2933
				2934	/*
				2935	* On success, We end up with an outstanding reference count against
				2936	* iloc->bh. This _must_ be cleaned up later.
				2937	*/
				2938
				2939	int
				2940	ext3_reserve_inode_write(handle_t handle, struct inode inode,
				2941	struct ext3_iloc *iloc)
				2942	{
				2943	int err = 0;
				2944	if (handle) {
				2945	err = ext3_get_inode_loc(inode, iloc);
				2946	if (!err) {
				2947	BUFFER_TRACE(iloc->bh, "get_write_access");
				2948	err = ext3_journal_get_write_access(handle, iloc->bh);
				2949	if (err) {
				2950	brelse(iloc->bh);
				2951	iloc->bh = NULL;
				2952	}
				2953	}
				2954	}
				2955	ext3_std_error(inode->i_sb, err);
				2956	return err;
				2957	}
				2958
				2959	/*
				2960	* akpm: What we do here is to mark the in-core inode as clean
				2961	* with respect to inode dirtiness (it may still be data-dirty).
				2962	* This means that the in-core inode may be reaped by prune_icache
				2963	* without having to perform any I/O. This is a very good thing,
				2964	* because any task may call prune_icache - even ones which
				2965	* have a transaction open against a different journal.
				2966	*
				2967	* Is this cheating? Not really. Sure, we haven't written the
				2968	* inode out, but prune_icache isn't a user-visible syncing function.
				2969	* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
				2970	* we start and wait on commits.
				2971	*
				2972	* Is this efficient/effective? Well, we're being nice to the system
				2973	* by cleaning up our inodes proactively so they can be reaped
				2974	* without I/O. But we are potentially leaving up to five seconds'
				2975	* worth of inodes floating about which prune_icache wants us to
				2976	* write out. One way to fix that would be to get prune_icache()
				2977	* to do a write_super() to free up some memory. It has the desired
				2978	* effect.
				2979	*/
				2980	int ext3_mark_inode_dirty(handle_t handle, struct inode inode)
				2981	{
				2982	struct ext3_iloc iloc;
				2983	int err;
				2984
				2985	might_sleep();
				2986	err = ext3_reserve_inode_write(handle, inode, &iloc);
				2987	if (!err)
				2988	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
				2989	return err;
				2990	}
				2991
				2992	/*
				2993	* akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
				2994	*
				2995	* We're really interested in the case where a file is being extended.
				2996	* i_size has been changed by generic_commit_write() and we thus need
				2997	* to include the updated inode in the current transaction.
				2998	*
				2999	* Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
				3000	* are allocated to the file.
				3001	*
				3002	* If the inode is marked synchronous, we don't honour that here - doing
				3003	* so would cause a commit on atime updates, which we don't bother doing.
				3004	* We handle synchronous inodes at the highest possible level.
				3005	*/
				3006	void ext3_dirty_inode(struct inode *inode)
				3007	{
				3008	handle_t *current_handle = ext3_journal_current_handle();
				3009	handle_t *handle;
				3010
				3011	handle = ext3_journal_start(inode, 2);
				3012	if (IS_ERR(handle))
				3013	goto out;
				3014	if (current_handle &&
				3015	current_handle->h_transaction != handle->h_transaction) {
				3016	/* This task has a transaction open against a different fs */
				3017	printk(KERN_EMERG "%s: transactions do not match!\n",
				3018	__FUNCTION__);
				3019	} else {
				3020	jbd_debug(5, "marking dirty. outer handle=%p\n",
				3021	current_handle);
				3022	ext3_mark_inode_dirty(handle, inode);
				3023	}
				3024	ext3_journal_stop(handle);
				3025	out:
				3026	return;
				3027	}
				3028
				3029	#ifdef AKPM
				3030	/*
				3031	* Bind an inode's backing buffer_head into this transaction, to prevent
				3032	* it from being flushed to disk early. Unlike
				3033	* ext3_reserve_inode_write, this leaves behind no bh reference and
				3034	* returns no iloc structure, so the caller needs to repeat the iloc
				3035	* lookup to mark the inode dirty later.
				3036	*/
				3037	static inline int
				3038	ext3_pin_inode(handle_t handle, struct inode inode)
				3039	{
				3040	struct ext3_iloc iloc;
				3041
				3042	int err = 0;
				3043	if (handle) {
				3044	err = ext3_get_inode_loc(inode, &iloc);
				3045	if (!err) {
				3046	BUFFER_TRACE(iloc.bh, "get_write_access");
				3047	err = journal_get_write_access(handle, iloc.bh);
				3048	if (!err)
				3049	err = ext3_journal_dirty_metadata(handle,
				3050	iloc.bh);
				3051	brelse(iloc.bh);
				3052	}
				3053	}
				3054	ext3_std_error(inode->i_sb, err);
				3055	return err;
				3056	}
				3057	#endif
				3058
				3059	int ext3_change_inode_journal_flag(struct inode *inode, int val)
				3060	{
				3061	journal_t *journal;
				3062	handle_t *handle;
				3063	int err;
				3064
				3065	/*
				3066	* We have to be very careful here: changing a data block's
				3067	* journaling status dynamically is dangerous. If we write a
				3068	* data block to the journal, change the status and then delete
				3069	* that block, we risk forgetting to revoke the old log record
				3070	* from the journal and so a subsequent replay can corrupt data.
				3071	* So, first we make sure that the journal is empty and that
				3072	* nobody is changing anything.
				3073	*/
				3074
				3075	journal = EXT3_JOURNAL(inode);
				3076	if (is_journal_aborted(journal) \|\| IS_RDONLY(inode))
				3077	return -EROFS;
				3078
				3079	journal_lock_updates(journal);
				3080	journal_flush(journal);
				3081
				3082	/*
				3083	* OK, there are no updates running now, and all cached data is
				3084	* synced to disk. We are now in a completely consistent state
				3085	* which doesn't have anything in the journal, and we know that
				3086	* no filesystem updates are running, so it is safe to modify
				3087	* the inode's in-core data-journaling state flag now.
				3088	*/
				3089
				3090	if (val)
				3091	EXT3_I(inode)->i_flags \|= EXT3_JOURNAL_DATA_FL;
				3092	else
				3093	EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
				3094	ext3_set_aops(inode);
				3095
				3096	journal_unlock_updates(journal);
				3097
				3098	/* Finally we can mark the inode as dirty. */
				3099
				3100	handle = ext3_journal_start(inode, 1);
				3101	if (IS_ERR(handle))
				3102	return PTR_ERR(handle);
				3103
				3104	err = ext3_mark_inode_dirty(handle, inode);
				3105	handle->h_sync = 1;
				3106	ext3_journal_stop(handle);
				3107	ext3_std_error(inode->i_sb, err);
				3108
				3109	return err;
				3110	}