Blame - fs/ocfs2/aops.c - SHIFTPHONES/mainline/linux

blob: 605c82a93f01ca49f8b269f51f48cdacd7ee5c08 [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				5	*
				6	* This program is free software; you can redistribute it and/or
				7	* modify it under the terms of the GNU General Public
				8	* License as published by the Free Software Foundation; either
				9	* version 2 of the License, or (at your option) any later version.
				10	*
				11	* This program is distributed in the hope that it will be useful,
				12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				14	* General Public License for more details.
				15	*
				16	* You should have received a copy of the GNU General Public
				17	* License along with this program; if not, write to the
				18	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				19	* Boston, MA 021110-1307, USA.
				20	*/
				21
				22	#include <linux/fs.h>
				23	#include <linux/slab.h>
				24	#include <linux/highmem.h>
				25	#include <linux/pagemap.h>
				26	#include <asm/byteorder.h>
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	27	#include <linux/swap.h>
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	28
				29	#define MLOG_MASK_PREFIX ML_FILE_IO
				30	#include <cluster/masklog.h>
				31
				32	#include "ocfs2.h"
				33
				34	#include "alloc.h"
				35	#include "aops.h"
				36	#include "dlmglue.h"
				37	#include "extent_map.h"
				38	#include "file.h"
				39	#include "inode.h"
				40	#include "journal.h"
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	41	#include "suballoc.h"
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	42	#include "super.h"
				43	#include "symlink.h"
				44
				45	#include "buffer_head_io.h"
				46
				47	static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
				48	struct buffer_head *bh_result, int create)
				49	{
				50	int err = -EIO;
				51	int status;
				52	struct ocfs2_dinode *fe = NULL;
				53	struct buffer_head *bh = NULL;
				54	struct buffer_head *buffer_cache_bh = NULL;
				55	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				56	void *kaddr;
				57
				58	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
				59	(unsigned long long)iblock, bh_result, create);
				60
				61	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
				62
				63	if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) {
				64	mlog(ML_ERROR, "block offset > PATH_MAX: %llu",
				65	(unsigned long long)iblock);
				66	goto bail;
				67	}
				68
				69	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
				70	OCFS2_I(inode)->ip_blkno,
				71	&bh, OCFS2_BH_CACHED, inode);
				72	if (status < 0) {
				73	mlog_errno(status);
				74	goto bail;
				75	}
				76	fe = (struct ocfs2_dinode *) bh->b_data;
				77
				78	if (!OCFS2_IS_VALID_DINODE(fe)) {
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	79	mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
				80	(unsigned long long)fe->i_blkno, 7, fe->i_signature);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	81	goto bail;
				82	}
				83
				84	if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
				85	le32_to_cpu(fe->i_clusters))) {
				86	mlog(ML_ERROR, "block offset is outside the allocated size: "
				87	"%llu\n", (unsigned long long)iblock);
				88	goto bail;
				89	}
				90
				91	/* We don't use the page cache to create symlink data, so if
				92	* need be, copy it over from the buffer cache. */
				93	if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) {
				94	u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) +
				95	iblock;
				96	buffer_cache_bh = sb_getblk(osb->sb, blkno);
				97	if (!buffer_cache_bh) {
				98	mlog(ML_ERROR, "couldn't getblock for symlink!\n");
				99	goto bail;
				100	}
				101
				102	/* we haven't locked out transactions, so a commit
				103	* could've happened. Since we've got a reference on
				104	* the bh, even if it commits while we're doing the
				105	* copy, the data is still good. */
				106	if (buffer_jbd(buffer_cache_bh)
				107	&& ocfs2_inode_is_new(inode)) {
				108	kaddr = kmap_atomic(bh_result->b_page, KM_USER0);
				109	if (!kaddr) {
				110	mlog(ML_ERROR, "couldn't kmap!\n");
				111	goto bail;
				112	}
				113	memcpy(kaddr + (bh_result->b_size * iblock),
				114	buffer_cache_bh->b_data,
				115	bh_result->b_size);
				116	kunmap_atomic(kaddr, KM_USER0);
				117	set_buffer_uptodate(bh_result);
				118	}
				119	brelse(buffer_cache_bh);
				120	}
				121
				122	map_bh(bh_result, inode->i_sb,
				123	le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock);
				124
				125	err = 0;
				126
				127	bail:
				128	if (bh)
				129	brelse(bh);
				130
				131	mlog_exit(err);
				132	return err;
				133	}
				134
				135	static int ocfs2_get_block(struct inode *inode, sector_t iblock,
				136	struct buffer_head *bh_result, int create)
				137	{
				138	int err = 0;
				139	u64 p_blkno, past_eof;
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	140	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	141
				142	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
				143	(unsigned long long)iblock, bh_result, create);
				144
				145	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
				146	mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
				147	inode, inode->i_ino);
				148
				149	if (S_ISLNK(inode->i_mode)) {
				150	/* this always does I/O for some reason. */
				151	err = ocfs2_symlink_get_block(inode, iblock, bh_result, create);
				152	goto bail;
				153	}
				154
Mark Fasheh	363041a	2007-01-17 12:31:35 -0800	[diff] [blame]	155	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	156	if (err) {
				157	mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
Mark Fasheh	b0697053	2006-03-03 10:24:33 -0800	[diff] [blame]	158	"%llu, NULL)\n", err, inode, (unsigned long long)iblock,
				159	(unsigned long long)p_blkno);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	160	goto bail;
				161	}
				162
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	163	/*
				164	* ocfs2 never allocates in this function - the only time we
				165	* need to use BH_New is when we're extending i_size on a file
				166	* system which doesn't support holes, in which case BH_New
				167	* allows block_prepare_write() to zero.
				168	*/
				169	mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
				170	"ino %lu, iblock %llu\n", inode->i_ino,
				171	(unsigned long long)iblock);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	172
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	173	if (p_blkno)
				174	map_bh(bh_result, inode->i_sb, p_blkno);
				175
				176	if (!ocfs2_sparse_alloc(osb)) {
				177	if (p_blkno == 0) {
				178	err = -EIO;
				179	mlog(ML_ERROR,
				180	"iblock = %llu p_blkno = %llu blkno=(%llu)\n",
				181	(unsigned long long)iblock,
				182	(unsigned long long)p_blkno,
				183	(unsigned long long)OCFS2_I(inode)->ip_blkno);
				184	mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
				185	dump_stack();
				186	}
				187
				188	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
				189	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
				190	(unsigned long long)past_eof);
				191
				192	if (create && (iblock >= past_eof))
				193	set_buffer_new(bh_result);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	194	}
				195
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	196	bail:
				197	if (err < 0)
				198	err = -EIO;
				199
				200	mlog_exit(err);
				201	return err;
				202	}
				203
				204	static int ocfs2_readpage(struct file file, struct page page)
				205	{
				206	struct inode *inode = page->mapping->host;
				207	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
				208	int ret, unlock = 1;
				209
				210	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
				211
Mark Fasheh	4bcec18	2006-10-09 16:02:40 -0700	[diff] [blame]	212	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	213	if (ret != 0) {
				214	if (ret == AOP_TRUNCATED_PAGE)
				215	unlock = 0;
				216	mlog_errno(ret);
				217	goto out;
				218	}
				219
				220	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				221
				222	/*
				223	* i_size might have just been updated as we grabed the meta lock. We
				224	* might now be discovering a truncate that hit on another node.
				225	* block_read_full_page->get_block freaks out if it is asked to read
				226	* beyond the end of a file, so we check here. Callers
				227	* (generic_file_read, fault->nopage) are clever enough to check i_size
				228	* and notice that the page they just read isn't needed.
				229	*
				230	* XXX sys_readahead() seems to get that wrong?
				231	*/
				232	if (start >= i_size_read(inode)) {
				233	char *addr = kmap(page);
				234	memset(addr, 0, PAGE_SIZE);
				235	flush_dcache_page(page);
				236	kunmap(page);
				237	SetPageUptodate(page);
				238	ret = 0;
				239	goto out_alloc;
				240	}
				241
				242	ret = ocfs2_data_lock_with_page(inode, 0, page);
				243	if (ret != 0) {
				244	if (ret == AOP_TRUNCATED_PAGE)
				245	unlock = 0;
				246	mlog_errno(ret);
				247	goto out_alloc;
				248	}
				249
				250	ret = block_read_full_page(page, ocfs2_get_block);
				251	unlock = 0;
				252
				253	ocfs2_data_unlock(inode, 0);
				254	out_alloc:
				255	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				256	ocfs2_meta_unlock(inode, 0);
				257	out:
				258	if (unlock)
				259	unlock_page(page);
				260	mlog_exit(ret);
				261	return ret;
				262	}
				263
				264	/* Note: Because we don't support holes, our allocation has
				265	* already happened (allocation writes zeros to the file data)
				266	* so we don't have to worry about ordered writes in
				267	* ocfs2_writepage.
				268	*
				269	* ->writepage is called during the process of invalidating the page cache
				270	* during blocked lock processing. It can't block on any cluster locks
				271	* to during block mapping. It's relying on the fact that the block
				272	* mapping can't have disappeared under the dirty pages that it is
				273	* being asked to write back.
				274	*/
				275	static int ocfs2_writepage(struct page page, struct writeback_control wbc)
				276	{
				277	int ret;
				278
				279	mlog_entry("(0x%p)\n", page);
				280
				281	ret = block_write_full_page(page, ocfs2_get_block, wbc);
				282
				283	mlog_exit(ret);
				284
				285	return ret;
				286	}
				287
Mark Fasheh	5069120	2007-02-09 20:52:53 -0800	[diff] [blame]	288	/*
				289	* This is called from ocfs2_write_zero_page() which has handled it's
				290	* own cluster locking and has ensured allocation exists for those
				291	* blocks to be written.
				292	*/
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	293	int ocfs2_prepare_write_nolock(struct inode inode, struct page page,
				294	unsigned from, unsigned to)
				295	{
				296	int ret;
				297
				298	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				299
				300	ret = block_prepare_write(page, from, to, ocfs2_get_block);
				301
				302	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				303
				304	return ret;
				305	}
				306
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	307	/* Taken from ext3. We don't necessarily need the full blown
				308	* functionality yet, but IMHO it's better to cut and paste the whole
				309	* thing so we can avoid introducing our own bugs (and easily pick up
				310	* their fixes when they happen) --Mark */
Mark Fasheh	60b1139	2007-02-16 11:46:50 -0800	[diff] [blame^]	311	int walk_page_buffers( handle_t *handle,
				312	struct buffer_head *head,
				313	unsigned from,
				314	unsigned to,
				315	int *partial,
				316	int (fn)( handle_t handle,
				317	struct buffer_head *bh))
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	318	{
				319	struct buffer_head *bh;
				320	unsigned block_start, block_end;
				321	unsigned blocksize = head->b_size;
				322	int err, ret = 0;
				323	struct buffer_head *next;
				324
				325	for ( bh = head, block_start = 0;
				326	ret == 0 && (bh != head \|\| !block_start);
				327	block_start = block_end, bh = next)
				328	{
				329	next = bh->b_this_page;
				330	block_end = block_start + blocksize;
				331	if (block_end <= from \|\| block_start >= to) {
				332	if (partial && !buffer_uptodate(bh))
				333	*partial = 1;
				334	continue;
				335	}
				336	err = (*fn)(handle, bh);
				337	if (!ret)
				338	ret = err;
				339	}
				340	return ret;
				341	}
				342
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	343	handle_t ocfs2_start_walk_page_trans(struct inode inode,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	344	struct page *page,
				345	unsigned from,
				346	unsigned to)
				347	{
				348	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	349	handle_t *handle = NULL;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	350	int ret = 0;
				351
Mark Fasheh	65eff9c	2006-10-09 17:26:22 -0700	[diff] [blame]	352	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	353	if (!handle) {
				354	ret = -ENOMEM;
				355	mlog_errno(ret);
				356	goto out;
				357	}
				358
				359	if (ocfs2_should_order_data(inode)) {
Mark Fasheh	1fabe14	2006-10-09 18:11:45 -0700	[diff] [blame]	360	ret = walk_page_buffers(handle,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	361	page_buffers(page),
				362	from, to, NULL,
				363	ocfs2_journal_dirty_data);
				364	if (ret < 0)
				365	mlog_errno(ret);
				366	}
				367	out:
				368	if (ret) {
				369	if (handle)
Mark Fasheh	02dc1af	2006-10-09 16:48:10 -0700	[diff] [blame]	370	ocfs2_commit_trans(osb, handle);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	371	handle = ERR_PTR(ret);
				372	}
				373	return handle;
				374	}
				375
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	376	static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
				377	{
				378	sector_t status;
				379	u64 p_blkno = 0;
				380	int err = 0;
				381	struct inode *inode = mapping->host;
				382
				383	mlog_entry("(block = %llu)\n", (unsigned long long)block);
				384
				385	/* We don't need to lock journal system files, since they aren't
				386	* accessed concurrently from multiple nodes.
				387	*/
				388	if (!INODE_JOURNAL(inode)) {
Mark Fasheh	4bcec18	2006-10-09 16:02:40 -0700	[diff] [blame]	389	err = ocfs2_meta_lock(inode, NULL, 0);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	390	if (err) {
				391	if (err != -ENOENT)
				392	mlog_errno(err);
				393	goto bail;
				394	}
				395	down_read(&OCFS2_I(inode)->ip_alloc_sem);
				396	}
				397
Mark Fasheh	363041a	2007-01-17 12:31:35 -0800	[diff] [blame]	398	err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL);
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	399
				400	if (!INODE_JOURNAL(inode)) {
				401	up_read(&OCFS2_I(inode)->ip_alloc_sem);
				402	ocfs2_meta_unlock(inode, 0);
				403	}
				404
				405	if (err) {
				406	mlog(ML_ERROR, "get_blocks() failed, block = %llu\n",
				407	(unsigned long long)block);
				408	mlog_errno(err);
				409	goto bail;
				410	}
				411
				412
				413	bail:
				414	status = err ? 0 : p_blkno;
				415
				416	mlog_exit((int)status);
				417
				418	return status;
				419	}
				420
				421	/*
				422	* TODO: Make this into a generic get_blocks function.
				423	*
				424	* From do_direct_io in direct-io.c:
				425	* "So what we do is to permit the ->get_blocks function to populate
				426	* bh.b_size with the size of IO which is permitted at this offset and
				427	* this i_blkbits."
				428	*
				429	* This function is called directly from get_more_blocks in direct-io.c.
				430	*
				431	* called like this: dio->get_blocks(dio->inode, fs_startblk,
				432	* fs_count, map_bh, dio->rw == WRITE);
				433	*/
				434	static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	435	struct buffer_head *bh_result, int create)
				436	{
				437	int ret;
Mark Fasheh	564f8a3	2006-12-14 13:01:05 -0800	[diff] [blame]	438	u64 p_blkno, inode_blocks;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	439	int contig_blocks;
Florin Malita	184d7d2	2006-06-03 19:30:10 -0400	[diff] [blame]	440	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
Badari Pulavarty	1d8fa7a	2006-03-26 01:38:02 -0800	[diff] [blame]	441	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	442
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	443	/* This function won't even be called if the request isn't all
				444	* nicely aligned and of the right size, so there's no need
				445	* for us to check any of that. */
				446
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	447	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
Mark Fasheh	564f8a3	2006-12-14 13:01:05 -0800	[diff] [blame]	448
				449	/*
				450	* Any write past EOF is not allowed because we'd be extending.
				451	*/
				452	if (create && (iblock + max_blocks) > inode_blocks) {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	453	ret = -EIO;
				454	goto bail;
				455	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	456
				457	/* This figures out the size of the next contiguous block, and
				458	* our logical offset */
Mark Fasheh	363041a	2007-01-17 12:31:35 -0800	[diff] [blame]	459	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	460	&contig_blocks);
				461	if (ret) {
				462	mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
				463	(unsigned long long)iblock);
				464	ret = -EIO;
				465	goto bail;
				466	}
				467
Mark Fasheh	25baf2d	2007-02-14 15:30:30 -0800	[diff] [blame]	468	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
				469	ocfs2_error(inode->i_sb,
				470	"Inode %llu has a hole at block %llu\n",
				471	(unsigned long long)OCFS2_I(inode)->ip_blkno,
				472	(unsigned long long)iblock);
				473	ret = -EROFS;
				474	goto bail;
				475	}
				476
				477	/*
				478	* get_more_blocks() expects us to describe a hole by clearing
				479	* the mapped bit on bh_result().
				480	*/
				481	if (p_blkno)
				482	map_bh(bh_result, inode->i_sb, p_blkno);
				483	else {
				484	/*
				485	* ocfs2_prepare_inode_for_write() should have caught
				486	* the case where we'd be filling a hole and triggered
				487	* a buffered write instead.
				488	*/
				489	if (create) {
				490	ret = -EIO;
				491	mlog_errno(ret);
				492	goto bail;
				493	}
				494
				495	clear_buffer_mapped(bh_result);
				496	}
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	497
				498	/* make sure we don't map more than max_blocks blocks here as
				499	that's all the kernel will handle at this point. */
				500	if (max_blocks < contig_blocks)
				501	contig_blocks = max_blocks;
				502	bh_result->b_size = contig_blocks << blocksize_bits;
				503	bail:
				504	return ret;
				505	}
				506
				507	/*
				508	* ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
				509	* particularly interested in the aio/dio case. Like the core uses
				510	* i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from
				511	* truncation on another.
				512	*/
				513	static void ocfs2_dio_end_io(struct kiocb *iocb,
				514	loff_t offset,
				515	ssize_t bytes,
				516	void *private)
				517	{
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	518	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	519
				520	/* this io's submitter should not have unlocked this before we could */
				521	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
				522	ocfs2_iocb_clear_rw_locked(iocb);
				523	up_read(&inode->i_alloc_sem);
				524	ocfs2_rw_unlock(inode, 0);
				525	}
				526
Joel Becker	03f981c	2007-01-04 14:54:41 -0800	[diff] [blame]	527	/*
				528	* ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
				529	* from ext3. PageChecked() bits have been removed as OCFS2 does not
				530	* do journalled data.
				531	*/
				532	static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
				533	{
				534	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
				535
				536	journal_invalidatepage(journal, page, offset);
				537	}
				538
				539	static int ocfs2_releasepage(struct page *page, gfp_t wait)
				540	{
				541	journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
				542
				543	if (!page_has_buffers(page))
				544	return 0;
				545	return journal_try_to_free_buffers(journal, page, wait);
				546	}
				547
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	548	static ssize_t ocfs2_direct_IO(int rw,
				549	struct kiocb *iocb,
				550	const struct iovec *iov,
				551	loff_t offset,
				552	unsigned long nr_segs)
				553	{
				554	struct file *file = iocb->ki_filp;
Josef Sipek	d28c917	2006-12-08 02:37:25 -0800	[diff] [blame]	555	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	556	int ret;
				557
				558	mlog_entry_void();
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	559
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	560	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
				561	/*
				562	* We get PR data locks even for O_DIRECT. This
				563	* allows concurrent O_DIRECT I/O but doesn't let
				564	* O_DIRECT with extending and buffered zeroing writes
				565	* race. If they did race then the buffered zeroing
				566	* could be written back after the O_DIRECT I/O. It's
				567	* one thing to tell people not to mix buffered and
				568	* O_DIRECT writes, but expecting them to understand
				569	* that file extension is also an implicit buffered
				570	* write is too much. By getting the PR we force
				571	* writeback of the buffered zeroing before
				572	* proceeding.
				573	*/
				574	ret = ocfs2_data_lock(inode, 0);
				575	if (ret < 0) {
				576	mlog_errno(ret);
				577	goto out;
				578	}
				579	ocfs2_data_unlock(inode, 0);
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	580	}
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	581
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	582	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
				583	inode->i_sb->s_bdev, iov, offset,
				584	nr_segs,
				585	ocfs2_direct_IO_get_blocks,
				586	ocfs2_dio_end_io);
Mark Fasheh	53013cb	2006-05-05 19:04:03 -0700	[diff] [blame]	587	out:
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	588	mlog_exit(ret);
				589	return ret;
				590	}
				591
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	592	static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
				593	u32 cpos,
				594	unsigned int *start,
				595	unsigned int *end)
				596	{
				597	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
				598
				599	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
				600	unsigned int cpp;
				601
				602	cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
				603
				604	cluster_start = cpos % cpp;
				605	cluster_start = cluster_start << osb->s_clustersize_bits;
				606
				607	cluster_end = cluster_start + osb->s_clustersize;
				608	}
				609
				610	BUG_ON(cluster_start > PAGE_SIZE);
				611	BUG_ON(cluster_end > PAGE_SIZE);
				612
				613	if (start)
				614	*start = cluster_start;
				615	if (end)
				616	*end = cluster_end;
				617	}
				618
				619	/*
				620	* 'from' and 'to' are the region in the page to avoid zeroing.
				621	*
				622	* If pagesize > clustersize, this function will avoid zeroing outside
				623	* of the cluster boundary.
				624	*
				625	* from == to == 0 is code for "zero the entire cluster region"
				626	*/
				627	static void ocfs2_clear_page_regions(struct page *page,
				628	struct ocfs2_super *osb, u32 cpos,
				629	unsigned from, unsigned to)
				630	{
				631	void *kaddr;
				632	unsigned int cluster_start, cluster_end;
				633
				634	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
				635
				636	kaddr = kmap_atomic(page, KM_USER0);
				637
				638	if (from \|\| to) {
				639	if (from > cluster_start)
				640	memset(kaddr + cluster_start, 0, from - cluster_start);
				641	if (to < cluster_end)
				642	memset(kaddr + to, 0, cluster_end - to);
				643	} else {
				644	memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
				645	}
				646
				647	kunmap_atomic(kaddr, KM_USER0);
				648	}
				649
				650	/*
				651	* Some of this taken from block_prepare_write(). We already have our
				652	* mapping by now though, and the entire write will be allocating or
				653	* it won't, so not much need to use BH_New.
				654	*
				655	* This will also skip zeroing, which is handled externally.
				656	*/
Mark Fasheh	60b1139	2007-02-16 11:46:50 -0800	[diff] [blame^]	657	int ocfs2_map_page_blocks(struct page page, u64 p_blkno,
				658	struct inode *inode, unsigned int from,
				659	unsigned int to, int new)
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	660	{
				661	int ret = 0;
				662	struct buffer_head head, bh, wait[2], *wait_bh = wait;
				663	unsigned int block_end, block_start;
				664	unsigned int bsize = 1 << inode->i_blkbits;
				665
				666	if (!page_has_buffers(page))
				667	create_empty_buffers(page, bsize, 0);
				668
				669	head = page_buffers(page);
				670	for (bh = head, block_start = 0; bh != head \|\| !block_start;
				671	bh = bh->b_this_page, block_start += bsize) {
				672	block_end = block_start + bsize;
				673
				674	/*
				675	* Ignore blocks outside of our i/o range -
				676	* they may belong to unallocated clusters.
				677	*/
Mark Fasheh	60b1139	2007-02-16 11:46:50 -0800	[diff] [blame^]	678	if (block_start >= to \|\| block_end <= from) {
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	679	if (PageUptodate(page))
				680	set_buffer_uptodate(bh);
				681	continue;
				682	}
				683
				684	/*
				685	* For an allocating write with cluster size >= page
				686	* size, we always write the entire page.
				687	*/
				688
				689	if (buffer_new(bh))
				690	clear_buffer_new(bh);
				691
				692	if (!buffer_mapped(bh)) {
				693	map_bh(bh, inode->i_sb, *p_blkno);
				694	unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
				695	}
				696
				697	if (PageUptodate(page)) {
				698	if (!buffer_uptodate(bh))
				699	set_buffer_uptodate(bh);
				700	} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
				701	(block_start < from \|\| block_end > to)) {
				702	ll_rw_block(READ, 1, &bh);
				703	*wait_bh++=bh;
				704	}
				705
				706	p_blkno = p_blkno + 1;
				707	}
				708
				709	/*
				710	* If we issued read requests - let them complete.
				711	*/
				712	while(wait_bh > wait) {
				713	wait_on_buffer(*--wait_bh);
				714	if (!buffer_uptodate(*wait_bh))
				715	ret = -EIO;
				716	}
				717
				718	if (ret == 0 \|\| !new)
				719	return ret;
				720
				721	/*
				722	* If we get -EIO above, zero out any newly allocated blocks
				723	* to avoid exposing stale data.
				724	*/
				725	bh = head;
				726	block_start = 0;
				727	do {
				728	void *kaddr;
				729
				730	block_end = block_start + bsize;
				731	if (block_end <= from)
				732	goto next_bh;
				733	if (block_start >= to)
				734	break;
				735
				736	kaddr = kmap_atomic(page, KM_USER0);
				737	memset(kaddr+block_start, 0, bh->b_size);
				738	flush_dcache_page(page);
				739	kunmap_atomic(kaddr, KM_USER0);
				740	set_buffer_uptodate(bh);
				741	mark_buffer_dirty(bh);
				742
				743	next_bh:
				744	block_start = block_end;
				745	bh = bh->b_this_page;
				746	} while (bh != head);
				747
				748	return ret;
				749	}
				750
				751	/*
				752	* This will copy user data from the iovec in the buffered write
				753	* context.
				754	*/
				755	int ocfs2_map_and_write_user_data(struct inode *inode,
				756	struct ocfs2_write_ctxt wc, u64 p_blkno,
				757	unsigned int ret_from, unsigned int ret_to)
				758	{
				759	int ret;
				760	unsigned int to, from, cluster_start, cluster_end;
				761	unsigned long bytes, src_from;
				762	char *dst;
				763	struct ocfs2_buffered_write_priv *bp = wc->w_private;
				764	const struct iovec *cur_iov = bp->b_cur_iov;
				765	char __user *buf;
				766	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				767
				768	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
				769	&cluster_end);
				770
				771	buf = cur_iov->iov_base + bp->b_cur_off;
				772	src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
				773
				774	from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
				775
				776	/*
				777	* This is a lot of comparisons, but it reads quite
				778	* easily, which is important here.
				779	*/
				780	/* Stay within the src page */
				781	bytes = PAGE_SIZE - src_from;
				782	/* Stay within the vector */
				783	bytes = min(bytes,
				784	(unsigned long)(cur_iov->iov_len - bp->b_cur_off));
				785	/* Stay within count */
				786	bytes = min(bytes, (unsigned long)wc->w_count);
				787	/*
				788	* For clustersize > page size, just stay within
				789	* target page, otherwise we have to calculate pos
				790	* within the cluster and obey the rightmost
				791	* boundary.
				792	*/
				793	if (wc->w_large_pages) {
				794	/*
				795	* For cluster size < page size, we have to
				796	* calculate pos within the cluster and obey
				797	* the rightmost boundary.
				798	*/
				799	bytes = min(bytes, (unsigned long)(osb->s_clustersize
				800	- (wc->w_pos & (osb->s_clustersize - 1))));
				801	} else {
				802	/*
				803	* cluster size > page size is the most common
				804	* case - we just stay within the target page
				805	* boundary.
				806	*/
				807	bytes = min(bytes, PAGE_CACHE_SIZE - from);
				808	}
				809
				810	to = from + bytes;
				811
				812	if (wc->w_this_page_new)
				813	ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
				814	cluster_start, cluster_end, 1);
				815	else
				816	ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
				817	from, to, 0);
				818	if (ret) {
				819	mlog_errno(ret);
				820	goto out;
				821	}
				822
				823	BUG_ON(from > PAGE_CACHE_SIZE);
				824	BUG_ON(to > PAGE_CACHE_SIZE);
				825	BUG_ON(from > osb->s_clustersize);
				826	BUG_ON(to > osb->s_clustersize);
				827
				828	dst = kmap(wc->w_this_page);
				829	memcpy(dst + from, bp->b_src_buf + src_from, bytes);
				830	kunmap(wc->w_this_page);
				831
				832	/*
				833	* XXX: This is slow, but simple. The caller of
				834	* ocfs2_buffered_write_cluster() is responsible for
				835	* passing through the iovecs, so it's difficult to
				836	* predict what our next step is in here after our
				837	* initial write. A future version should be pushing
				838	* that iovec manipulation further down.
				839	*
				840	* By setting this, we indicate that a copy from user
				841	* data was done, and subsequent calls for this
				842	* cluster will skip copying more data.
				843	*/
				844	wc->w_finished_copy = 1;
				845
				846	*ret_from = from;
				847	*ret_to = to;
				848	out:
				849
				850	return bytes ? (unsigned int)bytes : ret;
				851	}
				852
				853	/*
				854	* Map, fill and write a page to disk.
				855	*
				856	* The work of copying data is done via callback. Newly allocated
				857	* pages which don't take user data will be zero'd (set 'new' to
				858	* indicate an allocating write)
				859	*
				860	* Returns a negative error code or the number of bytes copied into
				861	* the page.
				862	*/
				863	int ocfs2_write_data_page(struct inode inode, handle_t handle,
				864	u64 p_blkno, struct page page,
				865	struct ocfs2_write_ctxt *wc, int new)
				866	{
				867	int ret, copied = 0;
				868	unsigned int from = 0, to = 0;
				869	unsigned int cluster_start, cluster_end;
				870	unsigned int zero_from = 0, zero_to = 0;
				871
				872	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
				873	&cluster_start, &cluster_end);
				874
				875	if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
				876	&& !wc->w_finished_copy) {
				877
				878	wc->w_this_page = page;
				879	wc->w_this_page_new = new;
				880	ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
				881	if (ret < 0) {
				882	mlog_errno(ret);
				883	goto out;
				884	}
				885
				886	copied = ret;
				887
				888	zero_from = from;
				889	zero_to = to;
				890	if (new) {
				891	from = cluster_start;
				892	to = cluster_end;
				893	}
				894	} else {
				895	/*
				896	* If we haven't allocated the new page yet, we
				897	* shouldn't be writing it out without copying user
				898	* data. This is likely a math error from the caller.
				899	*/
				900	BUG_ON(!new);
				901
				902	from = cluster_start;
				903	to = cluster_end;
				904
				905	ret = ocfs2_map_page_blocks(page, p_blkno, inode,
				906	cluster_start, cluster_end, 1);
				907	if (ret) {
				908	mlog_errno(ret);
				909	goto out;
				910	}
				911	}
				912
				913	/*
				914	* Parts of newly allocated pages need to be zero'd.
				915	*
				916	* Above, we have also rewritten 'to' and 'from' - as far as
				917	* the rest of the function is concerned, the entire cluster
				918	* range inside of a page needs to be written.
				919	*
				920	* We can skip this if the page is up to date - it's already
				921	* been zero'd from being read in as a hole.
				922	*/
				923	if (new && !PageUptodate(page))
				924	ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
				925	wc->w_cpos, zero_from, zero_to);
				926
				927	flush_dcache_page(page);
				928
				929	if (ocfs2_should_order_data(inode)) {
				930	ret = walk_page_buffers(handle,
				931	page_buffers(page),
				932	from, to, NULL,
				933	ocfs2_journal_dirty_data);
				934	if (ret < 0)
				935	mlog_errno(ret);
				936	}
				937
				938	/*
				939	* We don't use generic_commit_write() because we need to
				940	* handle our own i_size update.
				941	*/
				942	ret = block_commit_write(page, from, to);
				943	if (ret)
				944	mlog_errno(ret);
				945	out:
				946
				947	return copied ? copied : ret;
				948	}
				949
				950	/*
				951	* Do the actual write of some data into an inode. Optionally allocate
				952	* in order to fulfill the write.
				953	*
				954	* cpos is the logical cluster offset within the file to write at
				955	*
				956	* 'phys' is the physical mapping of that offset. a 'phys' value of
				957	* zero indicates that allocation is required. In this case, data_ac
				958	* and meta_ac should be valid (meta_ac can be null if metadata
				959	* allocation isn't required).
				960	*/
				961	static ssize_t ocfs2_write(struct file file, u32 phys, handle_t handle,
				962	struct buffer_head *di_bh,
				963	struct ocfs2_alloc_context *data_ac,
				964	struct ocfs2_alloc_context *meta_ac,
				965	struct ocfs2_write_ctxt *wc)
				966	{
				967	int ret, i, numpages = 1, new;
				968	unsigned int copied = 0;
				969	u32 tmp_pos;
				970	u64 v_blkno, p_blkno;
				971	struct address_space *mapping = file->f_mapping;
				972	struct inode *inode = mapping->host;
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	973	unsigned long index, start;
				974	struct page **cpages;
				975
				976	new = phys == 0 ? 1 : 0;
				977
				978	/*
				979	* Figure out how many pages we'll be manipulating here. For
Mark Fasheh	60b1139	2007-02-16 11:46:50 -0800	[diff] [blame^]	980	* non allocating write, we just change the one
				981	* page. Otherwise, we'll need a whole clusters worth.
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	982	*/
Mark Fasheh	60b1139	2007-02-16 11:46:50 -0800	[diff] [blame^]	983	if (new)
				984	numpages = ocfs2_pages_per_cluster(inode->i_sb);
Mark Fasheh	9517bac	2007-02-09 20:24:12 -0800	[diff] [blame]	985
				986	cpages = kzalloc(sizeof(cpages) numpages, GFP_NOFS);
				987	if (!cpages) {
				988	ret = -ENOMEM;
				989	mlog_errno(ret);
				990	return ret;
				991	}
				992
				993	/*
				994	* Fill our page array first. That way we've grabbed enough so
				995	* that we can zero and flush if we error after adding the
				996	* extent.
				997	*/
				998	if (new) {
				999	start = ocfs2_align_clusters_to_page_index(inode->i_sb,
				1000	wc->w_cpos);
				1001	v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
				1002	} else {
				1003	start = wc->w_pos >> PAGE_CACHE_SHIFT;
				1004	v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
				1005	}
				1006
				1007	for(i = 0; i < numpages; i++) {
				1008	index = start + i;
				1009
				1010	cpages[i] = grab_cache_page(mapping, index);
				1011	if (!cpages[i]) {
				1012	ret = -ENOMEM;
				1013	mlog_errno(ret);
				1014	goto out;
				1015	}
				1016	}
				1017
				1018	if (new) {
				1019	/*
				1020	* This is safe to call with the page locks - it won't take
				1021	* any additional semaphores or cluster locks.
				1022	*/
				1023	tmp_pos = wc->w_cpos;
				1024	ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
				1025	&tmp_pos, 1, di_bh, handle,
				1026	data_ac, meta_ac, NULL);
				1027	/*
				1028	* This shouldn't happen because we must have already
				1029	* calculated the correct meta data allocation required. The
				1030	* internal tree allocation code should know how to increase
				1031	* transaction credits itself.
				1032	*
				1033	* If need be, we could handle -EAGAIN for a
				1034	* RESTART_TRANS here.
				1035	*/
				1036	mlog_bug_on_msg(ret == -EAGAIN,
				1037	"Inode %llu: EAGAIN return during allocation.\n",
				1038	(unsigned long long)OCFS2_I(inode)->ip_blkno);
				1039	if (ret < 0) {
				1040	mlog_errno(ret);
				1041	goto out;
				1042	}
				1043	}
				1044
				1045	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL);
				1046	if (ret < 0) {
				1047
				1048	/*
				1049	* XXX: Should we go readonly here?
				1050	*/
				1051
				1052	mlog_errno(ret);
				1053	goto out;
				1054	}
				1055
				1056	BUG_ON(p_blkno == 0);
				1057
				1058	for(i = 0; i < numpages; i++) {
				1059	ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
				1060	wc, new);
				1061	if (ret < 0) {
				1062	mlog_errno(ret);
				1063	goto out;
				1064	}
				1065
				1066	copied += ret;
				1067	}
				1068
				1069	out:
				1070	for(i = 0; i < numpages; i++) {
				1071	unlock_page(cpages[i]);
				1072	mark_page_accessed(cpages[i]);
				1073	page_cache_release(cpages[i]);
				1074	}
				1075	kfree(cpages);
				1076
				1077	return copied ? copied : ret;
				1078	}
				1079
				1080	static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
				1081	struct ocfs2_super *osb, loff_t pos,
				1082	size_t count, ocfs2_page_writer *cb,
				1083	void *cb_priv)
				1084	{
				1085	wc->w_count = count;
				1086	wc->w_pos = pos;
				1087	wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
				1088	wc->w_finished_copy = 0;
				1089
				1090	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
				1091	wc->w_large_pages = 1;
				1092	else
				1093	wc->w_large_pages = 0;
				1094
				1095	wc->w_write_data_page = cb;
				1096	wc->w_private = cb_priv;
				1097	}
				1098
				1099	/*
				1100	* Write a cluster to an inode. The cluster may not be allocated yet,
				1101	* in which case it will be. This only exists for buffered writes -
				1102	* O_DIRECT takes a more "traditional" path through the kernel.
				1103	*
				1104	* The caller is responsible for incrementing pos, written counts, etc
				1105	*
				1106	* For file systems that don't support sparse files, pre-allocation
				1107	* and page zeroing up until cpos should be done prior to this
				1108	* function call.
				1109	*
				1110	* Callers should be holding i_sem, and the rw cluster lock.
				1111	*
				1112	* Returns the number of user bytes written, or less than zero for
				1113	* error.
				1114	*/
				1115	ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
				1116	size_t count, ocfs2_page_writer *actor,
				1117	void *priv)
				1118	{
				1119	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
				1120	ssize_t written = 0;
				1121	u32 phys;
				1122	struct inode *inode = file->f_mapping->host;
				1123	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1124	struct buffer_head *di_bh = NULL;
				1125	struct ocfs2_dinode *di;
				1126	struct ocfs2_alloc_context *data_ac = NULL;
				1127	struct ocfs2_alloc_context *meta_ac = NULL;
				1128	handle_t *handle;
				1129	struct ocfs2_write_ctxt wc;
				1130
				1131	ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
				1132
				1133	ret = ocfs2_meta_lock(inode, &di_bh, 1);
				1134	if (ret) {
				1135	mlog_errno(ret);
				1136	goto out;
				1137	}
				1138	di = (struct ocfs2_dinode *)di_bh->b_data;
				1139
				1140	/*
				1141	* Take alloc sem here to prevent concurrent lookups. That way
				1142	* the mapping, zeroing and tree manipulation within
				1143	* ocfs2_write() will be safe against ->readpage(). This
				1144	* should also serve to lock out allocation from a shared
				1145	* writeable region.
				1146	*/
				1147	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				1148
				1149	ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL);
				1150	if (ret) {
				1151	mlog_errno(ret);
				1152	goto out_meta;
				1153	}
				1154
				1155	/* phys == 0 means that allocation is required. */
				1156	if (phys == 0) {
				1157	ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
				1158	if (ret) {
				1159	mlog_errno(ret);
				1160	goto out_meta;
				1161	}
				1162
				1163	credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
				1164	}
				1165
				1166	ret = ocfs2_data_lock(inode, 1);
				1167	if (ret) {
				1168	mlog_errno(ret);
				1169	goto out_meta;
				1170	}
				1171
				1172	handle = ocfs2_start_trans(osb, credits);
				1173	if (IS_ERR(handle)) {
				1174	ret = PTR_ERR(handle);
				1175	mlog_errno(ret);
				1176	goto out_data;
				1177	}
				1178
				1179	written = ocfs2_write(file, phys, handle, di_bh, data_ac,
				1180	meta_ac, &wc);
				1181	if (written < 0) {
				1182	ret = written;
				1183	mlog_errno(ret);
				1184	goto out_commit;
				1185	}
				1186
				1187	ret = ocfs2_journal_access(handle, inode, di_bh,
				1188	OCFS2_JOURNAL_ACCESS_WRITE);
				1189	if (ret) {
				1190	mlog_errno(ret);
				1191	goto out_commit;
				1192	}
				1193
				1194	pos += written;
				1195	if (pos > inode->i_size) {
				1196	i_size_write(inode, pos);
				1197	mark_inode_dirty(inode);
				1198	}
				1199	inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
				1200	di->i_size = cpu_to_le64((u64)i_size_read(inode));
				1201	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
				1202	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
				1203	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
				1204
				1205	ret = ocfs2_journal_dirty(handle, di_bh);
				1206	if (ret)
				1207	mlog_errno(ret);
				1208
				1209	out_commit:
				1210	ocfs2_commit_trans(osb, handle);
				1211
				1212	out_data:
				1213	ocfs2_data_unlock(inode, 1);
				1214
				1215	out_meta:
				1216	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				1217	ocfs2_meta_unlock(inode, 1);
				1218
				1219	out:
				1220	brelse(di_bh);
				1221	if (data_ac)
				1222	ocfs2_free_alloc_context(data_ac);
				1223	if (meta_ac)
				1224	ocfs2_free_alloc_context(meta_ac);
				1225
				1226	return written ? written : ret;
				1227	}
				1228
Christoph Hellwig	f5e54d6	2006-06-28 04:26:44 -0700	[diff] [blame]	1229	const struct address_space_operations ocfs2_aops = {
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1230	.readpage = ocfs2_readpage,
				1231	.writepage = ocfs2_writepage,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1232	.bmap = ocfs2_bmap,
				1233	.sync_page = block_sync_page,
Joel Becker	03f981c	2007-01-04 14:54:41 -0800	[diff] [blame]	1234	.direct_IO = ocfs2_direct_IO,
				1235	.invalidatepage = ocfs2_invalidatepage,
				1236	.releasepage = ocfs2_releasepage,
				1237	.migratepage = buffer_migrate_page,
Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame]	1238	};