Blame - fs/ocfs2/file.c - SHIFTPHONES/android_kernel_shift_sdm845

blob: 72ae9e3306f40dd6fc648e1117de7b061b52244f [file] [log] [blame]

Mark Fasheh	ccd979b	2005-12-15 14:31:24 -0800	[diff] [blame^]	1	/* -- mode: c; c-basic-offset: 8; --
				2	* vim: noexpandtab sw=8 ts=8 sts=0:
				3	*
				4	* file.c
				5	*
				6	* File open, close, extend, truncate
				7	*
				8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
				9	*
				10	* This program is free software; you can redistribute it and/or
				11	* modify it under the terms of the GNU General Public
				12	* License as published by the Free Software Foundation; either
				13	* version 2 of the License, or (at your option) any later version.
				14	*
				15	* This program is distributed in the hope that it will be useful,
				16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				18	* General Public License for more details.
				19	*
				20	* You should have received a copy of the GNU General Public
				21	* License along with this program; if not, write to the
				22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				23	* Boston, MA 021110-1307, USA.
				24	*/
				25
				26	#include <linux/fs.h>
				27	#include <linux/types.h>
				28	#include <linux/slab.h>
				29	#include <linux/highmem.h>
				30	#include <linux/pagemap.h>
				31	#include <linux/uio.h>
				32
				33	#define MLOG_MASK_PREFIX ML_INODE
				34	#include <cluster/masklog.h>
				35
				36	#include "ocfs2.h"
				37
				38	#include "alloc.h"
				39	#include "aops.h"
				40	#include "dir.h"
				41	#include "dlmglue.h"
				42	#include "extent_map.h"
				43	#include "file.h"
				44	#include "sysfile.h"
				45	#include "inode.h"
				46	#include "journal.h"
				47	#include "mmap.h"
				48	#include "suballoc.h"
				49	#include "super.h"
				50
				51	#include "buffer_head_io.h"
				52
				53	static int ocfs2_sync_inode(struct inode *inode)
				54	{
				55	filemap_fdatawrite(inode->i_mapping);
				56	return sync_mapping_buffers(inode->i_mapping);
				57	}
				58
				59	static int ocfs2_file_open(struct inode inode, struct file file)
				60	{
				61	int status;
				62	int mode = file->f_flags;
				63	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				64
				65	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
				66	file->f_dentry->d_name.len, file->f_dentry->d_name.name);
				67
				68	spin_lock(&oi->ip_lock);
				69
				70	/* Check that the inode hasn't been wiped from disk by another
				71	* node. If it hasn't then we're safe as long as we hold the
				72	* spin lock until our increment of open count. */
				73	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
				74	spin_unlock(&oi->ip_lock);
				75
				76	status = -ENOENT;
				77	goto leave;
				78	}
				79
				80	if (mode & O_DIRECT)
				81	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;
				82
				83	oi->ip_open_count++;
				84	spin_unlock(&oi->ip_lock);
				85	status = 0;
				86	leave:
				87	mlog_exit(status);
				88	return status;
				89	}
				90
				91	static int ocfs2_file_release(struct inode inode, struct file file)
				92	{
				93	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				94
				95	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
				96	file->f_dentry->d_name.len,
				97	file->f_dentry->d_name.name);
				98
				99	spin_lock(&oi->ip_lock);
				100	if (!--oi->ip_open_count)
				101	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
				102	spin_unlock(&oi->ip_lock);
				103
				104	mlog_exit(0);
				105
				106	return 0;
				107	}
				108
				109	static int ocfs2_sync_file(struct file *file,
				110	struct dentry *dentry,
				111	int datasync)
				112	{
				113	int err = 0;
				114	journal_t *journal;
				115	struct inode *inode = dentry->d_inode;
				116	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				117
				118	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
				119	dentry->d_name.len, dentry->d_name.name);
				120
				121	err = ocfs2_sync_inode(dentry->d_inode);
				122	if (err)
				123	goto bail;
				124
				125	journal = osb->journal->j_journal;
				126	err = journal_force_commit(journal);
				127
				128	bail:
				129	mlog_exit(err);
				130
				131	return (err < 0) ? -EIO : 0;
				132	}
				133
				134	int ocfs2_set_inode_size(struct ocfs2_journal_handle *handle,
				135	struct inode *inode,
				136	struct buffer_head *fe_bh,
				137	u64 new_i_size)
				138	{
				139	int status;
				140
				141	mlog_entry_void();
				142	i_size_write(inode, new_i_size);
				143	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
				144	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
				145
				146	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
				147	if (status < 0) {
				148	mlog_errno(status);
				149	goto bail;
				150	}
				151
				152	bail:
				153	mlog_exit(status);
				154	return status;
				155	}
				156
				157	static int ocfs2_simple_size_update(struct inode *inode,
				158	struct buffer_head *di_bh,
				159	u64 new_i_size)
				160	{
				161	int ret;
				162	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				163	struct ocfs2_journal_handle *handle = NULL;
				164
				165	handle = ocfs2_start_trans(osb, NULL,
				166	OCFS2_INODE_UPDATE_CREDITS);
				167	if (handle == NULL) {
				168	ret = -ENOMEM;
				169	mlog_errno(ret);
				170	goto out;
				171	}
				172
				173	ret = ocfs2_set_inode_size(handle, inode, di_bh,
				174	new_i_size);
				175	if (ret < 0)
				176	mlog_errno(ret);
				177
				178	ocfs2_commit_trans(handle);
				179	out:
				180	return ret;
				181	}
				182
				183	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
				184	struct inode *inode,
				185	struct buffer_head *fe_bh,
				186	u64 new_i_size)
				187	{
				188	int status;
				189	struct ocfs2_journal_handle *handle;
				190
				191	mlog_entry_void();
				192
				193	/* TODO: This needs to actually orphan the inode in this
				194	* transaction. */
				195
				196	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
				197	if (IS_ERR(handle)) {
				198	status = PTR_ERR(handle);
				199	mlog_errno(status);
				200	goto out;
				201	}
				202
				203	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
				204	if (status < 0)
				205	mlog_errno(status);
				206
				207	ocfs2_commit_trans(handle);
				208	out:
				209	mlog_exit(status);
				210	return status;
				211	}
				212
				213	static int ocfs2_truncate_file(struct inode *inode,
				214	struct buffer_head *di_bh,
				215	u64 new_i_size)
				216	{
				217	int status = 0;
				218	struct ocfs2_dinode *fe = NULL;
				219	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				220	struct ocfs2_truncate_context *tc = NULL;
				221
				222	mlog_entry("(inode = %"MLFu64", new_i_size = %"MLFu64"\n",
				223	OCFS2_I(inode)->ip_blkno, new_i_size);
				224
				225	truncate_inode_pages(inode->i_mapping, new_i_size);
				226
				227	fe = (struct ocfs2_dinode *) di_bh->b_data;
				228	if (!OCFS2_IS_VALID_DINODE(fe)) {
				229	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
				230	status = -EIO;
				231	goto bail;
				232	}
				233
				234	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
				235	"Inode %"MLFu64", inode i_size = %lld != di "
				236	"i_size = %"MLFu64", i_flags = 0x%x\n",
				237	OCFS2_I(inode)->ip_blkno,
				238	i_size_read(inode),
				239	le64_to_cpu(fe->i_size), le32_to_cpu(fe->i_flags));
				240
				241	if (new_i_size > le64_to_cpu(fe->i_size)) {
				242	mlog(0, "asked to truncate file with size (%"MLFu64") "
				243	"to size (%"MLFu64")!\n",
				244	le64_to_cpu(fe->i_size), new_i_size);
				245	status = -EINVAL;
				246	mlog_errno(status);
				247	goto bail;
				248	}
				249
				250	mlog(0, "inode %"MLFu64", i_size = %"MLFu64", new_i_size = %"MLFu64"\n",
				251	le64_to_cpu(fe->i_blkno), le64_to_cpu(fe->i_size), new_i_size);
				252
				253	/* lets handle the simple truncate cases before doing any more
				254	* cluster locking. */
				255	if (new_i_size == le64_to_cpu(fe->i_size))
				256	goto bail;
				257
				258	if (le32_to_cpu(fe->i_clusters) ==
				259	ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
				260	mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
				261	fe->i_clusters);
				262	/* No allocation change is required, so lets fast path
				263	* this truncate. */
				264	status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
				265	if (status < 0)
				266	mlog_errno(status);
				267	goto bail;
				268	}
				269
				270	/* This forces other nodes to sync and drop their pages */
				271	status = ocfs2_data_lock(inode, 1);
				272	if (status < 0) {
				273	mlog_errno(status);
				274	goto bail;
				275	}
				276	ocfs2_data_unlock(inode, 1);
				277
				278	/* alright, we're going to need to do a full blown alloc size
				279	* change. Orphan the inode so that recovery can complete the
				280	* truncate if necessary. This does the task of marking
				281	* i_size. */
				282	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
				283	if (status < 0) {
				284	mlog_errno(status);
				285	goto bail;
				286	}
				287
				288	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
				289	if (status < 0) {
				290	mlog_errno(status);
				291	goto bail;
				292	}
				293
				294	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
				295	if (status < 0) {
				296	mlog_errno(status);
				297	goto bail;
				298	}
				299
				300	/* TODO: orphan dir cleanup here. */
				301	bail:
				302
				303	mlog_exit(status);
				304	return status;
				305	}
				306
				307	/*
				308	* extend allocation only here.
				309	* we'll update all the disk stuff, and oip->alloc_size
				310	*
				311	* expect stuff to be locked, a transaction started and enough data /
				312	* metadata reservations in the contexts.
				313	*
				314	* Will return -EAGAIN, and a reason if a restart is needed.
				315	* If passed in, *reason will always be set, even in error.
				316	*/
				317	int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
				318	struct inode *inode,
				319	u32 clusters_to_add,
				320	struct buffer_head *fe_bh,
				321	struct ocfs2_journal_handle *handle,
				322	struct ocfs2_alloc_context *data_ac,
				323	struct ocfs2_alloc_context *meta_ac,
				324	enum ocfs2_alloc_restarted *reason_ret)
				325	{
				326	int status = 0;
				327	int free_extents;
				328	struct ocfs2_dinode fe = (struct ocfs2_dinode ) fe_bh->b_data;
				329	enum ocfs2_alloc_restarted reason = RESTART_NONE;
				330	u32 bit_off, num_bits;
				331	u64 block;
				332
				333	BUG_ON(!clusters_to_add);
				334
				335	free_extents = ocfs2_num_free_extents(osb, inode, fe);
				336	if (free_extents < 0) {
				337	status = free_extents;
				338	mlog_errno(status);
				339	goto leave;
				340	}
				341
				342	/* there are two cases which could cause us to EAGAIN in the
				343	* we-need-more-metadata case:
				344	* 1) we haven't reserved any
				345	* 2) we are so fragmented, we've needed to add metadata too
				346	* many times. */
				347	if (!free_extents && !meta_ac) {
				348	mlog(0, "we haven't reserved any metadata!\n");
				349	status = -EAGAIN;
				350	reason = RESTART_META;
				351	goto leave;
				352	} else if ((!free_extents)
				353	&& (ocfs2_alloc_context_bits_left(meta_ac)
				354	< ocfs2_extend_meta_needed(fe))) {
				355	mlog(0, "filesystem is really fragmented...\n");
				356	status = -EAGAIN;
				357	reason = RESTART_META;
				358	goto leave;
				359	}
				360
				361	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
				362	&bit_off, &num_bits);
				363	if (status < 0) {
				364	if (status != -ENOSPC)
				365	mlog_errno(status);
				366	goto leave;
				367	}
				368
				369	BUG_ON(num_bits > clusters_to_add);
				370
				371	/* reserve our write early -- insert_extent may update the inode */
				372	status = ocfs2_journal_access(handle, inode, fe_bh,
				373	OCFS2_JOURNAL_ACCESS_WRITE);
				374	if (status < 0) {
				375	mlog_errno(status);
				376	goto leave;
				377	}
				378
				379	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
				380	mlog(0, "Allocating %u clusters at block %u for inode %"MLFu64"\n",
				381	num_bits, bit_off, OCFS2_I(inode)->ip_blkno);
				382	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
				383	num_bits, meta_ac);
				384	if (status < 0) {
				385	mlog_errno(status);
				386	goto leave;
				387	}
				388
				389	le32_add_cpu(&fe->i_clusters, num_bits);
				390	spin_lock(&OCFS2_I(inode)->ip_lock);
				391	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
				392	spin_unlock(&OCFS2_I(inode)->ip_lock);
				393
				394	status = ocfs2_journal_dirty(handle, fe_bh);
				395	if (status < 0) {
				396	mlog_errno(status);
				397	goto leave;
				398	}
				399
				400	clusters_to_add -= num_bits;
				401
				402	if (clusters_to_add) {
				403	mlog(0, "need to alloc once more, clusters = %u, wanted = "
				404	"%u\n", fe->i_clusters, clusters_to_add);
				405	status = -EAGAIN;
				406	reason = RESTART_TRANS;
				407	}
				408
				409	leave:
				410	mlog_exit(status);
				411	if (reason_ret)
				412	*reason_ret = reason;
				413	return status;
				414	}
				415
				416	static int ocfs2_extend_allocation(struct inode *inode,
				417	u32 clusters_to_add)
				418	{
				419	int status = 0;
				420	int restart_func = 0;
				421	int drop_alloc_sem = 0;
				422	int credits, num_free_extents;
				423	u32 prev_clusters;
				424	struct buffer_head *bh = NULL;
				425	struct ocfs2_dinode *fe = NULL;
				426	struct ocfs2_journal_handle *handle = NULL;
				427	struct ocfs2_alloc_context *data_ac = NULL;
				428	struct ocfs2_alloc_context *meta_ac = NULL;
				429	enum ocfs2_alloc_restarted why;
				430	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				431
				432	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
				433
				434	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
				435	OCFS2_BH_CACHED, inode);
				436	if (status < 0) {
				437	mlog_errno(status);
				438	goto leave;
				439	}
				440
				441	fe = (struct ocfs2_dinode *) bh->b_data;
				442	if (!OCFS2_IS_VALID_DINODE(fe)) {
				443	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
				444	status = -EIO;
				445	goto leave;
				446	}
				447
				448	restart_all:
				449	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
				450
				451	mlog(0, "extend inode %"MLFu64", i_size = %lld, fe->i_clusters = %u, "
				452	"clusters_to_add = %u\n",
				453	OCFS2_I(inode)->ip_blkno, i_size_read(inode),
				454	fe->i_clusters, clusters_to_add);
				455
				456	handle = ocfs2_alloc_handle(osb);
				457	if (handle == NULL) {
				458	status = -ENOMEM;
				459	mlog_errno(status);
				460	goto leave;
				461	}
				462
				463	num_free_extents = ocfs2_num_free_extents(osb,
				464	inode,
				465	fe);
				466	if (num_free_extents < 0) {
				467	status = num_free_extents;
				468	mlog_errno(status);
				469	goto leave;
				470	}
				471
				472	if (!num_free_extents) {
				473	status = ocfs2_reserve_new_metadata(osb,
				474	handle,
				475	fe,
				476	&meta_ac);
				477	if (status < 0) {
				478	if (status != -ENOSPC)
				479	mlog_errno(status);
				480	goto leave;
				481	}
				482	}
				483
				484	status = ocfs2_reserve_clusters(osb,
				485	handle,
				486	clusters_to_add,
				487	&data_ac);
				488	if (status < 0) {
				489	if (status != -ENOSPC)
				490	mlog_errno(status);
				491	goto leave;
				492	}
				493
				494	/* blocks peope in read/write from reading our allocation
				495	* until we're done changing it. We depend on i_sem to block
				496	* other extend/truncate calls while we're here. Ordering wrt
				497	* start_trans is important here -- always do it before! */
				498	down_write(&OCFS2_I(inode)->ip_alloc_sem);
				499	drop_alloc_sem = 1;
				500
				501	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
				502	handle = ocfs2_start_trans(osb, handle, credits);
				503	if (IS_ERR(handle)) {
				504	status = PTR_ERR(handle);
				505	handle = NULL;
				506	mlog_errno(status);
				507	goto leave;
				508	}
				509
				510	restarted_transaction:
				511	/* reserve a write to the file entry early on - that we if we
				512	* run out of credits in the allocation path, we can still
				513	* update i_size. */
				514	status = ocfs2_journal_access(handle, inode, bh,
				515	OCFS2_JOURNAL_ACCESS_WRITE);
				516	if (status < 0) {
				517	mlog_errno(status);
				518	goto leave;
				519	}
				520
				521	prev_clusters = OCFS2_I(inode)->ip_clusters;
				522
				523	status = ocfs2_do_extend_allocation(osb,
				524	inode,
				525	clusters_to_add,
				526	bh,
				527	handle,
				528	data_ac,
				529	meta_ac,
				530	&why);
				531	if ((status < 0) && (status != -EAGAIN)) {
				532	if (status != -ENOSPC)
				533	mlog_errno(status);
				534	goto leave;
				535	}
				536
				537	status = ocfs2_journal_dirty(handle, bh);
				538	if (status < 0) {
				539	mlog_errno(status);
				540	goto leave;
				541	}
				542
				543	spin_lock(&OCFS2_I(inode)->ip_lock);
				544	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
				545	spin_unlock(&OCFS2_I(inode)->ip_lock);
				546
				547	if (why != RESTART_NONE && clusters_to_add) {
				548	if (why == RESTART_META) {
				549	mlog(0, "restarting function.\n");
				550	restart_func = 1;
				551	} else {
				552	BUG_ON(why != RESTART_TRANS);
				553
				554	mlog(0, "restarting transaction.\n");
				555	/* TODO: This can be more intelligent. */
				556	credits = ocfs2_calc_extend_credits(osb->sb,
				557	fe,
				558	clusters_to_add);
				559	status = ocfs2_extend_trans(handle, credits);
				560	if (status < 0) {
				561	/* handle still has to be committed at
				562	* this point. */
				563	status = -ENOMEM;
				564	mlog_errno(status);
				565	goto leave;
				566	}
				567	goto restarted_transaction;
				568	}
				569	}
				570
				571	mlog(0, "fe: i_clusters = %u, i_size=%"MLFu64"\n",
				572	fe->i_clusters, fe->i_size);
				573	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
				574	OCFS2_I(inode)->ip_clusters, i_size_read(inode));
				575
				576	leave:
				577	if (drop_alloc_sem) {
				578	up_write(&OCFS2_I(inode)->ip_alloc_sem);
				579	drop_alloc_sem = 0;
				580	}
				581	if (handle) {
				582	ocfs2_commit_trans(handle);
				583	handle = NULL;
				584	}
				585	if (data_ac) {
				586	ocfs2_free_alloc_context(data_ac);
				587	data_ac = NULL;
				588	}
				589	if (meta_ac) {
				590	ocfs2_free_alloc_context(meta_ac);
				591	meta_ac = NULL;
				592	}
				593	if ((!status) && restart_func) {
				594	restart_func = 0;
				595	goto restart_all;
				596	}
				597	if (bh) {
				598	brelse(bh);
				599	bh = NULL;
				600	}
				601
				602	mlog_exit(status);
				603	return status;
				604	}
				605
				606	/* Some parts of this taken from generic_cont_expand, which turned out
				607	* to be too fragile to do exactly what we need without us having to
				608	* worry about recursive locking in ->commit_write(). */
				609	static int ocfs2_write_zero_page(struct inode *inode,
				610	u64 size)
				611	{
				612	struct address_space *mapping = inode->i_mapping;
				613	struct page *page;
				614	unsigned long index;
				615	unsigned int offset;
				616	struct ocfs2_journal_handle *handle = NULL;
				617	int ret;
				618
				619	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
				620	/* ugh. in prepare/commit_write, if from==to==start of block, we
				621	** skip the prepare. make sure we never send an offset for the start
				622	** of a block
				623	*/
				624	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
				625	offset++;
				626	}
				627	index = size >> PAGE_CACHE_SHIFT;
				628
				629	page = grab_cache_page(mapping, index);
				630	if (!page) {
				631	ret = -ENOMEM;
				632	mlog_errno(ret);
				633	goto out;
				634	}
				635
				636	ret = ocfs2_prepare_write(NULL, page, offset, offset);
				637	if (ret < 0) {
				638	mlog_errno(ret);
				639	goto out_unlock;
				640	}
				641
				642	if (ocfs2_should_order_data(inode)) {
				643	handle = ocfs2_start_walk_page_trans(inode, page, offset,
				644	offset);
				645	if (IS_ERR(handle)) {
				646	ret = PTR_ERR(handle);
				647	handle = NULL;
				648	goto out_unlock;
				649	}
				650	}
				651
				652	/* must not update i_size! */
				653	ret = block_commit_write(page, offset, offset);
				654	if (ret < 0)
				655	mlog_errno(ret);
				656	else
				657	ret = 0;
				658
				659	if (handle)
				660	ocfs2_commit_trans(handle);
				661	out_unlock:
				662	unlock_page(page);
				663	page_cache_release(page);
				664	out:
				665	return ret;
				666	}
				667
				668	static int ocfs2_zero_extend(struct inode *inode,
				669	u64 zero_to_size)
				670	{
				671	int ret = 0;
				672	u64 start_off;
				673	struct super_block *sb = inode->i_sb;
				674
				675	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
				676	while (start_off < zero_to_size) {
				677	ret = ocfs2_write_zero_page(inode, start_off);
				678	if (ret < 0) {
				679	mlog_errno(ret);
				680	goto out;
				681	}
				682
				683	start_off += sb->s_blocksize;
				684	}
				685
				686	out:
				687	return ret;
				688	}
				689
				690	static int ocfs2_extend_file(struct inode *inode,
				691	struct buffer_head *di_bh,
				692	u64 new_i_size)
				693	{
				694	int ret = 0;
				695	u32 clusters_to_add;
				696
				697	/* setattr sometimes calls us like this. */
				698	if (new_i_size == 0)
				699	goto out;
				700
				701	if (i_size_read(inode) == new_i_size)
				702	goto out;
				703	BUG_ON(new_i_size < i_size_read(inode));
				704
				705	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
				706	OCFS2_I(inode)->ip_clusters;
				707
				708	if (clusters_to_add) {
				709	ret = ocfs2_extend_allocation(inode, clusters_to_add);
				710	if (ret < 0) {
				711	mlog_errno(ret);
				712	goto out;
				713	}
				714
				715	ret = ocfs2_zero_extend(inode, new_i_size);
				716	if (ret < 0) {
				717	mlog_errno(ret);
				718	goto out;
				719	}
				720	}
				721
				722	/* No allocation required, we just use this helper to
				723	* do a trivial update of i_size. */
				724	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
				725	if (ret < 0) {
				726	mlog_errno(ret);
				727	goto out;
				728	}
				729
				730	out:
				731	return ret;
				732	}
				733
				734	int ocfs2_setattr(struct dentry dentry, struct iattr attr)
				735	{
				736	int status = 0, size_change;
				737	struct inode *inode = dentry->d_inode;
				738	struct super_block *sb = inode->i_sb;
				739	struct ocfs2_super *osb = OCFS2_SB(sb);
				740	struct buffer_head *bh = NULL;
				741	struct ocfs2_journal_handle *handle = NULL;
				742
				743	mlog_entry("(0x%p, '%.*s')\n", dentry,
				744	dentry->d_name.len, dentry->d_name.name);
				745
				746	if (attr->ia_valid & ATTR_MODE)
				747	mlog(0, "mode change: %d\n", attr->ia_mode);
				748	if (attr->ia_valid & ATTR_UID)
				749	mlog(0, "uid change: %d\n", attr->ia_uid);
				750	if (attr->ia_valid & ATTR_GID)
				751	mlog(0, "gid change: %d\n", attr->ia_gid);
				752	if (attr->ia_valid & ATTR_SIZE)
				753	mlog(0, "size change...\n");
				754	if (attr->ia_valid & (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME))
				755	mlog(0, "time change...\n");
				756
				757	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \
				758	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)
				759	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
				760	mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
				761	return 0;
				762	}
				763
				764	status = inode_change_ok(inode, attr);
				765	if (status)
				766	return status;
				767
				768	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
				769	if (size_change) {
				770	status = ocfs2_rw_lock(inode, 1);
				771	if (status < 0) {
				772	mlog_errno(status);
				773	goto bail;
				774	}
				775	}
				776
				777	status = ocfs2_meta_lock(inode, NULL, &bh, 1);
				778	if (status < 0) {
				779	if (status != -ENOENT)
				780	mlog_errno(status);
				781	goto bail_unlock_rw;
				782	}
				783
				784	if (size_change && attr->ia_size != i_size_read(inode)) {
				785	if (i_size_read(inode) > attr->ia_size)
				786	status = ocfs2_truncate_file(inode, bh, attr->ia_size);
				787	else
				788	status = ocfs2_extend_file(inode, bh, attr->ia_size);
				789	if (status < 0) {
				790	if (status != -ENOSPC)
				791	mlog_errno(status);
				792	status = -ENOSPC;
				793	goto bail_unlock;
				794	}
				795	}
				796
				797	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
				798	if (IS_ERR(handle)) {
				799	status = PTR_ERR(handle);
				800	mlog_errno(status);
				801	goto bail_unlock;
				802	}
				803
				804	status = inode_setattr(inode, attr);
				805	if (status < 0) {
				806	mlog_errno(status);
				807	goto bail_commit;
				808	}
				809
				810	status = ocfs2_mark_inode_dirty(handle, inode, bh);
				811	if (status < 0)
				812	mlog_errno(status);
				813
				814	bail_commit:
				815	ocfs2_commit_trans(handle);
				816	bail_unlock:
				817	ocfs2_meta_unlock(inode, 1);
				818	bail_unlock_rw:
				819	if (size_change)
				820	ocfs2_rw_unlock(inode, 1);
				821	bail:
				822	if (bh)
				823	brelse(bh);
				824
				825	mlog_exit(status);
				826	return status;
				827	}
				828
				829	int ocfs2_getattr(struct vfsmount *mnt,
				830	struct dentry *dentry,
				831	struct kstat *stat)
				832	{
				833	struct inode *inode = dentry->d_inode;
				834	struct super_block *sb = dentry->d_inode->i_sb;
				835	struct ocfs2_super *osb = sb->s_fs_info;
				836	int err;
				837
				838	mlog_entry_void();
				839
				840	err = ocfs2_inode_revalidate(dentry);
				841	if (err) {
				842	if (err != -ENOENT)
				843	mlog_errno(err);
				844	goto bail;
				845	}
				846
				847	generic_fillattr(inode, stat);
				848
				849	/* We set the blksize from the cluster size for performance */
				850	stat->blksize = osb->s_clustersize;
				851
				852	bail:
				853	mlog_exit(err);
				854
				855	return err;
				856	}
				857
				858	static int ocfs2_write_remove_suid(struct inode *inode)
				859	{
				860	int ret;
				861	struct buffer_head *bh = NULL;
				862	struct ocfs2_inode_info *oi = OCFS2_I(inode);
				863	struct ocfs2_journal_handle *handle;
				864	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				865	struct ocfs2_dinode *di;
				866
				867	mlog_entry("(Inode %"MLFu64", mode 0%o)\n", oi->ip_blkno,
				868	inode->i_mode);
				869
				870	handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
				871	if (handle == NULL) {
				872	ret = -ENOMEM;
				873	mlog_errno(ret);
				874	goto out;
				875	}
				876
				877	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
				878	if (ret < 0) {
				879	mlog_errno(ret);
				880	goto out_trans;
				881	}
				882
				883	ret = ocfs2_journal_access(handle, inode, bh,
				884	OCFS2_JOURNAL_ACCESS_WRITE);
				885	if (ret < 0) {
				886	mlog_errno(ret);
				887	goto out_bh;
				888	}
				889
				890	inode->i_mode &= ~S_ISUID;
				891	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
				892	inode->i_mode &= ~S_ISGID;
				893
				894	di = (struct ocfs2_dinode *) bh->b_data;
				895	di->i_mode = cpu_to_le16(inode->i_mode);
				896
				897	ret = ocfs2_journal_dirty(handle, bh);
				898	if (ret < 0)
				899	mlog_errno(ret);
				900	out_bh:
				901	brelse(bh);
				902	out_trans:
				903	ocfs2_commit_trans(handle);
				904	out:
				905	mlog_exit(ret);
				906	return ret;
				907	}
				908
				909	static inline int ocfs2_write_should_remove_suid(struct inode *inode)
				910	{
				911	mode_t mode = inode->i_mode;
				912
				913	if (!capable(CAP_FSETID)) {
				914	if (unlikely(mode & S_ISUID))
				915	return 1;
				916
				917	if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
				918	return 1;
				919	}
				920	return 0;
				921	}
				922
				923	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
				924	const char __user *buf,
				925	size_t count,
				926	loff_t pos)
				927	{
				928	struct iovec local_iov = { .iov_base = (void __user *)buf,
				929	.iov_len = count };
				930	int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0;
				931	u32 clusters;
				932	struct file *filp = iocb->ki_filp;
				933	struct inode *inode = filp->f_dentry->d_inode;
				934	loff_t newsize, saved_pos;
				935	#ifdef OCFS2_ORACORE_WORKAROUNDS
				936	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				937	#endif
				938
				939	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
				940	(unsigned int)count,
				941	filp->f_dentry->d_name.len,
				942	filp->f_dentry->d_name.name);
				943
				944	/* happy write of zero bytes */
				945	if (count == 0)
				946	return 0;
				947
				948	if (!inode) {
				949	mlog(0, "bad inode\n");
				950	return -EIO;
				951	}
				952
				953	#ifdef OCFS2_ORACORE_WORKAROUNDS
				954	/* ugh, work around some applications which open everything O_DIRECT +
				955	* O_APPEND and really don't mean to use O_DIRECT. */
				956	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
				957	(filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT))
				958	filp->f_flags &= ~O_DIRECT;
				959	#endif
				960
				961	down(&inode->i_sem);
				962	/* to match setattr's i_sem -> i_alloc_sem -> rw_lock ordering */
				963	if (filp->f_flags & O_DIRECT) {
				964	have_alloc_sem = 1;
				965	down_read(&inode->i_alloc_sem);
				966	}
				967
				968	/* concurrent O_DIRECT writes are allowed */
				969	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
				970	ret = ocfs2_rw_lock(inode, rw_level);
				971	if (ret < 0) {
				972	rw_level = -1;
				973	mlog_errno(ret);
				974	goto out;
				975	}
				976
				977	/*
				978	* We sample i_size under a read level meta lock to see if our write
				979	* is extending the file, if it is we back off and get a write level
				980	* meta lock.
				981	*/
				982	meta_level = (filp->f_flags & O_APPEND) ? 1 : 0;
				983	for(;;) {
				984	ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level);
				985	if (ret < 0) {
				986	meta_level = -1;
				987	mlog_errno(ret);
				988	goto out;
				989	}
				990
				991	/* Clear suid / sgid if necessary. We do this here
				992	* instead of later in the write path because
				993	* remove_suid() calls ->setattr without any hint that
				994	* we may have already done our cluster locking. Since
				995	* ocfs2_setattr() must take cluster locks to
				996	* proceeed, this will lead us to recursively lock the
				997	* inode. There's also the dinode i_size state which
				998	* can be lost via setattr during extending writes (we
				999	* set inode->i_size at the end of a write. */
				1000	if (ocfs2_write_should_remove_suid(inode)) {
				1001	if (meta_level == 0) {
				1002	ocfs2_meta_unlock(inode, meta_level);
				1003	meta_level = 1;
				1004	continue;
				1005	}
				1006
				1007	ret = ocfs2_write_remove_suid(inode);
				1008	if (ret < 0) {
				1009	mlog_errno(ret);
				1010	goto out;
				1011	}
				1012	}
				1013
				1014	/* work on a copy of ppos until we're sure that we won't have
				1015	* to recalculate it due to relocking. */
				1016	if (filp->f_flags & O_APPEND) {
				1017	saved_pos = i_size_read(inode);
				1018	mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
				1019	} else {
				1020	saved_pos = iocb->ki_pos;
				1021	}
				1022	newsize = count + saved_pos;
				1023
				1024	mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n",
				1025	saved_pos, newsize, i_size_read(inode));
				1026
				1027	/* No need for a higher level metadata lock if we're
				1028	* never going past i_size. */
				1029	if (newsize <= i_size_read(inode))
				1030	break;
				1031
				1032	if (meta_level == 0) {
				1033	ocfs2_meta_unlock(inode, meta_level);
				1034	meta_level = 1;
				1035	continue;
				1036	}
				1037
				1038	spin_lock(&OCFS2_I(inode)->ip_lock);
				1039	clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
				1040	OCFS2_I(inode)->ip_clusters;
				1041	spin_unlock(&OCFS2_I(inode)->ip_lock);
				1042
				1043	mlog(0, "Writing at EOF, may need more allocation: "
				1044	"i_size = %lld, newsize = %"MLFu64", need %u clusters\n",
				1045	i_size_read(inode), newsize, clusters);
				1046
				1047	/* We only want to continue the rest of this loop if
				1048	* our extend will actually require more
				1049	* allocation. */
				1050	if (!clusters)
				1051	break;
				1052
				1053	ret = ocfs2_extend_allocation(inode, clusters);
				1054	if (ret < 0) {
				1055	if (ret != -ENOSPC)
				1056	mlog_errno(ret);
				1057	goto out;
				1058	}
				1059
				1060	/* Fill any holes which would've been created by this
				1061	* write. If we're O_APPEND, this will wind up
				1062	* (correctly) being a noop. */
				1063	ret = ocfs2_zero_extend(inode, (u64) newsize - count);
				1064	if (ret < 0) {
				1065	mlog_errno(ret);
				1066	goto out;
				1067	}
				1068	break;
				1069	}
				1070
				1071	/* ok, we're done with i_size and alloc work */
				1072	iocb->ki_pos = saved_pos;
				1073	ocfs2_meta_unlock(inode, meta_level);
				1074	meta_level = -1;
				1075
				1076	/* communicate with ocfs2_dio_end_io */
				1077	ocfs2_iocb_set_rw_locked(iocb);
				1078
				1079	#ifdef OCFS2_ORACORE_WORKAROUNDS
				1080	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&
				1081	filp->f_flags & O_DIRECT) {
				1082	unsigned int saved_flags = filp->f_flags;
				1083	int sector_size = 1 << osb->s_sectsize_bits;
				1084
				1085	if ((saved_pos & (sector_size - 1)) \|\|
				1086	(count & (sector_size - 1)) \|\|
				1087	((unsigned long)buf & (sector_size - 1))) {
				1088	filp->f_flags \|= O_SYNC;
				1089	filp->f_flags &= ~O_DIRECT;
				1090	}
				1091
				1092	ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
				1093	&iocb->ki_pos);
				1094
				1095	filp->f_flags = saved_flags;
				1096	} else
				1097	#endif
				1098	ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,
				1099	&iocb->ki_pos);
				1100
				1101	/* buffered aio wouldn't have proper lock coverage today */
				1102	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
				1103
				1104	/*
				1105	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
				1106	* function pointer which is called when o_direct io completes so that
				1107	* it can unlock our rw lock. (it's the clustered equivalent of
				1108	* i_alloc_sem; protects truncate from racing with pending ios).
				1109	* Unfortunately there are error cases which call end_io and others
				1110	* that don't. so we don't have to unlock the rw_lock if either an
				1111	* async dio is going to do it in the future or an end_io after an
				1112	* error has already done it.
				1113	*/
				1114	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
				1115	rw_level = -1;
				1116	have_alloc_sem = 0;
				1117	}
				1118
				1119	out:
				1120	if (meta_level != -1)
				1121	ocfs2_meta_unlock(inode, meta_level);
				1122	if (have_alloc_sem)
				1123	up_read(&inode->i_alloc_sem);
				1124	if (rw_level != -1)
				1125	ocfs2_rw_unlock(inode, rw_level);
				1126	up(&inode->i_sem);
				1127
				1128	mlog_exit(ret);
				1129	return ret;
				1130	}
				1131
				1132	static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
				1133	char __user *buf,
				1134	size_t count,
				1135	loff_t pos)
				1136	{
				1137	int ret = 0, rw_level = -1, have_alloc_sem = 0;
				1138	struct file *filp = iocb->ki_filp;
				1139	struct inode *inode = filp->f_dentry->d_inode;
				1140	#ifdef OCFS2_ORACORE_WORKAROUNDS
				1141	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
				1142	#endif
				1143
				1144	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,
				1145	(unsigned int)count,
				1146	filp->f_dentry->d_name.len,
				1147	filp->f_dentry->d_name.name);
				1148
				1149	if (!inode) {
				1150	ret = -EINVAL;
				1151	mlog_errno(ret);
				1152	goto bail;
				1153	}
				1154
				1155	#ifdef OCFS2_ORACORE_WORKAROUNDS
				1156	if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {
				1157	if (filp->f_flags & O_DIRECT) {
				1158	int sector_size = 1 << osb->s_sectsize_bits;
				1159
				1160	if ((pos & (sector_size - 1)) \|\|
				1161	(count & (sector_size - 1)) \|\|
				1162	((unsigned long)buf & (sector_size - 1)) \|\|
				1163	(i_size_read(inode) & (sector_size -1))) {
				1164	filp->f_flags &= ~O_DIRECT;
				1165	}
				1166	}
				1167	}
				1168	#endif
				1169
				1170	/*
				1171	* buffered reads protect themselves in ->readpage(). O_DIRECT reads
				1172	* need locks to protect pending reads from racing with truncate.
				1173	*/
				1174	if (filp->f_flags & O_DIRECT) {
				1175	down_read(&inode->i_alloc_sem);
				1176	have_alloc_sem = 1;
				1177
				1178	ret = ocfs2_rw_lock(inode, 0);
				1179	if (ret < 0) {
				1180	mlog_errno(ret);
				1181	goto bail;
				1182	}
				1183	rw_level = 0;
				1184	/* communicate with ocfs2_dio_end_io */
				1185	ocfs2_iocb_set_rw_locked(iocb);
				1186	}
				1187
				1188	ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos);
				1189	if (ret == -EINVAL)
				1190	mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
				1191
				1192	/* buffered aio wouldn't have proper lock coverage today */
				1193	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
				1194
				1195	/* see ocfs2_file_aio_write */
				1196	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
				1197	rw_level = -1;
				1198	have_alloc_sem = 0;
				1199	}
				1200
				1201	bail:
				1202	if (have_alloc_sem)
				1203	up_read(&inode->i_alloc_sem);
				1204	if (rw_level != -1)
				1205	ocfs2_rw_unlock(inode, rw_level);
				1206	mlog_exit(ret);
				1207
				1208	return ret;
				1209	}
				1210
				1211	struct inode_operations ocfs2_file_iops = {
				1212	.setattr = ocfs2_setattr,
				1213	.getattr = ocfs2_getattr,
				1214	};
				1215
				1216	struct inode_operations ocfs2_special_file_iops = {
				1217	.setattr = ocfs2_setattr,
				1218	.getattr = ocfs2_getattr,
				1219	};
				1220
				1221	struct file_operations ocfs2_fops = {
				1222	.read = do_sync_read,
				1223	.write = do_sync_write,
				1224	.sendfile = generic_file_sendfile,
				1225	.mmap = ocfs2_mmap,
				1226	.fsync = ocfs2_sync_file,
				1227	.release = ocfs2_file_release,
				1228	.open = ocfs2_file_open,
				1229	.aio_read = ocfs2_file_aio_read,
				1230	.aio_write = ocfs2_file_aio_write,
				1231	};
				1232
				1233	struct file_operations ocfs2_dops = {
				1234	.read = generic_read_dir,
				1235	.readdir = ocfs2_readdir,
				1236	.fsync = ocfs2_sync_file,
				1237	};