Blame - fs/buffer.c - SHIFTPHONES/kernel/shift/mainline

blob: 08e422d5699674c246bf614d62fe826b6b0aaf43 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/buffer.c
				3	*
				4	* Copyright (C) 1991, 1992, 2002 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
				9	*
				10	* Removed a lot of unnecessary code and simplified things now that
				11	* the buffer cache isn't our primary cache - Andrew Tridgell 12/96
				12	*
				13	* Speed up hash, lru, and free list operations. Use gfp() for allocating
				14	* hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
				15	*
				16	* Added 32k buffer block sizes - these are required older ARM systems. - RMK
				17	*
				18	* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
				19	*/
				20
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	#include <linux/kernel.h>
				22	#include <linux/syscalls.h>
				23	#include <linux/fs.h>
				24	#include <linux/mm.h>
				25	#include <linux/percpu.h>
				26	#include <linux/slab.h>
Randy Dunlap	16f7e0f	2006-01-11 12:17:46 -0800	[diff] [blame]	27	#include <linux/capability.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	28	#include <linux/blkdev.h>
				29	#include <linux/file.h>
				30	#include <linux/quotaops.h>
				31	#include <linux/highmem.h>
				32	#include <linux/module.h>
				33	#include <linux/writeback.h>
				34	#include <linux/hash.h>
				35	#include <linux/suspend.h>
				36	#include <linux/buffer_head.h>
Andrew Morton	55e829a	2006-12-10 02:19:27 -0800	[diff] [blame]	37	#include <linux/task_io_accounting_ops.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	38	#include <linux/bio.h>
				39	#include <linux/notifier.h>
				40	#include <linux/cpu.h>
				41	#include <linux/bitops.h>
				42	#include <linux/mpage.h>
Ingo Molnar	fb1c8f9	2005-09-10 00:25:56 -0700	[diff] [blame]	43	#include <linux/bit_spinlock.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	44
				45	static int fsync_buffers_list(spinlock_t lock, struct list_head list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	46
				47	#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
				48
				49	inline void
				50	init_buffer(struct buffer_head bh, bh_end_io_t handler, void *private)
				51	{
				52	bh->b_end_io = handler;
				53	bh->b_private = private;
				54	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	55	EXPORT_SYMBOL(init_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	56
				57	static int sync_buffer(void *word)
				58	{
				59	struct block_device *bd;
				60	struct buffer_head *bh
				61	= container_of(word, struct buffer_head, b_state);
				62
				63	smp_mb();
				64	bd = bh->b_bdev;
				65	if (bd)
				66	blk_run_address_space(bd->bd_inode->i_mapping);
				67	io_schedule();
				68	return 0;
				69	}
				70
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	71	void __lock_buffer(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	72	{
				73	wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
				74	TASK_UNINTERRUPTIBLE);
				75	}
				76	EXPORT_SYMBOL(__lock_buffer);
				77
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	78	void unlock_buffer(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	79	{
Nick Piggin	51b07fc	2008-10-18 20:27:00 -0700	[diff] [blame]	80	clear_bit_unlock(BH_Lock, &bh->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	81	smp_mb__after_clear_bit();
				82	wake_up_bit(&bh->b_state, BH_Lock);
				83	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	84	EXPORT_SYMBOL(unlock_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	85
				86	/*
				87	* Block until a buffer comes unlocked. This doesn't stop it
				88	* from becoming locked again - you have to lock it yourself
				89	* if you want to preserve its state.
				90	*/
				91	void __wait_on_buffer(struct buffer_head * bh)
				92	{
				93	wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
				94	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	95	EXPORT_SYMBOL(__wait_on_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	96
				97	static void
				98	__clear_page_buffers(struct page *page)
				99	{
				100	ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	101	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	102	page_cache_release(page);
				103	}
				104
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	105
				106	static int quiet_error(struct buffer_head *bh)
				107	{
				108	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
				109	return 0;
				110	return 1;
				111	}
				112
				113
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	114	static void buffer_io_error(struct buffer_head *bh)
				115	{
				116	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	117	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
				118	bdevname(bh->b_bdev, b),
				119	(unsigned long long)bh->b_blocknr);
				120	}
				121
				122	/*
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	123	* End-of-IO handler helper function which does not touch the bh after
				124	* unlocking it.
				125	* Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
				126	* a race there is benign: unlock_buffer() only use the bh's address for
				127	* hashing after unlocking the buffer, so it doesn't actually touch the bh
				128	* itself.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	129	*/
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	130	static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	131	{
				132	if (uptodate) {
				133	set_buffer_uptodate(bh);
				134	} else {
				135	/* This happens, due to failed READA attempts. */
				136	clear_buffer_uptodate(bh);
				137	}
				138	unlock_buffer(bh);
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	139	}
				140
				141	/*
				142	* Default synchronous end-of-IO handler.. Just mark it up-to-date and
				143	* unlock the buffer. This is what ll_rw_block uses too.
				144	*/
				145	void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
				146	{
				147	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	148	put_bh(bh);
				149	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	150	EXPORT_SYMBOL(end_buffer_read_sync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	151
				152	void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				153	{
				154	char b[BDEVNAME_SIZE];
				155
				156	if (uptodate) {
				157	set_buffer_uptodate(bh);
				158	} else {
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	159	if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	160	buffer_io_error(bh);
				161	printk(KERN_WARNING "lost page write due to "
				162	"I/O error on %s\n",
				163	bdevname(bh->b_bdev, b));
				164	}
				165	set_buffer_write_io_error(bh);
				166	clear_buffer_uptodate(bh);
				167	}
				168	unlock_buffer(bh);
				169	put_bh(bh);
				170	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	171	EXPORT_SYMBOL(end_buffer_write_sync);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	172
				173	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	174	* Various filesystems appear to want __find_get_block to be non-blocking.
				175	* But it's the page lock which protects the buffers. To get around this,
				176	* we get exclusion from try_to_free_buffers with the blockdev mapping's
				177	* private_lock.
				178	*
				179	* Hack idea: for the blockdev mapping, i_bufferlist_lock contention
				180	* may be quite high. This code could TryLock the page, and if that
				181	* succeeds, there is no need to take private_lock. (But if
				182	* private_lock is contended then so is mapping->tree_lock).
				183	*/
				184	static struct buffer_head *
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	185	__find_get_block_slow(struct block_device *bdev, sector_t block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	186	{
				187	struct inode *bd_inode = bdev->bd_inode;
				188	struct address_space *bd_mapping = bd_inode->i_mapping;
				189	struct buffer_head *ret = NULL;
				190	pgoff_t index;
				191	struct buffer_head *bh;
				192	struct buffer_head *head;
				193	struct page *page;
				194	int all_mapped = 1;
				195
				196	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
				197	page = find_get_page(bd_mapping, index);
				198	if (!page)
				199	goto out;
				200
				201	spin_lock(&bd_mapping->private_lock);
				202	if (!page_has_buffers(page))
				203	goto out_unlock;
				204	head = page_buffers(page);
				205	bh = head;
				206	do {
Nikanth Karthikesan	97f76d3	2009-04-02 16:56:46 -0700	[diff] [blame]	207	if (!buffer_mapped(bh))
				208	all_mapped = 0;
				209	else if (bh->b_blocknr == block) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	210	ret = bh;
				211	get_bh(bh);
				212	goto out_unlock;
				213	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	214	bh = bh->b_this_page;
				215	} while (bh != head);
				216
				217	/* we might be here because some of the buffers on this page are
				218	* not mapped. This is due to various races between
				219	* file io on the block device and getblk. It gets dealt with
				220	* elsewhere, don't buffer_error if we had some unmapped buffers
				221	*/
				222	if (all_mapped) {
				223	printk("__find_get_block_slow() failed. "
				224	"block=%llu, b_blocknr=%llu\n",
Badari Pulavarty	205f87f	2006-03-26 01:38:00 -0800	[diff] [blame]	225	(unsigned long long)block,
				226	(unsigned long long)bh->b_blocknr);
				227	printk("b_state=0x%08lx, b_size=%zu\n",
				228	bh->b_state, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	229	printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
				230	}
				231	out_unlock:
				232	spin_unlock(&bd_mapping->private_lock);
				233	page_cache_release(page);
				234	out:
				235	return ret;
				236	}
				237
				238	/* If invalidate_buffers() will trash dirty buffers, it means some kind
				239	of fs corruption is going on. Trashing dirty data always imply losing
				240	information that was supposed to be just stored on the physical layer
				241	by the user.
				242
				243	Thus invalidate_buffers in general usage is not allwowed to trash
				244	dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
				245	be preserved. These buffers are simply skipped.
				246
				247	We also skip buffers which are still in use. For example this can
				248	happen if a userspace program is reading the block device.
				249
				250	NOTE: In the case where the user removed a removable-media-disk even if
				251	there's still dirty data not synced on disk (due a bug in the device driver
				252	or due an error of the user), by not destroying the dirty buffers we could
				253	generate corruption also on the next media inserted, thus a parameter is
				254	necessary to handle this case in the most safe way possible (trying
				255	to not corrupt also the new disk inserted with the data belonging to
				256	the old now corrupted disk). Also for the ramdisk the natural thing
				257	to do in order to release the ramdisk memory is to destroy dirty buffers.
				258
				259	These are two special cases. Normal usage imply the device driver
				260	to issue a sync on the device (without waiting I/O completion) and
				261	then an invalidate_buffers call that doesn't trash dirty buffers.
				262
				263	For handling cache coherency with the blkdev pagecache the 'update' case
				264	is been introduced. It is needed to re-read from disk any pinned
				265	buffer. NOTE: re-reading from disk is destructive so we can do it only
				266	when we assume nobody is changing the buffercache under our I/O and when
				267	we think the disk contains more recent information than the buffercache.
				268	The update == 1 pass marks the buffers we need to update, the update == 2
				269	pass does the actual I/O. */
Peter Zijlstra	f98393a	2007-05-06 14:49:54 -0700	[diff] [blame]	270	void invalidate_bdev(struct block_device *bdev)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	271	{
Andrew Morton	0e1dfc6	2006-07-30 03:03:28 -0700	[diff] [blame]	272	struct address_space *mapping = bdev->bd_inode->i_mapping;
				273
				274	if (mapping->nrpages == 0)
				275	return;
				276
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	277	invalidate_bh_lrus();
Tejun Heo	fa4b907	2010-05-15 20:09:27 +0200	[diff] [blame^]	278	lru_add_drain_all(); /* make sure all lru add caches are flushed */
Andrew Morton	fc0ecff	2007-02-10 01:45:39 -0800	[diff] [blame]	279	invalidate_mapping_pages(mapping, 0, -1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	280	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	281	EXPORT_SYMBOL(invalidate_bdev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	282
				283	/*
Jens Axboe	5b0830c	2009-09-23 19:37:09 +0200	[diff] [blame]	284	* Kick the writeback threads then try to free up some ZONE_NORMAL memory.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	285	*/
				286	static void free_more_memory(void)
				287	{
Mel Gorman	19770b3	2008-04-28 02:12:18 -0700	[diff] [blame]	288	struct zone *zone;
Mel Gorman	0e88460	2008-04-28 02:12:14 -0700	[diff] [blame]	289	int nid;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	290
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	291	wakeup_flusher_threads(1024);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	292	yield();
				293
Mel Gorman	0e88460	2008-04-28 02:12:14 -0700	[diff] [blame]	294	for_each_online_node(nid) {
Mel Gorman	19770b3	2008-04-28 02:12:18 -0700	[diff] [blame]	295	(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
				296	gfp_zone(GFP_NOFS), NULL,
				297	&zone);
				298	if (zone)
Mel Gorman	54a6eb5	2008-04-28 02:12:16 -0700	[diff] [blame]	299	try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
KAMEZAWA Hiroyuki	327c0e9	2009-03-31 15:23:31 -0700	[diff] [blame]	300	GFP_NOFS, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	301	}
				302	}
				303
				304	/*
				305	* I/O completion handler for block_read_full_page() - pages
				306	* which come unlocked at the end of I/O.
				307	*/
				308	static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
				309	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	310	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	311	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	312	struct buffer_head *tmp;
				313	struct page *page;
				314	int page_uptodate = 1;
				315
				316	BUG_ON(!buffer_async_read(bh));
				317
				318	page = bh->b_page;
				319	if (uptodate) {
				320	set_buffer_uptodate(bh);
				321	} else {
				322	clear_buffer_uptodate(bh);
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	323	if (!quiet_error(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	324	buffer_io_error(bh);
				325	SetPageError(page);
				326	}
				327
				328	/*
				329	* Be _very_ careful from here on. Bad things can happen if
				330	* two buffer heads end IO at almost the same time and both
				331	* decide that the page is now completely done.
				332	*/
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	333	first = page_buffers(page);
				334	local_irq_save(flags);
				335	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	336	clear_buffer_async_read(bh);
				337	unlock_buffer(bh);
				338	tmp = bh;
				339	do {
				340	if (!buffer_uptodate(tmp))
				341	page_uptodate = 0;
				342	if (buffer_async_read(tmp)) {
				343	BUG_ON(!buffer_locked(tmp));
				344	goto still_busy;
				345	}
				346	tmp = tmp->b_this_page;
				347	} while (tmp != bh);
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	348	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				349	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	350
				351	/*
				352	* If none of the buffers had errors and they are all
				353	* uptodate then we can set the page uptodate.
				354	*/
				355	if (page_uptodate && !PageError(page))
				356	SetPageUptodate(page);
				357	unlock_page(page);
				358	return;
				359
				360	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	361	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				362	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	363	return;
				364	}
				365
				366	/*
				367	* Completion handler for block_write_full_page() - pages which are unlocked
				368	* during I/O, and which have PageWriteback cleared upon I/O completion.
				369	*/
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	370	void end_buffer_async_write(struct buffer_head *bh, int uptodate)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	371	{
				372	char b[BDEVNAME_SIZE];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	373	unsigned long flags;
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	374	struct buffer_head *first;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	375	struct buffer_head *tmp;
				376	struct page *page;
				377
				378	BUG_ON(!buffer_async_write(bh));
				379
				380	page = bh->b_page;
				381	if (uptodate) {
				382	set_buffer_uptodate(bh);
				383	} else {
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	384	if (!quiet_error(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	385	buffer_io_error(bh);
				386	printk(KERN_WARNING "lost page write due to "
				387	"I/O error on %s\n",
				388	bdevname(bh->b_bdev, b));
				389	}
				390	set_bit(AS_EIO, &page->mapping->flags);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	391	set_buffer_write_io_error(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	392	clear_buffer_uptodate(bh);
				393	SetPageError(page);
				394	}
				395
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	396	first = page_buffers(page);
				397	local_irq_save(flags);
				398	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
				399
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	400	clear_buffer_async_write(bh);
				401	unlock_buffer(bh);
				402	tmp = bh->b_this_page;
				403	while (tmp != bh) {
				404	if (buffer_async_write(tmp)) {
				405	BUG_ON(!buffer_locked(tmp));
				406	goto still_busy;
				407	}
				408	tmp = tmp->b_this_page;
				409	}
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	410	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				411	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	412	end_page_writeback(page);
				413	return;
				414
				415	still_busy:
Nick Piggin	a397220	2005-07-07 17:56:56 -0700	[diff] [blame]	416	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
				417	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	418	return;
				419	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	420	EXPORT_SYMBOL(end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	421
				422	/*
				423	* If a page's buffers are under async readin (end_buffer_async_read
				424	* completion) then there is a possibility that another thread of
				425	* control could lock one of the buffers after it has completed
				426	* but while some of the other buffers have not completed. This
				427	* locked buffer would confuse end_buffer_async_read() into not unlocking
				428	* the page. So the absence of BH_Async_Read tells end_buffer_async_read()
				429	* that this buffer is not under async I/O.
				430	*
				431	* The page comes unlocked when it has no locked buffer_async buffers
				432	* left.
				433	*
				434	* PageLocked prevents anyone starting new async I/O reads any of
				435	* the buffers.
				436	*
				437	* PageWriteback is used to prevent simultaneous writeout of the same
				438	* page.
				439	*
				440	* PageLocked prevents anyone from starting writeback of a page which is
				441	* under read I/O (PageWriteback is only ever set against a locked page).
				442	*/
				443	static void mark_buffer_async_read(struct buffer_head *bh)
				444	{
				445	bh->b_end_io = end_buffer_async_read;
				446	set_buffer_async_read(bh);
				447	}
				448
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	449	static void mark_buffer_async_write_endio(struct buffer_head *bh,
				450	bh_end_io_t *handler)
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	451	{
				452	bh->b_end_io = handler;
				453	set_buffer_async_write(bh);
				454	}
				455
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	456	void mark_buffer_async_write(struct buffer_head *bh)
				457	{
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	458	mark_buffer_async_write_endio(bh, end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	459	}
				460	EXPORT_SYMBOL(mark_buffer_async_write);
				461
				462
				463	/*
				464	* fs/buffer.c contains helper functions for buffer-backed address space's
				465	* fsync functions. A common requirement for buffer-based filesystems is
				466	* that certain data from the backing blockdev needs to be written out for
				467	* a successful fsync(). For example, ext2 indirect blocks need to be
				468	* written back and waited upon before fsync() returns.
				469	*
				470	* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
				471	* inode_has_buffers() and invalidate_inode_buffers() are provided for the
				472	* management of a list of dependent buffers at ->i_mapping->private_list.
				473	*
				474	* Locking is a little subtle: try_to_free_buffers() will remove buffers
				475	* from their controlling inode's queue when they are being freed. But
				476	* try_to_free_buffers() will be operating against the blockdev mapping
				477	* at the time, not against the S_ISREG file which depends on those buffers.
				478	* So the locking for private_list is via the private_lock in the address_space
				479	* which backs the buffers. Which is different from the address_space
				480	* against which the buffers are listed. So for a particular address_space,
				481	* mapping->private_lock does not protect mapping->private_list! In fact,
				482	* mapping->private_list will always be protected by the backing blockdev's
				483	* ->private_lock.
				484	*
				485	* Which introduces a requirement: all buffers on an address_space's
				486	* ->private_list must be from the same address_space: the blockdev's.
				487	*
				488	* address_spaces which do not place buffers at ->private_list via these
				489	* utility functions are free to use private_lock and private_list for
				490	* whatever they want. The only requirement is that list_empty(private_list)
				491	* be true at clear_inode() time.
				492	*
				493	* FIXME: clear_inode should not call invalidate_inode_buffers(). The
				494	* filesystems should do that. invalidate_inode_buffers() should just go
				495	* BUG_ON(!list_empty).
				496	*
				497	* FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
				498	* take an address_space, not an inode. And it should be called
				499	* mark_buffer_dirty_fsync() to clearly define why those buffers are being
				500	* queued up.
				501	*
				502	* FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
				503	* list if it is already on a list. Because if the buffer is on a list,
				504	* it must already be on the right one. If not, the filesystem is being
				505	* silly. This will save a ton of locking. But first we have to ensure
				506	* that buffers are taken off the old inode's list when they are freed
				507	* (presumably in truncate). That requires careful auditing of all
				508	* filesystems (do it inside bforget()). It could also be done by bringing
				509	* b_inode back.
				510	*/
				511
				512	/*
				513	* The buffer's backing address_space's private_lock must be held
				514	*/
Thomas Petazzoni	dbacefc	2008-07-29 22:33:47 -0700	[diff] [blame]	515	static void __remove_assoc_queue(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	516	{
				517	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	518	WARN_ON(!bh->b_assoc_map);
				519	if (buffer_write_io_error(bh))
				520	set_bit(AS_EIO, &bh->b_assoc_map->flags);
				521	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	522	}
				523
				524	int inode_has_buffers(struct inode *inode)
				525	{
				526	return !list_empty(&inode->i_data.private_list);
				527	}
				528
				529	/*
				530	* osync is designed to support O_SYNC io. It waits synchronously for
				531	* all already-submitted IO to complete, but does not queue any new
				532	* writes to the disk.
				533	*
				534	* To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
				535	* you dirty the buffers, and then use osync_inode_buffers to wait for
				536	* completion. Any other dirty buffers which are not yet queued for
				537	* write will not be flushed to disk by the osync.
				538	*/
				539	static int osync_buffers_list(spinlock_t lock, struct list_head list)
				540	{
				541	struct buffer_head *bh;
				542	struct list_head *p;
				543	int err = 0;
				544
				545	spin_lock(lock);
				546	repeat:
				547	list_for_each_prev(p, list) {
				548	bh = BH_ENTRY(p);
				549	if (buffer_locked(bh)) {
				550	get_bh(bh);
				551	spin_unlock(lock);
				552	wait_on_buffer(bh);
				553	if (!buffer_uptodate(bh))
				554	err = -EIO;
				555	brelse(bh);
				556	spin_lock(lock);
				557	goto repeat;
				558	}
				559	}
				560	spin_unlock(lock);
				561	return err;
				562	}
				563
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	564	static void do_thaw_all(struct work_struct *work)
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	565	{
				566	struct super_block *sb;
				567	char b[BDEVNAME_SIZE];
				568
				569	spin_lock(&sb_lock);
				570	restart:
				571	list_for_each_entry(sb, &super_blocks, s_list) {
				572	sb->s_count++;
				573	spin_unlock(&sb_lock);
				574	down_read(&sb->s_umount);
				575	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
				576	printk(KERN_WARNING "Emergency Thaw on %s\n",
				577	bdevname(sb->s_bdev, b));
				578	up_read(&sb->s_umount);
				579	spin_lock(&sb_lock);
				580	if (__put_super_and_need_restart(sb))
				581	goto restart;
				582	}
				583	spin_unlock(&sb_lock);
Jens Axboe	053c525	2009-04-08 13:44:08 +0200	[diff] [blame]	584	kfree(work);
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	585	printk(KERN_WARNING "Emergency Thaw complete\n");
				586	}
				587
				588	/**
				589	* emergency_thaw_all -- forcibly thaw every frozen filesystem
				590	*
				591	* Used for emergency unfreeze of all filesystems via SysRq
				592	*/
				593	void emergency_thaw_all(void)
				594	{
Jens Axboe	053c525	2009-04-08 13:44:08 +0200	[diff] [blame]	595	struct work_struct *work;
				596
				597	work = kmalloc(sizeof(*work), GFP_ATOMIC);
				598	if (work) {
				599	INIT_WORK(work, do_thaw_all);
				600	schedule_work(work);
				601	}
Eric Sandeen	c2d7543	2009-03-31 15:23:46 -0700	[diff] [blame]	602	}
				603
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	604	/**
Randy Dunlap	78a4a50	2008-02-29 22:02:31 -0800	[diff] [blame]	605	* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	606	* @mapping: the mapping which wants those buffers written
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	607	*
				608	* Starts I/O against the buffers at mapping->private_list, and waits upon
				609	* that I/O.
				610	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	611	* Basically, this is a convenience function for fsync().
				612	* @mapping is a file or directory which needs those buffers to be written for
				613	* a successful fsync().
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	614	*/
				615	int sync_mapping_buffers(struct address_space *mapping)
				616	{
				617	struct address_space *buffer_mapping = mapping->assoc_mapping;
				618
				619	if (buffer_mapping == NULL \|\| list_empty(&mapping->private_list))
				620	return 0;
				621
				622	return fsync_buffers_list(&buffer_mapping->private_lock,
				623	&mapping->private_list);
				624	}
				625	EXPORT_SYMBOL(sync_mapping_buffers);
				626
				627	/*
				628	* Called when we've recently written block `bblock', and it is known that
				629	* `bblock' was for a buffer_boundary() buffer. This means that the block at
				630	* `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
				631	* dirty, schedule it for IO. So that indirects merge nicely with their data.
				632	*/
				633	void write_boundary_block(struct block_device *bdev,
				634	sector_t bblock, unsigned blocksize)
				635	{
				636	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
				637	if (bh) {
				638	if (buffer_dirty(bh))
				639	ll_rw_block(WRITE, 1, &bh);
				640	put_bh(bh);
				641	}
				642	}
				643
				644	void mark_buffer_dirty_inode(struct buffer_head bh, struct inode inode)
				645	{
				646	struct address_space *mapping = inode->i_mapping;
				647	struct address_space *buffer_mapping = bh->b_page->mapping;
				648
				649	mark_buffer_dirty(bh);
				650	if (!mapping->assoc_mapping) {
				651	mapping->assoc_mapping = buffer_mapping;
				652	} else {
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	653	BUG_ON(mapping->assoc_mapping != buffer_mapping);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	654	}
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	655	if (!bh->b_assoc_map) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	656	spin_lock(&buffer_mapping->private_lock);
				657	list_move_tail(&bh->b_assoc_buffers,
				658	&mapping->private_list);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	659	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	660	spin_unlock(&buffer_mapping->private_lock);
				661	}
				662	}
				663	EXPORT_SYMBOL(mark_buffer_dirty_inode);
				664
				665	/*
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	666	* Mark the page dirty, and set it dirty in the radix tree, and mark the inode
				667	* dirty.
				668	*
				669	* If warn is true, then emit a warning if the page is not uptodate and has
				670	* not been truncated.
				671	*/
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	672	static void __set_page_dirty(struct page *page,
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	673	struct address_space *mapping, int warn)
				674	{
Nick Piggin	19fd623	2008-07-25 19:45:32 -0700	[diff] [blame]	675	spin_lock_irq(&mapping->tree_lock);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	676	if (page->mapping) { /* Race with truncate? */
				677	WARN_ON_ONCE(warn && !PageUptodate(page));
Edward Shishkin	e3a7cca	2009-03-31 15:19:39 -0700	[diff] [blame]	678	account_page_dirtied(page, mapping);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	679	radix_tree_tag_set(&mapping->page_tree,
				680	page_index(page), PAGECACHE_TAG_DIRTY);
				681	}
Nick Piggin	19fd623	2008-07-25 19:45:32 -0700	[diff] [blame]	682	spin_unlock_irq(&mapping->tree_lock);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	683	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	684	}
				685
				686	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	687	* Add a page to the dirty page list.
				688	*
				689	* It is a sad fact of life that this function is called from several places
				690	* deeply under spinlocking. It may not sleep.
				691	*
				692	* If the page has buffers, the uptodate buffers are set dirty, to preserve
				693	* dirty-state coherency between the page and the buffers. It the page does
				694	* not have buffers then when they are later attached they will all be set
				695	* dirty.
				696	*
				697	* The buffers are dirtied before the page is dirtied. There's a small race
				698	* window in which a writepage caller may see the page cleanness but not the
				699	* buffer dirtiness. That's fine. If this code were to set the page dirty
				700	* before the buffers, a concurrent writepage caller could clear the page dirty
				701	* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
				702	* page on the dirty page list.
				703	*
				704	* We use private_lock to lock against try_to_free_buffers while using the
				705	* page's buffer list. Also use this to protect against clean buffers being
				706	* added to the page after it was set dirty.
				707	*
				708	* FIXME: may need to call ->reservepage here as well. That's rather up to the
				709	* address_space though.
				710	*/
				711	int __set_page_dirty_buffers(struct page *page)
				712	{
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	713	int newly_dirty;
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	714	struct address_space *mapping = page_mapping(page);
Nick Piggin	ebf7a22	2006-10-10 04:36:54 +0200	[diff] [blame]	715
				716	if (unlikely(!mapping))
				717	return !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	718
				719	spin_lock(&mapping->private_lock);
				720	if (page_has_buffers(page)) {
				721	struct buffer_head *head = page_buffers(page);
				722	struct buffer_head *bh = head;
				723
				724	do {
				725	set_buffer_dirty(bh);
				726	bh = bh->b_this_page;
				727	} while (bh != head);
				728	}
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	729	newly_dirty = !TestSetPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	730	spin_unlock(&mapping->private_lock);
				731
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	732	if (newly_dirty)
				733	__set_page_dirty(page, mapping, 1);
				734	return newly_dirty;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	735	}
				736	EXPORT_SYMBOL(__set_page_dirty_buffers);
				737
				738	/*
				739	* Write out and wait upon a list of buffers.
				740	*
				741	* We have conflicting pressures: we want to make sure that all
				742	* initially dirty buffers get waited on, but that any subsequently
				743	* dirtied buffers don't. After all, we don't want fsync to last
				744	* forever if somebody is actively writing to the file.
				745	*
				746	* Do this in two main stages: first we copy dirty buffers to a
				747	* temporary inode list, queueing the writes as we go. Then we clean
				748	* up, waiting for those writes to complete.
				749	*
				750	* During this second stage, any subsequent updates to the file may end
				751	* up refiling the buffer on the original inode's dirty list again, so
				752	* there is a chance we will end up with a buffer queued for write but
				753	* not yet completed on that list. So, as a final cleanup we go through
				754	* the osync code to catch these locked, dirty buffers without requeuing
				755	* any newly dirty buffers for write.
				756	*/
				757	static int fsync_buffers_list(spinlock_t lock, struct list_head list)
				758	{
				759	struct buffer_head *bh;
				760	struct list_head tmp;
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	761	struct address_space mapping, prev_mapping = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	762	int err = 0, err2;
				763
				764	INIT_LIST_HEAD(&tmp);
				765
				766	spin_lock(lock);
				767	while (!list_empty(list)) {
				768	bh = BH_ENTRY(list->next);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	769	mapping = bh->b_assoc_map;
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	770	__remove_assoc_queue(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	771	/* Avoid race with mark_buffer_dirty_inode() which does
				772	* a lockless check and we rely on seeing the dirty bit */
				773	smp_mb();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	774	if (buffer_dirty(bh) \|\| buffer_locked(bh)) {
				775	list_add(&bh->b_assoc_buffers, &tmp);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	776	bh->b_assoc_map = mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	777	if (buffer_dirty(bh)) {
				778	get_bh(bh);
				779	spin_unlock(lock);
				780	/*
				781	* Ensure any pending I/O completes so that
				782	* ll_rw_block() actually writes the current
				783	* contents - it is a noop if I/O is still in
				784	* flight on potentially older contents.
				785	*/
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	786	ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
				787
				788	/*
				789	* Kick off IO for the previous mapping. Note
				790	* that we will not run the very last mapping,
				791	* wait_on_buffer() will do that for us
				792	* through sync_buffer().
				793	*/
				794	if (prev_mapping && prev_mapping != mapping)
				795	blk_run_address_space(prev_mapping);
				796	prev_mapping = mapping;
				797
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	798	brelse(bh);
				799	spin_lock(lock);
				800	}
				801	}
				802	}
				803
				804	while (!list_empty(&tmp)) {
				805	bh = BH_ENTRY(tmp.prev);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	806	get_bh(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	807	mapping = bh->b_assoc_map;
				808	__remove_assoc_queue(bh);
				809	/* Avoid race with mark_buffer_dirty_inode() which does
				810	* a lockless check and we rely on seeing the dirty bit */
				811	smp_mb();
				812	if (buffer_dirty(bh)) {
				813	list_add(&bh->b_assoc_buffers,
Jan Kara	e389229	2008-03-04 14:28:33 -0800	[diff] [blame]	814	&mapping->private_list);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	815	bh->b_assoc_map = mapping;
				816	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	817	spin_unlock(lock);
				818	wait_on_buffer(bh);
				819	if (!buffer_uptodate(bh))
				820	err = -EIO;
				821	brelse(bh);
				822	spin_lock(lock);
				823	}
				824
				825	spin_unlock(lock);
				826	err2 = osync_buffers_list(lock, list);
				827	if (err)
				828	return err;
				829	else
				830	return err2;
				831	}
				832
				833	/*
				834	* Invalidate any and all dirty buffers on a given inode. We are
				835	* probably unmounting the fs, but that doesn't mean we have already
				836	* done a sync(). Just drop the buffers from the inode list.
				837	*
				838	* NOTE: we take the inode's blockdev's mapping's private_lock. Which
				839	* assumes that all the buffers are against the blockdev. Not true
				840	* for reiserfs.
				841	*/
				842	void invalidate_inode_buffers(struct inode *inode)
				843	{
				844	if (inode_has_buffers(inode)) {
				845	struct address_space *mapping = &inode->i_data;
				846	struct list_head *list = &mapping->private_list;
				847	struct address_space *buffer_mapping = mapping->assoc_mapping;
				848
				849	spin_lock(&buffer_mapping->private_lock);
				850	while (!list_empty(list))
				851	__remove_assoc_queue(BH_ENTRY(list->next));
				852	spin_unlock(&buffer_mapping->private_lock);
				853	}
				854	}
Jan Kara	52b19ac	2008-09-23 18:24:08 +0200	[diff] [blame]	855	EXPORT_SYMBOL(invalidate_inode_buffers);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	856
				857	/*
				858	* Remove any clean buffers from the inode's buffer list. This is called
				859	* when we're trying to free the inode itself. Those buffers can pin it.
				860	*
				861	* Returns true if all buffers were removed.
				862	*/
				863	int remove_inode_buffers(struct inode *inode)
				864	{
				865	int ret = 1;
				866
				867	if (inode_has_buffers(inode)) {
				868	struct address_space *mapping = &inode->i_data;
				869	struct list_head *list = &mapping->private_list;
				870	struct address_space *buffer_mapping = mapping->assoc_mapping;
				871
				872	spin_lock(&buffer_mapping->private_lock);
				873	while (!list_empty(list)) {
				874	struct buffer_head *bh = BH_ENTRY(list->next);
				875	if (buffer_dirty(bh)) {
				876	ret = 0;
				877	break;
				878	}
				879	__remove_assoc_queue(bh);
				880	}
				881	spin_unlock(&buffer_mapping->private_lock);
				882	}
				883	return ret;
				884	}
				885
				886	/*
				887	* Create the appropriate buffers when given a page for data area and
				888	* the size of each buffer.. Use the bh->b_this_page linked list to
				889	* follow the buffers created. Return NULL if unable to create more
				890	* buffers.
				891	*
				892	* The retry flag is used to differentiate async IO (paging, swapping)
				893	* which may not fail from ordinary buffer allocations.
				894	*/
				895	struct buffer_head alloc_page_buffers(struct page page, unsigned long size,
				896	int retry)
				897	{
				898	struct buffer_head bh, head;
				899	long offset;
				900
				901	try_again:
				902	head = NULL;
				903	offset = PAGE_SIZE;
				904	while ((offset -= size) >= 0) {
				905	bh = alloc_buffer_head(GFP_NOFS);
				906	if (!bh)
				907	goto no_grow;
				908
				909	bh->b_bdev = NULL;
				910	bh->b_this_page = head;
				911	bh->b_blocknr = -1;
				912	head = bh;
				913
				914	bh->b_state = 0;
				915	atomic_set(&bh->b_count, 0);
Chris Mason	fc5cd58	2006-02-01 03:06:48 -0800	[diff] [blame]	916	bh->b_private = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	917	bh->b_size = size;
				918
				919	/* Link the buffer to its page */
				920	set_bh_page(bh, page, offset);
				921
Nathan Scott	01ffe33	2006-01-17 09:02:07 +1100	[diff] [blame]	922	init_buffer(bh, NULL, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	923	}
				924	return head;
				925	/*
				926	* In case anything failed, we just free everything we got.
				927	*/
				928	no_grow:
				929	if (head) {
				930	do {
				931	bh = head;
				932	head = head->b_this_page;
				933	free_buffer_head(bh);
				934	} while (head);
				935	}
				936
				937	/*
				938	* Return failure for non-async IO requests. Async IO requests
				939	* are not allowed to fail, so we have to wait until buffer heads
				940	* become available. But we don't want tasks sleeping with
				941	* partially complete buffers, so all were released above.
				942	*/
				943	if (!retry)
				944	return NULL;
				945
				946	/* We're _really_ low on memory. Now we just
				947	* wait for old buffer heads to become free due to
				948	* finishing IO. Since this is an async request and
				949	* the reserve list is empty, we're sure there are
				950	* async buffer heads in use.
				951	*/
				952	free_more_memory();
				953	goto try_again;
				954	}
				955	EXPORT_SYMBOL_GPL(alloc_page_buffers);
				956
				957	static inline void
				958	link_dev_buffers(struct page page, struct buffer_head head)
				959	{
				960	struct buffer_head bh, tail;
				961
				962	bh = head;
				963	do {
				964	tail = bh;
				965	bh = bh->b_this_page;
				966	} while (bh);
				967	tail->b_this_page = head;
				968	attach_page_buffers(page, head);
				969	}
				970
				971	/*
				972	* Initialise the state of a blockdev page's buffers.
				973	*/
				974	static void
				975	init_page_buffers(struct page page, struct block_device bdev,
				976	sector_t block, int size)
				977	{
				978	struct buffer_head *head = page_buffers(page);
				979	struct buffer_head *bh = head;
				980	int uptodate = PageUptodate(page);
				981
				982	do {
				983	if (!buffer_mapped(bh)) {
				984	init_buffer(bh, NULL, NULL);
				985	bh->b_bdev = bdev;
				986	bh->b_blocknr = block;
				987	if (uptodate)
				988	set_buffer_uptodate(bh);
				989	set_buffer_mapped(bh);
				990	}
				991	block++;
				992	bh = bh->b_this_page;
				993	} while (bh != head);
				994	}
				995
				996	/*
				997	* Create the page-cache page that contains the requested block.
				998	*
				999	* This is user purely for blockdev mappings.
				1000	*/
				1001	static struct page *
				1002	grow_dev_page(struct block_device *bdev, sector_t block,
				1003	pgoff_t index, int size)
				1004	{
				1005	struct inode *inode = bdev->bd_inode;
				1006	struct page *page;
				1007	struct buffer_head *bh;
				1008
Christoph Lameter	ea12589	2007-05-16 22:11:21 -0700	[diff] [blame]	1009	page = find_or_create_page(inode->i_mapping, index,
Mel Gorman	769848c	2007-07-17 04:03:05 -0700	[diff] [blame]	1010	(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)\|__GFP_MOVABLE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1011	if (!page)
				1012	return NULL;
				1013
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1014	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1015
				1016	if (page_has_buffers(page)) {
				1017	bh = page_buffers(page);
				1018	if (bh->b_size == size) {
				1019	init_page_buffers(page, bdev, block, size);
				1020	return page;
				1021	}
				1022	if (!try_to_free_buffers(page))
				1023	goto failed;
				1024	}
				1025
				1026	/*
				1027	* Allocate some buffers for this page
				1028	*/
				1029	bh = alloc_page_buffers(page, size, 0);
				1030	if (!bh)
				1031	goto failed;
				1032
				1033	/*
				1034	* Link the page to the buffers and initialise them. Take the
				1035	* lock to be atomic wrt __find_get_block(), which does not
				1036	* run under the page lock.
				1037	*/
				1038	spin_lock(&inode->i_mapping->private_lock);
				1039	link_dev_buffers(page, bh);
				1040	init_page_buffers(page, bdev, block, size);
				1041	spin_unlock(&inode->i_mapping->private_lock);
				1042	return page;
				1043
				1044	failed:
				1045	BUG();
				1046	unlock_page(page);
				1047	page_cache_release(page);
				1048	return NULL;
				1049	}
				1050
				1051	/*
				1052	* Create buffers for the specified block device block's page. If
				1053	* that page was dirty, the buffers are set dirty also.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1054	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1055	static int
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1056	grow_buffers(struct block_device *bdev, sector_t block, int size)
				1057	{
				1058	struct page *page;
				1059	pgoff_t index;
				1060	int sizebits;
				1061
				1062	sizebits = -1;
				1063	do {
				1064	sizebits++;
				1065	} while ((size << sizebits) < PAGE_SIZE);
				1066
				1067	index = block >> sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1068
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1069	/*
				1070	* Check for a block which wants to lie outside our maximum possible
				1071	* pagecache index. (this comparison is done using sector_t types).
				1072	*/
				1073	if (unlikely(index != block >> sizebits)) {
				1074	char b[BDEVNAME_SIZE];
				1075
				1076	printk(KERN_ERR "%s: requested out-of-range block %llu for "
				1077	"device %s\n",
Harvey Harrison	8e24eea	2008-04-30 00:55:09 -0700	[diff] [blame]	1078	__func__, (unsigned long long)block,
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1079	bdevname(bdev, b));
				1080	return -EIO;
				1081	}
				1082	block = index << sizebits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1083	/* Create a page with the proper size buffers.. */
				1084	page = grow_dev_page(bdev, block, index, size);
				1085	if (!page)
				1086	return 0;
				1087	unlock_page(page);
				1088	page_cache_release(page);
				1089	return 1;
				1090	}
				1091
Adrian Bunk	75c96f8	2005-05-05 16:16:09 -0700	[diff] [blame]	1092	static struct buffer_head *
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1093	__getblk_slow(struct block_device *bdev, sector_t block, int size)
				1094	{
				1095	/* Size must be multiple of hard sectorsize */
Martin K. Petersen	e1defc4	2009-05-22 17:17:49 -0400	[diff] [blame]	1096	if (unlikely(size & (bdev_logical_block_size(bdev)-1) \|\|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1097	(size < 512 \|\| size > PAGE_SIZE))) {
				1098	printk(KERN_ERR "getblk(): invalid block size %d requested\n",
				1099	size);
Martin K. Petersen	e1defc4	2009-05-22 17:17:49 -0400	[diff] [blame]	1100	printk(KERN_ERR "logical block size: %d\n",
				1101	bdev_logical_block_size(bdev));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1102
				1103	dump_stack();
				1104	return NULL;
				1105	}
				1106
				1107	for (;;) {
				1108	struct buffer_head * bh;
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1109	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1110
				1111	bh = __find_get_block(bdev, block, size);
				1112	if (bh)
				1113	return bh;
				1114
Andrew Morton	e565793	2006-10-11 01:21:46 -0700	[diff] [blame]	1115	ret = grow_buffers(bdev, block, size);
				1116	if (ret < 0)
				1117	return NULL;
				1118	if (ret == 0)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1119	free_more_memory();
				1120	}
				1121	}
				1122
				1123	/*
				1124	* The relationship between dirty buffers and dirty pages:
				1125	*
				1126	* Whenever a page has any dirty buffers, the page's dirty bit is set, and
				1127	* the page is tagged dirty in its radix tree.
				1128	*
				1129	* At all times, the dirtiness of the buffers represents the dirtiness of
				1130	* subsections of the page. If the page has buffers, the page dirty bit is
				1131	* merely a hint about the true dirty state.
				1132	*
				1133	* When a page is set dirty in its entirety, all its buffers are marked dirty
				1134	* (if the page has buffers).
				1135	*
				1136	* When a buffer is marked dirty, its page is dirtied, but the page's other
				1137	* buffers are not.
				1138	*
				1139	* Also. When blockdev buffers are explicitly read with bread(), they
				1140	* individually become uptodate. But their backing page remains not
				1141	* uptodate - even if all of its buffers are uptodate. A subsequent
				1142	* block_read_full_page() against that page will discover all the uptodate
				1143	* buffers, will set the page uptodate and will perform no I/O.
				1144	*/
				1145
				1146	/**
				1147	* mark_buffer_dirty - mark a buffer_head as needing writeout
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1148	* @bh: the buffer_head to mark dirty
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1149	*
				1150	* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
				1151	* backing page dirty, then tag the page as dirty in its address_space's radix
				1152	* tree and then attach the address_space's inode to its superblock's dirty
				1153	* inode list.
				1154	*
				1155	* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
				1156	* mapping->tree_lock and the global inode_lock.
				1157	*/
Harvey Harrison	fc9b52c	2008-02-08 04:19:52 -0800	[diff] [blame]	1158	void mark_buffer_dirty(struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1159	{
Nick Piggin	787d221	2007-07-17 04:03:34 -0700	[diff] [blame]	1160	WARN_ON_ONCE(!buffer_uptodate(bh));
Linus Torvalds	1be62dc	2008-04-04 14:38:17 -0700	[diff] [blame]	1161
				1162	/*
				1163	* Very carefully optimize the it-is-already-dirty case.
				1164	*
				1165	* Don't let the final "is it dirty" escape to before we
				1166	* perhaps modified the buffer.
				1167	*/
				1168	if (buffer_dirty(bh)) {
				1169	smp_mb();
				1170	if (buffer_dirty(bh))
				1171	return;
				1172	}
				1173
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	1174	if (!test_set_buffer_dirty(bh)) {
				1175	struct page *page = bh->b_page;
Linus Torvalds	8e9d78e	2009-08-21 17:40:08 -0700	[diff] [blame]	1176	if (!TestSetPageDirty(page)) {
				1177	struct address_space *mapping = page_mapping(page);
				1178	if (mapping)
				1179	__set_page_dirty(page, mapping, 0);
				1180	}
Linus Torvalds	a8e7d49	2009-03-19 11:32:05 -0700	[diff] [blame]	1181	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1182	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	1183	EXPORT_SYMBOL(mark_buffer_dirty);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1184
				1185	/*
				1186	* Decrement a buffer_head's reference count. If all buffers against a page
				1187	* have zero reference count, are clean and unlocked, and if the page is clean
				1188	* and unlocked then try_to_free_buffers() may strip the buffers from the page
				1189	* in preparation for freeing it (sometimes, rarely, buffers are removed from
				1190	* a page but it ends up not being freed, and buffers may later be reattached).
				1191	*/
				1192	void __brelse(struct buffer_head * buf)
				1193	{
				1194	if (atomic_read(&buf->b_count)) {
				1195	put_bh(buf);
				1196	return;
				1197	}
Arjan van de Ven	5c752ad	2008-07-25 19:45:40 -0700	[diff] [blame]	1198	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1199	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	1200	EXPORT_SYMBOL(__brelse);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1201
				1202	/*
				1203	* bforget() is like brelse(), except it discards any
				1204	* potentially dirty data.
				1205	*/
				1206	void __bforget(struct buffer_head *bh)
				1207	{
				1208	clear_buffer_dirty(bh);
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	1209	if (bh->b_assoc_map) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1210	struct address_space *buffer_mapping = bh->b_page->mapping;
				1211
				1212	spin_lock(&buffer_mapping->private_lock);
				1213	list_del_init(&bh->b_assoc_buffers);
Jan Kara	58ff407	2006-10-17 00:10:19 -0700	[diff] [blame]	1214	bh->b_assoc_map = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1215	spin_unlock(&buffer_mapping->private_lock);
				1216	}
				1217	__brelse(bh);
				1218	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	1219	EXPORT_SYMBOL(__bforget);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1220
				1221	static struct buffer_head __bread_slow(struct buffer_head bh)
				1222	{
				1223	lock_buffer(bh);
				1224	if (buffer_uptodate(bh)) {
				1225	unlock_buffer(bh);
				1226	return bh;
				1227	} else {
				1228	get_bh(bh);
				1229	bh->b_end_io = end_buffer_read_sync;
				1230	submit_bh(READ, bh);
				1231	wait_on_buffer(bh);
				1232	if (buffer_uptodate(bh))
				1233	return bh;
				1234	}
				1235	brelse(bh);
				1236	return NULL;
				1237	}
				1238
				1239	/*
				1240	* Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
				1241	* The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
				1242	* refcount elevated by one when they're in an LRU. A buffer can only appear
				1243	* once in a particular CPU's LRU. A single buffer can be present in multiple
				1244	* CPU's LRUs at the same time.
				1245	*
				1246	* This is a transparent caching front-end to sb_bread(), sb_getblk() and
				1247	* sb_find_get_block().
				1248	*
				1249	* The LRUs themselves only need locking against invalidate_bh_lrus. We use
				1250	* a local interrupt disable for that.
				1251	*/
				1252
				1253	#define BH_LRU_SIZE 8
				1254
				1255	struct bh_lru {
				1256	struct buffer_head *bhs[BH_LRU_SIZE];
				1257	};
				1258
				1259	static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
				1260
				1261	#ifdef CONFIG_SMP
				1262	#define bh_lru_lock() local_irq_disable()
				1263	#define bh_lru_unlock() local_irq_enable()
				1264	#else
				1265	#define bh_lru_lock() preempt_disable()
				1266	#define bh_lru_unlock() preempt_enable()
				1267	#endif
				1268
				1269	static inline void check_irqs_on(void)
				1270	{
				1271	#ifdef irqs_disabled
				1272	BUG_ON(irqs_disabled());
				1273	#endif
				1274	}
				1275
				1276	/*
				1277	* The LRU management algorithm is dopey-but-simple. Sorry.
				1278	*/
				1279	static void bh_lru_install(struct buffer_head *bh)
				1280	{
				1281	struct buffer_head *evictee = NULL;
				1282	struct bh_lru *lru;
				1283
				1284	check_irqs_on();
				1285	bh_lru_lock();
				1286	lru = &__get_cpu_var(bh_lrus);
				1287	if (lru->bhs[0] != bh) {
				1288	struct buffer_head *bhs[BH_LRU_SIZE];
				1289	int in;
				1290	int out = 0;
				1291
				1292	get_bh(bh);
				1293	bhs[out++] = bh;
				1294	for (in = 0; in < BH_LRU_SIZE; in++) {
				1295	struct buffer_head *bh2 = lru->bhs[in];
				1296
				1297	if (bh2 == bh) {
				1298	__brelse(bh2);
				1299	} else {
				1300	if (out >= BH_LRU_SIZE) {
				1301	BUG_ON(evictee != NULL);
				1302	evictee = bh2;
				1303	} else {
				1304	bhs[out++] = bh2;
				1305	}
				1306	}
				1307	}
				1308	while (out < BH_LRU_SIZE)
				1309	bhs[out++] = NULL;
				1310	memcpy(lru->bhs, bhs, sizeof(bhs));
				1311	}
				1312	bh_lru_unlock();
				1313
				1314	if (evictee)
				1315	__brelse(evictee);
				1316	}
				1317
				1318	/*
				1319	* Look up the bh in this cpu's LRU. If it's there, move it to the head.
				1320	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1321	static struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1322	lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1323	{
				1324	struct buffer_head *ret = NULL;
				1325	struct bh_lru *lru;
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1326	unsigned int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1327
				1328	check_irqs_on();
				1329	bh_lru_lock();
				1330	lru = &__get_cpu_var(bh_lrus);
				1331	for (i = 0; i < BH_LRU_SIZE; i++) {
				1332	struct buffer_head *bh = lru->bhs[i];
				1333
				1334	if (bh && bh->b_bdev == bdev &&
				1335	bh->b_blocknr == block && bh->b_size == size) {
				1336	if (i) {
				1337	while (i) {
				1338	lru->bhs[i] = lru->bhs[i - 1];
				1339	i--;
				1340	}
				1341	lru->bhs[0] = bh;
				1342	}
				1343	get_bh(bh);
				1344	ret = bh;
				1345	break;
				1346	}
				1347	}
				1348	bh_lru_unlock();
				1349	return ret;
				1350	}
				1351
				1352	/*
				1353	* Perform a pagecache lookup for the matching buffer. If it's there, refresh
				1354	* it in the LRU and mark it as accessed. If it is not present then return
				1355	* NULL
				1356	*/
				1357	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1358	__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1359	{
				1360	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
				1361
				1362	if (bh == NULL) {
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1363	bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1364	if (bh)
				1365	bh_lru_install(bh);
				1366	}
				1367	if (bh)
				1368	touch_buffer(bh);
				1369	return bh;
				1370	}
				1371	EXPORT_SYMBOL(__find_get_block);
				1372
				1373	/*
				1374	* __getblk will locate (and, if necessary, create) the buffer_head
				1375	* which corresponds to the passed block_device, block and size. The
				1376	* returned buffer has its reference count incremented.
				1377	*
				1378	* __getblk() cannot fail - it just keeps trying. If you pass it an
				1379	* illegal block number, __getblk() will happily return a buffer_head
				1380	* which represents the non-existent block. Very weird.
				1381	*
				1382	* __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
				1383	* attempt is failing. FIXME, perhaps?
				1384	*/
				1385	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1386	__getblk(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1387	{
				1388	struct buffer_head *bh = __find_get_block(bdev, block, size);
				1389
				1390	might_sleep();
				1391	if (bh == NULL)
				1392	bh = __getblk_slow(bdev, block, size);
				1393	return bh;
				1394	}
				1395	EXPORT_SYMBOL(__getblk);
				1396
				1397	/*
				1398	* Do async read-ahead on a buffer..
				1399	*/
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1400	void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1401	{
				1402	struct buffer_head *bh = __getblk(bdev, block, size);
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1403	if (likely(bh)) {
				1404	ll_rw_block(READA, 1, &bh);
				1405	brelse(bh);
				1406	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1407	}
				1408	EXPORT_SYMBOL(__breadahead);
				1409
				1410	/**
				1411	* __bread() - reads a specified block and returns the bh
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1412	* @bdev: the block_device to read from
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1413	* @block: number of block
				1414	* @size: size (in bytes) to read
				1415	*
				1416	* Reads a specified block, and returns buffer head that contains it.
				1417	* It returns NULL if the block was unreadable.
				1418	*/
				1419	struct buffer_head *
Tomasz Kvarsin	3991d3b	2007-02-12 00:52:14 -0800	[diff] [blame]	1420	__bread(struct block_device *bdev, sector_t block, unsigned size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1421	{
				1422	struct buffer_head *bh = __getblk(bdev, block, size);
				1423
Andrew Morton	a3e713b	2005-10-30 15:03:15 -0800	[diff] [blame]	1424	if (likely(bh) && !buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1425	bh = __bread_slow(bh);
				1426	return bh;
				1427	}
				1428	EXPORT_SYMBOL(__bread);
				1429
				1430	/*
				1431	* invalidate_bh_lrus() is called rarely - but not only at unmount.
				1432	* This doesn't race because it runs in each cpu either in irq
				1433	* or with preempt disabled.
				1434	*/
				1435	static void invalidate_bh_lru(void *arg)
				1436	{
				1437	struct bh_lru *b = &get_cpu_var(bh_lrus);
				1438	int i;
				1439
				1440	for (i = 0; i < BH_LRU_SIZE; i++) {
				1441	brelse(b->bhs[i]);
				1442	b->bhs[i] = NULL;
				1443	}
				1444	put_cpu_var(bh_lrus);
				1445	}
				1446
Peter Zijlstra	f9a1439	2007-05-06 14:49:55 -0700	[diff] [blame]	1447	void invalidate_bh_lrus(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1448	{
Jens Axboe	15c8b6c	2008-05-09 09:39:44 +0200	[diff] [blame]	1449	on_each_cpu(invalidate_bh_lru, NULL, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1450	}
Nick Piggin	9db5579	2008-02-08 04:19:49 -0800	[diff] [blame]	1451	EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1452
				1453	void set_bh_page(struct buffer_head *bh,
				1454	struct page *page, unsigned long offset)
				1455	{
				1456	bh->b_page = page;
Eric Sesterhenn	e827f92	2006-03-26 18:24:46 +0200	[diff] [blame]	1457	BUG_ON(offset >= PAGE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1458	if (PageHighMem(page))
				1459	/*
				1460	* This catches illegal uses and preserves the offset:
				1461	*/
				1462	bh->b_data = (char *)(0 + offset);
				1463	else
				1464	bh->b_data = page_address(page) + offset;
				1465	}
				1466	EXPORT_SYMBOL(set_bh_page);
				1467
				1468	/*
				1469	* Called when truncating a buffer on a page completely.
				1470	*/
Arjan van de Ven	858119e	2006-01-14 13:20:43 -0800	[diff] [blame]	1471	static void discard_buffer(struct buffer_head * bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1472	{
				1473	lock_buffer(bh);
				1474	clear_buffer_dirty(bh);
				1475	bh->b_bdev = NULL;
				1476	clear_buffer_mapped(bh);
				1477	clear_buffer_req(bh);
				1478	clear_buffer_new(bh);
				1479	clear_buffer_delay(bh);
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1480	clear_buffer_unwritten(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1481	unlock_buffer(bh);
				1482	}
				1483
				1484	/**
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1485	* block_invalidatepage - invalidate part of all of a buffer-backed page
				1486	*
				1487	* @page: the page which is affected
				1488	* @offset: the index of the truncation point
				1489	*
				1490	* block_invalidatepage() is called when all or part of the page has become
				1491	* invalidatedby a truncate operation.
				1492	*
				1493	* block_invalidatepage() does not have to release all buffers, but it must
				1494	* ensure that no dirty buffer is left outside @offset and that no I/O
				1495	* is underway against any of the blocks which are outside the truncation
				1496	* point. Because the caller is about to free (and possibly reuse) those
				1497	* blocks on-disk.
				1498	*/
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1499	void block_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1500	{
				1501	struct buffer_head head, bh, *next;
				1502	unsigned int curr_off = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1503
				1504	BUG_ON(!PageLocked(page));
				1505	if (!page_has_buffers(page))
				1506	goto out;
				1507
				1508	head = page_buffers(page);
				1509	bh = head;
				1510	do {
				1511	unsigned int next_off = curr_off + bh->b_size;
				1512	next = bh->b_this_page;
				1513
				1514	/*
				1515	* is this block fully invalidated?
				1516	*/
				1517	if (offset <= curr_off)
				1518	discard_buffer(bh);
				1519	curr_off = next_off;
				1520	bh = next;
				1521	} while (bh != head);
				1522
				1523	/*
				1524	* We release buffers only if the entire page is being invalidated.
				1525	* The get_block cached value has been unconditionally invalidated,
				1526	* so real IO is not possible anymore.
				1527	*/
				1528	if (offset == 0)
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1529	try_to_release_page(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1530	out:
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1531	return;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1532	}
				1533	EXPORT_SYMBOL(block_invalidatepage);
				1534
				1535	/*
				1536	* We attach and possibly dirty the buffers atomically wrt
				1537	* __set_page_dirty_buffers() via private_lock. try_to_free_buffers
				1538	* is already excluded via the page lock.
				1539	*/
				1540	void create_empty_buffers(struct page *page,
				1541	unsigned long blocksize, unsigned long b_state)
				1542	{
				1543	struct buffer_head bh, head, *tail;
				1544
				1545	head = alloc_page_buffers(page, blocksize, 1);
				1546	bh = head;
				1547	do {
				1548	bh->b_state \|= b_state;
				1549	tail = bh;
				1550	bh = bh->b_this_page;
				1551	} while (bh);
				1552	tail->b_this_page = head;
				1553
				1554	spin_lock(&page->mapping->private_lock);
				1555	if (PageUptodate(page) \|\| PageDirty(page)) {
				1556	bh = head;
				1557	do {
				1558	if (PageDirty(page))
				1559	set_buffer_dirty(bh);
				1560	if (PageUptodate(page))
				1561	set_buffer_uptodate(bh);
				1562	bh = bh->b_this_page;
				1563	} while (bh != head);
				1564	}
				1565	attach_page_buffers(page, head);
				1566	spin_unlock(&page->mapping->private_lock);
				1567	}
				1568	EXPORT_SYMBOL(create_empty_buffers);
				1569
				1570	/*
				1571	* We are taking a block for data and we don't want any output from any
				1572	* buffer-cache aliases starting from return from that function and
				1573	* until the moment when something will explicitly mark the buffer
				1574	* dirty (hopefully that will not happen until we will free that block ;-)
				1575	* We don't even need to mark it not-uptodate - nobody can expect
				1576	* anything from a newly allocated buffer anyway. We used to used
				1577	* unmap_buffer() for such invalidation, but that was wrong. We definitely
				1578	* don't want to mark the alias unmapped, for example - it would confuse
				1579	* anyone who might pick it with bread() afterwards...
				1580	*
				1581	* Also.. Note that bforget() doesn't lock the buffer. So there can
				1582	* be writeout I/O going on against recently-freed buffers. We don't
				1583	* wait on that I/O in bforget() - it's more efficient to wait on the I/O
				1584	* only if we really need to. That happens here.
				1585	*/
				1586	void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
				1587	{
				1588	struct buffer_head *old_bh;
				1589
				1590	might_sleep();
				1591
Coywolf Qi Hunt	385fd4c	2005-11-07 00:59:39 -0800	[diff] [blame]	1592	old_bh = __find_get_block_slow(bdev, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1593	if (old_bh) {
				1594	clear_buffer_dirty(old_bh);
				1595	wait_on_buffer(old_bh);
				1596	clear_buffer_req(old_bh);
				1597	__brelse(old_bh);
				1598	}
				1599	}
				1600	EXPORT_SYMBOL(unmap_underlying_metadata);
				1601
				1602	/*
				1603	* NOTE! All mapped/uptodate combinations are valid:
				1604	*
				1605	* Mapped Uptodate Meaning
				1606	*
				1607	* No No "unknown" - must do get_block()
				1608	* No Yes "hole" - zero-filled
				1609	* Yes No "allocated" - allocated on disk, not read in
				1610	* Yes Yes "valid" - allocated and up-to-date in memory.
				1611	*
				1612	* "Dirty" is valid only with the last case (mapped+uptodate).
				1613	*/
				1614
				1615	/*
				1616	* While block_write_full_page is writing back the dirty buffers under
				1617	* the page lock, whoever dirtied the buffers may decide to clean them
				1618	* again at any time. We handle that by only looking at the buffer
				1619	* state inside lock_buffer().
				1620	*
				1621	* If block_write_full_page() is called for regular writeback
				1622	* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
				1623	* locked buffer. This only can happen if someone has written the buffer
				1624	* directly, with submit_bh(). At the address_space level PageWriteback
				1625	* prevents this contention from occurring.
Theodore Ts'o	6e34eedd	2009-04-07 18:12:43 -0400	[diff] [blame]	1626	*
				1627	* If block_write_full_page() is called with wbc->sync_mode ==
				1628	* WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
				1629	* causes the writes to be flagged as synchronous writes, but the
				1630	* block device queue will NOT be unplugged, since usually many pages
				1631	* will be pushed to the out before the higher-level caller actually
				1632	* waits for the writes to be completed. The various wait functions,
				1633	* such as wait_on_writeback_range() will ultimately call sync_page()
				1634	* which will ultimately call blk_run_backing_dev(), which will end up
				1635	* unplugging the device queue.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1636	*/
				1637	static int __block_write_full_page(struct inode inode, struct page page,
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	1638	get_block_t get_block, struct writeback_control wbc,
				1639	bh_end_io_t *handler)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1640	{
				1641	int err;
				1642	sector_t block;
				1643	sector_t last_block;
Andrew Morton	f0fbd5f	2005-05-05 16:15:48 -0700	[diff] [blame]	1644	struct buffer_head bh, head;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1645	const unsigned blocksize = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1646	int nr_underway = 0;
Theodore Ts'o	6e34eedd	2009-04-07 18:12:43 -0400	[diff] [blame]	1647	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
				1648	WRITE_SYNC_PLUG : WRITE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1649
				1650	BUG_ON(!PageLocked(page));
				1651
				1652	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
				1653
				1654	if (!page_has_buffers(page)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1655	create_empty_buffers(page, blocksize,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1656	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1657	}
				1658
				1659	/*
				1660	* Be very careful. We have no exclusion from __set_page_dirty_buffers
				1661	* here, and the (potentially unmapped) buffers may become dirty at
				1662	* any time. If a buffer becomes dirty here after we've inspected it
				1663	* then we just miss that fact, and the page stays dirty.
				1664	*
				1665	* Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
				1666	* handle that here by just cleaning them.
				1667	*/
				1668
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	1669	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1670	head = page_buffers(page);
				1671	bh = head;
				1672
				1673	/*
				1674	* Get all the dirty buffers mapped to disk addresses and
				1675	* handle any aliases from the underlying blockdev's mapping.
				1676	*/
				1677	do {
				1678	if (block > last_block) {
				1679	/*
				1680	* mapped buffers outside i_size will occur, because
				1681	* this page can be outside i_size when there is a
				1682	* truncate in progress.
				1683	*/
				1684	/*
				1685	* The buffer was zeroed by block_write_full_page()
				1686	*/
				1687	clear_buffer_dirty(bh);
				1688	set_buffer_uptodate(bh);
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1689	} else if ((!buffer_mapped(bh) \|\| buffer_delay(bh)) &&
				1690	buffer_dirty(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1691	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1692	err = get_block(inode, block, bh, 1);
				1693	if (err)
				1694	goto recover;
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1695	clear_buffer_delay(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1696	if (buffer_new(bh)) {
				1697	/* blockdev mappings never come here */
				1698	clear_buffer_new(bh);
				1699	unmap_underlying_metadata(bh->b_bdev,
				1700	bh->b_blocknr);
				1701	}
				1702	}
				1703	bh = bh->b_this_page;
				1704	block++;
				1705	} while (bh != head);
				1706
				1707	do {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1708	if (!buffer_mapped(bh))
				1709	continue;
				1710	/*
				1711	* If it's a fully non-blocking write attempt and we cannot
				1712	* lock the buffer then redirty the page. Note that this can
Jens Axboe	5b0830c	2009-09-23 19:37:09 +0200	[diff] [blame]	1713	* potentially cause a busy-wait loop from writeback threads
				1714	* and kswapd activity, but those code paths have their own
				1715	* higher-level throttling.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1716	*/
				1717	if (wbc->sync_mode != WB_SYNC_NONE \|\| !wbc->nonblocking) {
				1718	lock_buffer(bh);
Nick Piggin	ca5de40	2008-08-02 12:02:13 +0200	[diff] [blame]	1719	} else if (!trylock_buffer(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1720	redirty_page_for_writepage(wbc, page);
				1721	continue;
				1722	}
				1723	if (test_clear_buffer_dirty(bh)) {
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	1724	mark_buffer_async_write_endio(bh, handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1725	} else {
				1726	unlock_buffer(bh);
				1727	}
				1728	} while ((bh = bh->b_this_page) != head);
				1729
				1730	/*
				1731	* The page and its buffers are protected by PageWriteback(), so we can
				1732	* drop the bh refcounts early.
				1733	*/
				1734	BUG_ON(PageWriteback(page));
				1735	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1736
				1737	do {
				1738	struct buffer_head *next = bh->b_this_page;
				1739	if (buffer_async_write(bh)) {
Theodore Ts'o	a64c861	2009-03-27 22:14:10 -0400	[diff] [blame]	1740	submit_bh(write_op, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1741	nr_underway++;
				1742	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1743	bh = next;
				1744	} while (bh != head);
Andrew Morton	05937ba	2005-05-05 16:15:47 -0700	[diff] [blame]	1745	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1746
				1747	err = 0;
				1748	done:
				1749	if (nr_underway == 0) {
				1750	/*
				1751	* The page was marked dirty, but the buffers were
				1752	* clean. Someone wrote them back by hand with
				1753	* ll_rw_block/submit_bh. A rare case.
				1754	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1755	end_page_writeback(page);
Nick Piggin	3d67f2d	2007-05-06 14:49:05 -0700	[diff] [blame]	1756
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1757	/*
				1758	* The page and buffer_heads can be released at any time from
				1759	* here on.
				1760	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1761	}
				1762	return err;
				1763
				1764	recover:
				1765	/*
				1766	* ENOSPC, or some other error. We may already have added some
				1767	* blocks to the file, so we need to write these out to avoid
				1768	* exposing stale data.
				1769	* The page is currently locked and not marked for writeback
				1770	*/
				1771	bh = head;
				1772	/* Recovery: lock and submit the mapped buffers */
				1773	do {
Alex Tomas	29a814d	2008-07-11 19:27:31 -0400	[diff] [blame]	1774	if (buffer_mapped(bh) && buffer_dirty(bh) &&
				1775	!buffer_delay(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1776	lock_buffer(bh);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	1777	mark_buffer_async_write_endio(bh, handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1778	} else {
				1779	/*
				1780	* The buffer may have been set dirty during
				1781	* attachment to a dirty page.
				1782	*/
				1783	clear_buffer_dirty(bh);
				1784	}
				1785	} while ((bh = bh->b_this_page) != head);
				1786	SetPageError(page);
				1787	BUG_ON(PageWriteback(page));
Andrew Morton	7e4c369	2007-05-08 00:23:27 -0700	[diff] [blame]	1788	mapping_set_error(page->mapping, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1789	set_page_writeback(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1790	do {
				1791	struct buffer_head *next = bh->b_this_page;
				1792	if (buffer_async_write(bh)) {
				1793	clear_buffer_dirty(bh);
Theodore Ts'o	a64c861	2009-03-27 22:14:10 -0400	[diff] [blame]	1794	submit_bh(write_op, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1795	nr_underway++;
				1796	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1797	bh = next;
				1798	} while (bh != head);
Nick Piggin	ffda9d3	2007-02-20 13:57:54 -0800	[diff] [blame]	1799	unlock_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1800	goto done;
				1801	}
				1802
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1803	/*
				1804	* If a page has any new buffers, zero them out here, and mark them uptodate
				1805	* and dirty so they'll be written out (in order to prevent uninitialised
				1806	* block data from leaking). And clear the new bit.
				1807	*/
				1808	void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
				1809	{
				1810	unsigned int block_start, block_end;
				1811	struct buffer_head head, bh;
				1812
				1813	BUG_ON(!PageLocked(page));
				1814	if (!page_has_buffers(page))
				1815	return;
				1816
				1817	bh = head = page_buffers(page);
				1818	block_start = 0;
				1819	do {
				1820	block_end = block_start + bh->b_size;
				1821
				1822	if (buffer_new(bh)) {
				1823	if (block_end > from && block_start < to) {
				1824	if (!PageUptodate(page)) {
				1825	unsigned start, size;
				1826
				1827	start = max(from, block_start);
				1828	size = min(to, block_end) - start;
				1829
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1830	zero_user(page, start, size);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1831	set_buffer_uptodate(bh);
				1832	}
				1833
				1834	clear_buffer_new(bh);
				1835	mark_buffer_dirty(bh);
				1836	}
				1837	}
				1838
				1839	block_start = block_end;
				1840	bh = bh->b_this_page;
				1841	} while (bh != head);
				1842	}
				1843	EXPORT_SYMBOL(page_zero_new_buffers);
				1844
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1845	static int __block_prepare_write(struct inode inode, struct page page,
				1846	unsigned from, unsigned to, get_block_t *get_block)
				1847	{
				1848	unsigned block_start, block_end;
				1849	sector_t block;
				1850	int err = 0;
				1851	unsigned blocksize, bbits;
				1852	struct buffer_head bh, head, wait[2], *wait_bh=wait;
				1853
				1854	BUG_ON(!PageLocked(page));
				1855	BUG_ON(from > PAGE_CACHE_SIZE);
				1856	BUG_ON(to > PAGE_CACHE_SIZE);
				1857	BUG_ON(from > to);
				1858
				1859	blocksize = 1 << inode->i_blkbits;
				1860	if (!page_has_buffers(page))
				1861	create_empty_buffers(page, blocksize, 0);
				1862	head = page_buffers(page);
				1863
				1864	bbits = inode->i_blkbits;
				1865	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
				1866
				1867	for(bh = head, block_start = 0; bh != head \|\| !block_start;
				1868	block++, block_start=block_end, bh = bh->b_this_page) {
				1869	block_end = block_start + blocksize;
				1870	if (block_end <= from \|\| block_start >= to) {
				1871	if (PageUptodate(page)) {
				1872	if (!buffer_uptodate(bh))
				1873	set_buffer_uptodate(bh);
				1874	}
				1875	continue;
				1876	}
				1877	if (buffer_new(bh))
				1878	clear_buffer_new(bh);
				1879	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	1880	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1881	err = get_block(inode, block, bh, 1);
				1882	if (err)
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1883	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1884	if (buffer_new(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1885	unmap_underlying_metadata(bh->b_bdev,
				1886	bh->b_blocknr);
				1887	if (PageUptodate(page)) {
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	1888	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1889	set_buffer_uptodate(bh);
Nick Piggin	637aff4	2007-10-16 01:25:00 -0700	[diff] [blame]	1890	mark_buffer_dirty(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1891	continue;
				1892	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	1893	if (block_end > to \|\| block_start < from)
				1894	zero_user_segments(page,
				1895	to, block_end,
				1896	block_start, from);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1897	continue;
				1898	}
				1899	}
				1900	if (PageUptodate(page)) {
				1901	if (!buffer_uptodate(bh))
				1902	set_buffer_uptodate(bh);
				1903	continue;
				1904	}
				1905	if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	1906	!buffer_unwritten(bh) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1907	(block_start < from \|\| block_end > to)) {
				1908	ll_rw_block(READ, 1, &bh);
				1909	*wait_bh++=bh;
				1910	}
				1911	}
				1912	/*
				1913	* If we issued read requests - let them complete.
				1914	*/
				1915	while(wait_bh > wait) {
				1916	wait_on_buffer(*--wait_bh);
				1917	if (!buffer_uptodate(*wait_bh))
Nick Piggin	f3ddbdc	2005-05-05 16:15:45 -0700	[diff] [blame]	1918	err = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1919	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1920	if (unlikely(err))
				1921	page_zero_new_buffers(page, from, to);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1922	return err;
				1923	}
				1924
				1925	static int __block_commit_write(struct inode inode, struct page page,
				1926	unsigned from, unsigned to)
				1927	{
				1928	unsigned block_start, block_end;
				1929	int partial = 0;
				1930	unsigned blocksize;
				1931	struct buffer_head bh, head;
				1932
				1933	blocksize = 1 << inode->i_blkbits;
				1934
				1935	for(bh = head = page_buffers(page), block_start = 0;
				1936	bh != head \|\| !block_start;
				1937	block_start=block_end, bh = bh->b_this_page) {
				1938	block_end = block_start + blocksize;
				1939	if (block_end <= from \|\| block_start >= to) {
				1940	if (!buffer_uptodate(bh))
				1941	partial = 1;
				1942	} else {
				1943	set_buffer_uptodate(bh);
				1944	mark_buffer_dirty(bh);
				1945	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1946	clear_buffer_new(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1947	}
				1948
				1949	/*
				1950	* If this is a partial write which happened to make all buffers
				1951	* uptodate then we can optimize away a bogus readpage() for
				1952	* the next read(). Here we 'discover' whether the page went
				1953	* uptodate as a result of this (potentially partial) write.
				1954	*/
				1955	if (!partial)
				1956	SetPageUptodate(page);
				1957	return 0;
				1958	}
				1959
				1960	/*
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1961	* block_write_begin takes care of the basic task of block allocation and
				1962	* bringing partial write blocks uptodate first.
				1963	*
				1964	* If *pagep is not NULL, then block_write_begin uses the locked page
				1965	* at *pagep rather than allocating its own. In this case, the page will
				1966	* not be unlocked or deallocated on failure.
				1967	*/
				1968	int block_write_begin(struct file file, struct address_space mapping,
				1969	loff_t pos, unsigned len, unsigned flags,
				1970	struct page pagep, void fsdata,
				1971	get_block_t *get_block)
				1972	{
				1973	struct inode *inode = mapping->host;
				1974	int status = 0;
				1975	struct page *page;
				1976	pgoff_t index;
				1977	unsigned start, end;
				1978	int ownpage = 0;
				1979
				1980	index = pos >> PAGE_CACHE_SHIFT;
				1981	start = pos & (PAGE_CACHE_SIZE - 1);
				1982	end = start + len;
				1983
				1984	page = *pagep;
				1985	if (page == NULL) {
				1986	ownpage = 1;
Nick Piggin	54566b2	2009-01-04 12:00:53 -0800	[diff] [blame]	1987	page = grab_cache_page_write_begin(mapping, index, flags);
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	1988	if (!page) {
				1989	status = -ENOMEM;
				1990	goto out;
				1991	}
				1992	*pagep = page;
				1993	} else
				1994	BUG_ON(!PageLocked(page));
				1995
				1996	status = __block_prepare_write(inode, page, start, end, get_block);
				1997	if (unlikely(status)) {
				1998	ClearPageUptodate(page);
				1999
				2000	if (ownpage) {
				2001	unlock_page(page);
				2002	page_cache_release(page);
				2003	*pagep = NULL;
				2004
				2005	/*
				2006	* prepare_write() may have instantiated a few blocks
				2007	* outside i_size. Trim these off again. Don't need
				2008	* i_size_read because we hold i_mutex.
				2009	*/
				2010	if (pos + len > inode->i_size)
				2011	vmtruncate(inode, inode->i_size);
				2012	}
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2013	}
				2014
				2015	out:
				2016	return status;
				2017	}
				2018	EXPORT_SYMBOL(block_write_begin);
				2019
				2020	int block_write_end(struct file file, struct address_space mapping,
				2021	loff_t pos, unsigned len, unsigned copied,
				2022	struct page page, void fsdata)
				2023	{
				2024	struct inode *inode = mapping->host;
				2025	unsigned start;
				2026
				2027	start = pos & (PAGE_CACHE_SIZE - 1);
				2028
				2029	if (unlikely(copied < len)) {
				2030	/*
				2031	* The buffers that were written will now be uptodate, so we
				2032	* don't have to worry about a readpage reading them and
				2033	* overwriting a partial write. However if we have encountered
				2034	* a short write and only partially written into a buffer, it
				2035	* will not be marked uptodate, so a readpage might come in and
				2036	* destroy our partial write.
				2037	*
				2038	* Do the simplest thing, and just treat any short write to a
				2039	* non uptodate page as a zero-length write, and force the
				2040	* caller to redo the whole thing.
				2041	*/
				2042	if (!PageUptodate(page))
				2043	copied = 0;
				2044
				2045	page_zero_new_buffers(page, start+copied, start+len);
				2046	}
				2047	flush_dcache_page(page);
				2048
				2049	/* This could be a short (even 0-length) commit */
				2050	__block_commit_write(inode, page, start, start+copied);
				2051
				2052	return copied;
				2053	}
				2054	EXPORT_SYMBOL(block_write_end);
				2055
				2056	int generic_write_end(struct file file, struct address_space mapping,
				2057	loff_t pos, unsigned len, unsigned copied,
				2058	struct page page, void fsdata)
				2059	{
				2060	struct inode *inode = mapping->host;
Jan Kara	c7d206b	2008-07-11 19:27:31 -0400	[diff] [blame]	2061	int i_size_changed = 0;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2062
				2063	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
				2064
				2065	/*
				2066	* No need to use i_size_read() here, the i_size
				2067	* cannot change under us because we hold i_mutex.
				2068	*
				2069	* But it's important to update i_size while still holding page lock:
				2070	* page writeout could otherwise come in and zero beyond i_size.
				2071	*/
				2072	if (pos+copied > inode->i_size) {
				2073	i_size_write(inode, pos+copied);
Jan Kara	c7d206b	2008-07-11 19:27:31 -0400	[diff] [blame]	2074	i_size_changed = 1;
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2075	}
				2076
				2077	unlock_page(page);
				2078	page_cache_release(page);
				2079
Jan Kara	c7d206b	2008-07-11 19:27:31 -0400	[diff] [blame]	2080	/*
				2081	* Don't mark the inode dirty under page lock. First, it unnecessarily
				2082	* makes the holding time of page lock longer. Second, it forces lock
				2083	* ordering of page lock and transaction start for journaling
				2084	* filesystems.
				2085	*/
				2086	if (i_size_changed)
				2087	mark_inode_dirty(inode);
				2088
Nick Piggin	afddba4	2007-10-16 01:25:01 -0700	[diff] [blame]	2089	return copied;
				2090	}
				2091	EXPORT_SYMBOL(generic_write_end);
				2092
				2093	/*
Hisashi Hifumi	8ab22b9	2008-07-28 15:46:36 -0700	[diff] [blame]	2094	* block_is_partially_uptodate checks whether buffers within a page are
				2095	* uptodate or not.
				2096	*
				2097	* Returns true if all buffers which correspond to a file portion
				2098	* we want to read are uptodate.
				2099	*/
				2100	int block_is_partially_uptodate(struct page page, read_descriptor_t desc,
				2101	unsigned long from)
				2102	{
				2103	struct inode *inode = page->mapping->host;
				2104	unsigned block_start, block_end, blocksize;
				2105	unsigned to;
				2106	struct buffer_head bh, head;
				2107	int ret = 1;
				2108
				2109	if (!page_has_buffers(page))
				2110	return 0;
				2111
				2112	blocksize = 1 << inode->i_blkbits;
				2113	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
				2114	to = from + to;
				2115	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
				2116	return 0;
				2117
				2118	head = page_buffers(page);
				2119	bh = head;
				2120	block_start = 0;
				2121	do {
				2122	block_end = block_start + blocksize;
				2123	if (block_end > from && block_start < to) {
				2124	if (!buffer_uptodate(bh)) {
				2125	ret = 0;
				2126	break;
				2127	}
				2128	if (block_end >= to)
				2129	break;
				2130	}
				2131	block_start = block_end;
				2132	bh = bh->b_this_page;
				2133	} while (bh != head);
				2134
				2135	return ret;
				2136	}
				2137	EXPORT_SYMBOL(block_is_partially_uptodate);
				2138
				2139	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2140	* Generic "read page" function for block devices that have the normal
				2141	* get_block functionality. This is most of the block device filesystems.
				2142	* Reads the page asynchronously --- the unlock_buffer() and
				2143	* set/clear_buffer_uptodate() functions propagate buffer state into the
				2144	* page struct once IO has completed.
				2145	*/
				2146	int block_read_full_page(struct page page, get_block_t get_block)
				2147	{
				2148	struct inode *inode = page->mapping->host;
				2149	sector_t iblock, lblock;
				2150	struct buffer_head bh, head, *arr[MAX_BUF_PER_PAGE];
				2151	unsigned int blocksize;
				2152	int nr, i;
				2153	int fully_mapped = 1;
				2154
Matt Mackall	cd7619d	2005-05-01 08:59:01 -0700	[diff] [blame]	2155	BUG_ON(!PageLocked(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2156	blocksize = 1 << inode->i_blkbits;
				2157	if (!page_has_buffers(page))
				2158	create_empty_buffers(page, blocksize, 0);
				2159	head = page_buffers(page);
				2160
				2161	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2162	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
				2163	bh = head;
				2164	nr = 0;
				2165	i = 0;
				2166
				2167	do {
				2168	if (buffer_uptodate(bh))
				2169	continue;
				2170
				2171	if (!buffer_mapped(bh)) {
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2172	int err = 0;
				2173
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2174	fully_mapped = 0;
				2175	if (iblock < lblock) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2176	WARN_ON(bh->b_size != blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2177	err = get_block(inode, iblock, bh, 0);
				2178	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2179	SetPageError(page);
				2180	}
				2181	if (!buffer_mapped(bh)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2182	zero_user(page, i * blocksize, blocksize);
Andrew Morton	c64610b	2005-05-16 21:53:49 -0700	[diff] [blame]	2183	if (!err)
				2184	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2185	continue;
				2186	}
				2187	/*
				2188	* get_block() might have updated the buffer
				2189	* synchronously
				2190	*/
				2191	if (buffer_uptodate(bh))
				2192	continue;
				2193	}
				2194	arr[nr++] = bh;
				2195	} while (i++, iblock++, (bh = bh->b_this_page) != head);
				2196
				2197	if (fully_mapped)
				2198	SetPageMappedToDisk(page);
				2199
				2200	if (!nr) {
				2201	/*
				2202	* All buffers are uptodate - we can set the page uptodate
				2203	* as well. But not if get_block() returned an error.
				2204	*/
				2205	if (!PageError(page))
				2206	SetPageUptodate(page);
				2207	unlock_page(page);
				2208	return 0;
				2209	}
				2210
				2211	/* Stage two: lock the buffers */
				2212	for (i = 0; i < nr; i++) {
				2213	bh = arr[i];
				2214	lock_buffer(bh);
				2215	mark_buffer_async_read(bh);
				2216	}
				2217
				2218	/*
				2219	* Stage 3: start the IO. Check for uptodateness
				2220	* inside the buffer lock in case another process reading
				2221	* the underlying blockdev brought it uptodate (the sct fix).
				2222	*/
				2223	for (i = 0; i < nr; i++) {
				2224	bh = arr[i];
				2225	if (buffer_uptodate(bh))
				2226	end_buffer_async_read(bh, 1);
				2227	else
				2228	submit_bh(READ, bh);
				2229	}
				2230	return 0;
				2231	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2232	EXPORT_SYMBOL(block_read_full_page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2233
				2234	/* utility function for filesystems that need to do work on expanding
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2235	* truncates. Uses filesystem pagecache writes to allow the filesystem to
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2236	* deal with the hole.
				2237	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2238	int generic_cont_expand_simple(struct inode *inode, loff_t size)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2239	{
				2240	struct address_space *mapping = inode->i_mapping;
				2241	struct page *page;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2242	void *fsdata;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2243	int err;
				2244
npiggin@suse.de	c08d3b0	2009-08-21 02:35:06 +1000	[diff] [blame]	2245	err = inode_newsize_ok(inode, size);
				2246	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2247	goto out;
				2248
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2249	err = pagecache_write_begin(NULL, mapping, size, 0,
				2250	AOP_FLAG_UNINTERRUPTIBLE\|AOP_FLAG_CONT_EXPAND,
				2251	&page, &fsdata);
				2252	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2253	goto out;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2254
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2255	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
				2256	BUG_ON(err > 0);
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2257
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2258	out:
				2259	return err;
				2260	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2261	EXPORT_SYMBOL(generic_cont_expand_simple);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2262
Adrian Bunk	f1e3af7	2008-04-29 00:59:01 -0700	[diff] [blame]	2263	static int cont_expand_zero(struct file file, struct address_space mapping,
				2264	loff_t pos, loff_t *bytes)
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2265	{
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2266	struct inode *inode = mapping->host;
				2267	unsigned blocksize = 1 << inode->i_blkbits;
				2268	struct page *page;
				2269	void *fsdata;
				2270	pgoff_t index, curidx;
				2271	loff_t curpos;
				2272	unsigned zerofrom, offset, len;
				2273	int err = 0;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2274
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2275	index = pos >> PAGE_CACHE_SHIFT;
				2276	offset = pos & ~PAGE_CACHE_MASK;
				2277
				2278	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
				2279	zerofrom = curpos & ~PAGE_CACHE_MASK;
				2280	if (zerofrom & (blocksize-1)) {
				2281	*bytes \|= (blocksize-1);
				2282	(*bytes)++;
				2283	}
				2284	len = PAGE_CACHE_SIZE - zerofrom;
				2285
				2286	err = pagecache_write_begin(file, mapping, curpos, len,
				2287	AOP_FLAG_UNINTERRUPTIBLE,
				2288	&page, &fsdata);
				2289	if (err)
				2290	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2291	zero_user(page, zerofrom, len);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2292	err = pagecache_write_end(file, mapping, curpos, len, len,
				2293	page, fsdata);
				2294	if (err < 0)
				2295	goto out;
				2296	BUG_ON(err != len);
				2297	err = 0;
OGAWA Hirofumi	061e974	2008-04-28 02:16:28 -0700	[diff] [blame]	2298
				2299	balance_dirty_pages_ratelimited(mapping);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2300	}
				2301
				2302	/* page covers the boundary, find the boundary offset */
				2303	if (index == curidx) {
				2304	zerofrom = curpos & ~PAGE_CACHE_MASK;
				2305	/* if we will expand the thing last block will be filled */
				2306	if (offset <= zerofrom) {
				2307	goto out;
				2308	}
				2309	if (zerofrom & (blocksize-1)) {
				2310	*bytes \|= (blocksize-1);
				2311	(*bytes)++;
				2312	}
				2313	len = offset - zerofrom;
				2314
				2315	err = pagecache_write_begin(file, mapping, curpos, len,
				2316	AOP_FLAG_UNINTERRUPTIBLE,
				2317	&page, &fsdata);
				2318	if (err)
				2319	goto out;
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2320	zero_user(page, zerofrom, len);
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2321	err = pagecache_write_end(file, mapping, curpos, len, len,
				2322	page, fsdata);
				2323	if (err < 0)
				2324	goto out;
				2325	BUG_ON(err != len);
				2326	err = 0;
				2327	}
				2328	out:
				2329	return err;
OGAWA Hirofumi	05eb0b5	2006-01-08 01:02:13 -0800	[diff] [blame]	2330	}
				2331
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2332	/*
				2333	* For moronic filesystems that do not allow holes in file.
				2334	* We may have to extend the file.
				2335	*/
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2336	int cont_write_begin(struct file file, struct address_space mapping,
				2337	loff_t pos, unsigned len, unsigned flags,
				2338	struct page pagep, void fsdata,
				2339	get_block_t get_block, loff_t bytes)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2340	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2341	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2342	unsigned blocksize = 1 << inode->i_blkbits;
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2343	unsigned zerofrom;
				2344	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2345
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2346	err = cont_expand_zero(file, mapping, pos, bytes);
				2347	if (err)
				2348	goto out;
				2349
				2350	zerofrom = *bytes & ~PAGE_CACHE_MASK;
				2351	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
				2352	*bytes \|= (blocksize-1);
				2353	(*bytes)++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2354	}
				2355
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2356	*pagep = NULL;
				2357	err = block_write_begin(file, mapping, pos, len,
				2358	flags, pagep, fsdata, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2359	out:
Nick Piggin	89e1078	2007-10-16 01:25:07 -0700	[diff] [blame]	2360	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2361	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2362	EXPORT_SYMBOL(cont_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2363
				2364	int block_prepare_write(struct page *page, unsigned from, unsigned to,
				2365	get_block_t *get_block)
				2366	{
				2367	struct inode *inode = page->mapping->host;
				2368	int err = __block_prepare_write(inode, page, from, to, get_block);
				2369	if (err)
				2370	ClearPageUptodate(page);
				2371	return err;
				2372	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2373	EXPORT_SYMBOL(block_prepare_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2374
				2375	int block_commit_write(struct page *page, unsigned from, unsigned to)
				2376	{
				2377	struct inode *inode = page->mapping->host;
				2378	__block_commit_write(inode,page,from,to);
				2379	return 0;
				2380	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2381	EXPORT_SYMBOL(block_commit_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2382
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2383	/*
				2384	* block_page_mkwrite() is not allowed to change the file size as it gets
				2385	* called from a page fault handler when a page is first dirtied. Hence we must
				2386	* be careful to check for EOF conditions here. We set the page up correctly
				2387	* for a written page which means we get ENOSPC checking when writing into
				2388	* holes and correct delalloc and unwritten extent mapping on filesystems that
				2389	* support these features.
				2390	*
				2391	* We are not allowed to take the i_mutex here so we have to play games to
				2392	* protect against truncate races as the page could now be beyond EOF. Because
				2393	* vmtruncate() writes the inode size before removing pages, once we have the
				2394	* page lock we can determine safely if the page is beyond EOF. If it is not
				2395	* beyond EOF, then the page is guaranteed safe against truncation until we
				2396	* unlock the page.
				2397	*/
				2398	int
Nick Piggin	c2ec175	2009-03-31 15:23:21 -0700	[diff] [blame]	2399	block_page_mkwrite(struct vm_area_struct vma, struct vm_fault vmf,
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2400	get_block_t get_block)
				2401	{
Nick Piggin	c2ec175	2009-03-31 15:23:21 -0700	[diff] [blame]	2402	struct page *page = vmf->page;
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2403	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
				2404	unsigned long end;
				2405	loff_t size;
Nick Piggin	56a76f8	2009-03-31 15:23:23 -0700	[diff] [blame]	2406	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2407
				2408	lock_page(page);
				2409	size = i_size_read(inode);
				2410	if ((page->mapping != inode->i_mapping) \|\|
Nick Piggin	1833633	2007-07-20 00:31:45 -0700	[diff] [blame]	2411	(page_offset(page) > size)) {
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2412	/* page got truncated out from underneath us */
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2413	unlock_page(page);
				2414	goto out;
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2415	}
				2416
				2417	/* page is wholly or partially inside EOF */
				2418	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
				2419	end = size & ~PAGE_CACHE_MASK;
				2420	else
				2421	end = PAGE_CACHE_SIZE;
				2422
				2423	ret = block_prepare_write(page, 0, end, get_block);
				2424	if (!ret)
				2425	ret = block_commit_write(page, 0, end);
				2426
Nick Piggin	56a76f8	2009-03-31 15:23:23 -0700	[diff] [blame]	2427	if (unlikely(ret)) {
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2428	unlock_page(page);
Nick Piggin	56a76f8	2009-03-31 15:23:23 -0700	[diff] [blame]	2429	if (ret == -ENOMEM)
				2430	ret = VM_FAULT_OOM;
				2431	else /* -ENOSPC, -EIO, etc */
				2432	ret = VM_FAULT_SIGBUS;
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2433	} else
				2434	ret = VM_FAULT_LOCKED;
Nick Piggin	c2ec175	2009-03-31 15:23:21 -0700	[diff] [blame]	2435
Nick Piggin	b827e49	2009-04-30 15:08:16 -0700	[diff] [blame]	2436	out:
David Chinner	5417169	2007-07-19 17:39:55 +1000	[diff] [blame]	2437	return ret;
				2438	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2439	EXPORT_SYMBOL(block_page_mkwrite);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2440
				2441	/*
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2442	* nobh_write_begin()'s prereads are special: the buffer_heads are freed
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2443	* immediately, while under the page lock. So it needs a special end_io
				2444	* handler which does not touch the bh after unlocking it.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2445	*/
				2446	static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
				2447	{
Dmitry Monakhov	68671f3	2007-10-16 01:24:47 -0700	[diff] [blame]	2448	__end_buffer_read_notouch(bh, uptodate);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2449	}
				2450
				2451	/*
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2452	* Attach the singly-linked list of buffers created by nobh_write_begin, to
				2453	* the page (converting it to circular linked list and taking care of page
				2454	* dirty races).
				2455	*/
				2456	static void attach_nobh_buffers(struct page page, struct buffer_head head)
				2457	{
				2458	struct buffer_head *bh;
				2459
				2460	BUG_ON(!PageLocked(page));
				2461
				2462	spin_lock(&page->mapping->private_lock);
				2463	bh = head;
				2464	do {
				2465	if (PageDirty(page))
				2466	set_buffer_dirty(bh);
				2467	if (!bh->b_this_page)
				2468	bh->b_this_page = head;
				2469	bh = bh->b_this_page;
				2470	} while (bh != head);
				2471	attach_page_buffers(page, head);
				2472	spin_unlock(&page->mapping->private_lock);
				2473	}
				2474
				2475	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2476	* On entry, the page is fully not uptodate.
				2477	* On exit the page is fully uptodate in the areas outside (from,to)
				2478	*/
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2479	int nobh_write_begin(struct file file, struct address_space mapping,
				2480	loff_t pos, unsigned len, unsigned flags,
				2481	struct page pagep, void fsdata,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2482	get_block_t *get_block)
				2483	{
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2484	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2485	const unsigned blkbits = inode->i_blkbits;
				2486	const unsigned blocksize = 1 << blkbits;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2487	struct buffer_head head, bh;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2488	struct page *page;
				2489	pgoff_t index;
				2490	unsigned from, to;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2491	unsigned block_in_page;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2492	unsigned block_start, block_end;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2493	sector_t block_in_file;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2494	int nr_reads = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2495	int ret = 0;
				2496	int is_mapped_to_disk = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2497
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2498	index = pos >> PAGE_CACHE_SHIFT;
				2499	from = pos & (PAGE_CACHE_SIZE - 1);
				2500	to = from + len;
				2501
Nick Piggin	54566b2	2009-01-04 12:00:53 -0800	[diff] [blame]	2502	page = grab_cache_page_write_begin(mapping, index, flags);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2503	if (!page)
				2504	return -ENOMEM;
				2505	*pagep = page;
				2506	*fsdata = NULL;
				2507
				2508	if (page_has_buffers(page)) {
				2509	unlock_page(page);
				2510	page_cache_release(page);
				2511	*pagep = NULL;
				2512	return block_write_begin(file, mapping, pos, len, flags, pagep,
				2513	fsdata, get_block);
				2514	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2515
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2516	if (PageMappedToDisk(page))
				2517	return 0;
				2518
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2519	/*
				2520	* Allocate buffers so that we can keep track of state, and potentially
				2521	* attach them to the page if an error occurs. In the common case of
				2522	* no error, they will just be freed again without ever being attached
				2523	* to the page (which is all OK, because we're under the page lock).
				2524	*
				2525	* Be careful: the buffer linked list is a NULL terminated one, rather
				2526	* than the circular one we're used to.
				2527	*/
				2528	head = alloc_page_buffers(page, blocksize, 0);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2529	if (!head) {
				2530	ret = -ENOMEM;
				2531	goto out_release;
				2532	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2533
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2534	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2535
				2536	/*
				2537	* We loop across all blocks in the page, whether or not they are
				2538	* part of the affected region. This is so we can discover if the
				2539	* page is fully mapped-to-disk.
				2540	*/
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2541	for (block_start = 0, block_in_page = 0, bh = head;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2542	block_start < PAGE_CACHE_SIZE;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2543	block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2544	int create;
				2545
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2546	block_end = block_start + blocksize;
				2547	bh->b_state = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2548	create = 1;
				2549	if (block_start >= to)
				2550	create = 0;
				2551	ret = get_block(inode, block_in_file + block_in_page,
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2552	bh, create);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2553	if (ret)
				2554	goto failed;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2555	if (!buffer_mapped(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2556	is_mapped_to_disk = 0;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2557	if (buffer_new(bh))
				2558	unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
				2559	if (PageUptodate(page)) {
				2560	set_buffer_uptodate(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2561	continue;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2562	}
				2563	if (buffer_new(bh) \|\| !buffer_mapped(bh)) {
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2564	zero_user_segments(page, block_start, from,
				2565	to, block_end);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2566	continue;
				2567	}
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2568	if (buffer_uptodate(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2569	continue; /* reiserfs does this */
				2570	if (block_start < from \|\| block_end > to) {
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2571	lock_buffer(bh);
				2572	bh->b_end_io = end_buffer_read_nobh;
				2573	submit_bh(READ, bh);
				2574	nr_reads++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2575	}
				2576	}
				2577
				2578	if (nr_reads) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2579	/*
				2580	* The page is locked, so these buffers are protected from
				2581	* any VM or truncate activity. Hence we don't need to care
				2582	* for the buffer_head refcounts.
				2583	*/
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2584	for (bh = head; bh; bh = bh->b_this_page) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2585	wait_on_buffer(bh);
				2586	if (!buffer_uptodate(bh))
				2587	ret = -EIO;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2588	}
				2589	if (ret)
				2590	goto failed;
				2591	}
				2592
				2593	if (is_mapped_to_disk)
				2594	SetPageMappedToDisk(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2595
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2596	fsdata = head; / to be released by nobh_write_end */
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2597
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2598	return 0;
				2599
				2600	failed:
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2601	BUG_ON(!ret);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2602	/*
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2603	* Error recovery is a bit difficult. We need to zero out blocks that
				2604	* were newly allocated, and dirty them to ensure they get written out.
				2605	* Buffers need to be attached to the page at this point, otherwise
				2606	* the handling of potential IO errors during writeout would be hard
				2607	* (could try doing synchronous writeout, but what if that fails too?)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2608	*/
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2609	attach_nobh_buffers(page, head);
				2610	page_zero_new_buffers(page, from, to);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2611
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2612	out_release:
				2613	unlock_page(page);
				2614	page_cache_release(page);
				2615	*pagep = NULL;
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2616
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2617	if (pos + len > inode->i_size)
				2618	vmtruncate(inode, inode->i_size);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2619
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2620	return ret;
				2621	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2622	EXPORT_SYMBOL(nobh_write_begin);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2623
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2624	int nobh_write_end(struct file file, struct address_space mapping,
				2625	loff_t pos, unsigned len, unsigned copied,
				2626	struct page page, void fsdata)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2627	{
				2628	struct inode *inode = page->mapping->host;
Nick Piggin	efdc313	2007-10-21 06:57:41 +0200	[diff] [blame]	2629	struct buffer_head *head = fsdata;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2630	struct buffer_head *bh;
Dmitri Monakhov	5b41e74	2008-03-28 14:15:52 -0700	[diff] [blame]	2631	BUG_ON(fsdata != NULL && page_has_buffers(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2632
Dave Kleikamp	d4cf109	2009-02-06 14:59:26 -0600	[diff] [blame]	2633	if (unlikely(copied < len) && head)
Dmitri Monakhov	5b41e74	2008-03-28 14:15:52 -0700	[diff] [blame]	2634	attach_nobh_buffers(page, head);
				2635	if (page_has_buffers(page))
				2636	return generic_write_end(file, mapping, pos, len,
				2637	copied, page, fsdata);
Nick Piggin	a4b0672	2007-10-16 01:24:48 -0700	[diff] [blame]	2638
Nick Piggin	22c8ca7	2007-02-20 13:58:09 -0800	[diff] [blame]	2639	SetPageUptodate(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2640	set_page_dirty(page);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2641	if (pos+copied > inode->i_size) {
				2642	i_size_write(inode, pos+copied);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2643	mark_inode_dirty(inode);
				2644	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2645
				2646	unlock_page(page);
				2647	page_cache_release(page);
				2648
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2649	while (head) {
				2650	bh = head;
				2651	head = head->b_this_page;
				2652	free_buffer_head(bh);
				2653	}
				2654
				2655	return copied;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2656	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2657	EXPORT_SYMBOL(nobh_write_end);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2658
				2659	/*
				2660	* nobh_writepage() - based on block_full_write_page() except
				2661	* that it tries to operate without attaching bufferheads to
				2662	* the page.
				2663	*/
				2664	int nobh_writepage(struct page page, get_block_t get_block,
				2665	struct writeback_control *wbc)
				2666	{
				2667	struct inode * const inode = page->mapping->host;
				2668	loff_t i_size = i_size_read(inode);
				2669	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2670	unsigned offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2671	int ret;
				2672
				2673	/* Is the page fully inside i_size? */
				2674	if (page->index < end_index)
				2675	goto out;
				2676
				2677	/* Is the page fully outside i_size? (truncate in progress) */
				2678	offset = i_size & (PAGE_CACHE_SIZE-1);
				2679	if (page->index >= end_index+1 \|\| !offset) {
				2680	/*
				2681	* The page may have dirty, unmapped buffers. For example,
				2682	* they may have been added in ext3_writepage(). Make them
				2683	* freeable here, so the page does not leak.
				2684	*/
				2685	#if 0
				2686	/* Not really sure about this - do we need this ? */
				2687	if (page->mapping->a_ops->invalidatepage)
				2688	page->mapping->a_ops->invalidatepage(page, offset);
				2689	#endif
				2690	unlock_page(page);
				2691	return 0; /* don't care */
				2692	}
				2693
				2694	/*
				2695	* The page straddles i_size. It must be zeroed out on each and every
				2696	* writepage invocation because it may be mmapped. "A file is mapped
				2697	* in multiples of the page size. For a file that is not a multiple of
				2698	* the page size, the remaining memory is zeroed when mapped, and
				2699	* writes to that region are not written out to the file."
				2700	*/
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2701	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2702	out:
				2703	ret = mpage_writepage(page, get_block, wbc);
				2704	if (ret == -EAGAIN)
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2705	ret = __block_write_full_page(inode, page, get_block, wbc,
				2706	end_buffer_async_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2707	return ret;
				2708	}
				2709	EXPORT_SYMBOL(nobh_writepage);
				2710
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2711	int nobh_truncate_page(struct address_space *mapping,
				2712	loff_t from, get_block_t *get_block)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2713	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2714	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2715	unsigned offset = from & (PAGE_CACHE_SIZE-1);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2716	unsigned blocksize;
				2717	sector_t iblock;
				2718	unsigned length, pos;
				2719	struct inode *inode = mapping->host;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2720	struct page *page;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2721	struct buffer_head map_bh;
				2722	int err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2723
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2724	blocksize = 1 << inode->i_blkbits;
				2725	length = offset & (blocksize - 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2726
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2727	/* Block boundary? Nothing to do */
				2728	if (!length)
				2729	return 0;
				2730
				2731	length = blocksize - length;
				2732	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
				2733
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2734	page = grab_cache_page(mapping, index);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2735	err = -ENOMEM;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2736	if (!page)
				2737	goto out;
				2738
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2739	if (page_has_buffers(page)) {
				2740	has_buffers:
				2741	unlock_page(page);
				2742	page_cache_release(page);
				2743	return block_truncate_page(mapping, from, get_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2744	}
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2745
				2746	/* Find the buffer that contains "offset" */
				2747	pos = blocksize;
				2748	while (offset >= pos) {
				2749	iblock++;
				2750	pos += blocksize;
				2751	}
				2752
Theodore Ts'o	460bcf5	2009-05-12 07:37:56 -0400	[diff] [blame]	2753	map_bh.b_size = blocksize;
				2754	map_bh.b_state = 0;
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2755	err = get_block(inode, iblock, &map_bh, 0);
				2756	if (err)
				2757	goto unlock;
				2758	/* unmapped? It's a hole - nothing to do */
				2759	if (!buffer_mapped(&map_bh))
				2760	goto unlock;
				2761
				2762	/* Ok, it's mapped. Make sure it's up-to-date */
				2763	if (!PageUptodate(page)) {
				2764	err = mapping->a_ops->readpage(NULL, page);
				2765	if (err) {
				2766	page_cache_release(page);
				2767	goto out;
				2768	}
				2769	lock_page(page);
				2770	if (!PageUptodate(page)) {
				2771	err = -EIO;
				2772	goto unlock;
				2773	}
				2774	if (page_has_buffers(page))
				2775	goto has_buffers;
				2776	}
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2777	zero_user(page, offset, length);
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2778	set_page_dirty(page);
				2779	err = 0;
				2780
				2781	unlock:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2782	unlock_page(page);
				2783	page_cache_release(page);
				2784	out:
Nick Piggin	03158cd	2007-10-16 01:25:25 -0700	[diff] [blame]	2785	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2786	}
				2787	EXPORT_SYMBOL(nobh_truncate_page);
				2788
				2789	int block_truncate_page(struct address_space *mapping,
				2790	loff_t from, get_block_t *get_block)
				2791	{
				2792	pgoff_t index = from >> PAGE_CACHE_SHIFT;
				2793	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				2794	unsigned blocksize;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2795	sector_t iblock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2796	unsigned length, pos;
				2797	struct inode *inode = mapping->host;
				2798	struct page *page;
				2799	struct buffer_head *bh;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2800	int err;
				2801
				2802	blocksize = 1 << inode->i_blkbits;
				2803	length = offset & (blocksize - 1);
				2804
				2805	/* Block boundary? Nothing to do */
				2806	if (!length)
				2807	return 0;
				2808
				2809	length = blocksize - length;
Andrew Morton	54b21a7	2006-01-08 01:03:05 -0800	[diff] [blame]	2810	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2811
				2812	page = grab_cache_page(mapping, index);
				2813	err = -ENOMEM;
				2814	if (!page)
				2815	goto out;
				2816
				2817	if (!page_has_buffers(page))
				2818	create_empty_buffers(page, blocksize, 0);
				2819
				2820	/* Find the buffer that contains "offset" */
				2821	bh = page_buffers(page);
				2822	pos = blocksize;
				2823	while (offset >= pos) {
				2824	bh = bh->b_this_page;
				2825	iblock++;
				2826	pos += blocksize;
				2827	}
				2828
				2829	err = 0;
				2830	if (!buffer_mapped(bh)) {
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2831	WARN_ON(bh->b_size != blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2832	err = get_block(inode, iblock, bh, 0);
				2833	if (err)
				2834	goto unlock;
				2835	/* unmapped? It's a hole - nothing to do */
				2836	if (!buffer_mapped(bh))
				2837	goto unlock;
				2838	}
				2839
				2840	/* Ok, it's mapped. Make sure it's up-to-date */
				2841	if (PageUptodate(page))
				2842	set_buffer_uptodate(bh);
				2843
David Chinner	33a266d	2007-02-12 00:51:41 -0800	[diff] [blame]	2844	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2845	err = -EIO;
				2846	ll_rw_block(READ, 1, &bh);
				2847	wait_on_buffer(bh);
				2848	/* Uhhuh. Read error. Complain and punt. */
				2849	if (!buffer_uptodate(bh))
				2850	goto unlock;
				2851	}
				2852
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2853	zero_user(page, offset, length);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2854	mark_buffer_dirty(bh);
				2855	err = 0;
				2856
				2857	unlock:
				2858	unlock_page(page);
				2859	page_cache_release(page);
				2860	out:
				2861	return err;
				2862	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2863	EXPORT_SYMBOL(block_truncate_page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2864
				2865	/*
				2866	* The generic ->writepage function for buffer-backed address_spaces
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2867	* this form passes in the end_io handler used to finish the IO.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2868	*/
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2869	int block_write_full_page_endio(struct page page, get_block_t get_block,
				2870	struct writeback_control wbc, bh_end_io_t handler)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2871	{
				2872	struct inode * const inode = page->mapping->host;
				2873	loff_t i_size = i_size_read(inode);
				2874	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
				2875	unsigned offset;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2876
				2877	/* Is the page fully inside i_size? */
				2878	if (page->index < end_index)
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2879	return __block_write_full_page(inode, page, get_block, wbc,
				2880	handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2881
				2882	/* Is the page fully outside i_size? (truncate in progress) */
				2883	offset = i_size & (PAGE_CACHE_SIZE-1);
				2884	if (page->index >= end_index+1 \|\| !offset) {
				2885	/*
				2886	* The page may have dirty, unmapped buffers. For example,
				2887	* they may have been added in ext3_writepage(). Make them
				2888	* freeable here, so the page does not leak.
				2889	*/
Jan Kara	aaa4059	2005-10-30 15:00:16 -0800	[diff] [blame]	2890	do_invalidatepage(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2891	unlock_page(page);
				2892	return 0; /* don't care */
				2893	}
				2894
				2895	/*
				2896	* The page straddles i_size. It must be zeroed out on each and every
Adam Buchbinder	2a61aa4	2009-12-11 16:35:40 -0500	[diff] [blame]	2897	* writepage invocation because it may be mmapped. "A file is mapped
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2898	* in multiples of the page size. For a file that is not a multiple of
				2899	* the page size, the remaining memory is zeroed when mapped, and
				2900	* writes to that region are not written out to the file."
				2901	*/
Christoph Lameter	eebd2aa	2008-02-04 22:28:29 -0800	[diff] [blame]	2902	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2903	return __block_write_full_page(inode, page, get_block, wbc, handler);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2904	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2905	EXPORT_SYMBOL(block_write_full_page_endio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2906
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2907	/*
				2908	* The generic ->writepage function for buffer-backed address_spaces
				2909	*/
				2910	int block_write_full_page(struct page page, get_block_t get_block,
				2911	struct writeback_control *wbc)
				2912	{
				2913	return block_write_full_page_endio(page, get_block, wbc,
				2914	end_buffer_async_write);
				2915	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2916	EXPORT_SYMBOL(block_write_full_page);
Chris Mason	35c80d5	2009-04-15 13:22:38 -0400	[diff] [blame]	2917
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2918	sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
				2919	get_block_t *get_block)
				2920	{
				2921	struct buffer_head tmp;
				2922	struct inode *inode = mapping->host;
				2923	tmp.b_state = 0;
				2924	tmp.b_blocknr = 0;
Badari Pulavarty	b0cf232	2006-03-26 01:38:00 -0800	[diff] [blame]	2925	tmp.b_size = 1 << inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2926	get_block(inode, block, &tmp, 0);
				2927	return tmp.b_blocknr;
				2928	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2929	EXPORT_SYMBOL(generic_block_bmap);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2930
NeilBrown	6712ecf	2007-09-27 12:47:43 +0200	[diff] [blame]	2931	static void end_bio_bh_io_sync(struct bio *bio, int err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2932	{
				2933	struct buffer_head *bh = bio->bi_private;
				2934
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2935	if (err == -EOPNOTSUPP) {
				2936	set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
				2937	set_bit(BH_Eopnotsupp, &bh->b_state);
				2938	}
				2939
Keith Mannthey	08bafc0	2008-11-25 10:24:35 +0100	[diff] [blame]	2940	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
				2941	set_bit(BH_Quiet, &bh->b_state);
				2942
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2943	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
				2944	bio_put(bio);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2945	}
				2946
				2947	int submit_bh(int rw, struct buffer_head * bh)
				2948	{
				2949	struct bio *bio;
				2950	int ret = 0;
				2951
				2952	BUG_ON(!buffer_locked(bh));
				2953	BUG_ON(!buffer_mapped(bh));
				2954	BUG_ON(!bh->b_end_io);
Aneesh Kumar K.V	8fb0e34	2009-05-12 16:22:37 -0400	[diff] [blame]	2955	BUG_ON(buffer_delay(bh));
				2956	BUG_ON(buffer_unwritten(bh));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2957
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2958	/*
				2959	* Mask in barrier bit for a write (could be either a WRITE or a
				2960	* WRITE_SYNC
				2961	*/
				2962	if (buffer_ordered(bh) && (rw & WRITE))
				2963	rw \|= WRITE_BARRIER;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2964
				2965	/*
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2966	* Only clear out a write error when rewriting
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2967	*/
Jens Axboe	48fd4f9	2008-08-22 10:00:36 +0200	[diff] [blame]	2968	if (test_set_buffer_req(bh) && (rw & WRITE))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2969	clear_buffer_write_io_error(bh);
				2970
				2971	/*
				2972	* from here on down, it's all bio -- do the initial mapping,
				2973	* submit_bio -> generic_make_request may further map this bio around
				2974	*/
				2975	bio = bio_alloc(GFP_NOIO, 1);
				2976
				2977	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
				2978	bio->bi_bdev = bh->b_bdev;
				2979	bio->bi_io_vec[0].bv_page = bh->b_page;
				2980	bio->bi_io_vec[0].bv_len = bh->b_size;
				2981	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
				2982
				2983	bio->bi_vcnt = 1;
				2984	bio->bi_idx = 0;
				2985	bio->bi_size = bh->b_size;
				2986
				2987	bio->bi_end_io = end_bio_bh_io_sync;
				2988	bio->bi_private = bh;
				2989
				2990	bio_get(bio);
				2991	submit_bio(rw, bio);
				2992
				2993	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2994	ret = -EOPNOTSUPP;
				2995
				2996	bio_put(bio);
				2997	return ret;
				2998	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	2999	EXPORT_SYMBOL(submit_bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3000
				3001	/**
				3002	* ll_rw_block: low-level access to block devices (DEPRECATED)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	3003	* @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3004	* @nr: number of &struct buffer_heads in the array
				3005	* @bhs: array of pointers to &struct buffer_head
				3006	*
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	3007	* ll_rw_block() takes an array of pointers to &struct buffer_heads, and
				3008	* requests an I/O operation on them, either a %READ or a %WRITE. The third
				3009	* %SWRITE is like %WRITE only we make sure that the current data in buffers
				3010	* are sent to disk. The fourth %READA option is described in the documentation
				3011	* for generic_make_request() which ll_rw_block() calls.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3012	*
				3013	* This function drops any buffer that it cannot get a lock on (with the
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	3014	* BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
				3015	* clean when doing a write request, and any buffer that appears to be
				3016	* up-to-date when doing read request. Further it marks as clean buffers that
				3017	* are processed for writing (the buffer cache won't assume that they are
				3018	* actually clean until the buffer gets unlocked).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3019	*
				3020	* ll_rw_block sets b_end_io to simple completion handler that marks
				3021	* the buffer up-to-date (if approriate), unlocks the buffer and wakes
				3022	* any waiters.
				3023	*
				3024	* All of the buffers must be for the same device, and must also be a
				3025	* multiple of the current approved size for the device.
				3026	*/
				3027	void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
				3028	{
				3029	int i;
				3030
				3031	for (i = 0; i < nr; i++) {
				3032	struct buffer_head *bh = bhs[i];
				3033
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	3034	if (rw == SWRITE \|\| rw == SWRITE_SYNC \|\| rw == SWRITE_SYNC_PLUG)
Jan Kara	a766223	2005-09-06 15:19:10 -0700	[diff] [blame]	3035	lock_buffer(bh);
Nick Piggin	ca5de40	2008-08-02 12:02:13 +0200	[diff] [blame]	3036	else if (!trylock_buffer(bh))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3037	continue;
				3038
Jens Axboe	9cf6b72	2009-04-06 14:48:03 +0200	[diff] [blame]	3039	if (rw == WRITE \|\| rw == SWRITE \|\| rw == SWRITE_SYNC \|\|
				3040	rw == SWRITE_SYNC_PLUG) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3041	if (test_clear_buffer_dirty(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	3042	bh->b_end_io = end_buffer_write_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	3043	get_bh(bh);
Jens Axboe	18ce375	2008-07-01 09:07:34 +0200	[diff] [blame]	3044	if (rw == SWRITE_SYNC)
				3045	submit_bh(WRITE_SYNC, bh);
				3046	else
				3047	submit_bh(WRITE, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3048	continue;
				3049	}
				3050	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3051	if (!buffer_uptodate(bh)) {
akpm@osdl.org	76c3073	2005-04-16 15:24:07 -0700	[diff] [blame]	3052	bh->b_end_io = end_buffer_read_sync;
OGAWA Hirofumi	e60e5c5	2006-02-03 03:04:43 -0800	[diff] [blame]	3053	get_bh(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3054	submit_bh(rw, bh);
				3055	continue;
				3056	}
				3057	}
				3058	unlock_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3059	}
				3060	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	3061	EXPORT_SYMBOL(ll_rw_block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3062
				3063	/*
				3064	* For a data-integrity writeout, we need to wait upon any in-progress I/O
				3065	* and then start new I/O and then wait upon it. The caller must have a ref on
				3066	* the buffer_head.
				3067	*/
				3068	int sync_dirty_buffer(struct buffer_head *bh)
				3069	{
				3070	int ret = 0;
				3071
				3072	WARN_ON(atomic_read(&bh->b_count) < 1);
				3073	lock_buffer(bh);
				3074	if (test_clear_buffer_dirty(bh)) {
				3075	get_bh(bh);
				3076	bh->b_end_io = end_buffer_write_sync;
Jens Axboe	1aa2a7c	2009-04-06 14:48:08 +0200	[diff] [blame]	3077	ret = submit_bh(WRITE_SYNC, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3078	wait_on_buffer(bh);
				3079	if (buffer_eopnotsupp(bh)) {
				3080	clear_buffer_eopnotsupp(bh);
				3081	ret = -EOPNOTSUPP;
				3082	}
				3083	if (!ret && !buffer_uptodate(bh))
				3084	ret = -EIO;
				3085	} else {
				3086	unlock_buffer(bh);
				3087	}
				3088	return ret;
				3089	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	3090	EXPORT_SYMBOL(sync_dirty_buffer);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3091
				3092	/*
				3093	* try_to_free_buffers() checks if all the buffers on this particular page
				3094	* are unused, and releases them if so.
				3095	*
				3096	* Exclusion against try_to_free_buffers may be obtained by either
				3097	* locking the page or by holding its mapping's private_lock.
				3098	*
				3099	* If the page is dirty but all the buffers are clean then we need to
				3100	* be sure to mark the page clean as well. This is because the page
				3101	* may be against a block device, and a later reattachment of buffers
				3102	* to a dirty page will set all buffers dirty. Which would corrupt
				3103	* filesystem data on the same device.
				3104	*
				3105	* The same applies to regular filesystem pages: if all the buffers are
				3106	* clean then we set the page clean and proceed. To do that, we require
				3107	* total exclusion from __set_page_dirty_buffers(). That is obtained with
				3108	* private_lock.
				3109	*
				3110	* try_to_free_buffers() is non-blocking.
				3111	*/
				3112	static inline int buffer_busy(struct buffer_head *bh)
				3113	{
				3114	return atomic_read(&bh->b_count) \|
				3115	(bh->b_state & ((1 << BH_Dirty) \| (1 << BH_Lock)));
				3116	}
				3117
				3118	static int
				3119	drop_buffers(struct page page, struct buffer_head *buffers_to_free)
				3120	{
				3121	struct buffer_head *head = page_buffers(page);
				3122	struct buffer_head *bh;
				3123
				3124	bh = head;
				3125	do {
akpm@osdl.org	de7d5a3	2005-05-01 08:58:39 -0700	[diff] [blame]	3126	if (buffer_write_io_error(bh) && page->mapping)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3127	set_bit(AS_EIO, &page->mapping->flags);
				3128	if (buffer_busy(bh))
				3129	goto failed;
				3130	bh = bh->b_this_page;
				3131	} while (bh != head);
				3132
				3133	do {
				3134	struct buffer_head *next = bh->b_this_page;
				3135
Jan Kara	535ee2f	2008-02-08 04:21:59 -0800	[diff] [blame]	3136	if (bh->b_assoc_map)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3137	__remove_assoc_queue(bh);
				3138	bh = next;
				3139	} while (bh != head);
				3140	*buffers_to_free = head;
				3141	__clear_page_buffers(page);
				3142	return 1;
				3143	failed:
				3144	return 0;
				3145	}
				3146
				3147	int try_to_free_buffers(struct page *page)
				3148	{
				3149	struct address_space * const mapping = page->mapping;
				3150	struct buffer_head *buffers_to_free = NULL;
				3151	int ret = 0;
				3152
				3153	BUG_ON(!PageLocked(page));
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3154	if (PageWriteback(page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3155	return 0;
				3156
				3157	if (mapping == NULL) { /* can this still happen? */
				3158	ret = drop_buffers(page, &buffers_to_free);
				3159	goto out;
				3160	}
				3161
				3162	spin_lock(&mapping->private_lock);
				3163	ret = drop_buffers(page, &buffers_to_free);
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3164
				3165	/*
				3166	* If the filesystem writes its buffers by hand (eg ext3)
				3167	* then we can have clean buffers against a dirty page. We
				3168	* clean the page here; otherwise the VM will never notice
				3169	* that the filesystem did any IO at all.
				3170	*
				3171	* Also, during truncate, discard_buffer will have marked all
				3172	* the page's buffers clean. We discover that here and clean
				3173	* the page also.
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	3174	*
				3175	* private_lock must be held over this entire operation in order
				3176	* to synchronise against __set_page_dirty_buffers and prevent the
				3177	* dirty bit from being lost.
Linus Torvalds	ecdfc97	2007-01-26 12:47:06 -0800	[diff] [blame]	3178	*/
				3179	if (ret)
				3180	cancel_dirty_page(page, PAGE_CACHE_SIZE);
Nick Piggin	87df724	2007-01-30 14:36:27 +1100	[diff] [blame]	3181	spin_unlock(&mapping->private_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3182	out:
				3183	if (buffers_to_free) {
				3184	struct buffer_head *bh = buffers_to_free;
				3185
				3186	do {
				3187	struct buffer_head *next = bh->b_this_page;
				3188	free_buffer_head(bh);
				3189	bh = next;
				3190	} while (bh != buffers_to_free);
				3191	}
				3192	return ret;
				3193	}
				3194	EXPORT_SYMBOL(try_to_free_buffers);
				3195
NeilBrown	3978d71	2006-03-26 01:37:17 -0800	[diff] [blame]	3196	void block_sync_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3197	{
				3198	struct address_space *mapping;
				3199
				3200	smp_mb();
				3201	mapping = page_mapping(page);
				3202	if (mapping)
				3203	blk_run_backing_dev(mapping->backing_dev_info, page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3204	}
H Hartley Sweeten	1fe72ea	2009-09-22 16:43:51 -0700	[diff] [blame]	3205	EXPORT_SYMBOL(block_sync_page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3206
				3207	/*
				3208	* There are no bdflush tunables left. But distributions are
				3209	* still running obsolete flush daemons, so we terminate them here.
				3210	*
				3211	* Use of bdflush() is deprecated and will be removed in a future kernel.
Jens Axboe	5b0830c	2009-09-23 19:37:09 +0200	[diff] [blame]	3212	* The `flush-X' kernel threads fully replace bdflush daemons and this call.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3213	*/
Heiko Carstens	bdc480e	2009-01-14 14:14:12 +0100	[diff] [blame]	3214	SYSCALL_DEFINE2(bdflush, int, func, long, data)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3215	{
				3216	static int msg_count;
				3217
				3218	if (!capable(CAP_SYS_ADMIN))
				3219	return -EPERM;
				3220
				3221	if (msg_count < 5) {
				3222	msg_count++;
				3223	printk(KERN_INFO
				3224	"warning: process `%s' used the obsolete bdflush"
				3225	" system call\n", current->comm);
				3226	printk(KERN_INFO "Fix your initscripts?\n");
				3227	}
				3228
				3229	if (func == 1)
				3230	do_exit(0);
				3231	return 0;
				3232	}
				3233
				3234	/*
				3235	* Buffer-head allocation
				3236	*/
Christoph Lameter	e18b890	2006-12-06 20:33:20 -0800	[diff] [blame]	3237	static struct kmem_cache *bh_cachep;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3238
				3239	/*
				3240	* Once the number of bh's in the machine exceeds this level, we start
				3241	* stripping them in writeback.
				3242	*/
				3243	static int max_buffer_heads;
				3244
				3245	int buffer_heads_over_limit;
				3246
				3247	struct bh_accounting {
				3248	int nr; /* Number of live bh's */
				3249	int ratelimit; /* Limit cacheline bouncing */
				3250	};
				3251
				3252	static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
				3253
				3254	static void recalc_bh_state(void)
				3255	{
				3256	int i;
				3257	int tot = 0;
				3258
				3259	if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
				3260	return;
				3261	__get_cpu_var(bh_accounting).ratelimit = 0;
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3262	for_each_online_cpu(i)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3263	tot += per_cpu(bh_accounting, i).nr;
				3264	buffer_heads_over_limit = (tot > max_buffer_heads);
				3265	}
				3266
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	3267	struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3268	{
Richard Kennedy	019b4d1	2010-03-10 15:20:33 -0800	[diff] [blame]	3269	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3270	if (ret) {
Christoph Lameter	a35afb8	2007-05-16 22:10:57 -0700	[diff] [blame]	3271	INIT_LIST_HEAD(&ret->b_assoc_buffers);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3272	get_cpu_var(bh_accounting).nr++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3273	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3274	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3275	}
				3276	return ret;
				3277	}
				3278	EXPORT_SYMBOL(alloc_buffer_head);
				3279
				3280	void free_buffer_head(struct buffer_head *bh)
				3281	{
				3282	BUG_ON(!list_empty(&bh->b_assoc_buffers));
				3283	kmem_cache_free(bh_cachep, bh);
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3284	get_cpu_var(bh_accounting).nr--;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3285	recalc_bh_state();
Coywolf Qi Hunt	736c7b8	2005-09-06 15:18:17 -0700	[diff] [blame]	3286	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3287	}
				3288	EXPORT_SYMBOL(free_buffer_head);
				3289
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3290	static void buffer_exit_cpu(int cpu)
				3291	{
				3292	int i;
				3293	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
				3294
				3295	for (i = 0; i < BH_LRU_SIZE; i++) {
				3296	brelse(b->bhs[i]);
				3297	b->bhs[i] = NULL;
				3298	}
Eric Dumazet	8a14342	2006-03-24 03:18:10 -0800	[diff] [blame]	3299	get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
				3300	per_cpu(bh_accounting, cpu).nr = 0;
				3301	put_cpu_var(bh_accounting);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3302	}
				3303
				3304	static int buffer_cpu_notify(struct notifier_block *self,
				3305	unsigned long action, void *hcpu)
				3306	{
Rafael J. Wysocki	8bb7844	2007-05-09 02:35:10 -0700	[diff] [blame]	3307	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3308	buffer_exit_cpu((unsigned long)hcpu);
				3309	return NOTIFY_OK;
				3310	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3311
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3312	/**
Randy Dunlap	a6b9191	2008-03-19 17:01:00 -0700	[diff] [blame]	3313	* bh_uptodate_or_lock - Test whether the buffer is uptodate
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3314	* @bh: struct buffer_head
				3315	*
				3316	* Return true if the buffer is up-to-date and false,
				3317	* with the buffer locked, if not.
				3318	*/
				3319	int bh_uptodate_or_lock(struct buffer_head *bh)
				3320	{
				3321	if (!buffer_uptodate(bh)) {
				3322	lock_buffer(bh);
				3323	if (!buffer_uptodate(bh))
				3324	return 0;
				3325	unlock_buffer(bh);
				3326	}
				3327	return 1;
				3328	}
				3329	EXPORT_SYMBOL(bh_uptodate_or_lock);
				3330
				3331	/**
Randy Dunlap	a6b9191	2008-03-19 17:01:00 -0700	[diff] [blame]	3332	* bh_submit_read - Submit a locked buffer for reading
Aneesh Kumar K.V	389d1b0	2008-01-28 23:58:26 -0500	[diff] [blame]	3333	* @bh: struct buffer_head
				3334	*
				3335	* Returns zero on success and -EIO on error.
				3336	*/
				3337	int bh_submit_read(struct buffer_head *bh)
				3338	{
				3339	BUG_ON(!buffer_locked(bh));
				3340
				3341	if (buffer_uptodate(bh)) {
				3342	unlock_buffer(bh);
				3343	return 0;
				3344	}
				3345
				3346	get_bh(bh);
				3347	bh->b_end_io = end_buffer_read_sync;
				3348	submit_bh(READ, bh);
				3349	wait_on_buffer(bh);
				3350	if (buffer_uptodate(bh))
				3351	return 0;
				3352	return -EIO;
				3353	}
				3354	EXPORT_SYMBOL(bh_submit_read);
				3355
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3356	void __init buffer_init(void)
				3357	{
				3358	int nrpages;
				3359
Christoph Lameter	b98938c	2008-02-04 22:28:36 -0800	[diff] [blame]	3360	bh_cachep = kmem_cache_create("buffer_head",
				3361	sizeof(struct buffer_head), 0,
				3362	(SLAB_RECLAIM_ACCOUNT\|SLAB_PANIC\|
				3363	SLAB_MEM_SPREAD),
Richard Kennedy	019b4d1	2010-03-10 15:20:33 -0800	[diff] [blame]	3364	NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3365
				3366	/*
				3367	* Limit the bh occupancy to 10% of ZONE_NORMAL
				3368	*/
				3369	nrpages = (nr_free_buffer_pages() * 10) / 100;
				3370	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
				3371	hotcpu_notifier(buffer_cpu_notify, 0);
				3372	}