Blame - fs/fs-writeback.c - SHIFTPHONES/kernel/common

blob: 783ed44c7cfec59cf9f52e0d2b8c1b45313aa41c [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* fs/fs-writeback.c
				3	*
				4	* Copyright (C) 2002, Linus Torvalds.
				5	*
				6	* Contains all the functions related to writing back and waiting
				7	* upon dirty inodes against superblocks, and writing back dirty
				8	* pages against inodes. ie: data writeback. Writeout of the
				9	* inode itself is not handled here.
				10	*
Francois Cami	e1f8e87	2008-10-15 22:01:59 -0700	[diff] [blame]	11	* 10Apr2002 Andrew Morton
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	12	* Split out of fs/inode.c
				13	* Additions for address_space-based writeback
				14	*/
				15
				16	#include <linux/kernel.h>
Jens Axboe	f5ff842	2007-09-21 09:19:54 +0200	[diff] [blame]	17	#include <linux/module.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	18	#include <linux/spinlock.h>
				19	#include <linux/sched.h>
				20	#include <linux/fs.h>
				21	#include <linux/mm.h>
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	22	#include <linux/kthread.h>
				23	#include <linux/freezer.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	24	#include <linux/writeback.h>
				25	#include <linux/blkdev.h>
				26	#include <linux/backing-dev.h>
				27	#include <linux/buffer_head.h>
David Howells	07f3f05	2006-09-30 20:52:18 +0200	[diff] [blame]	28	#include "internal.h"
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	29
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	30	#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	31
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	32	/*
Jens Axboe	d0bceac	2009-05-18 08:20:32 +0200	[diff] [blame]	33	* We don't actually have pdflush, but this one is exported though /proc...
				34	*/
				35	int nr_pdflush_threads;
				36
				37	/*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	38	* Work items for the bdi_writeback threads
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	39	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	40	struct bdi_work {
				41	struct list_head list;
				42	struct list_head wait_list;
				43	struct rcu_head rcu_head;
				44
				45	unsigned long seen;
				46	atomic_t pending;
				47
				48	struct super_block *sb;
				49	unsigned long nr_pages;
				50	enum writeback_sync_modes sync_mode;
				51
				52	unsigned long state;
				53	};
				54
				55	enum {
				56	WS_USED_B = 0,
				57	WS_ONSTACK_B,
				58	};
				59
				60	#define WS_USED (1 << WS_USED_B)
				61	#define WS_ONSTACK (1 << WS_ONSTACK_B)
				62
				63	static inline bool bdi_work_on_stack(struct bdi_work *work)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	64	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	65	return test_bit(WS_ONSTACK_B, &work->state);
				66	}
				67
				68	static inline void bdi_work_init(struct bdi_work *work,
				69	struct writeback_control *wbc)
				70	{
				71	INIT_RCU_HEAD(&work->rcu_head);
				72	work->sb = wbc->sb;
				73	work->nr_pages = wbc->nr_to_write;
				74	work->sync_mode = wbc->sync_mode;
				75	work->state = WS_USED;
				76	}
				77
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	78	/**
				79	* writeback_in_progress - determine whether there is writeback in progress
				80	* @bdi: the device's backing_dev_info structure.
				81	*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	82	* Determine whether there is writeback waiting to be handled against a
				83	* backing device.
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	84	*/
				85	int writeback_in_progress(struct backing_dev_info *bdi)
				86	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	87	return !list_empty(&bdi->work_list);
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	88	}
				89
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	90	static void bdi_work_clear(struct bdi_work *work)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	91	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	92	clear_bit(WS_USED_B, &work->state);
				93	smp_mb__after_clear_bit();
				94	wake_up_bit(&work->state, WS_USED_B);
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	95	}
				96
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	97	static void bdi_work_free(struct rcu_head *head)
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	98	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	99	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	100
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	101	if (!bdi_work_on_stack(work))
				102	kfree(work);
				103	else
				104	bdi_work_clear(work);
				105	}
				106
				107	static void wb_work_complete(struct bdi_work *work)
				108	{
				109	const enum writeback_sync_modes sync_mode = work->sync_mode;
				110
				111	/*
				112	* For allocated work, we can clear the done/seen bit right here.
				113	* For on-stack work, we need to postpone both the clear and free
				114	* to after the RCU grace period, since the stack could be invalidated
				115	* as soon as bdi_work_clear() has done the wakeup.
				116	*/
				117	if (!bdi_work_on_stack(work))
				118	bdi_work_clear(work);
				119	if (sync_mode == WB_SYNC_NONE \|\| bdi_work_on_stack(work))
				120	call_rcu(&work->rcu_head, bdi_work_free);
				121	}
				122
				123	static void wb_clear_pending(struct bdi_writeback wb, struct bdi_work work)
				124	{
				125	/*
				126	* The caller has retrieved the work arguments from this work,
				127	* drop our reference. If this is the last ref, delete and free it
				128	*/
				129	if (atomic_dec_and_test(&work->pending)) {
				130	struct backing_dev_info *bdi = wb->bdi;
				131
				132	spin_lock(&bdi->wb_lock);
				133	list_del_rcu(&work->list);
				134	spin_unlock(&bdi->wb_lock);
				135
				136	wb_work_complete(work);
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	137	}
				138	}
				139
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	140	static void bdi_queue_work(struct backing_dev_info bdi, struct bdi_work work)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	141	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	142	if (work) {
				143	work->seen = bdi->wb_mask;
				144	BUG_ON(!work->seen);
				145	atomic_set(&work->pending, bdi->wb_cnt);
				146	BUG_ON(!bdi->wb_cnt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	147
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	148	/*
				149	* Make sure stores are seen before it appears on the list
				150	*/
				151	smp_mb();
				152
				153	spin_lock(&bdi->wb_lock);
				154	list_add_tail_rcu(&work->list, &bdi->work_list);
				155	spin_unlock(&bdi->wb_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	156	}
				157
				158	/*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	159	* If the default thread isn't there, make sure we add it. When
				160	* it gets created and wakes up, we'll run this work.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	161	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	162	if (unlikely(list_empty_careful(&bdi->wb_list)))
				163	wake_up_process(default_backing_dev_info.wb.task);
				164	else {
				165	struct bdi_writeback *wb = &bdi->wb;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	166
				167	/*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	168	* If we failed allocating the bdi work item, wake up the wb
				169	* thread always. As a safety precaution, it'll flush out
				170	* everything
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	171	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	172	if (!wb_has_dirty_io(wb)) {
				173	if (work)
				174	wb_clear_pending(wb, work);
				175	} else if (wb->task)
				176	wake_up_process(wb->task);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	177	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	178	}
				179
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	180	/*
				181	* Used for on-stack allocated work items. The caller needs to wait until
				182	* the wb threads have acked the work before it's safe to continue.
				183	*/
				184	static void bdi_wait_on_work_clear(struct bdi_work *work)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	185	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	186	wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
				187	TASK_UNINTERRUPTIBLE);
				188	}
				189
				190	static struct bdi_work bdi_alloc_work(struct writeback_control wbc)
				191	{
				192	struct bdi_work *work;
				193
				194	work = kmalloc(sizeof(*work), GFP_ATOMIC);
				195	if (work)
				196	bdi_work_init(work, wbc);
				197
				198	return work;
				199	}
				200
				201	void bdi_start_writeback(struct writeback_control *wbc)
				202	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	203	/*
Christoph Hellwig	f0fad8a	2009-09-11 09:47:56 +0200	[diff] [blame^]	204	* WB_SYNC_NONE is opportunistic writeback. If this allocation fails,
				205	* bdi_queue_work() will wake up the thread and flush old data. This
				206	* should ensure some amount of progress in freeing memory.
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	207	*/
Christoph Hellwig	f0fad8a	2009-09-11 09:47:56 +0200	[diff] [blame^]	208	if (wbc->sync_mode != WB_SYNC_ALL) {
				209	struct bdi_work *w = bdi_alloc_work(wbc);
				210
				211	bdi_queue_work(wbc->bdi, w);
				212	} else {
				213	struct bdi_work work;
				214
				215	bdi_work_init(&work, wbc);
				216	work.state \|= WS_ONSTACK;
				217
				218	bdi_queue_work(wbc->bdi, &work);
				219	bdi_wait_on_work_clear(&work);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	220	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	221	}
				222
				223	/*
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	224	* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
				225	* furthest end of its superblock's dirty-inode list.
				226	*
				227	* Before stamping the inode's ->dirtied_when, we check to see whether it is
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	228	* already the most-recently-dirtied inode on the b_dirty list. If that is
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	229	* the case then the inode must have been redirtied while it was being written
				230	* out and we don't reset its dirtied_when.
				231	*/
				232	static void redirty_tail(struct inode *inode)
				233	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	234	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	235
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	236	if (!list_empty(&wb->b_dirty)) {
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	237	struct inode *tail;
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	238
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	239	tail = list_entry(wb->b_dirty.next, struct inode, i_list);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	240	if (time_before(inode->dirtied_when, tail->dirtied_when))
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	241	inode->dirtied_when = jiffies;
				242	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	243	list_move(&inode->i_list, &wb->b_dirty);
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	244	}
				245
				246	/*
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	247	* requeue inode for re-scanning after bdi->b_io list is exhausted.
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	248	*/
Ken Chen	0e0f4fc	2007-10-16 23:30:38 -0700	[diff] [blame]	249	static void requeue_io(struct inode *inode)
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	250	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	251	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
				252
				253	list_move(&inode->i_list, &wb->b_more_io);
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	254	}
				255
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	256	static void inode_sync_complete(struct inode *inode)
				257	{
				258	/*
				259	* Prevent speculative execution through spin_unlock(&inode_lock);
				260	*/
				261	smp_mb();
				262	wake_up_bit(&inode->i_state, __I_SYNC);
				263	}
				264
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	265	static bool inode_dirtied_after(struct inode *inode, unsigned long t)
				266	{
				267	bool ret = time_after(inode->dirtied_when, t);
				268	#ifndef CONFIG_64BIT
				269	/*
				270	* For inodes being constantly redirtied, dirtied_when can get stuck.
				271	* It _appears_ to be in the future, but is actually in distant past.
				272	* This test is necessary to prevent such wrapped-around relative times
				273	* from permanently stopping the whole pdflush writeback.
				274	*/
				275	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
				276	#endif
				277	return ret;
				278	}
				279
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	280	/*
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	281	* Move expired dirty inodes from @delaying_queue to @dispatch_queue.
				282	*/
				283	static void move_expired_inodes(struct list_head *delaying_queue,
				284	struct list_head *dispatch_queue,
				285	unsigned long *older_than_this)
				286	{
				287	while (!list_empty(delaying_queue)) {
				288	struct inode *inode = list_entry(delaying_queue->prev,
				289	struct inode, i_list);
				290	if (older_than_this &&
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	291	inode_dirtied_after(inode, *older_than_this))
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	292	break;
				293	list_move(&inode->i_list, dispatch_queue);
				294	}
				295	}
				296
				297	/*
				298	* Queue all expired dirty inodes for io, eldest first.
				299	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	300	static void queue_io(struct bdi_writeback wb, unsigned long older_than_this)
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	301	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	302	list_splice_init(&wb->b_more_io, wb->b_io.prev);
				303	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	304	}
				305
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	306	static int write_inode(struct inode *inode, int sync)
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	307	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	308	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
				309	return inode->i_sb->s_op->write_inode(inode, sync);
				310	return 0;
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	311	}
				312
				313	/*
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	314	* Wait for writeback on an inode to complete.
				315	*/
				316	static void inode_wait_for_writeback(struct inode *inode)
				317	{
				318	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
				319	wait_queue_head_t *wqh;
				320
				321	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
				322	do {
				323	spin_unlock(&inode_lock);
				324	__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
				325	spin_lock(&inode_lock);
				326	} while (inode->i_state & I_SYNC);
				327	}
				328
				329	/*
				330	* Write out an inode's dirty pages. Called under inode_lock. Either the
				331	* caller has ref on the inode (either via __iget or via syscall against an fd)
				332	* or the inode has I_WILL_FREE set (via generic_forget_inode)
				333	*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	334	* If `wait' is set, wait on the writeout.
				335	*
				336	* The whole writeout design is quite complex and fragile. We want to avoid
				337	* starvation of particular inodes when others are being redirtied, prevent
				338	* livelocks, etc.
				339	*
				340	* Called under inode_lock.
				341	*/
				342	static int
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	343	writeback_single_inode(struct inode inode, struct writeback_control wbc)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	344	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	345	struct address_space *mapping = inode->i_mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	346	int wait = wbc->sync_mode == WB_SYNC_ALL;
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	347	unsigned dirty;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	348	int ret;
				349
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	350	if (!atomic_read(&inode->i_count))
				351	WARN_ON(!(inode->i_state & (I_WILL_FREE\|I_FREEING)));
				352	else
				353	WARN_ON(inode->i_state & I_WILL_FREE);
				354
				355	if (inode->i_state & I_SYNC) {
				356	/*
				357	* If this inode is locked for writeback and we are not doing
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	358	* writeback-for-data-integrity, move it to b_more_io so that
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	359	* writeback can proceed with the other inodes on s_io.
				360	*
				361	* We'll have another go at writing back this inode when we
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	362	* completed a full scan of b_io.
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	363	*/
				364	if (!wait) {
				365	requeue_io(inode);
				366	return 0;
				367	}
				368
				369	/*
				370	* It's a data-integrity sync. We must wait.
				371	*/
				372	inode_wait_for_writeback(inode);
				373	}
				374
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	375	BUG_ON(inode->i_state & I_SYNC);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	376
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	377	/* Set I_SYNC, reset I_DIRTY */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	378	dirty = inode->i_state & I_DIRTY;
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	379	inode->i_state \|= I_SYNC;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	380	inode->i_state &= ~I_DIRTY;
				381
				382	spin_unlock(&inode_lock);
				383
				384	ret = do_writepages(mapping, wbc);
				385
				386	/* Don't write the inode if only I_DIRTY_PAGES was set */
				387	if (dirty & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
				388	int err = write_inode(inode, wait);
				389	if (ret == 0)
				390	ret = err;
				391	}
				392
				393	if (wait) {
				394	int err = filemap_fdatawait(mapping);
				395	if (ret == 0)
				396	ret = err;
				397	}
				398
				399	spin_lock(&inode_lock);
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	400	inode->i_state &= ~I_SYNC;
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	401	if (!(inode->i_state & (I_FREEING \| I_CLEAR))) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	402	if (!(inode->i_state & I_DIRTY) &&
				403	mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
				404	/*
				405	* We didn't write back all the pages. nfs_writepages()
				406	* sometimes bales out without doing anything. Redirty
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	407	* the inode; Move it from b_io onto b_more_io/b_dirty.
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	408	*/
				409	/*
				410	* akpm: if the caller was the kupdate function we put
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	411	* this inode at the head of b_dirty so it gets first
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	412	* consideration. Otherwise, move it to the tail, for
				413	* the reasons described there. I'm not really sure
				414	* how much sense this makes. Presumably I had a good
				415	* reasons for doing it this way, and I'd rather not
				416	* muck with it at present.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	417	*/
				418	if (wbc->for_kupdate) {
				419	/*
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	420	* For the kupdate function we move the inode
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	421	* to b_more_io so it will get more writeout as
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	422	* soon as the queue becomes uncongested.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	423	*/
				424	inode->i_state \|= I_DIRTY_PAGES;
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	425	if (wbc->nr_to_write <= 0) {
				426	/*
				427	* slice used up: queue for next turn
				428	*/
				429	requeue_io(inode);
				430	} else {
				431	/*
				432	* somehow blocked: retry later
				433	*/
				434	redirty_tail(inode);
				435	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	436	} else {
				437	/*
				438	* Otherwise fully redirty the inode so that
				439	* other inodes on this superblock will get some
				440	* writeout. Otherwise heavy writing to one
				441	* file would indefinitely suspend writeout of
				442	* all the other files.
				443	*/
				444	inode->i_state \|= I_DIRTY_PAGES;
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	445	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	446	}
				447	} else if (inode->i_state & I_DIRTY) {
				448	/*
				449	* Someone redirtied the inode while were writing back
				450	* the pages.
				451	*/
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	452	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	453	} else if (atomic_read(&inode->i_count)) {
				454	/*
				455	* The inode is clean, inuse
				456	*/
				457	list_move(&inode->i_list, &inode_in_use);
				458	} else {
				459	/*
				460	* The inode is clean, unused
				461	*/
				462	list_move(&inode->i_list, &inode_unused);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	463	}
				464	}
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	465	inode_sync_complete(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	466	return ret;
				467	}
				468
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	469	/*
				470	* For WB_SYNC_NONE writeback, the caller does not have the sb pinned
				471	* before calling writeback. So make sure that we do pin it, so it doesn't
				472	* go away while we are writing inodes from it.
				473	*
				474	* Returns 0 if the super was successfully pinned (or pinning wasn't needed),
				475	* 1 if we failed.
				476	*/
				477	static int pin_sb_for_writeback(struct writeback_control *wbc,
				478	struct inode *inode)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	479	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	480	struct super_block *sb = inode->i_sb;
				481
				482	/*
				483	* Caller must already hold the ref for this
				484	*/
				485	if (wbc->sync_mode == WB_SYNC_ALL) {
				486	WARN_ON(!rwsem_is_locked(&sb->s_umount));
				487	return 0;
				488	}
				489
				490	spin_lock(&sb_lock);
				491	sb->s_count++;
				492	if (down_read_trylock(&sb->s_umount)) {
				493	if (sb->s_root) {
				494	spin_unlock(&sb_lock);
				495	return 0;
				496	}
				497	/*
				498	* umounted, drop rwsem again and fall through to failure
				499	*/
				500	up_read(&sb->s_umount);
				501	}
				502
				503	sb->s_count--;
				504	spin_unlock(&sb_lock);
				505	return 1;
				506	}
				507
				508	static void unpin_sb_for_writeback(struct writeback_control *wbc,
				509	struct inode *inode)
				510	{
				511	struct super_block *sb = inode->i_sb;
				512
				513	if (wbc->sync_mode == WB_SYNC_ALL)
				514	return;
				515
				516	up_read(&sb->s_umount);
				517	put_super(sb);
				518	}
				519
				520	static void writeback_inodes_wb(struct bdi_writeback *wb,
				521	struct writeback_control *wbc)
				522	{
				523	struct super_block *sb = wbc->sb;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	524	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	525	const unsigned long start = jiffies; /* livelock avoidance */
				526
Hans Reiser	ae8547b	2008-05-07 15:48:57 +0300	[diff] [blame]	527	spin_lock(&inode_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	528
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	529	if (!wbc->for_kupdate \|\| list_empty(&wb->b_io))
				530	queue_io(wb, wbc->older_than_this);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	531
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	532	while (!list_empty(&wb->b_io)) {
				533	struct inode *inode = list_entry(wb->b_io.prev,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	534	struct inode, i_list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	535	long pages_skipped;
				536
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	537	/*
				538	* super block given and doesn't match, skip this inode
				539	*/
				540	if (sb && sb != inode->i_sb) {
				541	redirty_tail(inode);
				542	continue;
				543	}
				544
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	545	if (!bdi_cap_writeback_dirty(wb->bdi)) {
Andrew Morton	9852a0e7	2007-10-16 23:30:33 -0700	[diff] [blame]	546	redirty_tail(inode);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	547	if (is_blkdev_sb) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	548	/*
				549	* Dirty memory-backed blockdev: the ramdisk
				550	* driver does this. Skip just this inode
				551	*/
				552	continue;
				553	}
				554	/*
				555	* Dirty memory-backed inode against a filesystem other
				556	* than the kernel-internal bdev filesystem. Skip the
				557	* entire superblock.
				558	*/
				559	break;
				560	}
				561
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	562	if (inode->i_state & (I_NEW \| I_WILL_FREE)) {
Nick Piggin	7ef0d73	2009-03-12 14:31:38 -0700	[diff] [blame]	563	requeue_io(inode);
				564	continue;
				565	}
				566
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	567	if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	568	wbc->encountered_congestion = 1;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	569	if (!is_blkdev_sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	570	break; /* Skip a congested fs */
Ken Chen	0e0f4fc	2007-10-16 23:30:38 -0700	[diff] [blame]	571	requeue_io(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	572	continue; /* Skip a congested blockdev */
				573	}
				574
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	575	/*
				576	* Was this inode dirtied after sync_sb_inodes was called?
				577	* This keeps sync from extra jobs and livelock.
				578	*/
				579	if (inode_dirtied_after(inode, start))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	580	break;
				581
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	582	if (pin_sb_for_writeback(wbc, inode)) {
				583	requeue_io(inode);
				584	continue;
				585	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	586
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	587	BUG_ON(inode->i_state & (I_FREEING \| I_CLEAR));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	588	__iget(inode);
				589	pages_skipped = wbc->pages_skipped;
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	590	writeback_single_inode(inode, wbc);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	591	unpin_sb_for_writeback(wbc, inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	592	if (wbc->pages_skipped != pages_skipped) {
				593	/*
				594	* writeback is not making progress due to locked
				595	* buffers. Skip this inode for now.
				596	*/
Andrew Morton	f57b9b7	2007-10-16 23:30:34 -0700	[diff] [blame]	597	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	598	}
				599	spin_unlock(&inode_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	600	iput(inode);
OGAWA Hirofumi	4ffc844	2006-03-25 03:07:44 -0800	[diff] [blame]	601	cond_resched();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	602	spin_lock(&inode_lock);
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	603	if (wbc->nr_to_write <= 0) {
				604	wbc->more_io = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	605	break;
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	606	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	607	if (!list_empty(&wb->b_more_io))
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	608	wbc->more_io = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	609	}
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	610
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	611	spin_unlock(&inode_lock);
				612	/* Leave any unwritten inodes on b_io */
				613	}
				614
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	615	void writeback_inodes_wbc(struct writeback_control *wbc)
				616	{
				617	struct backing_dev_info *bdi = wbc->bdi;
				618
				619	writeback_inodes_wb(&bdi->wb, wbc);
				620	}
				621
				622	/*
				623	* The maximum number of pages to writeout in a single bdi flush/kupdate
				624	* operation. We do this so we don't hold I_SYNC against an inode for
				625	* enormous amounts of time, which would block a userspace task which has
				626	* been forced to throttle against that inode. Also, the code reevaluates
				627	* the dirty each time it has written this many pages.
				628	*/
				629	#define MAX_WRITEBACK_PAGES 1024
				630
				631	static inline bool over_bground_thresh(void)
				632	{
				633	unsigned long background_thresh, dirty_thresh;
				634
				635	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
				636
				637	return (global_page_state(NR_FILE_DIRTY) +
				638	global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
				639	}
				640
				641	/*
				642	* Explicit flushing or periodic writeback of "old" data.
				643	*
				644	* Define "old": the first time one of an inode's pages is dirtied, we mark the
				645	* dirtying-time in the inode's address_space. So this periodic writeback code
				646	* just walks the superblock inode list, writing back any inodes which are
				647	* older than a specific point in time.
				648	*
				649	* Try to run once per dirty_writeback_interval. But if a writeback event
				650	* takes longer than a dirty_writeback_interval interval, then leave a
				651	* one-second gap.
				652	*
				653	* older_than_this takes precedence over nr_to_write. So we'll only write back
				654	* all dirty pages if they are all attached to "old" mappings.
				655	*/
				656	static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
				657	struct super_block *sb,
				658	enum writeback_sync_modes sync_mode, int for_kupdate)
				659	{
				660	struct writeback_control wbc = {
				661	.bdi = wb->bdi,
				662	.sb = sb,
				663	.sync_mode = sync_mode,
				664	.older_than_this = NULL,
				665	.for_kupdate = for_kupdate,
				666	.range_cyclic = 1,
				667	};
				668	unsigned long oldest_jif;
				669	long wrote = 0;
				670
				671	if (wbc.for_kupdate) {
				672	wbc.older_than_this = &oldest_jif;
				673	oldest_jif = jiffies -
				674	msecs_to_jiffies(dirty_expire_interval * 10);
				675	}
				676
				677	for (;;) {
				678	/*
				679	* Don't flush anything for non-integrity writeback where
				680	* no nr_pages was given
				681	*/
				682	if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
				683	break;
				684
				685	/*
				686	* If no specific pages were given and this is just a
				687	* periodic background writeout and we are below the
				688	* background dirty threshold, don't do anything
				689	*/
				690	if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
				691	break;
				692
				693	wbc.more_io = 0;
				694	wbc.encountered_congestion = 0;
				695	wbc.nr_to_write = MAX_WRITEBACK_PAGES;
				696	wbc.pages_skipped = 0;
				697	writeback_inodes_wb(wb, &wbc);
				698	nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
				699	wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
				700
				701	/*
				702	* If we ran out of stuff to write, bail unless more_io got set
				703	*/
				704	if (wbc.nr_to_write > 0 \|\| wbc.pages_skipped > 0) {
				705	if (wbc.more_io && !wbc.for_kupdate)
				706	continue;
				707	break;
				708	}
				709	}
				710
				711	return wrote;
				712	}
				713
				714	/*
				715	* Return the next bdi_work struct that hasn't been processed by this
				716	* wb thread yet
				717	*/
				718	static struct bdi_work get_next_work_item(struct backing_dev_info bdi,
				719	struct bdi_writeback *wb)
				720	{
				721	struct bdi_work work, ret = NULL;
				722
				723	rcu_read_lock();
				724
				725	list_for_each_entry_rcu(work, &bdi->work_list, list) {
				726	if (!test_and_clear_bit(wb->nr, &work->seen))
				727	continue;
				728
				729	ret = work;
				730	break;
				731	}
				732
				733	rcu_read_unlock();
				734	return ret;
				735	}
				736
				737	static long wb_check_old_data_flush(struct bdi_writeback *wb)
				738	{
				739	unsigned long expired;
				740	long nr_pages;
				741
				742	expired = wb->last_old_flush +
				743	msecs_to_jiffies(dirty_writeback_interval * 10);
				744	if (time_before(jiffies, expired))
				745	return 0;
				746
				747	wb->last_old_flush = jiffies;
				748	nr_pages = global_page_state(NR_FILE_DIRTY) +
				749	global_page_state(NR_UNSTABLE_NFS) +
				750	(inodes_stat.nr_inodes - inodes_stat.nr_unused);
				751
				752	if (nr_pages)
				753	return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
				754
				755	return 0;
				756	}
				757
				758	/*
				759	* Retrieve work items and do the writeback they describe
				760	*/
				761	long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
				762	{
				763	struct backing_dev_info *bdi = wb->bdi;
				764	struct bdi_work *work;
				765	long nr_pages, wrote = 0;
				766
				767	while ((work = get_next_work_item(bdi, wb)) != NULL) {
				768	enum writeback_sync_modes sync_mode;
				769
				770	nr_pages = work->nr_pages;
				771
				772	/*
				773	* Override sync mode, in case we must wait for completion
				774	*/
				775	if (force_wait)
				776	work->sync_mode = sync_mode = WB_SYNC_ALL;
				777	else
				778	sync_mode = work->sync_mode;
				779
				780	/*
				781	* If this isn't a data integrity operation, just notify
				782	* that we have seen this work and we are now starting it.
				783	*/
				784	if (sync_mode == WB_SYNC_NONE)
				785	wb_clear_pending(wb, work);
				786
				787	wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
				788
				789	/*
				790	* This is a data integrity writeback, so only do the
				791	* notification when we have completed the work.
				792	*/
				793	if (sync_mode == WB_SYNC_ALL)
				794	wb_clear_pending(wb, work);
				795	}
				796
				797	/*
				798	* Check for periodic writeback, kupdated() style
				799	*/
				800	wrote += wb_check_old_data_flush(wb);
				801
				802	return wrote;
				803	}
				804
				805	/*
				806	* Handle writeback of dirty data for the device backed by this bdi. Also
				807	* wakes up periodically and does kupdated style flushing.
				808	*/
				809	int bdi_writeback_task(struct bdi_writeback *wb)
				810	{
				811	unsigned long last_active = jiffies;
				812	unsigned long wait_jiffies = -1UL;
				813	long pages_written;
				814
				815	while (!kthread_should_stop()) {
				816	pages_written = wb_do_writeback(wb, 0);
				817
				818	if (pages_written)
				819	last_active = jiffies;
				820	else if (wait_jiffies != -1UL) {
				821	unsigned long max_idle;
				822
				823	/*
				824	* Longest period of inactivity that we tolerate. If we
				825	* see dirty data again later, the task will get
				826	* recreated automatically.
				827	*/
				828	max_idle = max(5UL * 60 * HZ, wait_jiffies);
				829	if (time_after(jiffies, max_idle + last_active))
				830	break;
				831	}
				832
				833	wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
				834	set_current_state(TASK_INTERRUPTIBLE);
				835	schedule_timeout(wait_jiffies);
				836	try_to_freeze();
				837	}
				838
				839	return 0;
				840	}
				841
				842	/*
				843	* Schedule writeback for all backing devices. Expensive! If this is a data
				844	* integrity operation, writeback will be complete when this returns. If
				845	* we are simply called for WB_SYNC_NONE, then writeback will merely be
				846	* scheduled to run.
				847	*/
				848	static void bdi_writeback_all(struct writeback_control *wbc)
				849	{
				850	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
				851	struct backing_dev_info *bdi;
				852	struct bdi_work *work;
				853	LIST_HEAD(list);
				854
				855	restart:
				856	spin_lock(&bdi_lock);
				857
				858	list_for_each_entry(bdi, &bdi_list, bdi_list) {
				859	struct bdi_work *work;
				860
				861	if (!bdi_has_dirty_io(bdi))
				862	continue;
				863
				864	/*
				865	* If work allocation fails, do the writes inline. We drop
				866	* the lock and restart the list writeout. This should be OK,
				867	* since this happens rarely and because the writeout should
				868	* eventually make more free memory available.
				869	*/
				870	work = bdi_alloc_work(wbc);
				871	if (!work) {
				872	struct writeback_control __wbc;
				873
				874	/*
				875	* Not a data integrity writeout, just continue
				876	*/
				877	if (!must_wait)
				878	continue;
				879
				880	spin_unlock(&bdi_lock);
				881	__wbc = *wbc;
				882	__wbc.bdi = bdi;
				883	writeback_inodes_wbc(&__wbc);
				884	goto restart;
				885	}
				886	if (must_wait)
				887	list_add_tail(&work->wait_list, &list);
				888
				889	bdi_queue_work(bdi, work);
				890	}
				891
				892	spin_unlock(&bdi_lock);
				893
				894	/*
				895	* If this is for WB_SYNC_ALL, wait for pending work to complete
				896	* before returning.
				897	*/
				898	while (!list_empty(&list)) {
				899	work = list_entry(list.next, struct bdi_work, wait_list);
				900	list_del(&work->wait_list);
				901	bdi_wait_on_work_clear(work);
				902	call_rcu(&work->rcu_head, bdi_work_free);
				903	}
				904	}
				905
				906	/*
				907	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
				908	* the whole world.
				909	*/
				910	void wakeup_flusher_threads(long nr_pages)
				911	{
				912	struct writeback_control wbc = {
				913	.sync_mode = WB_SYNC_NONE,
				914	.older_than_this = NULL,
				915	.range_cyclic = 1,
				916	};
				917
				918	if (nr_pages == 0)
				919	nr_pages = global_page_state(NR_FILE_DIRTY) +
				920	global_page_state(NR_UNSTABLE_NFS);
				921	wbc.nr_to_write = nr_pages;
				922	bdi_writeback_all(&wbc);
				923	}
				924
				925	static noinline void block_dump___mark_inode_dirty(struct inode *inode)
				926	{
				927	if (inode->i_ino \|\| strcmp(inode->i_sb->s_id, "bdev")) {
				928	struct dentry *dentry;
				929	const char *name = "?";
				930
				931	dentry = d_find_alias(inode);
				932	if (dentry) {
				933	spin_lock(&dentry->d_lock);
				934	name = (const char *) dentry->d_name.name;
				935	}
				936	printk(KERN_DEBUG
				937	"%s(%d): dirtied inode %lu (%s) on %s\n",
				938	current->comm, task_pid_nr(current), inode->i_ino,
				939	name, inode->i_sb->s_id);
				940	if (dentry) {
				941	spin_unlock(&dentry->d_lock);
				942	dput(dentry);
				943	}
				944	}
				945	}
				946
				947	/**
				948	* __mark_inode_dirty - internal function
				949	* @inode: inode to mark
				950	* @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
				951	* Mark an inode as dirty. Callers should use mark_inode_dirty or
				952	* mark_inode_dirty_sync.
				953	*
				954	* Put the inode on the super block's dirty list.
				955	*
				956	* CAREFUL! We mark it dirty unconditionally, but move it onto the
				957	* dirty list only if it is hashed or if it refers to a blockdev.
				958	* If it was not hashed, it will never be added to the dirty list
				959	* even if it is later hashed, as it will have been marked dirty already.
				960	*
				961	* In short, make sure you hash any inodes _before_ you start marking
				962	* them dirty.
				963	*
				964	* This function must be atomic for the I_DIRTY_PAGES case -
				965	* set_page_dirty() is called under spinlock in several places.
				966	*
				967	* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
				968	* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
				969	* the kernel-internal blockdev inode represents the dirtying time of the
				970	* blockdev's pages. This is why for I_DIRTY_PAGES we always use
				971	* page->mapping->host, so the page-dirtying time is recorded in the internal
				972	* blockdev inode.
				973	*/
				974	void __mark_inode_dirty(struct inode *inode, int flags)
				975	{
				976	struct super_block *sb = inode->i_sb;
				977
				978	/*
				979	* Don't do this for I_DIRTY_PAGES - that doesn't actually
				980	* dirty the inode itself
				981	*/
				982	if (flags & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
				983	if (sb->s_op->dirty_inode)
				984	sb->s_op->dirty_inode(inode);
				985	}
				986
				987	/*
				988	* make sure that changes are seen by all cpus before we test i_state
				989	* -- mikulas
				990	*/
				991	smp_mb();
				992
				993	/* avoid the locking if we can */
				994	if ((inode->i_state & flags) == flags)
				995	return;
				996
				997	if (unlikely(block_dump))
				998	block_dump___mark_inode_dirty(inode);
				999
				1000	spin_lock(&inode_lock);
				1001	if ((inode->i_state & flags) != flags) {
				1002	const int was_dirty = inode->i_state & I_DIRTY;
				1003
				1004	inode->i_state \|= flags;
				1005
				1006	/*
				1007	* If the inode is being synced, just update its dirty state.
				1008	* The unlocker will place the inode on the appropriate
				1009	* superblock list, based upon its state.
				1010	*/
				1011	if (inode->i_state & I_SYNC)
				1012	goto out;
				1013
				1014	/*
				1015	* Only add valid (hashed) inodes to the superblock's
				1016	* dirty list. Add blockdev inodes as well.
				1017	*/
				1018	if (!S_ISBLK(inode->i_mode)) {
				1019	if (hlist_unhashed(&inode->i_hash))
				1020	goto out;
				1021	}
				1022	if (inode->i_state & (I_FREEING\|I_CLEAR))
				1023	goto out;
				1024
				1025	/*
				1026	* If the inode was already on b_dirty/b_io/b_more_io, don't
				1027	* reposition it (that would break b_dirty time-ordering).
				1028	*/
				1029	if (!was_dirty) {
				1030	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
Jens Axboe	500b067	2009-09-09 09:10:25 +0200	[diff] [blame]	1031	struct backing_dev_info *bdi = wb->bdi;
				1032
				1033	if (bdi_cap_writeback_dirty(bdi) &&
				1034	!test_bit(BDI_registered, &bdi->state)) {
				1035	WARN_ON(1);
				1036	printk(KERN_ERR "bdi-%s not registered\n",
				1037	bdi->name);
				1038	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1039
				1040	inode->dirtied_when = jiffies;
				1041	list_move(&inode->i_list, &wb->b_dirty);
				1042	}
				1043	}
				1044	out:
				1045	spin_unlock(&inode_lock);
				1046	}
				1047	EXPORT_SYMBOL(__mark_inode_dirty);
				1048
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1049	/*
				1050	* Write out a superblock's list of dirty inodes. A wait will be performed
				1051	* upon no inodes, all inodes or the final one, depending upon sync_mode.
				1052	*
				1053	* If older_than_this is non-NULL, then only write out inodes which
				1054	* had their first dirtying at a time earlier than *older_than_this.
				1055	*
				1056	* If we're a pdlfush thread, then implement pdflush collision avoidance
				1057	* against the entire list.
				1058	*
				1059	* If `bdi' is non-zero then we're being asked to writeback a specific queue.
				1060	* This function assumes that the blockdev superblock's inodes are backed by
				1061	* a variety of queues, so all inodes are searched. For other superblocks,
				1062	* assume that all inodes are backed by the same queue.
				1063	*
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1064	* The inodes to be written are parked on bdi->b_io. They are moved back onto
				1065	* bdi->b_dirty as they are selected for writing. This way, none can be missed
				1066	* on the writer throttling path, and we get decent balancing between many
				1067	* throttled threads: we don't want them all piling up on inode_sync_wait.
				1068	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1069	static void wait_sb_inodes(struct writeback_control *wbc)
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1070	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1071	struct inode inode, old_inode = NULL;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1072
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1073	/*
				1074	* We need to be protected against the filesystem going from
				1075	* r/o to r/w or vice versa.
				1076	*/
				1077	WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1078
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1079	spin_lock(&inode_lock);
				1080
				1081	/*
				1082	* Data integrity sync. Must wait for all pages under writeback,
				1083	* because there may have been pages dirtied before our sync
				1084	* call, but which had writeout started before we write it out.
				1085	* In which case, the inode may not be on the dirty list, but
				1086	* we still have to wait for that writeout.
				1087	*/
				1088	list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
				1089	struct address_space *mapping;
				1090
				1091	if (inode->i_state & (I_FREEING\|I_CLEAR\|I_WILL_FREE\|I_NEW))
				1092	continue;
				1093	mapping = inode->i_mapping;
				1094	if (mapping->nrpages == 0)
				1095	continue;
				1096	__iget(inode);
				1097	spin_unlock(&inode_lock);
				1098	/*
				1099	* We hold a reference to 'inode' so it couldn't have
				1100	* been removed from s_inodes list while we dropped the
				1101	* inode_lock. We cannot iput the inode now as we can
				1102	* be holding the last reference and we cannot iput it
				1103	* under inode_lock. So we keep the reference and iput
				1104	* it later.
				1105	*/
				1106	iput(old_inode);
				1107	old_inode = inode;
				1108
				1109	filemap_fdatawait(mapping);
				1110
				1111	cond_resched();
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1112
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1113	spin_lock(&inode_lock);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1114	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1115	spin_unlock(&inode_lock);
				1116	iput(old_inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1117	}
				1118
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1119	/**
				1120	* writeback_inodes_sb - writeback dirty inodes from given super_block
				1121	* @sb: the superblock
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1122	*
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1123	* Start writeback on some inodes on this super_block. No guarantees are made
				1124	* on how many (if any) will be written, and this function does not wait
				1125	* for IO completion of submitted IO. The number of pages submitted is
				1126	* returned.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1127	*/
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1128	long writeback_inodes_sb(struct super_block *sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1129	{
				1130	struct writeback_control wbc = {
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1131	.sb = sb,
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1132	.sync_mode = WB_SYNC_NONE,
OGAWA Hirofumi	111ebb6	2006-06-23 02:03:26 -0700	[diff] [blame]	1133	.range_start = 0,
				1134	.range_end = LLONG_MAX,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1135	};
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1136	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
				1137	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
				1138	long nr_to_write;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1139
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1140	nr_to_write = nr_dirty + nr_unstable +
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1141	(inodes_stat.nr_inodes - inodes_stat.nr_unused);
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1142
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1143	wbc.nr_to_write = nr_to_write;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1144	bdi_writeback_all(&wbc);
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1145	return nr_to_write - wbc.nr_to_write;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1146	}
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1147	EXPORT_SYMBOL(writeback_inodes_sb);
				1148
				1149	/**
				1150	* sync_inodes_sb - sync sb inode pages
				1151	* @sb: the superblock
				1152	*
				1153	* This function writes and waits on any dirty inode belonging to this
				1154	* super_block. The number of pages synced is returned.
				1155	*/
				1156	long sync_inodes_sb(struct super_block *sb)
				1157	{
				1158	struct writeback_control wbc = {
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1159	.sb = sb,
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1160	.sync_mode = WB_SYNC_ALL,
				1161	.range_start = 0,
				1162	.range_end = LLONG_MAX,
				1163	};
				1164	long nr_to_write = LONG_MAX; /* doesn't actually matter */
				1165
				1166	wbc.nr_to_write = nr_to_write;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1167	bdi_writeback_all(&wbc);
				1168	wait_sb_inodes(&wbc);
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1169	return nr_to_write - wbc.nr_to_write;
				1170	}
				1171	EXPORT_SYMBOL(sync_inodes_sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1172
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1173	/**
Andrea Arcangeli	7f04c26	2005-10-30 15:03:05 -0800	[diff] [blame]	1174	* write_inode_now - write an inode to disk
				1175	* @inode: inode to write to disk
				1176	* @sync: whether the write should be synchronous or not
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1177	*
Andrea Arcangeli	7f04c26	2005-10-30 15:03:05 -0800	[diff] [blame]	1178	* This function commits an inode to disk immediately if it is dirty. This is
				1179	* primarily needed by knfsd.
				1180	*
				1181	* The caller must either have a ref on the inode or must have set I_WILL_FREE.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1182	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1183	int write_inode_now(struct inode *inode, int sync)
				1184	{
				1185	int ret;
				1186	struct writeback_control wbc = {
				1187	.nr_to_write = LONG_MAX,
Mike Galbraith	18914b1	2008-02-08 04:20:23 -0800	[diff] [blame]	1188	.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
OGAWA Hirofumi	111ebb6	2006-06-23 02:03:26 -0700	[diff] [blame]	1189	.range_start = 0,
				1190	.range_end = LLONG_MAX,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1191	};
				1192
				1193	if (!mapping_cap_writeback_dirty(inode->i_mapping))
Andrew Morton	49364ce	2005-11-07 00:59:15 -0800	[diff] [blame]	1194	wbc.nr_to_write = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1195
				1196	might_sleep();
				1197	spin_lock(&inode_lock);
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	1198	ret = writeback_single_inode(inode, &wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1199	spin_unlock(&inode_lock);
				1200	if (sync)
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	1201	inode_sync_wait(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1202	return ret;
				1203	}
				1204	EXPORT_SYMBOL(write_inode_now);
				1205
				1206	/**
				1207	* sync_inode - write an inode and its pages to disk.
				1208	* @inode: the inode to sync
				1209	* @wbc: controls the writeback mode
				1210	*
				1211	* sync_inode() will write an inode and its pages to disk. It will also
				1212	* correctly update the inode on its superblock's dirty inode lists and will
				1213	* update inode->i_state.
				1214	*
				1215	* The caller must have a ref on the inode.
				1216	*/
				1217	int sync_inode(struct inode inode, struct writeback_control wbc)
				1218	{
				1219	int ret;
				1220
				1221	spin_lock(&inode_lock);
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	1222	ret = writeback_single_inode(inode, wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1223	spin_unlock(&inode_lock);
				1224	return ret;
				1225	}
				1226	EXPORT_SYMBOL(sync_inode);