Blame - mm/backing-dev.c - SHIFTPHONES/kernel/common

blob: 4c9386c98ec18c518158b69905091a4715c4cc08 [file] [log] [blame]

Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	1
				2	#include <linux/wait.h>
				3	#include <linux/backing-dev.h>
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	4	#include <linux/kthread.h>
				5	#include <linux/freezer.h>
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	6	#include <linux/fs.h>
Jens Axboe	2616015	2009-03-17 09:35:06 +0100	[diff] [blame]	7	#include <linux/pagemap.h>
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	8	#include <linux/mm.h>
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	9	#include <linux/sched.h>
				10	#include <linux/module.h>
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	11	#include <linux/writeback.h>
				12	#include <linux/device.h>
Dave Chinner	455b286	2010-07-07 13:24:06 +1000	[diff] [blame]	13	#include <trace/events/writeback.h>
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	14
Jens Axboe	c3c5320	2010-04-22 11:37:01 +0200	[diff] [blame]	15	static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
				16
Jörn Engel	5129a46	2010-04-25 08:54:42 +0200	[diff] [blame]	17	struct backing_dev_info noop_backing_dev_info = {
				18	.name = "noop",
Jan Kara	976e48f	2010-09-21 11:48:55 +0200	[diff] [blame]	19	.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
Jörn Engel	5129a46	2010-04-25 08:54:42 +0200	[diff] [blame]	20	};
Tejun Heo	a212b10	2015-05-22 17:13:33 -0400	[diff] [blame]	21	EXPORT_SYMBOL_GPL(noop_backing_dev_info);
Jörn Engel	5129a46	2010-04-25 08:54:42 +0200	[diff] [blame]	22
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	23	static struct class *bdi_class;
Jens Axboe	cfc4ba5	2009-09-14 13:12:40 +0200	[diff] [blame]	24
				25	/*
Tejun Heo	181387d	2013-04-01 19:08:06 -0700	[diff] [blame]	26	* bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
Jens Axboe	cfc4ba5	2009-09-14 13:12:40 +0200	[diff] [blame]	27	* locking.
				28	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	29	DEFINE_SPINLOCK(bdi_lock);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	30	LIST_HEAD(bdi_list);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	31
Tejun Heo	839a8e8	2013-04-01 19:08:06 -0700	[diff] [blame]	32	/* bdi_wq serves all asynchronous writeback tasks */
				33	struct workqueue_struct *bdi_wq;
				34
Miklos Szeredi	76f1418	2008-04-30 00:54:36 -0700	[diff] [blame]	35	#ifdef CONFIG_DEBUG_FS
				36	#include <linux/debugfs.h>
				37	#include <linux/seq_file.h>
				38
				39	static struct dentry *bdi_debug_root;
				40
				41	static void bdi_debug_init(void)
				42	{
				43	bdi_debug_root = debugfs_create_dir("bdi", NULL);
				44	}
				45
				46	static int bdi_debug_stats_show(struct seq_file m, void v)
				47	{
				48	struct backing_dev_info *bdi = m->private;
Christoph Hellwig	c1955ce	2010-06-19 23:08:06 +0200	[diff] [blame]	49	struct bdi_writeback *wb = &bdi->wb;
David Rientjes	364aeb2	2009-01-06 14:39:29 -0800	[diff] [blame]	50	unsigned long background_thresh;
				51	unsigned long dirty_thresh;
				52	unsigned long bdi_thresh;
Theodore Ts'o	0ae45f6	2015-02-02 00:37:00 -0500	[diff] [blame]	53	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
Jens Axboe	f09b00d	2009-05-25 09:08:21 +0200	[diff] [blame]	54	struct inode *inode;
				55
Theodore Ts'o	0ae45f6	2015-02-02 00:37:00 -0500	[diff] [blame]	56	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
Christoph Hellwig	f758eea	2011-04-21 18:19:44 -0600	[diff] [blame]	57	spin_lock(&wb->list_lock);
Nick Piggin	7ccf19a	2010-10-21 11:49:30 +1100	[diff] [blame]	58	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
Christoph Hellwig	c1955ce	2010-06-19 23:08:06 +0200	[diff] [blame]	59	nr_dirty++;
Nick Piggin	7ccf19a	2010-10-21 11:49:30 +1100	[diff] [blame]	60	list_for_each_entry(inode, &wb->b_io, i_wb_list)
Christoph Hellwig	c1955ce	2010-06-19 23:08:06 +0200	[diff] [blame]	61	nr_io++;
Nick Piggin	7ccf19a	2010-10-21 11:49:30 +1100	[diff] [blame]	62	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
Christoph Hellwig	c1955ce	2010-06-19 23:08:06 +0200	[diff] [blame]	63	nr_more_io++;
Theodore Ts'o	0ae45f6	2015-02-02 00:37:00 -0500	[diff] [blame]	64	list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
				65	if (inode->i_state & I_DIRTY_TIME)
				66	nr_dirty_time++;
Christoph Hellwig	f758eea	2011-04-21 18:19:44 -0600	[diff] [blame]	67	spin_unlock(&wb->list_lock);
Miklos Szeredi	76f1418	2008-04-30 00:54:36 -0700	[diff] [blame]	68
Wu Fengguang	16c4042	2010-08-11 14:17:39 -0700	[diff] [blame]	69	global_dirty_limits(&background_thresh, &dirty_thresh);
Tejun Heo	a88a341	2015-05-22 17:13:28 -0400	[diff] [blame]	70	bdi_thresh = wb_dirty_limit(wb, dirty_thresh);
Miklos Szeredi	76f1418	2008-04-30 00:54:36 -0700	[diff] [blame]	71
				72	#define K(x) ((x) << (PAGE_SHIFT - 10))
				73	seq_printf(m,
Wu Fengguang	00821b0	2010-08-29 11:28:45 -0600	[diff] [blame]	74	"BdiWriteback: %10lu kB\n"
				75	"BdiReclaimable: %10lu kB\n"
				76	"BdiDirtyThresh: %10lu kB\n"
				77	"DirtyThresh: %10lu kB\n"
				78	"BackgroundThresh: %10lu kB\n"
Wu Fengguang	c8e28ce	2011-01-23 10:07:47 -0600	[diff] [blame]	79	"BdiDirtied: %10lu kB\n"
Wu Fengguang	00821b0	2010-08-29 11:28:45 -0600	[diff] [blame]	80	"BdiWritten: %10lu kB\n"
				81	"BdiWriteBandwidth: %10lu kBps\n"
				82	"b_dirty: %10lu\n"
				83	"b_io: %10lu\n"
				84	"b_more_io: %10lu\n"
Theodore Ts'o	0ae45f6	2015-02-02 00:37:00 -0500	[diff] [blame]	85	"b_dirty_time: %10lu\n"
Wu Fengguang	00821b0	2010-08-29 11:28:45 -0600	[diff] [blame]	86	"bdi_list: %10u\n"
				87	"state: %10lx\n",
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	88	(unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
				89	(unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
Jan Kara	f7d2b1e	2010-12-08 22:44:24 -0600	[diff] [blame]	90	K(bdi_thresh),
				91	K(dirty_thresh),
				92	K(background_thresh),
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	93	(unsigned long) K(wb_stat(wb, WB_DIRTIED)),
				94	(unsigned long) K(wb_stat(wb, WB_WRITTEN)),
Tejun Heo	a88a341	2015-05-22 17:13:28 -0400	[diff] [blame]	95	(unsigned long) K(wb->write_bandwidth),
Jan Kara	f7d2b1e	2010-12-08 22:44:24 -0600	[diff] [blame]	96	nr_dirty,
				97	nr_io,
				98	nr_more_io,
Theodore Ts'o	0ae45f6	2015-02-02 00:37:00 -0500	[diff] [blame]	99	nr_dirty_time,
Tejun Heo	4452226	2015-05-22 17:13:26 -0400	[diff] [blame]	100	!list_empty(&bdi->bdi_list), bdi->wb.state);
Miklos Szeredi	76f1418	2008-04-30 00:54:36 -0700	[diff] [blame]	101	#undef K
				102
				103	return 0;
				104	}
				105
				106	static int bdi_debug_stats_open(struct inode inode, struct file file)
				107	{
				108	return single_open(file, bdi_debug_stats_show, inode->i_private);
				109	}
				110
				111	static const struct file_operations bdi_debug_stats_fops = {
				112	.open = bdi_debug_stats_open,
				113	.read = seq_read,
				114	.llseek = seq_lseek,
				115	.release = single_release,
				116	};
				117
				118	static void bdi_debug_register(struct backing_dev_info bdi, const char name)
				119	{
				120	bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
				121	bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
				122	bdi, &bdi_debug_stats_fops);
				123	}
				124
				125	static void bdi_debug_unregister(struct backing_dev_info *bdi)
				126	{
				127	debugfs_remove(bdi->debug_stats);
				128	debugfs_remove(bdi->debug_dir);
				129	}
				130	#else
				131	static inline void bdi_debug_init(void)
				132	{
				133	}
				134	static inline void bdi_debug_register(struct backing_dev_info *bdi,
				135	const char *name)
				136	{
				137	}
				138	static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
				139	{
				140	}
				141	#endif
				142
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	143	static ssize_t read_ahead_kb_store(struct device *dev,
				144	struct device_attribute *attr,
				145	const char *buf, size_t count)
				146	{
				147	struct backing_dev_info *bdi = dev_get_drvdata(dev);
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	148	unsigned long read_ahead_kb;
Namjae Jeon	7034ed1	2012-08-25 16:57:27 +0800	[diff] [blame]	149	ssize_t ret;
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	150
Namjae Jeon	7034ed1	2012-08-25 16:57:27 +0800	[diff] [blame]	151	ret = kstrtoul(buf, 10, &read_ahead_kb);
				152	if (ret < 0)
				153	return ret;
				154
				155	bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
				156
				157	return count;
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	158	}
				159
				160	#define K(pages) ((pages) << (PAGE_SHIFT - 10))
				161
				162	#define BDI_SHOW(name, expr) \
				163	static ssize_t name##_show(struct device *dev, \
				164	struct device_attribute attr, char page) \
				165	{ \
				166	struct backing_dev_info *bdi = dev_get_drvdata(dev); \
				167	\
				168	return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
Greg Kroah-Hartman	d9e1241	2013-07-24 15:05:26 -0700	[diff] [blame]	169	} \
				170	static DEVICE_ATTR_RW(name);
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	171
				172	BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
				173
Peter Zijlstra	189d3c4	2008-04-30 00:54:35 -0700	[diff] [blame]	174	static ssize_t min_ratio_store(struct device *dev,
				175	struct device_attribute attr, const char buf, size_t count)
				176	{
				177	struct backing_dev_info *bdi = dev_get_drvdata(dev);
Peter Zijlstra	189d3c4	2008-04-30 00:54:35 -0700	[diff] [blame]	178	unsigned int ratio;
Namjae Jeon	7034ed1	2012-08-25 16:57:27 +0800	[diff] [blame]	179	ssize_t ret;
Peter Zijlstra	189d3c4	2008-04-30 00:54:35 -0700	[diff] [blame]	180
Namjae Jeon	7034ed1	2012-08-25 16:57:27 +0800	[diff] [blame]	181	ret = kstrtouint(buf, 10, &ratio);
				182	if (ret < 0)
				183	return ret;
				184
				185	ret = bdi_set_min_ratio(bdi, ratio);
				186	if (!ret)
				187	ret = count;
				188
Peter Zijlstra	189d3c4	2008-04-30 00:54:35 -0700	[diff] [blame]	189	return ret;
				190	}
				191	BDI_SHOW(min_ratio, bdi->min_ratio)
				192
Peter Zijlstra	a42dde0	2008-04-30 00:54:36 -0700	[diff] [blame]	193	static ssize_t max_ratio_store(struct device *dev,
				194	struct device_attribute attr, const char buf, size_t count)
				195	{
				196	struct backing_dev_info *bdi = dev_get_drvdata(dev);
Peter Zijlstra	a42dde0	2008-04-30 00:54:36 -0700	[diff] [blame]	197	unsigned int ratio;
Namjae Jeon	7034ed1	2012-08-25 16:57:27 +0800	[diff] [blame]	198	ssize_t ret;
Peter Zijlstra	a42dde0	2008-04-30 00:54:36 -0700	[diff] [blame]	199
Namjae Jeon	7034ed1	2012-08-25 16:57:27 +0800	[diff] [blame]	200	ret = kstrtouint(buf, 10, &ratio);
				201	if (ret < 0)
				202	return ret;
				203
				204	ret = bdi_set_max_ratio(bdi, ratio);
				205	if (!ret)
				206	ret = count;
				207
Peter Zijlstra	a42dde0	2008-04-30 00:54:36 -0700	[diff] [blame]	208	return ret;
				209	}
				210	BDI_SHOW(max_ratio, bdi->max_ratio)
				211
Darrick J. Wong	7d311cd	2013-02-21 16:42:48 -0800	[diff] [blame]	212	static ssize_t stable_pages_required_show(struct device *dev,
				213	struct device_attribute *attr,
				214	char *page)
				215	{
				216	struct backing_dev_info *bdi = dev_get_drvdata(dev);
				217
				218	return snprintf(page, PAGE_SIZE-1, "%d\n",
				219	bdi_cap_stable_pages_required(bdi) ? 1 : 0);
				220	}
Greg Kroah-Hartman	d9e1241	2013-07-24 15:05:26 -0700	[diff] [blame]	221	static DEVICE_ATTR_RO(stable_pages_required);
Darrick J. Wong	7d311cd	2013-02-21 16:42:48 -0800	[diff] [blame]	222
Greg Kroah-Hartman	d9e1241	2013-07-24 15:05:26 -0700	[diff] [blame]	223	static struct attribute *bdi_dev_attrs[] = {
				224	&dev_attr_read_ahead_kb.attr,
				225	&dev_attr_min_ratio.attr,
				226	&dev_attr_max_ratio.attr,
				227	&dev_attr_stable_pages_required.attr,
				228	NULL,
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	229	};
Greg Kroah-Hartman	d9e1241	2013-07-24 15:05:26 -0700	[diff] [blame]	230	ATTRIBUTE_GROUPS(bdi_dev);
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	231
				232	static __init int bdi_class_init(void)
				233	{
				234	bdi_class = class_create(THIS_MODULE, "bdi");
Anton Blanchard	1442145	2010-04-02 09:46:55 +0200	[diff] [blame]	235	if (IS_ERR(bdi_class))
				236	return PTR_ERR(bdi_class);
				237
Greg Kroah-Hartman	d9e1241	2013-07-24 15:05:26 -0700	[diff] [blame]	238	bdi_class->dev_groups = bdi_dev_groups;
Miklos Szeredi	76f1418	2008-04-30 00:54:36 -0700	[diff] [blame]	239	bdi_debug_init();
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	240	return 0;
				241	}
Miklos Szeredi	76f1418	2008-04-30 00:54:36 -0700	[diff] [blame]	242	postcore_initcall(bdi_class_init);
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	243
Jens Axboe	2616015	2009-03-17 09:35:06 +0100	[diff] [blame]	244	static int __init default_bdi_init(void)
				245	{
				246	int err;
				247
Tejun Heo	839a8e8	2013-04-01 19:08:06 -0700	[diff] [blame]	248	bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM \| WQ_FREEZABLE \|
Tejun Heo	b5c872d	2013-04-01 19:08:06 -0700	[diff] [blame]	249	WQ_UNBOUND \| WQ_SYSFS, 0);
Tejun Heo	839a8e8	2013-04-01 19:08:06 -0700	[diff] [blame]	250	if (!bdi_wq)
				251	return -ENOMEM;
				252
Jan Kara	976e48f	2010-09-21 11:48:55 +0200	[diff] [blame]	253	err = bdi_init(&noop_backing_dev_info);
Jens Axboe	2616015	2009-03-17 09:35:06 +0100	[diff] [blame]	254
				255	return err;
				256	}
				257	subsys_initcall(default_bdi_init);
				258
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	259	int bdi_has_dirty_io(struct backing_dev_info *bdi)
				260	{
				261	return wb_has_dirty_io(&bdi->wb);
				262	}
				263
Artem Bityutskiy	6467716	2010-07-25 14:29:22 +0300	[diff] [blame]	264	/*
Tejun Heo	f0054bb	2015-05-22 17:13:30 -0400	[diff] [blame]	265	* This function is used when the first inode for this wb is marked dirty. It
Artem Bityutskiy	6467716	2010-07-25 14:29:22 +0300	[diff] [blame]	266	* wakes-up the corresponding bdi thread which should then take care of the
				267	* periodic background write-out of dirty inodes. Since the write-out would
				268	* starts only 'dirty_writeback_interval' centisecs from now anyway, we just
				269	* set up a timer which wakes the bdi thread up later.
				270	*
				271	* Note, we wouldn't bother setting up the timer, but this function is on the
				272	* fast-path (used by '__mark_inode_dirty()'), so we save few context switches
				273	* by delaying the wake-up.
Derek Basehore	6ca738d	2014-04-03 14:46:22 -0700	[diff] [blame]	274	*
				275	* We have to be careful not to postpone flush work if it is scheduled for
				276	* earlier. Thus we use queue_delayed_work().
Artem Bityutskiy	6467716	2010-07-25 14:29:22 +0300	[diff] [blame]	277	*/
Tejun Heo	f0054bb	2015-05-22 17:13:30 -0400	[diff] [blame]	278	void wb_wakeup_delayed(struct bdi_writeback *wb)
Artem Bityutskiy	6467716	2010-07-25 14:29:22 +0300	[diff] [blame]	279	{
				280	unsigned long timeout;
				281
				282	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
Tejun Heo	f0054bb	2015-05-22 17:13:30 -0400	[diff] [blame]	283	spin_lock_bh(&wb->work_lock);
				284	if (test_bit(WB_registered, &wb->state))
				285	queue_delayed_work(bdi_wq, &wb->dwork, timeout);
				286	spin_unlock_bh(&wb->work_lock);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	287	}
				288
Jens Axboe	cfc4ba5	2009-09-14 13:12:40 +0200	[diff] [blame]	289	/*
Tejun Heo	a88a341	2015-05-22 17:13:28 -0400	[diff] [blame]	290	* Initial write bandwidth: 100 MB/s
				291	*/
				292	#define INIT_BW (100 << (20 - PAGE_SHIFT))
				293
Tejun Heo	8395cd9	2015-05-22 17:13:34 -0400	[diff] [blame]	294	static int wb_init(struct bdi_writeback wb, struct backing_dev_info bdi,
				295	gfp_t gfp)
Artem Bityutskiy	6467716	2010-07-25 14:29:22 +0300	[diff] [blame]	296	{
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	297	int i, err;
				298
Artem Bityutskiy	6467716	2010-07-25 14:29:22 +0300	[diff] [blame]	299	memset(wb, 0, sizeof(*wb));
				300
				301	wb->bdi = bdi;
				302	wb->last_old_flush = jiffies;
				303	INIT_LIST_HEAD(&wb->b_dirty);
				304	INIT_LIST_HEAD(&wb->b_io);
				305	INIT_LIST_HEAD(&wb->b_more_io);
Theodore Ts'o	0ae45f6	2015-02-02 00:37:00 -0500	[diff] [blame]	306	INIT_LIST_HEAD(&wb->b_dirty_time);
Christoph Hellwig	f758eea	2011-04-21 18:19:44 -0600	[diff] [blame]	307	spin_lock_init(&wb->list_lock);
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	308
Tejun Heo	a88a341	2015-05-22 17:13:28 -0400	[diff] [blame]	309	wb->bw_time_stamp = jiffies;
				310	wb->balanced_dirty_ratelimit = INIT_BW;
				311	wb->dirty_ratelimit = INIT_BW;
				312	wb->write_bandwidth = INIT_BW;
				313	wb->avg_write_bandwidth = INIT_BW;
				314
Tejun Heo	f0054bb	2015-05-22 17:13:30 -0400	[diff] [blame]	315	spin_lock_init(&wb->work_lock);
				316	INIT_LIST_HEAD(&wb->work_list);
				317	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
				318
Tejun Heo	8395cd9	2015-05-22 17:13:34 -0400	[diff] [blame]	319	err = fprop_local_init_percpu(&wb->completions, gfp);
Tejun Heo	a88a341	2015-05-22 17:13:28 -0400	[diff] [blame]	320	if (err)
				321	return err;
				322
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	323	for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
Tejun Heo	8395cd9	2015-05-22 17:13:34 -0400	[diff] [blame]	324	err = percpu_counter_init(&wb->stat[i], 0, gfp);
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	325	if (err) {
				326	while (--i)
				327	percpu_counter_destroy(&wb->stat[i]);
Tejun Heo	a88a341	2015-05-22 17:13:28 -0400	[diff] [blame]	328	fprop_local_destroy_percpu(&wb->completions);
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	329	return err;
				330	}
				331	}
				332
				333	return 0;
				334	}
				335
Tejun Heo	4610007	2015-05-22 17:13:31 -0400	[diff] [blame]	336	/*
				337	* Remove bdi from the global list and shutdown any threads we have running
				338	*/
				339	static void wb_shutdown(struct bdi_writeback *wb)
				340	{
				341	/* Make sure nobody queues further work */
				342	spin_lock_bh(&wb->work_lock);
				343	if (!test_and_clear_bit(WB_registered, &wb->state)) {
				344	spin_unlock_bh(&wb->work_lock);
				345	return;
				346	}
				347	spin_unlock_bh(&wb->work_lock);
				348
				349	/*
				350	* Drain work list and shutdown the delayed_work. !WB_registered
				351	* tells wb_workfn() that @wb is dying and its work_list needs to
				352	* be drained no matter what.
				353	*/
				354	mod_delayed_work(bdi_wq, &wb->dwork, 0);
				355	flush_delayed_work(&wb->dwork);
				356	WARN_ON(!list_empty(&wb->work_list));
				357	}
				358
Tejun Heo	f0054bb	2015-05-22 17:13:30 -0400	[diff] [blame]	359	static void wb_exit(struct bdi_writeback *wb)
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	360	{
				361	int i;
				362
				363	WARN_ON(delayed_work_pending(&wb->dwork));
				364
				365	for (i = 0; i < NR_WB_STAT_ITEMS; i++)
				366	percpu_counter_destroy(&wb->stat[i]);
Artem Bityutskiy	6467716	2010-07-25 14:29:22 +0300	[diff] [blame]	367
Tejun Heo	a88a341	2015-05-22 17:13:28 -0400	[diff] [blame]	368	fprop_local_destroy_percpu(&wb->completions);
				369	}
Wu Fengguang	e98be2d	2010-08-29 11:22:30 -0600	[diff] [blame]	370
Tejun Heo	52ebea7	2015-05-22 17:13:37 -0400	[diff] [blame^]	371	#ifdef CONFIG_CGROUP_WRITEBACK
				372
				373	#include <linux/memcontrol.h>
				374
				375	/*
				376	* cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
				377	* blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
				378	* protected. cgwb_release_wait is used to wait for the completion of cgwb
				379	* releases from bdi destruction path.
				380	*/
				381	static DEFINE_SPINLOCK(cgwb_lock);
				382	static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
				383
				384	/**
				385	* wb_congested_get_create - get or create a wb_congested
				386	* @bdi: associated bdi
				387	* @blkcg_id: ID of the associated blkcg
				388	* @gfp: allocation mask
				389	*
				390	* Look up the wb_congested for @blkcg_id on @bdi. If missing, create one.
				391	* The returned wb_congested has its reference count incremented. Returns
				392	* NULL on failure.
				393	*/
				394	struct bdi_writeback_congested *
				395	wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
				396	{
				397	struct bdi_writeback_congested new_congested = NULL, congested;
				398	struct rb_node *node, parent;
				399	unsigned long flags;
				400
				401	if (blkcg_id == 1)
				402	return &bdi->wb_congested;
				403	retry:
				404	spin_lock_irqsave(&cgwb_lock, flags);
				405
				406	node = &bdi->cgwb_congested_tree.rb_node;
				407	parent = NULL;
				408
				409	while (*node != NULL) {
				410	parent = *node;
				411	congested = container_of(parent, struct bdi_writeback_congested,
				412	rb_node);
				413	if (congested->blkcg_id < blkcg_id)
				414	node = &parent->rb_left;
				415	else if (congested->blkcg_id > blkcg_id)
				416	node = &parent->rb_right;
				417	else
				418	goto found;
				419	}
				420
				421	if (new_congested) {
				422	/* !found and storage for new one already allocated, insert */
				423	congested = new_congested;
				424	new_congested = NULL;
				425	rb_link_node(&congested->rb_node, parent, node);
				426	rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
				427	atomic_inc(&bdi->usage_cnt);
				428	goto found;
				429	}
				430
				431	spin_unlock_irqrestore(&cgwb_lock, flags);
				432
				433	/* allocate storage for new one and retry */
				434	new_congested = kzalloc(sizeof(*new_congested), gfp);
				435	if (!new_congested)
				436	return NULL;
				437
				438	atomic_set(&new_congested->refcnt, 0);
				439	new_congested->bdi = bdi;
				440	new_congested->blkcg_id = blkcg_id;
				441	goto retry;
				442
				443	found:
				444	atomic_inc(&congested->refcnt);
				445	spin_unlock_irqrestore(&cgwb_lock, flags);
				446	kfree(new_congested);
				447	return congested;
				448	}
				449
				450	/**
				451	* wb_congested_put - put a wb_congested
				452	* @congested: wb_congested to put
				453	*
				454	* Put @congested and destroy it if the refcnt reaches zero.
				455	*/
				456	void wb_congested_put(struct bdi_writeback_congested *congested)
				457	{
				458	struct backing_dev_info *bdi = congested->bdi;
				459	unsigned long flags;
				460
				461	if (congested->blkcg_id == 1)
				462	return;
				463
				464	local_irq_save(flags);
				465	if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
				466	local_irq_restore(flags);
				467	return;
				468	}
				469
				470	rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree);
				471	spin_unlock_irqrestore(&cgwb_lock, flags);
				472	kfree(congested);
				473
				474	if (atomic_dec_and_test(&bdi->usage_cnt))
				475	wake_up_all(&cgwb_release_wait);
				476	}
				477
				478	static void cgwb_release_workfn(struct work_struct *work)
				479	{
				480	struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
				481	release_work);
				482	struct backing_dev_info *bdi = wb->bdi;
				483
				484	wb_shutdown(wb);
				485
				486	css_put(wb->memcg_css);
				487	css_put(wb->blkcg_css);
				488	wb_congested_put(wb->congested);
				489
				490	percpu_ref_exit(&wb->refcnt);
				491	wb_exit(wb);
				492	kfree_rcu(wb, rcu);
				493
				494	if (atomic_dec_and_test(&bdi->usage_cnt))
				495	wake_up_all(&cgwb_release_wait);
				496	}
				497
				498	static void cgwb_release(struct percpu_ref *refcnt)
				499	{
				500	struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
				501	refcnt);
				502	schedule_work(&wb->release_work);
				503	}
				504
				505	static void cgwb_kill(struct bdi_writeback *wb)
				506	{
				507	lockdep_assert_held(&cgwb_lock);
				508
				509	WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
				510	list_del(&wb->memcg_node);
				511	list_del(&wb->blkcg_node);
				512	percpu_ref_kill(&wb->refcnt);
				513	}
				514
				515	static int cgwb_create(struct backing_dev_info *bdi,
				516	struct cgroup_subsys_state *memcg_css, gfp_t gfp)
				517	{
				518	struct mem_cgroup *memcg;
				519	struct cgroup_subsys_state *blkcg_css;
				520	struct blkcg *blkcg;
				521	struct list_head memcg_cgwb_list, blkcg_cgwb_list;
				522	struct bdi_writeback *wb;
				523	unsigned long flags;
				524	int ret = 0;
				525
				526	memcg = mem_cgroup_from_css(memcg_css);
				527	blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
				528	blkcg = css_to_blkcg(blkcg_css);
				529	memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
				530	blkcg_cgwb_list = &blkcg->cgwb_list;
				531
				532	/* look up again under lock and discard on blkcg mismatch */
				533	spin_lock_irqsave(&cgwb_lock, flags);
				534	wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
				535	if (wb && wb->blkcg_css != blkcg_css) {
				536	cgwb_kill(wb);
				537	wb = NULL;
				538	}
				539	spin_unlock_irqrestore(&cgwb_lock, flags);
				540	if (wb)
				541	goto out_put;
				542
				543	/* need to create a new one */
				544	wb = kmalloc(sizeof(*wb), gfp);
				545	if (!wb)
				546	return -ENOMEM;
				547
				548	ret = wb_init(wb, bdi, gfp);
				549	if (ret)
				550	goto err_free;
				551
				552	ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
				553	if (ret)
				554	goto err_wb_exit;
				555
				556	wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
				557	if (!wb->congested)
				558	goto err_ref_exit;
				559
				560	wb->memcg_css = memcg_css;
				561	wb->blkcg_css = blkcg_css;
				562	INIT_WORK(&wb->release_work, cgwb_release_workfn);
				563	set_bit(WB_registered, &wb->state);
				564
				565	/*
				566	* The root wb determines the registered state of the whole bdi and
				567	* memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
				568	* whether they're still online. Don't link @wb if any is dead.
				569	* See wb_memcg_offline() and wb_blkcg_offline().
				570	*/
				571	ret = -ENODEV;
				572	spin_lock_irqsave(&cgwb_lock, flags);
				573	if (test_bit(WB_registered, &bdi->wb.state) &&
				574	blkcg_cgwb_list->next && memcg_cgwb_list->next) {
				575	/* we might have raced another instance of this function */
				576	ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
				577	if (!ret) {
				578	atomic_inc(&bdi->usage_cnt);
				579	list_add(&wb->memcg_node, memcg_cgwb_list);
				580	list_add(&wb->blkcg_node, blkcg_cgwb_list);
				581	css_get(memcg_css);
				582	css_get(blkcg_css);
				583	}
				584	}
				585	spin_unlock_irqrestore(&cgwb_lock, flags);
				586	if (ret) {
				587	if (ret == -EEXIST)
				588	ret = 0;
				589	goto err_put_congested;
				590	}
				591	goto out_put;
				592
				593	err_put_congested:
				594	wb_congested_put(wb->congested);
				595	err_ref_exit:
				596	percpu_ref_exit(&wb->refcnt);
				597	err_wb_exit:
				598	wb_exit(wb);
				599	err_free:
				600	kfree(wb);
				601	out_put:
				602	css_put(blkcg_css);
				603	return ret;
				604	}
				605
				606	/**
				607	* wb_get_create - get wb for a given memcg, create if necessary
				608	* @bdi: target bdi
				609	* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
				610	* @gfp: allocation mask to use
				611	*
				612	* Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
				613	* create one. The returned wb has its refcount incremented.
				614	*
				615	* This function uses css_get() on @memcg_css and thus expects its refcnt
				616	* to be positive on invocation. IOW, rcu_read_lock() protection on
				617	* @memcg_css isn't enough. try_get it before calling this function.
				618	*
				619	* A wb is keyed by its associated memcg. As blkcg implicitly enables
				620	* memcg on the default hierarchy, memcg association is guaranteed to be
				621	* more specific (equal or descendant to the associated blkcg) and thus can
				622	* identify both the memcg and blkcg associations.
				623	*
				624	* Because the blkcg associated with a memcg may change as blkcg is enabled
				625	* and disabled closer to root in the hierarchy, each wb keeps track of
				626	* both the memcg and blkcg associated with it and verifies the blkcg on
				627	* each lookup. On mismatch, the existing wb is discarded and a new one is
				628	* created.
				629	*/
				630	struct bdi_writeback wb_get_create(struct backing_dev_info bdi,
				631	struct cgroup_subsys_state *memcg_css,
				632	gfp_t gfp)
				633	{
				634	struct bdi_writeback *wb;
				635
				636	might_sleep_if(gfp & __GFP_WAIT);
				637
				638	if (!memcg_css->parent)
				639	return &bdi->wb;
				640
				641	do {
				642	rcu_read_lock();
				643	wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
				644	if (wb) {
				645	struct cgroup_subsys_state *blkcg_css;
				646
				647	/* see whether the blkcg association has changed */
				648	blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
				649	&blkio_cgrp_subsys);
				650	if (unlikely(wb->blkcg_css != blkcg_css \|\|
				651	!wb_tryget(wb)))
				652	wb = NULL;
				653	css_put(blkcg_css);
				654	}
				655	rcu_read_unlock();
				656	} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
				657
				658	return wb;
				659	}
				660
				661	void __inode_attach_wb(struct inode inode, struct page page)
				662	{
				663	struct backing_dev_info *bdi = inode_to_bdi(inode);
				664	struct bdi_writeback *wb = NULL;
				665
				666	if (inode_cgwb_enabled(inode)) {
				667	struct cgroup_subsys_state *memcg_css;
				668
				669	if (page) {
				670	memcg_css = mem_cgroup_css_from_page(page);
				671	wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
				672	} else {
				673	/* must pin memcg_css, see wb_get_create() */
				674	memcg_css = task_get_css(current, memory_cgrp_id);
				675	wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
				676	css_put(memcg_css);
				677	}
				678	}
				679
				680	if (!wb)
				681	wb = &bdi->wb;
				682
				683	/*
				684	* There may be multiple instances of this function racing to
				685	* update the same inode. Use cmpxchg() to tell the winner.
				686	*/
				687	if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
				688	wb_put(wb);
				689	}
				690
				691	static void cgwb_bdi_init(struct backing_dev_info *bdi)
				692	{
				693	bdi->wb.memcg_css = mem_cgroup_root_css;
				694	bdi->wb.blkcg_css = blkcg_root_css;
				695	bdi->wb_congested.blkcg_id = 1;
				696	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
				697	bdi->cgwb_congested_tree = RB_ROOT;
				698	atomic_set(&bdi->usage_cnt, 1);
				699	}
				700
				701	static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
				702	{
				703	struct radix_tree_iter iter;
				704	void **slot;
				705
				706	WARN_ON(test_bit(WB_registered, &bdi->wb.state));
				707
				708	spin_lock_irq(&cgwb_lock);
				709	radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
				710	cgwb_kill(*slot);
				711	spin_unlock_irq(&cgwb_lock);
				712
				713	/*
				714	* All cgwb's and their congested states must be shutdown and
				715	* released before returning. Drain the usage counter to wait for
				716	* all cgwb's and cgwb_congested's ever created on @bdi.
				717	*/
				718	atomic_dec(&bdi->usage_cnt);
				719	wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
				720	}
				721
				722	/**
				723	* wb_memcg_offline - kill all wb's associated with a memcg being offlined
				724	* @memcg: memcg being offlined
				725	*
				726	* Also prevents creation of any new wb's associated with @memcg.
				727	*/
				728	void wb_memcg_offline(struct mem_cgroup *memcg)
				729	{
				730	LIST_HEAD(to_destroy);
				731	struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
				732	struct bdi_writeback wb, next;
				733
				734	spin_lock_irq(&cgwb_lock);
				735	list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
				736	cgwb_kill(wb);
				737	memcg_cgwb_list->next = NULL; /* prevent new wb's */
				738	spin_unlock_irq(&cgwb_lock);
				739	}
				740
				741	/**
				742	* wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
				743	* @blkcg: blkcg being offlined
				744	*
				745	* Also prevents creation of any new wb's associated with @blkcg.
				746	*/
				747	void wb_blkcg_offline(struct blkcg *blkcg)
				748	{
				749	LIST_HEAD(to_destroy);
				750	struct bdi_writeback wb, next;
				751
				752	spin_lock_irq(&cgwb_lock);
				753	list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
				754	cgwb_kill(wb);
				755	blkcg->cgwb_list.next = NULL; /* prevent new wb's */
				756	spin_unlock_irq(&cgwb_lock);
				757	}
				758
				759	#else /* CONFIG_CGROUP_WRITEBACK */
				760
				761	static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
				762	static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
				763
				764	#endif /* CONFIG_CGROUP_WRITEBACK */
				765
Peter Zijlstra	b2e8fb6	2007-10-16 23:25:47 -0700	[diff] [blame]	766	int bdi_init(struct backing_dev_info *bdi)
				767	{
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	768	int err;
Peter Zijlstra	b2e8fb6	2007-10-16 23:25:47 -0700	[diff] [blame]	769
Peter Zijlstra	cf0ca9f	2008-04-30 00:54:32 -0700	[diff] [blame]	770	bdi->dev = NULL;
				771
Peter Zijlstra	189d3c4	2008-04-30 00:54:35 -0700	[diff] [blame]	772	bdi->min_ratio = 0;
Peter Zijlstra	a42dde0	2008-04-30 00:54:36 -0700	[diff] [blame]	773	bdi->max_ratio = 100;
Jan Kara	eb608e3	2012-05-24 18:59:11 +0200	[diff] [blame]	774	bdi->max_prop_frac = FPROP_FRAC_BASE;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	775	INIT_LIST_HEAD(&bdi->bdi_list);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	776
Tejun Heo	8395cd9	2015-05-22 17:13:34 -0400	[diff] [blame]	777	err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	778	if (err)
				779	return err;
Peter Zijlstra	04fbfdc	2007-10-16 23:25:50 -0700	[diff] [blame]	780
Tejun Heo	4aa9c69	2015-05-22 17:13:35 -0400	[diff] [blame]	781	bdi->wb_congested.state = 0;
				782	bdi->wb.congested = &bdi->wb_congested;
				783
Tejun Heo	52ebea7	2015-05-22 17:13:37 -0400	[diff] [blame^]	784	cgwb_bdi_init(bdi);
Tejun Heo	93f78d8	2015-05-22 17:13:27 -0400	[diff] [blame]	785	return 0;
Peter Zijlstra	b2e8fb6	2007-10-16 23:25:47 -0700	[diff] [blame]	786	}
				787	EXPORT_SYMBOL(bdi_init);
				788
Tejun Heo	4610007	2015-05-22 17:13:31 -0400	[diff] [blame]	789	int bdi_register(struct backing_dev_info bdi, struct device parent,
				790	const char *fmt, ...)
				791	{
				792	va_list args;
				793	struct device *dev;
				794
				795	if (bdi->dev) /* The driver needs to use separate queues per device */
				796	return 0;
				797
				798	va_start(args, fmt);
				799	dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
				800	va_end(args);
				801	if (IS_ERR(dev))
				802	return PTR_ERR(dev);
				803
				804	bdi->dev = dev;
				805
				806	bdi_debug_register(bdi, dev_name(dev));
				807	set_bit(WB_registered, &bdi->wb.state);
				808
				809	spin_lock_bh(&bdi_lock);
				810	list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
				811	spin_unlock_bh(&bdi_lock);
				812
				813	trace_writeback_bdi_register(bdi);
				814	return 0;
				815	}
				816	EXPORT_SYMBOL(bdi_register);
				817
				818	int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
				819	{
				820	return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
				821	}
				822	EXPORT_SYMBOL(bdi_register_dev);
				823
				824	/*
				825	* Remove bdi from bdi_list, and ensure that it is no longer visible
				826	*/
				827	static void bdi_remove_from_list(struct backing_dev_info *bdi)
				828	{
				829	spin_lock_bh(&bdi_lock);
				830	list_del_rcu(&bdi->bdi_list);
				831	spin_unlock_bh(&bdi_lock);
				832
				833	synchronize_rcu_expedited();
				834	}
				835
				836	/*
				837	* Called when the device behind @bdi has been removed or ejected.
				838	*
				839	* We can't really do much here except for reducing the dirty ratio at
				840	* the moment. In the future we should be able to set a flag so that
				841	* the filesystem can handle errors at mark_inode_dirty time instead
				842	* of only at writeback time.
				843	*/
				844	void bdi_unregister(struct backing_dev_info *bdi)
				845	{
				846	if (WARN_ON_ONCE(!bdi->dev))
				847	return;
				848
				849	bdi_set_min_ratio(bdi, 0);
				850	}
				851	EXPORT_SYMBOL(bdi_unregister);
				852
Peter Zijlstra	b2e8fb6	2007-10-16 23:25:47 -0700	[diff] [blame]	853	void bdi_destroy(struct backing_dev_info *bdi)
				854	{
Tejun Heo	f0054bb	2015-05-22 17:13:30 -0400	[diff] [blame]	855	/* make sure nobody finds us on the bdi_list anymore */
				856	bdi_remove_from_list(bdi);
				857	wb_shutdown(&bdi->wb);
Tejun Heo	52ebea7	2015-05-22 17:13:37 -0400	[diff] [blame^]	858	cgwb_bdi_destroy(bdi);
Rabin Vincent	7a401a9	2011-11-11 13:29:04 +0100	[diff] [blame]	859
Christoph Hellwig	c4db59d	2015-01-20 14:05:00 -0700	[diff] [blame]	860	if (bdi->dev) {
				861	bdi_debug_unregister(bdi);
				862	device_unregister(bdi->dev);
				863	bdi->dev = NULL;
				864	}
				865
Tejun Heo	f0054bb	2015-05-22 17:13:30 -0400	[diff] [blame]	866	wb_exit(&bdi->wb);
Peter Zijlstra	b2e8fb6	2007-10-16 23:25:47 -0700	[diff] [blame]	867	}
				868	EXPORT_SYMBOL(bdi_destroy);
				869
Jens Axboe	c3c5320	2010-04-22 11:37:01 +0200	[diff] [blame]	870	/*
				871	* For use from filesystems to quickly init and register a bdi associated
				872	* with dirty writeback
				873	*/
Christoph Hellwig	b4caecd	2015-01-14 10:42:32 +0100	[diff] [blame]	874	int bdi_setup_and_register(struct backing_dev_info bdi, char name)
Jens Axboe	c3c5320	2010-04-22 11:37:01 +0200	[diff] [blame]	875	{
Jens Axboe	c3c5320	2010-04-22 11:37:01 +0200	[diff] [blame]	876	int err;
				877
				878	bdi->name = name;
Christoph Hellwig	b4caecd	2015-01-14 10:42:32 +0100	[diff] [blame]	879	bdi->capabilities = 0;
Jens Axboe	c3c5320	2010-04-22 11:37:01 +0200	[diff] [blame]	880	err = bdi_init(bdi);
				881	if (err)
				882	return err;
				883
Kees Cook	02aa2a3	2013-07-03 15:04:56 -0700	[diff] [blame]	884	err = bdi_register(bdi, NULL, "%.28s-%ld", name,
				885	atomic_long_inc_return(&bdi_seq));
Jens Axboe	c3c5320	2010-04-22 11:37:01 +0200	[diff] [blame]	886	if (err) {
				887	bdi_destroy(bdi);
				888	return err;
				889	}
				890
				891	return 0;
				892	}
				893	EXPORT_SYMBOL(bdi_setup_and_register);
				894
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	895	static wait_queue_head_t congestion_wqh[2] = {
				896	__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
				897	__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
				898	};
Mel Gorman	0e093d99	2010-10-26 14:21:45 -0700	[diff] [blame]	899	static atomic_t nr_bdi_congested[2];
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	900
Jens Axboe	1faa16d	2009-04-06 14:48:01 +0200	[diff] [blame]	901	void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	902	{
Tejun Heo	4452226	2015-05-22 17:13:26 -0400	[diff] [blame]	903	enum wb_state bit;
Jens Axboe	1faa16d	2009-04-06 14:48:01 +0200	[diff] [blame]	904	wait_queue_head_t *wqh = &congestion_wqh[sync];
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	905
Tejun Heo	4452226	2015-05-22 17:13:26 -0400	[diff] [blame]	906	bit = sync ? WB_sync_congested : WB_async_congested;
Tejun Heo	4aa9c69	2015-05-22 17:13:35 -0400	[diff] [blame]	907	if (test_and_clear_bit(bit, &bdi->wb.congested->state))
Mel Gorman	0e093d99	2010-10-26 14:21:45 -0700	[diff] [blame]	908	atomic_dec(&nr_bdi_congested[sync]);
Peter Zijlstra	4e857c5	2014-03-17 18:06:10 +0100	[diff] [blame]	909	smp_mb__after_atomic();
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	910	if (waitqueue_active(wqh))
				911	wake_up(wqh);
				912	}
				913	EXPORT_SYMBOL(clear_bdi_congested);
				914
Jens Axboe	1faa16d	2009-04-06 14:48:01 +0200	[diff] [blame]	915	void set_bdi_congested(struct backing_dev_info *bdi, int sync)
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	916	{
Tejun Heo	4452226	2015-05-22 17:13:26 -0400	[diff] [blame]	917	enum wb_state bit;
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	918
Tejun Heo	4452226	2015-05-22 17:13:26 -0400	[diff] [blame]	919	bit = sync ? WB_sync_congested : WB_async_congested;
Tejun Heo	4aa9c69	2015-05-22 17:13:35 -0400	[diff] [blame]	920	if (!test_and_set_bit(bit, &bdi->wb.congested->state))
Mel Gorman	0e093d99	2010-10-26 14:21:45 -0700	[diff] [blame]	921	atomic_inc(&nr_bdi_congested[sync]);
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	922	}
				923	EXPORT_SYMBOL(set_bdi_congested);
				924
				925	/**
				926	* congestion_wait - wait for a backing_dev to become uncongested
Jens Axboe	8aa7e84	2009-07-09 14:52:32 +0200	[diff] [blame]	927	* @sync: SYNC or ASYNC IO
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	928	* @timeout: timeout in jiffies
				929	*
				930	* Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit
				931	* write congestion. If no backing_devs are congested then just wait for the
				932	* next write to be completed.
				933	*/
Jens Axboe	8aa7e84	2009-07-09 14:52:32 +0200	[diff] [blame]	934	long congestion_wait(int sync, long timeout)
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	935	{
				936	long ret;
Mel Gorman	52bb919	2010-10-26 14:21:41 -0700	[diff] [blame]	937	unsigned long start = jiffies;
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	938	DEFINE_WAIT(wait);
Jens Axboe	8aa7e84	2009-07-09 14:52:32 +0200	[diff] [blame]	939	wait_queue_head_t *wqh = &congestion_wqh[sync];
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	940
				941	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
				942	ret = io_schedule_timeout(timeout);
				943	finish_wait(wqh, &wait);
Mel Gorman	52bb919	2010-10-26 14:21:41 -0700	[diff] [blame]	944
				945	trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
				946	jiffies_to_usecs(jiffies - start));
				947
Andrew Morton	3fcfab1	2006-10-19 23:28:16 -0700	[diff] [blame]	948	return ret;
				949	}
				950	EXPORT_SYMBOL(congestion_wait);
Peter Zijlstra	04fbfdc	2007-10-16 23:25:50 -0700	[diff] [blame]	951
Mel Gorman	0e093d99	2010-10-26 14:21:45 -0700	[diff] [blame]	952	/**
				953	* wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
				954	* @zone: A zone to check if it is heavily congested
				955	* @sync: SYNC or ASYNC IO
				956	* @timeout: timeout in jiffies
				957	*
				958	* In the event of a congested backing_dev (any backing_dev) and the given
				959	* @zone has experienced recent congestion, this waits for up to @timeout
				960	* jiffies for either a BDI to exit congestion of the given @sync queue
				961	* or a write to complete.
				962	*
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	963	* In the absence of zone congestion, cond_resched() is called to yield
Mel Gorman	0e093d99	2010-10-26 14:21:45 -0700	[diff] [blame]	964	* the processor if necessary but otherwise does not sleep.
				965	*
				966	* The return value is 0 if the sleep is for the full timeout. Otherwise,
				967	* it is the number of jiffies that were still remaining when the function
				968	* returned. return_value == timeout implies the function did not sleep.
				969	*/
				970	long wait_iff_congested(struct zone *zone, int sync, long timeout)
				971	{
				972	long ret;
				973	unsigned long start = jiffies;
				974	DEFINE_WAIT(wait);
				975	wait_queue_head_t *wqh = &congestion_wqh[sync];
				976
				977	/*
				978	* If there is no congestion, or heavy congestion is not being
				979	* encountered in the current zone, yield if necessary instead
				980	* of sleeping on the congestion queue
				981	*/
				982	if (atomic_read(&nr_bdi_congested[sync]) == 0 \|\|
Johannes Weiner	5705465	2014-10-09 15:28:17 -0700	[diff] [blame]	983	!test_bit(ZONE_CONGESTED, &zone->flags)) {
Mel Gorman	0e093d99	2010-10-26 14:21:45 -0700	[diff] [blame]	984	cond_resched();
				985
				986	/* In case we scheduled, work out time remaining */
				987	ret = timeout - (jiffies - start);
				988	if (ret < 0)
				989	ret = 0;
				990
				991	goto out;
				992	}
				993
				994	/* Sleep until uncongested or a write happens */
				995	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
				996	ret = io_schedule_timeout(timeout);
				997	finish_wait(wqh, &wait);
				998
				999	out:
				1000	trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
				1001	jiffies_to_usecs(jiffies - start));
				1002
				1003	return ret;
				1004	}
				1005	EXPORT_SYMBOL(wait_iff_congested);
Wanpeng Li	3965c9a	2012-07-31 16:41:52 -0700	[diff] [blame]	1006
				1007	int pdflush_proc_obsolete(struct ctl_table *table, int write,
				1008	void __user buffer, size_t lenp, loff_t *ppos)
				1009	{
				1010	char kbuf[] = "0\n";
				1011
Chen Gang	4c3bffc	2013-09-11 14:22:44 -0700	[diff] [blame]	1012	if (ppos \|\| lenp < sizeof(kbuf)) {
Wanpeng Li	3965c9a	2012-07-31 16:41:52 -0700	[diff] [blame]	1013	*lenp = 0;
				1014	return 0;
				1015	}
				1016
				1017	if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
				1018	return -EFAULT;
				1019	printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
				1020	table->procname);
				1021
				1022	*lenp = 2;
				1023	ppos += lenp;
				1024	return 2;
				1025	}