Blame - block/blk-throttle.c - SHIFTPHONES/kernel/common

blob: af53f37c1b13188bcc599b3c02b786a99d25e008 [file] [log] [blame]

Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	1	/*
				2	* Interface for controlling IO bandwidth on a request queue
				3	*
				4	* Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
				5	*/
				6
				7	#include <linux/module.h>
				8	#include <linux/slab.h>
				9	#include <linux/blkdev.h>
				10	#include <linux/bio.h>
				11	#include <linux/blktrace_api.h>
				12	#include "blk-cgroup.h"
				13
				14	/* Max dispatch from a group in 1 round */
				15	static int throtl_grp_quantum = 8;
				16
				17	/* Total max dispatch from all groups in one round */
				18	static int throtl_quantum = 32;
				19
				20	/* Throttling is performed over 100ms slice and after that slice is renewed */
				21	static unsigned long throtl_slice = HZ/10; /* 100 ms */
				22
				23	struct throtl_rb_root {
				24	struct rb_root rb;
				25	struct rb_node *left;
				26	unsigned int count;
				27	unsigned long min_disptime;
				28	};
				29
				30	#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
				31	.count = 0, .min_disptime = 0}
				32
				33	#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
				34
				35	struct throtl_grp {
				36	/* List of throtl groups on the request queue*/
				37	struct hlist_node tg_node;
				38
				39	/* active throtl group service_tree member */
				40	struct rb_node rb_node;
				41
				42	/*
				43	* Dispatch time in jiffies. This is the estimated time when group
				44	* will unthrottle and is ready to dispatch more bio. It is used as
				45	* key to sort active groups in service tree.
				46	*/
				47	unsigned long disptime;
				48
				49	struct blkio_group blkg;
				50	atomic_t ref;
				51	unsigned int flags;
				52
				53	/* Two lists for READ and WRITE */
				54	struct bio_list bio_lists[2];
				55
				56	/* Number of queued bios on READ and WRITE lists */
				57	unsigned int nr_queued[2];
				58
				59	/* bytes per second rate limits */
				60	uint64_t bps[2];
				61
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	62	/* IOPS limits */
				63	unsigned int iops[2];
				64
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	65	/* Number of bytes disptached in current slice */
				66	uint64_t bytes_disp[2];
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	67	/* Number of bio's dispatched in current slice */
				68	unsigned int io_disp[2];
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	69
				70	/* When did we start a new slice */
				71	unsigned long slice_start[2];
				72	unsigned long slice_end[2];
				73	};
				74
				75	struct throtl_data
				76	{
				77	/* List of throtl groups */
				78	struct hlist_head tg_list;
				79
				80	/* service tree for active throtl groups */
				81	struct throtl_rb_root tg_service_tree;
				82
				83	struct throtl_grp root_tg;
				84	struct request_queue *queue;
				85
				86	/* Total Number of queued bios on READ and WRITE lists */
				87	unsigned int nr_queued[2];
				88
				89	/*
				90	* number of total undestroyed groups (excluding root group)
				91	*/
				92	unsigned int nr_undestroyed_grps;
				93
				94	/* Work for dispatching throttled bios */
				95	struct delayed_work throtl_work;
				96	};
				97
				98	enum tg_state_flags {
				99	THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
				100	};
				101
				102	#define THROTL_TG_FNS(name) \
				103	static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
				104	{ \
				105	(tg)->flags \|= (1 << THROTL_TG_FLAG_##name); \
				106	} \
				107	static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
				108	{ \
				109	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
				110	} \
				111	static inline int throtl_tg_##name(const struct throtl_grp *tg) \
				112	{ \
				113	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
				114	}
				115
				116	THROTL_TG_FNS(on_rr);
				117
				118	#define throtl_log_tg(td, tg, fmt, args...) \
				119	blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
				120	blkg_path(&(tg)->blkg), ##args); \
				121
				122	#define throtl_log(td, fmt, args...) \
				123	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
				124
				125	static inline struct throtl_grp tg_of_blkg(struct blkio_group blkg)
				126	{
				127	if (blkg)
				128	return container_of(blkg, struct throtl_grp, blkg);
				129
				130	return NULL;
				131	}
				132
				133	static inline int total_nr_queued(struct throtl_data *td)
				134	{
				135	return (td->nr_queued[0] + td->nr_queued[1]);
				136	}
				137
				138	static inline struct throtl_grp throtl_ref_get_tg(struct throtl_grp tg)
				139	{
				140	atomic_inc(&tg->ref);
				141	return tg;
				142	}
				143
				144	static void throtl_put_tg(struct throtl_grp *tg)
				145	{
				146	BUG_ON(atomic_read(&tg->ref) <= 0);
				147	if (!atomic_dec_and_test(&tg->ref))
				148	return;
				149	kfree(tg);
				150	}
				151
				152	static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
				153	struct cgroup *cgroup)
				154	{
				155	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
				156	struct throtl_grp *tg = NULL;
				157	void *key = td;
				158	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
				159	unsigned int major, minor;
				160
				161	/*
				162	* TODO: Speed up blkiocg_lookup_group() by maintaining a radix
				163	* tree of blkg (instead of traversing through hash list all
				164	* the time.
				165	*/
				166	tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
				167
				168	/* Fill in device details for root group */
				169	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
				170	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
				171	tg->blkg.dev = MKDEV(major, minor);
				172	goto done;
				173	}
				174
				175	if (tg)
				176	goto done;
				177
				178	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
				179	if (!tg)
				180	goto done;
				181
				182	INIT_HLIST_NODE(&tg->tg_node);
				183	RB_CLEAR_NODE(&tg->rb_node);
				184	bio_list_init(&tg->bio_lists[0]);
				185	bio_list_init(&tg->bio_lists[1]);
				186
				187	/*
				188	* Take the initial reference that will be released on destroy
				189	* This can be thought of a joint reference by cgroup and
				190	* request queue which will be dropped by either request queue
				191	* exit or cgroup deletion path depending on who is exiting first.
				192	*/
				193	atomic_set(&tg->ref, 1);
				194
				195	/* Add group onto cgroup list */
				196	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
				197	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
				198	MKDEV(major, minor), BLKIO_POLICY_THROTL);
				199
				200	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
				201	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	202	tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev);
				203	tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev);
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	204
				205	hlist_add_head(&tg->tg_node, &td->tg_list);
				206	td->nr_undestroyed_grps++;
				207	done:
				208	return tg;
				209	}
				210
				211	static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
				212	{
				213	struct cgroup *cgroup;
				214	struct throtl_grp *tg = NULL;
				215
				216	rcu_read_lock();
				217	cgroup = task_cgroup(current, blkio_subsys_id);
				218	tg = throtl_find_alloc_tg(td, cgroup);
				219	if (!tg)
				220	tg = &td->root_tg;
				221	rcu_read_unlock();
				222	return tg;
				223	}
				224
				225	static struct throtl_grp throtl_rb_first(struct throtl_rb_root root)
				226	{
				227	/* Service tree is empty */
				228	if (!root->count)
				229	return NULL;
				230
				231	if (!root->left)
				232	root->left = rb_first(&root->rb);
				233
				234	if (root->left)
				235	return rb_entry_tg(root->left);
				236
				237	return NULL;
				238	}
				239
				240	static void rb_erase_init(struct rb_node n, struct rb_root root)
				241	{
				242	rb_erase(n, root);
				243	RB_CLEAR_NODE(n);
				244	}
				245
				246	static void throtl_rb_erase(struct rb_node n, struct throtl_rb_root root)
				247	{
				248	if (root->left == n)
				249	root->left = NULL;
				250	rb_erase_init(n, &root->rb);
				251	--root->count;
				252	}
				253
				254	static void update_min_dispatch_time(struct throtl_rb_root *st)
				255	{
				256	struct throtl_grp *tg;
				257
				258	tg = throtl_rb_first(st);
				259	if (!tg)
				260	return;
				261
				262	st->min_disptime = tg->disptime;
				263	}
				264
				265	static void
				266	tg_service_tree_add(struct throtl_rb_root st, struct throtl_grp tg)
				267	{
				268	struct rb_node **node = &st->rb.rb_node;
				269	struct rb_node *parent = NULL;
				270	struct throtl_grp *__tg;
				271	unsigned long key = tg->disptime;
				272	int left = 1;
				273
				274	while (*node != NULL) {
				275	parent = *node;
				276	__tg = rb_entry_tg(parent);
				277
				278	if (time_before(key, __tg->disptime))
				279	node = &parent->rb_left;
				280	else {
				281	node = &parent->rb_right;
				282	left = 0;
				283	}
				284	}
				285
				286	if (left)
				287	st->left = &tg->rb_node;
				288
				289	rb_link_node(&tg->rb_node, parent, node);
				290	rb_insert_color(&tg->rb_node, &st->rb);
				291	}
				292
				293	static void __throtl_enqueue_tg(struct throtl_data td, struct throtl_grp tg)
				294	{
				295	struct throtl_rb_root *st = &td->tg_service_tree;
				296
				297	tg_service_tree_add(st, tg);
				298	throtl_mark_tg_on_rr(tg);
				299	st->count++;
				300	}
				301
				302	static void throtl_enqueue_tg(struct throtl_data td, struct throtl_grp tg)
				303	{
				304	if (!throtl_tg_on_rr(tg))
				305	__throtl_enqueue_tg(td, tg);
				306	}
				307
				308	static void __throtl_dequeue_tg(struct throtl_data td, struct throtl_grp tg)
				309	{
				310	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
				311	throtl_clear_tg_on_rr(tg);
				312	}
				313
				314	static void throtl_dequeue_tg(struct throtl_data td, struct throtl_grp tg)
				315	{
				316	if (throtl_tg_on_rr(tg))
				317	__throtl_dequeue_tg(td, tg);
				318	}
				319
				320	static void throtl_schedule_next_dispatch(struct throtl_data *td)
				321	{
				322	struct throtl_rb_root *st = &td->tg_service_tree;
				323
				324	/*
				325	* If there are more bios pending, schedule more work.
				326	*/
				327	if (!total_nr_queued(td))
				328	return;
				329
				330	BUG_ON(!st->count);
				331
				332	update_min_dispatch_time(st);
				333
				334	if (time_before_eq(st->min_disptime, jiffies))
				335	throtl_schedule_delayed_work(td->queue, 0);
				336	else
				337	throtl_schedule_delayed_work(td->queue,
				338	(st->min_disptime - jiffies));
				339	}
				340
				341	static inline void
				342	throtl_start_new_slice(struct throtl_data td, struct throtl_grp tg, bool rw)
				343	{
				344	tg->bytes_disp[rw] = 0;
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	345	tg->io_disp[rw] = 0;
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	346	tg->slice_start[rw] = jiffies;
				347	tg->slice_end[rw] = jiffies + throtl_slice;
				348	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
				349	rw == READ ? 'R' : 'W', tg->slice_start[rw],
				350	tg->slice_end[rw], jiffies);
				351	}
				352
				353	static inline void throtl_extend_slice(struct throtl_data *td,
				354	struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
				355	{
				356	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
				357	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
				358	rw == READ ? 'R' : 'W', tg->slice_start[rw],
				359	tg->slice_end[rw], jiffies);
				360	}
				361
				362	/* Determine if previously allocated or extended slice is complete or not */
				363	static bool
				364	throtl_slice_used(struct throtl_data td, struct throtl_grp tg, bool rw)
				365	{
				366	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
				367	return 0;
				368
				369	return 1;
				370	}
				371
				372	/* Trim the used slices and adjust slice start accordingly */
				373	static inline void
				374	throtl_trim_slice(struct throtl_data td, struct throtl_grp tg, bool rw)
				375	{
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	376	unsigned long nr_slices, bytes_trim, time_elapsed, io_trim;
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	377
				378	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
				379
				380	/*
				381	* If bps are unlimited (-1), then time slice don't get
				382	* renewed. Don't try to trim the slice if slice is used. A new
				383	* slice will start when appropriate.
				384	*/
				385	if (throtl_slice_used(td, tg, rw))
				386	return;
				387
				388	time_elapsed = jiffies - tg->slice_start[rw];
				389
				390	nr_slices = time_elapsed / throtl_slice;
				391
				392	if (!nr_slices)
				393	return;
				394
				395	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	396	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	397
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	398	if (!bytes_trim && !io_trim)
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	399	return;
				400
				401	if (tg->bytes_disp[rw] >= bytes_trim)
				402	tg->bytes_disp[rw] -= bytes_trim;
				403	else
				404	tg->bytes_disp[rw] = 0;
				405
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	406	if (tg->io_disp[rw] >= io_trim)
				407	tg->io_disp[rw] -= io_trim;
				408	else
				409	tg->io_disp[rw] = 0;
				410
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	411	tg->slice_start[rw] += nr_slices * throtl_slice;
				412
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	413	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu io=%lu"
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	414	" start=%lu end=%lu jiffies=%lu",
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	415	rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	416	tg->slice_start[rw], tg->slice_end[rw], jiffies);
				417	}
				418
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	419	static bool tg_with_in_iops_limit(struct throtl_data td, struct throtl_grp tg,
				420	struct bio bio, unsigned long wait)
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	421	{
				422	bool rw = bio_data_dir(bio);
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	423	unsigned int io_allowed;
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	424	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
				425
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	426	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	427
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	428	/* Slice has just started. Consider one slice interval */
				429	if (!jiffy_elapsed)
				430	jiffy_elapsed_rnd = throtl_slice;
				431
				432	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
				433
				434	io_allowed = (tg->iops[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
				435	/ MSEC_PER_SEC;
				436
				437	if (tg->io_disp[rw] + 1 <= io_allowed) {
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	438	if (wait)
				439	*wait = 0;
				440	return 1;
				441	}
				442
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	443	/* Calc approx time to dispatch */
				444	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
				445
				446	if (jiffy_wait > jiffy_elapsed)
				447	jiffy_wait = jiffy_wait - jiffy_elapsed;
				448	else
				449	jiffy_wait = 1;
				450
				451	if (wait)
				452	*wait = jiffy_wait;
				453	return 0;
				454	}
				455
				456	static bool tg_with_in_bps_limit(struct throtl_data td, struct throtl_grp tg,
				457	struct bio bio, unsigned long wait)
				458	{
				459	bool rw = bio_data_dir(bio);
				460	u64 bytes_allowed, extra_bytes;
				461	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	462
				463	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
				464
				465	/* Slice has just started. Consider one slice interval */
				466	if (!jiffy_elapsed)
				467	jiffy_elapsed_rnd = throtl_slice;
				468
				469	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
				470
				471	bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
				472	/ MSEC_PER_SEC;
				473
				474	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
				475	if (wait)
				476	*wait = 0;
				477	return 1;
				478	}
				479
				480	/* Calc approx time to dispatch */
				481	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
				482	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
				483
				484	if (!jiffy_wait)
				485	jiffy_wait = 1;
				486
				487	/*
				488	* This wait time is without taking into consideration the rounding
				489	* up we did. Add that time also.
				490	*/
				491	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	492	if (wait)
				493	*wait = jiffy_wait;
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	494	return 0;
				495	}
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	496
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	497	/*
				498	* Returns whether one can dispatch a bio or not. Also returns approx number
				499	* of jiffies to wait before this bio is with-in IO rate and can be dispatched
				500	*/
				501	static bool tg_may_dispatch(struct throtl_data td, struct throtl_grp tg,
				502	struct bio bio, unsigned long wait)
				503	{
				504	bool rw = bio_data_dir(bio);
				505	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
				506
				507	/*
				508	* Currently whole state machine of group depends on first bio
				509	* queued in the group bio list. So one should not be calling
				510	* this function with a different bio if there are other bios
				511	* queued.
				512	*/
				513	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
				514
				515	/* If tg->bps = -1, then BW is unlimited */
				516	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
				517	if (wait)
				518	*wait = 0;
				519	return 1;
				520	}
				521
				522	/*
				523	* If previous slice expired, start a new one otherwise renew/extend
				524	* existing slice to make sure it is at least throtl_slice interval
				525	* long since now.
				526	*/
				527	if (throtl_slice_used(td, tg, rw))
				528	throtl_start_new_slice(td, tg, rw);
				529	else {
				530	if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
				531	throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
				532	}
				533
				534	if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
				535	&& tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
				536	if (wait)
				537	*wait = 0;
				538	return 1;
				539	}
				540
				541	max_wait = max(bps_wait, iops_wait);
				542
				543	if (wait)
				544	*wait = max_wait;
				545
				546	if (time_before(tg->slice_end[rw], jiffies + max_wait))
				547	throtl_extend_slice(td, tg, rw, jiffies + max_wait);
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	548
				549	return 0;
				550	}
				551
				552	static void throtl_charge_bio(struct throtl_grp tg, struct bio bio)
				553	{
				554	bool rw = bio_data_dir(bio);
				555	bool sync = bio->bi_rw & REQ_SYNC;
				556
				557	/* Charge the bio to the group */
				558	tg->bytes_disp[rw] += bio->bi_size;
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	559	tg->io_disp[rw]++;
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	560
				561	/*
				562	* TODO: This will take blkg->stats_lock. Figure out a way
				563	* to avoid this cost.
				564	*/
				565	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	566	}
				567
				568	static void throtl_add_bio_tg(struct throtl_data td, struct throtl_grp tg,
				569	struct bio *bio)
				570	{
				571	bool rw = bio_data_dir(bio);
				572
				573	bio_list_add(&tg->bio_lists[rw], bio);
				574	/* Take a bio reference on tg */
				575	throtl_ref_get_tg(tg);
				576	tg->nr_queued[rw]++;
				577	td->nr_queued[rw]++;
				578	throtl_enqueue_tg(td, tg);
				579	}
				580
				581	static void tg_update_disptime(struct throtl_data td, struct throtl_grp tg)
				582	{
				583	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
				584	struct bio *bio;
				585
				586	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
				587	tg_may_dispatch(td, tg, bio, &read_wait);
				588
				589	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
				590	tg_may_dispatch(td, tg, bio, &write_wait);
				591
				592	min_wait = min(read_wait, write_wait);
				593	disptime = jiffies + min_wait;
				594
				595	/*
				596	* If group is already on active tree, then update dispatch time
				597	* only if it is lesser than existing dispatch time. Otherwise
				598	* always update the dispatch time
				599	*/
				600
				601	if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
				602	return;
				603
				604	/* Update dispatch time */
				605	throtl_dequeue_tg(td, tg);
				606	tg->disptime = disptime;
				607	throtl_enqueue_tg(td, tg);
				608	}
				609
				610	static void tg_dispatch_one_bio(struct throtl_data td, struct throtl_grp tg,
				611	bool rw, struct bio_list *bl)
				612	{
				613	struct bio *bio;
				614
				615	bio = bio_list_pop(&tg->bio_lists[rw]);
				616	tg->nr_queued[rw]--;
				617	/* Drop bio reference on tg */
				618	throtl_put_tg(tg);
				619
				620	BUG_ON(td->nr_queued[rw] <= 0);
				621	td->nr_queued[rw]--;
				622
				623	throtl_charge_bio(tg, bio);
				624	bio_list_add(bl, bio);
				625	bio->bi_rw \|= REQ_THROTTLED;
				626
				627	throtl_trim_slice(td, tg, rw);
				628	}
				629
				630	static int throtl_dispatch_tg(struct throtl_data td, struct throtl_grp tg,
				631	struct bio_list *bl)
				632	{
				633	unsigned int nr_reads = 0, nr_writes = 0;
				634	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
				635	unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
				636	struct bio *bio;
				637
				638	/* Try to dispatch 75% READS and 25% WRITES */
				639
				640	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
				641	&& tg_may_dispatch(td, tg, bio, NULL)) {
				642
				643	tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
				644	nr_reads++;
				645
				646	if (nr_reads >= max_nr_reads)
				647	break;
				648	}
				649
				650	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
				651	&& tg_may_dispatch(td, tg, bio, NULL)) {
				652
				653	tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
				654	nr_writes++;
				655
				656	if (nr_writes >= max_nr_writes)
				657	break;
				658	}
				659
				660	return nr_reads + nr_writes;
				661	}
				662
				663	static int throtl_select_dispatch(struct throtl_data td, struct bio_list bl)
				664	{
				665	unsigned int nr_disp = 0;
				666	struct throtl_grp *tg;
				667	struct throtl_rb_root *st = &td->tg_service_tree;
				668
				669	while (1) {
				670	tg = throtl_rb_first(st);
				671
				672	if (!tg)
				673	break;
				674
				675	if (time_before(jiffies, tg->disptime))
				676	break;
				677
				678	throtl_dequeue_tg(td, tg);
				679
				680	nr_disp += throtl_dispatch_tg(td, tg, bl);
				681
				682	if (tg->nr_queued[0] \|\| tg->nr_queued[1]) {
				683	tg_update_disptime(td, tg);
				684	throtl_enqueue_tg(td, tg);
				685	}
				686
				687	if (nr_disp >= throtl_quantum)
				688	break;
				689	}
				690
				691	return nr_disp;
				692	}
				693
				694	/* Dispatch throttled bios. Should be called without queue lock held. */
				695	static int throtl_dispatch(struct request_queue *q)
				696	{
				697	struct throtl_data *td = q->td;
				698	unsigned int nr_disp = 0;
				699	struct bio_list bio_list_on_stack;
				700	struct bio *bio;
				701
				702	spin_lock_irq(q->queue_lock);
				703
				704	if (!total_nr_queued(td))
				705	goto out;
				706
				707	bio_list_init(&bio_list_on_stack);
				708
				709	throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
				710	total_nr_queued(td), td->nr_queued[READ],
				711	td->nr_queued[WRITE]);
				712
				713	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
				714
				715	if (nr_disp)
				716	throtl_log(td, "bios disp=%u", nr_disp);
				717
				718	throtl_schedule_next_dispatch(td);
				719	out:
				720	spin_unlock_irq(q->queue_lock);
				721
				722	/*
				723	* If we dispatched some requests, unplug the queue to make sure
				724	* immediate dispatch
				725	*/
				726	if (nr_disp) {
				727	while((bio = bio_list_pop(&bio_list_on_stack)))
				728	generic_make_request(bio);
				729	blk_unplug(q);
				730	}
				731	return nr_disp;
				732	}
				733
				734	void blk_throtl_work(struct work_struct *work)
				735	{
				736	struct throtl_data *td = container_of(work, struct throtl_data,
				737	throtl_work.work);
				738	struct request_queue *q = td->queue;
				739
				740	throtl_dispatch(q);
				741	}
				742
				743	/* Call with queue lock held */
				744	void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
				745	{
				746
				747	struct throtl_data *td = q->td;
				748	struct delayed_work *dwork = &td->throtl_work;
				749
				750	if (total_nr_queued(td) > 0) {
				751	/*
				752	* We might have a work scheduled to be executed in future.
				753	* Cancel that and schedule a new one.
				754	*/
				755	__cancel_delayed_work(dwork);
				756	kblockd_schedule_delayed_work(q, dwork, delay);
				757	throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
				758	delay, jiffies);
				759	}
				760	}
				761	EXPORT_SYMBOL(throtl_schedule_delayed_work);
				762
				763	static void
				764	throtl_destroy_tg(struct throtl_data td, struct throtl_grp tg)
				765	{
				766	/* Something wrong if we are trying to remove same group twice */
				767	BUG_ON(hlist_unhashed(&tg->tg_node));
				768
				769	hlist_del_init(&tg->tg_node);
				770
				771	/*
				772	* Put the reference taken at the time of creation so that when all
				773	* queues are gone, group can be destroyed.
				774	*/
				775	throtl_put_tg(tg);
				776	td->nr_undestroyed_grps--;
				777	}
				778
				779	static void throtl_release_tgs(struct throtl_data *td)
				780	{
				781	struct hlist_node pos, n;
				782	struct throtl_grp *tg;
				783
				784	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
				785	/*
				786	* If cgroup removal path got to blk_group first and removed
				787	* it from cgroup list, then it will take care of destroying
				788	* cfqg also.
				789	*/
				790	if (!blkiocg_del_blkio_group(&tg->blkg))
				791	throtl_destroy_tg(td, tg);
				792	}
				793	}
				794
				795	static void throtl_td_free(struct throtl_data *td)
				796	{
				797	kfree(td);
				798	}
				799
				800	/*
				801	* Blk cgroup controller notification saying that blkio_group object is being
				802	* delinked as associated cgroup object is going away. That also means that
				803	* no new IO will come in this group. So get rid of this group as soon as
				804	* any pending IO in the group is finished.
				805	*
				806	* This function is called under rcu_read_lock(). key is the rcu protected
				807	* pointer. That means "key" is a valid throtl_data pointer as long as we are
				808	* rcu read lock.
				809	*
				810	* "key" was fetched from blkio_group under blkio_cgroup->lock. That means
				811	* it should not be NULL as even if queue was going away, cgroup deltion
				812	* path got to it first.
				813	*/
				814	void throtl_unlink_blkio_group(void key, struct blkio_group blkg)
				815	{
				816	unsigned long flags;
				817	struct throtl_data *td = key;
				818
				819	spin_lock_irqsave(td->queue->queue_lock, flags);
				820	throtl_destroy_tg(td, tg_of_blkg(blkg));
				821	spin_unlock_irqrestore(td->queue->queue_lock, flags);
				822	}
				823
				824	static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
				825	u64 read_bps)
				826	{
				827	tg_of_blkg(blkg)->bps[READ] = read_bps;
				828	}
				829
				830	static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
				831	u64 write_bps)
				832	{
				833	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
				834	}
				835
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	836	static void throtl_update_blkio_group_read_iops (struct blkio_group *blkg,
				837	unsigned int read_iops)
				838	{
				839	tg_of_blkg(blkg)->iops[READ] = read_iops;
				840	}
				841
				842	static void throtl_update_blkio_group_write_iops (struct blkio_group *blkg,
				843	unsigned int write_iops)
				844	{
				845	tg_of_blkg(blkg)->iops[WRITE] = write_iops;
				846	}
				847
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	848	void throtl_shutdown_timer_wq(struct request_queue *q)
				849	{
				850	struct throtl_data *td = q->td;
				851
				852	cancel_delayed_work_sync(&td->throtl_work);
				853	}
				854
				855	static struct blkio_policy_type blkio_policy_throtl = {
				856	.ops = {
				857	.blkio_unlink_group_fn = throtl_unlink_blkio_group,
				858	.blkio_update_group_read_bps_fn =
				859	throtl_update_blkio_group_read_bps,
				860	.blkio_update_group_write_bps_fn =
				861	throtl_update_blkio_group_write_bps,
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	862	.blkio_update_group_read_iops_fn =
				863	throtl_update_blkio_group_read_iops,
				864	.blkio_update_group_write_iops_fn =
				865	throtl_update_blkio_group_write_iops,
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	866	},
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	867	.plid = BLKIO_POLICY_THROTL,
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	868	};
				869
				870	int blk_throtl_bio(struct request_queue q, struct bio *biop)
				871	{
				872	struct throtl_data *td = q->td;
				873	struct throtl_grp *tg;
				874	struct bio bio = biop;
				875	bool rw = bio_data_dir(bio), update_disptime = true;
				876
				877	if (bio->bi_rw & REQ_THROTTLED) {
				878	bio->bi_rw &= ~REQ_THROTTLED;
				879	return 0;
				880	}
				881
				882	spin_lock_irq(q->queue_lock);
				883	tg = throtl_get_tg(td);
				884
				885	if (tg->nr_queued[rw]) {
				886	/*
				887	* There is already another bio queued in same dir. No
				888	* need to update dispatch time.
				889	*/
				890	update_disptime = false;
				891	goto queue_bio;
				892	}
				893
				894	/* Bio is with-in rate limit of group */
				895	if (tg_may_dispatch(td, tg, bio, NULL)) {
				896	throtl_charge_bio(tg, bio);
				897	goto out;
				898	}
				899
				900	queue_bio:
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	901	throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu"
				902	" iodisp=%u iops=%u queued=%d/%d",
				903	rw == READ ? 'R' : 'W',
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	904	tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	905	tg->io_disp[rw], tg->iops[rw],
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	906	tg->nr_queued[READ], tg->nr_queued[WRITE]);
				907
				908	throtl_add_bio_tg(q->td, tg, bio);
				909	*biop = NULL;
				910
				911	if (update_disptime) {
				912	tg_update_disptime(td, tg);
				913	throtl_schedule_next_dispatch(td);
				914	}
				915
				916	out:
				917	spin_unlock_irq(q->queue_lock);
				918	return 0;
				919	}
				920
				921	int blk_throtl_init(struct request_queue *q)
				922	{
				923	struct throtl_data *td;
				924	struct throtl_grp *tg;
				925
				926	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
				927	if (!td)
				928	return -ENOMEM;
				929
				930	INIT_HLIST_HEAD(&td->tg_list);
				931	td->tg_service_tree = THROTL_RB_ROOT;
				932
				933	/* Init root group */
				934	tg = &td->root_tg;
				935	INIT_HLIST_NODE(&tg->tg_node);
				936	RB_CLEAR_NODE(&tg->rb_node);
				937	bio_list_init(&tg->bio_lists[0]);
				938	bio_list_init(&tg->bio_lists[1]);
				939
				940	/* Practically unlimited BW */
				941	tg->bps[0] = tg->bps[1] = -1;
Vivek Goyal	8e89d13	2010-09-15 17:06:37 -0400	[diff] [blame^]	942	tg->iops[0] = tg->iops[1] = -1;
Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame]	943	atomic_set(&tg->ref, 1);
				944
				945	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
				946
				947	rcu_read_lock();
				948	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
				949	0, BLKIO_POLICY_THROTL);
				950	rcu_read_unlock();
				951
				952	/* Attach throtl data to request queue */
				953	td->queue = q;
				954	q->td = td;
				955	return 0;
				956	}
				957
				958	void blk_throtl_exit(struct request_queue *q)
				959	{
				960	struct throtl_data *td = q->td;
				961	bool wait = false;
				962
				963	BUG_ON(!td);
				964
				965	throtl_shutdown_timer_wq(q);
				966
				967	spin_lock_irq(q->queue_lock);
				968	throtl_release_tgs(td);
				969	blkiocg_del_blkio_group(&td->root_tg.blkg);
				970
				971	/* If there are other groups */
				972	if (td->nr_undestroyed_grps >= 1)
				973	wait = true;
				974
				975	spin_unlock_irq(q->queue_lock);
				976
				977	/*
				978	* Wait for tg->blkg->key accessors to exit their grace periods.
				979	* Do this wait only if there are other undestroyed groups out
				980	* there (other than root group). This can happen if cgroup deletion
				981	* path claimed the responsibility of cleaning up a group before
				982	* queue cleanup code get to the group.
				983	*
				984	* Do not call synchronize_rcu() unconditionally as there are drivers
				985	* which create/delete request queue hundreds of times during scan/boot
				986	* and synchronize_rcu() can take significant time and slow down boot.
				987	*/
				988	if (wait)
				989	synchronize_rcu();
				990	throtl_td_free(td);
				991	}
				992
				993	static int __init throtl_init(void)
				994	{
				995	blkio_policy_register(&blkio_policy_throtl);
				996	return 0;
				997	}
				998
				999	module_init(throtl_init);