Blame - block/blk-throttle.c - SHIFTPHONES/kernel/common

blob: 4b492011e0dea11cd3587d065aa3ea16a1262c51 [file] [log] [blame]

Vivek Goyal	e43473b	2010-09-15 17:06:35 -0400	[diff] [blame^]	1	/*
				2	* Interface for controlling IO bandwidth on a request queue
				3	*
				4	* Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
				5	*/
				6
				7	#include <linux/module.h>
				8	#include <linux/slab.h>
				9	#include <linux/blkdev.h>
				10	#include <linux/bio.h>
				11	#include <linux/blktrace_api.h>
				12	#include "blk-cgroup.h"
				13
				14	/* Max dispatch from a group in 1 round */
				15	static int throtl_grp_quantum = 8;
				16
				17	/* Total max dispatch from all groups in one round */
				18	static int throtl_quantum = 32;
				19
				20	/* Throttling is performed over 100ms slice and after that slice is renewed */
				21	static unsigned long throtl_slice = HZ/10; /* 100 ms */
				22
				23	struct throtl_rb_root {
				24	struct rb_root rb;
				25	struct rb_node *left;
				26	unsigned int count;
				27	unsigned long min_disptime;
				28	};
				29
				30	#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
				31	.count = 0, .min_disptime = 0}
				32
				33	#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
				34
				35	struct throtl_grp {
				36	/* List of throtl groups on the request queue*/
				37	struct hlist_node tg_node;
				38
				39	/* active throtl group service_tree member */
				40	struct rb_node rb_node;
				41
				42	/*
				43	* Dispatch time in jiffies. This is the estimated time when group
				44	* will unthrottle and is ready to dispatch more bio. It is used as
				45	* key to sort active groups in service tree.
				46	*/
				47	unsigned long disptime;
				48
				49	struct blkio_group blkg;
				50	atomic_t ref;
				51	unsigned int flags;
				52
				53	/* Two lists for READ and WRITE */
				54	struct bio_list bio_lists[2];
				55
				56	/* Number of queued bios on READ and WRITE lists */
				57	unsigned int nr_queued[2];
				58
				59	/* bytes per second rate limits */
				60	uint64_t bps[2];
				61
				62	/* Number of bytes disptached in current slice */
				63	uint64_t bytes_disp[2];
				64
				65	/* When did we start a new slice */
				66	unsigned long slice_start[2];
				67	unsigned long slice_end[2];
				68	};
				69
				70	struct throtl_data
				71	{
				72	/* List of throtl groups */
				73	struct hlist_head tg_list;
				74
				75	/* service tree for active throtl groups */
				76	struct throtl_rb_root tg_service_tree;
				77
				78	struct throtl_grp root_tg;
				79	struct request_queue *queue;
				80
				81	/* Total Number of queued bios on READ and WRITE lists */
				82	unsigned int nr_queued[2];
				83
				84	/*
				85	* number of total undestroyed groups (excluding root group)
				86	*/
				87	unsigned int nr_undestroyed_grps;
				88
				89	/* Work for dispatching throttled bios */
				90	struct delayed_work throtl_work;
				91	};
				92
				93	enum tg_state_flags {
				94	THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */
				95	};
				96
				97	#define THROTL_TG_FNS(name) \
				98	static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \
				99	{ \
				100	(tg)->flags \|= (1 << THROTL_TG_FLAG_##name); \
				101	} \
				102	static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \
				103	{ \
				104	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \
				105	} \
				106	static inline int throtl_tg_##name(const struct throtl_grp *tg) \
				107	{ \
				108	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \
				109	}
				110
				111	THROTL_TG_FNS(on_rr);
				112
				113	#define throtl_log_tg(td, tg, fmt, args...) \
				114	blk_add_trace_msg((td)->queue, "throtl %s " fmt, \
				115	blkg_path(&(tg)->blkg), ##args); \
				116
				117	#define throtl_log(td, fmt, args...) \
				118	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
				119
				120	static inline struct throtl_grp tg_of_blkg(struct blkio_group blkg)
				121	{
				122	if (blkg)
				123	return container_of(blkg, struct throtl_grp, blkg);
				124
				125	return NULL;
				126	}
				127
				128	static inline int total_nr_queued(struct throtl_data *td)
				129	{
				130	return (td->nr_queued[0] + td->nr_queued[1]);
				131	}
				132
				133	static inline struct throtl_grp throtl_ref_get_tg(struct throtl_grp tg)
				134	{
				135	atomic_inc(&tg->ref);
				136	return tg;
				137	}
				138
				139	static void throtl_put_tg(struct throtl_grp *tg)
				140	{
				141	BUG_ON(atomic_read(&tg->ref) <= 0);
				142	if (!atomic_dec_and_test(&tg->ref))
				143	return;
				144	kfree(tg);
				145	}
				146
				147	static struct throtl_grp * throtl_find_alloc_tg(struct throtl_data *td,
				148	struct cgroup *cgroup)
				149	{
				150	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
				151	struct throtl_grp *tg = NULL;
				152	void *key = td;
				153	struct backing_dev_info *bdi = &td->queue->backing_dev_info;
				154	unsigned int major, minor;
				155
				156	/*
				157	* TODO: Speed up blkiocg_lookup_group() by maintaining a radix
				158	* tree of blkg (instead of traversing through hash list all
				159	* the time.
				160	*/
				161	tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key));
				162
				163	/* Fill in device details for root group */
				164	if (tg && !tg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
				165	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
				166	tg->blkg.dev = MKDEV(major, minor);
				167	goto done;
				168	}
				169
				170	if (tg)
				171	goto done;
				172
				173	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
				174	if (!tg)
				175	goto done;
				176
				177	INIT_HLIST_NODE(&tg->tg_node);
				178	RB_CLEAR_NODE(&tg->rb_node);
				179	bio_list_init(&tg->bio_lists[0]);
				180	bio_list_init(&tg->bio_lists[1]);
				181
				182	/*
				183	* Take the initial reference that will be released on destroy
				184	* This can be thought of a joint reference by cgroup and
				185	* request queue which will be dropped by either request queue
				186	* exit or cgroup deletion path depending on who is exiting first.
				187	*/
				188	atomic_set(&tg->ref, 1);
				189
				190	/* Add group onto cgroup list */
				191	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
				192	blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td,
				193	MKDEV(major, minor), BLKIO_POLICY_THROTL);
				194
				195	tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev);
				196	tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev);
				197
				198	hlist_add_head(&tg->tg_node, &td->tg_list);
				199	td->nr_undestroyed_grps++;
				200	done:
				201	return tg;
				202	}
				203
				204	static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
				205	{
				206	struct cgroup *cgroup;
				207	struct throtl_grp *tg = NULL;
				208
				209	rcu_read_lock();
				210	cgroup = task_cgroup(current, blkio_subsys_id);
				211	tg = throtl_find_alloc_tg(td, cgroup);
				212	if (!tg)
				213	tg = &td->root_tg;
				214	rcu_read_unlock();
				215	return tg;
				216	}
				217
				218	static struct throtl_grp throtl_rb_first(struct throtl_rb_root root)
				219	{
				220	/* Service tree is empty */
				221	if (!root->count)
				222	return NULL;
				223
				224	if (!root->left)
				225	root->left = rb_first(&root->rb);
				226
				227	if (root->left)
				228	return rb_entry_tg(root->left);
				229
				230	return NULL;
				231	}
				232
				233	static void rb_erase_init(struct rb_node n, struct rb_root root)
				234	{
				235	rb_erase(n, root);
				236	RB_CLEAR_NODE(n);
				237	}
				238
				239	static void throtl_rb_erase(struct rb_node n, struct throtl_rb_root root)
				240	{
				241	if (root->left == n)
				242	root->left = NULL;
				243	rb_erase_init(n, &root->rb);
				244	--root->count;
				245	}
				246
				247	static void update_min_dispatch_time(struct throtl_rb_root *st)
				248	{
				249	struct throtl_grp *tg;
				250
				251	tg = throtl_rb_first(st);
				252	if (!tg)
				253	return;
				254
				255	st->min_disptime = tg->disptime;
				256	}
				257
				258	static void
				259	tg_service_tree_add(struct throtl_rb_root st, struct throtl_grp tg)
				260	{
				261	struct rb_node **node = &st->rb.rb_node;
				262	struct rb_node *parent = NULL;
				263	struct throtl_grp *__tg;
				264	unsigned long key = tg->disptime;
				265	int left = 1;
				266
				267	while (*node != NULL) {
				268	parent = *node;
				269	__tg = rb_entry_tg(parent);
				270
				271	if (time_before(key, __tg->disptime))
				272	node = &parent->rb_left;
				273	else {
				274	node = &parent->rb_right;
				275	left = 0;
				276	}
				277	}
				278
				279	if (left)
				280	st->left = &tg->rb_node;
				281
				282	rb_link_node(&tg->rb_node, parent, node);
				283	rb_insert_color(&tg->rb_node, &st->rb);
				284	}
				285
				286	static void __throtl_enqueue_tg(struct throtl_data td, struct throtl_grp tg)
				287	{
				288	struct throtl_rb_root *st = &td->tg_service_tree;
				289
				290	tg_service_tree_add(st, tg);
				291	throtl_mark_tg_on_rr(tg);
				292	st->count++;
				293	}
				294
				295	static void throtl_enqueue_tg(struct throtl_data td, struct throtl_grp tg)
				296	{
				297	if (!throtl_tg_on_rr(tg))
				298	__throtl_enqueue_tg(td, tg);
				299	}
				300
				301	static void __throtl_dequeue_tg(struct throtl_data td, struct throtl_grp tg)
				302	{
				303	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
				304	throtl_clear_tg_on_rr(tg);
				305	}
				306
				307	static void throtl_dequeue_tg(struct throtl_data td, struct throtl_grp tg)
				308	{
				309	if (throtl_tg_on_rr(tg))
				310	__throtl_dequeue_tg(td, tg);
				311	}
				312
				313	static void throtl_schedule_next_dispatch(struct throtl_data *td)
				314	{
				315	struct throtl_rb_root *st = &td->tg_service_tree;
				316
				317	/*
				318	* If there are more bios pending, schedule more work.
				319	*/
				320	if (!total_nr_queued(td))
				321	return;
				322
				323	BUG_ON(!st->count);
				324
				325	update_min_dispatch_time(st);
				326
				327	if (time_before_eq(st->min_disptime, jiffies))
				328	throtl_schedule_delayed_work(td->queue, 0);
				329	else
				330	throtl_schedule_delayed_work(td->queue,
				331	(st->min_disptime - jiffies));
				332	}
				333
				334	static inline void
				335	throtl_start_new_slice(struct throtl_data td, struct throtl_grp tg, bool rw)
				336	{
				337	tg->bytes_disp[rw] = 0;
				338	tg->slice_start[rw] = jiffies;
				339	tg->slice_end[rw] = jiffies + throtl_slice;
				340	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
				341	rw == READ ? 'R' : 'W', tg->slice_start[rw],
				342	tg->slice_end[rw], jiffies);
				343	}
				344
				345	static inline void throtl_extend_slice(struct throtl_data *td,
				346	struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
				347	{
				348	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
				349	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
				350	rw == READ ? 'R' : 'W', tg->slice_start[rw],
				351	tg->slice_end[rw], jiffies);
				352	}
				353
				354	/* Determine if previously allocated or extended slice is complete or not */
				355	static bool
				356	throtl_slice_used(struct throtl_data td, struct throtl_grp tg, bool rw)
				357	{
				358	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
				359	return 0;
				360
				361	return 1;
				362	}
				363
				364	/* Trim the used slices and adjust slice start accordingly */
				365	static inline void
				366	throtl_trim_slice(struct throtl_data td, struct throtl_grp tg, bool rw)
				367	{
				368	unsigned long nr_slices, bytes_trim, time_elapsed;
				369
				370	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
				371
				372	/*
				373	* If bps are unlimited (-1), then time slice don't get
				374	* renewed. Don't try to trim the slice if slice is used. A new
				375	* slice will start when appropriate.
				376	*/
				377	if (throtl_slice_used(td, tg, rw))
				378	return;
				379
				380	time_elapsed = jiffies - tg->slice_start[rw];
				381
				382	nr_slices = time_elapsed / throtl_slice;
				383
				384	if (!nr_slices)
				385	return;
				386
				387	bytes_trim = (tg->bps[rw] * throtl_slice * nr_slices)/HZ;
				388
				389	if (!bytes_trim)
				390	return;
				391
				392	if (tg->bytes_disp[rw] >= bytes_trim)
				393	tg->bytes_disp[rw] -= bytes_trim;
				394	else
				395	tg->bytes_disp[rw] = 0;
				396
				397	tg->slice_start[rw] += nr_slices * throtl_slice;
				398
				399	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%lu"
				400	" start=%lu end=%lu jiffies=%lu",
				401	rw == READ ? 'R' : 'W', nr_slices, bytes_trim,
				402	tg->slice_start[rw], tg->slice_end[rw], jiffies);
				403	}
				404
				405	/*
				406	* Returns whether one can dispatch a bio or not. Also returns approx number
				407	* of jiffies to wait before this bio is with-in IO rate and can be dispatched
				408	*/
				409	static bool tg_may_dispatch(struct throtl_data td, struct throtl_grp tg,
				410	struct bio bio, unsigned long wait)
				411	{
				412	bool rw = bio_data_dir(bio);
				413	u64 bytes_allowed, extra_bytes;
				414	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
				415
				416	/*
				417	* Currently whole state machine of group depends on first bio
				418	* queued in the group bio list. So one should not be calling
				419	* this function with a different bio if there are other bios
				420	* queued.
				421	*/
				422	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
				423
				424	/* If tg->bps = -1, then BW is unlimited */
				425	if (tg->bps[rw] == -1) {
				426	if (wait)
				427	*wait = 0;
				428	return 1;
				429	}
				430
				431	/*
				432	* If previous slice expired, start a new one otherwise renew/extend
				433	* existing slice to make sure it is at least throtl_slice interval
				434	* long since now.
				435	*/
				436	if (throtl_slice_used(td, tg, rw))
				437	throtl_start_new_slice(td, tg, rw);
				438	else {
				439	if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
				440	throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
				441	}
				442
				443	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
				444
				445	/* Slice has just started. Consider one slice interval */
				446	if (!jiffy_elapsed)
				447	jiffy_elapsed_rnd = throtl_slice;
				448
				449	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
				450
				451	bytes_allowed = (tg->bps[rw] * jiffies_to_msecs(jiffy_elapsed_rnd))
				452	/ MSEC_PER_SEC;
				453
				454	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
				455	if (wait)
				456	*wait = 0;
				457	return 1;
				458	}
				459
				460	/* Calc approx time to dispatch */
				461	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
				462	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
				463
				464	if (!jiffy_wait)
				465	jiffy_wait = 1;
				466
				467	/*
				468	* This wait time is without taking into consideration the rounding
				469	* up we did. Add that time also.
				470	*/
				471	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
				472
				473	if (wait)
				474	*wait = jiffy_wait;
				475
				476	if (time_before(tg->slice_end[rw], jiffies + jiffy_wait))
				477	throtl_extend_slice(td, tg, rw, jiffies + jiffy_wait);
				478
				479	return 0;
				480	}
				481
				482	static void throtl_charge_bio(struct throtl_grp tg, struct bio bio)
				483	{
				484	bool rw = bio_data_dir(bio);
				485	bool sync = bio->bi_rw & REQ_SYNC;
				486
				487	/* Charge the bio to the group */
				488	tg->bytes_disp[rw] += bio->bi_size;
				489
				490	/*
				491	* TODO: This will take blkg->stats_lock. Figure out a way
				492	* to avoid this cost.
				493	*/
				494	blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync);
				495
				496	}
				497
				498	static void throtl_add_bio_tg(struct throtl_data td, struct throtl_grp tg,
				499	struct bio *bio)
				500	{
				501	bool rw = bio_data_dir(bio);
				502
				503	bio_list_add(&tg->bio_lists[rw], bio);
				504	/* Take a bio reference on tg */
				505	throtl_ref_get_tg(tg);
				506	tg->nr_queued[rw]++;
				507	td->nr_queued[rw]++;
				508	throtl_enqueue_tg(td, tg);
				509	}
				510
				511	static void tg_update_disptime(struct throtl_data td, struct throtl_grp tg)
				512	{
				513	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
				514	struct bio *bio;
				515
				516	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
				517	tg_may_dispatch(td, tg, bio, &read_wait);
				518
				519	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
				520	tg_may_dispatch(td, tg, bio, &write_wait);
				521
				522	min_wait = min(read_wait, write_wait);
				523	disptime = jiffies + min_wait;
				524
				525	/*
				526	* If group is already on active tree, then update dispatch time
				527	* only if it is lesser than existing dispatch time. Otherwise
				528	* always update the dispatch time
				529	*/
				530
				531	if (throtl_tg_on_rr(tg) && time_before(disptime, tg->disptime))
				532	return;
				533
				534	/* Update dispatch time */
				535	throtl_dequeue_tg(td, tg);
				536	tg->disptime = disptime;
				537	throtl_enqueue_tg(td, tg);
				538	}
				539
				540	static void tg_dispatch_one_bio(struct throtl_data td, struct throtl_grp tg,
				541	bool rw, struct bio_list *bl)
				542	{
				543	struct bio *bio;
				544
				545	bio = bio_list_pop(&tg->bio_lists[rw]);
				546	tg->nr_queued[rw]--;
				547	/* Drop bio reference on tg */
				548	throtl_put_tg(tg);
				549
				550	BUG_ON(td->nr_queued[rw] <= 0);
				551	td->nr_queued[rw]--;
				552
				553	throtl_charge_bio(tg, bio);
				554	bio_list_add(bl, bio);
				555	bio->bi_rw \|= REQ_THROTTLED;
				556
				557	throtl_trim_slice(td, tg, rw);
				558	}
				559
				560	static int throtl_dispatch_tg(struct throtl_data td, struct throtl_grp tg,
				561	struct bio_list *bl)
				562	{
				563	unsigned int nr_reads = 0, nr_writes = 0;
				564	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
				565	unsigned int max_nr_writes = throtl_grp_quantum - nr_reads;
				566	struct bio *bio;
				567
				568	/* Try to dispatch 75% READS and 25% WRITES */
				569
				570	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
				571	&& tg_may_dispatch(td, tg, bio, NULL)) {
				572
				573	tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
				574	nr_reads++;
				575
				576	if (nr_reads >= max_nr_reads)
				577	break;
				578	}
				579
				580	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
				581	&& tg_may_dispatch(td, tg, bio, NULL)) {
				582
				583	tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
				584	nr_writes++;
				585
				586	if (nr_writes >= max_nr_writes)
				587	break;
				588	}
				589
				590	return nr_reads + nr_writes;
				591	}
				592
				593	static int throtl_select_dispatch(struct throtl_data td, struct bio_list bl)
				594	{
				595	unsigned int nr_disp = 0;
				596	struct throtl_grp *tg;
				597	struct throtl_rb_root *st = &td->tg_service_tree;
				598
				599	while (1) {
				600	tg = throtl_rb_first(st);
				601
				602	if (!tg)
				603	break;
				604
				605	if (time_before(jiffies, tg->disptime))
				606	break;
				607
				608	throtl_dequeue_tg(td, tg);
				609
				610	nr_disp += throtl_dispatch_tg(td, tg, bl);
				611
				612	if (tg->nr_queued[0] \|\| tg->nr_queued[1]) {
				613	tg_update_disptime(td, tg);
				614	throtl_enqueue_tg(td, tg);
				615	}
				616
				617	if (nr_disp >= throtl_quantum)
				618	break;
				619	}
				620
				621	return nr_disp;
				622	}
				623
				624	/* Dispatch throttled bios. Should be called without queue lock held. */
				625	static int throtl_dispatch(struct request_queue *q)
				626	{
				627	struct throtl_data *td = q->td;
				628	unsigned int nr_disp = 0;
				629	struct bio_list bio_list_on_stack;
				630	struct bio *bio;
				631
				632	spin_lock_irq(q->queue_lock);
				633
				634	if (!total_nr_queued(td))
				635	goto out;
				636
				637	bio_list_init(&bio_list_on_stack);
				638
				639	throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u",
				640	total_nr_queued(td), td->nr_queued[READ],
				641	td->nr_queued[WRITE]);
				642
				643	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
				644
				645	if (nr_disp)
				646	throtl_log(td, "bios disp=%u", nr_disp);
				647
				648	throtl_schedule_next_dispatch(td);
				649	out:
				650	spin_unlock_irq(q->queue_lock);
				651
				652	/*
				653	* If we dispatched some requests, unplug the queue to make sure
				654	* immediate dispatch
				655	*/
				656	if (nr_disp) {
				657	while((bio = bio_list_pop(&bio_list_on_stack)))
				658	generic_make_request(bio);
				659	blk_unplug(q);
				660	}
				661	return nr_disp;
				662	}
				663
				664	void blk_throtl_work(struct work_struct *work)
				665	{
				666	struct throtl_data *td = container_of(work, struct throtl_data,
				667	throtl_work.work);
				668	struct request_queue *q = td->queue;
				669
				670	throtl_dispatch(q);
				671	}
				672
				673	/* Call with queue lock held */
				674	void throtl_schedule_delayed_work(struct request_queue *q, unsigned long delay)
				675	{
				676
				677	struct throtl_data *td = q->td;
				678	struct delayed_work *dwork = &td->throtl_work;
				679
				680	if (total_nr_queued(td) > 0) {
				681	/*
				682	* We might have a work scheduled to be executed in future.
				683	* Cancel that and schedule a new one.
				684	*/
				685	__cancel_delayed_work(dwork);
				686	kblockd_schedule_delayed_work(q, dwork, delay);
				687	throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
				688	delay, jiffies);
				689	}
				690	}
				691	EXPORT_SYMBOL(throtl_schedule_delayed_work);
				692
				693	static void
				694	throtl_destroy_tg(struct throtl_data td, struct throtl_grp tg)
				695	{
				696	/* Something wrong if we are trying to remove same group twice */
				697	BUG_ON(hlist_unhashed(&tg->tg_node));
				698
				699	hlist_del_init(&tg->tg_node);
				700
				701	/*
				702	* Put the reference taken at the time of creation so that when all
				703	* queues are gone, group can be destroyed.
				704	*/
				705	throtl_put_tg(tg);
				706	td->nr_undestroyed_grps--;
				707	}
				708
				709	static void throtl_release_tgs(struct throtl_data *td)
				710	{
				711	struct hlist_node pos, n;
				712	struct throtl_grp *tg;
				713
				714	hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) {
				715	/*
				716	* If cgroup removal path got to blk_group first and removed
				717	* it from cgroup list, then it will take care of destroying
				718	* cfqg also.
				719	*/
				720	if (!blkiocg_del_blkio_group(&tg->blkg))
				721	throtl_destroy_tg(td, tg);
				722	}
				723	}
				724
				725	static void throtl_td_free(struct throtl_data *td)
				726	{
				727	kfree(td);
				728	}
				729
				730	/*
				731	* Blk cgroup controller notification saying that blkio_group object is being
				732	* delinked as associated cgroup object is going away. That also means that
				733	* no new IO will come in this group. So get rid of this group as soon as
				734	* any pending IO in the group is finished.
				735	*
				736	* This function is called under rcu_read_lock(). key is the rcu protected
				737	* pointer. That means "key" is a valid throtl_data pointer as long as we are
				738	* rcu read lock.
				739	*
				740	* "key" was fetched from blkio_group under blkio_cgroup->lock. That means
				741	* it should not be NULL as even if queue was going away, cgroup deltion
				742	* path got to it first.
				743	*/
				744	void throtl_unlink_blkio_group(void key, struct blkio_group blkg)
				745	{
				746	unsigned long flags;
				747	struct throtl_data *td = key;
				748
				749	spin_lock_irqsave(td->queue->queue_lock, flags);
				750	throtl_destroy_tg(td, tg_of_blkg(blkg));
				751	spin_unlock_irqrestore(td->queue->queue_lock, flags);
				752	}
				753
				754	static void throtl_update_blkio_group_read_bps (struct blkio_group *blkg,
				755	u64 read_bps)
				756	{
				757	tg_of_blkg(blkg)->bps[READ] = read_bps;
				758	}
				759
				760	static void throtl_update_blkio_group_write_bps (struct blkio_group *blkg,
				761	u64 write_bps)
				762	{
				763	tg_of_blkg(blkg)->bps[WRITE] = write_bps;
				764	}
				765
				766	void throtl_shutdown_timer_wq(struct request_queue *q)
				767	{
				768	struct throtl_data *td = q->td;
				769
				770	cancel_delayed_work_sync(&td->throtl_work);
				771	}
				772
				773	static struct blkio_policy_type blkio_policy_throtl = {
				774	.ops = {
				775	.blkio_unlink_group_fn = throtl_unlink_blkio_group,
				776	.blkio_update_group_read_bps_fn =
				777	throtl_update_blkio_group_read_bps,
				778	.blkio_update_group_write_bps_fn =
				779	throtl_update_blkio_group_write_bps,
				780	},
				781	};
				782
				783	int blk_throtl_bio(struct request_queue q, struct bio *biop)
				784	{
				785	struct throtl_data *td = q->td;
				786	struct throtl_grp *tg;
				787	struct bio bio = biop;
				788	bool rw = bio_data_dir(bio), update_disptime = true;
				789
				790	if (bio->bi_rw & REQ_THROTTLED) {
				791	bio->bi_rw &= ~REQ_THROTTLED;
				792	return 0;
				793	}
				794
				795	spin_lock_irq(q->queue_lock);
				796	tg = throtl_get_tg(td);
				797
				798	if (tg->nr_queued[rw]) {
				799	/*
				800	* There is already another bio queued in same dir. No
				801	* need to update dispatch time.
				802	*/
				803	update_disptime = false;
				804	goto queue_bio;
				805	}
				806
				807	/* Bio is with-in rate limit of group */
				808	if (tg_may_dispatch(td, tg, bio, NULL)) {
				809	throtl_charge_bio(tg, bio);
				810	goto out;
				811	}
				812
				813	queue_bio:
				814	throtl_log_tg(td, tg, "[%c] bio. disp=%u sz=%u bps=%llu"
				815	" queued=%d/%d", rw == READ ? 'R' : 'W',
				816	tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
				817	tg->nr_queued[READ], tg->nr_queued[WRITE]);
				818
				819	throtl_add_bio_tg(q->td, tg, bio);
				820	*biop = NULL;
				821
				822	if (update_disptime) {
				823	tg_update_disptime(td, tg);
				824	throtl_schedule_next_dispatch(td);
				825	}
				826
				827	out:
				828	spin_unlock_irq(q->queue_lock);
				829	return 0;
				830	}
				831
				832	int blk_throtl_init(struct request_queue *q)
				833	{
				834	struct throtl_data *td;
				835	struct throtl_grp *tg;
				836
				837	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
				838	if (!td)
				839	return -ENOMEM;
				840
				841	INIT_HLIST_HEAD(&td->tg_list);
				842	td->tg_service_tree = THROTL_RB_ROOT;
				843
				844	/* Init root group */
				845	tg = &td->root_tg;
				846	INIT_HLIST_NODE(&tg->tg_node);
				847	RB_CLEAR_NODE(&tg->rb_node);
				848	bio_list_init(&tg->bio_lists[0]);
				849	bio_list_init(&tg->bio_lists[1]);
				850
				851	/* Practically unlimited BW */
				852	tg->bps[0] = tg->bps[1] = -1;
				853	atomic_set(&tg->ref, 1);
				854
				855	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
				856
				857	rcu_read_lock();
				858	blkiocg_add_blkio_group(&blkio_root_cgroup, &tg->blkg, (void *)td,
				859	0, BLKIO_POLICY_THROTL);
				860	rcu_read_unlock();
				861
				862	/* Attach throtl data to request queue */
				863	td->queue = q;
				864	q->td = td;
				865	return 0;
				866	}
				867
				868	void blk_throtl_exit(struct request_queue *q)
				869	{
				870	struct throtl_data *td = q->td;
				871	bool wait = false;
				872
				873	BUG_ON(!td);
				874
				875	throtl_shutdown_timer_wq(q);
				876
				877	spin_lock_irq(q->queue_lock);
				878	throtl_release_tgs(td);
				879	blkiocg_del_blkio_group(&td->root_tg.blkg);
				880
				881	/* If there are other groups */
				882	if (td->nr_undestroyed_grps >= 1)
				883	wait = true;
				884
				885	spin_unlock_irq(q->queue_lock);
				886
				887	/*
				888	* Wait for tg->blkg->key accessors to exit their grace periods.
				889	* Do this wait only if there are other undestroyed groups out
				890	* there (other than root group). This can happen if cgroup deletion
				891	* path claimed the responsibility of cleaning up a group before
				892	* queue cleanup code get to the group.
				893	*
				894	* Do not call synchronize_rcu() unconditionally as there are drivers
				895	* which create/delete request queue hundreds of times during scan/boot
				896	* and synchronize_rcu() can take significant time and slow down boot.
				897	*/
				898	if (wait)
				899	synchronize_rcu();
				900	throtl_td_free(td);
				901	}
				902
				903	static int __init throtl_init(void)
				904	{
				905	blkio_policy_register(&blkio_policy_throtl);
				906	return 0;
				907	}
				908
				909	module_init(throtl_init);