Blame - block/blk-wbt.c - SHIFTPHONES/kernel/shift/mainline

blob: aafe5b5512245458a251fe984d82355e83f29693 [file] [log] [blame]

Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	1	/*
				2	* buffered writeback throttling. loosely based on CoDel. We can't drop
				3	* packets for IO scheduling, so the logic is something like this:
				4	*
				5	* - Monitor latencies in a defined window of time.
				6	* - If the minimum latency in the above window exceeds some target, increment
				7	* scaling step and scale down queue depth by a factor of 2x. The monitoring
				8	* window is then shrunk to 100 / sqrt(scaling step + 1).
				9	* - For any window where we don't have solid data on what the latencies
				10	* look like, retain status quo.
				11	* - If latencies look good, decrement scaling step.
				12	* - If we're only doing writes, allow the scaling step to go negative. This
				13	* will temporarily boost write performance, snapping back to a stable
				14	* scaling step of 0 if reads show up or the heavy writers finish. Unlike
				15	* positive scaling steps where we shrink the monitoring window, a negative
				16	* scaling step retains the default step==0 window size.
				17	*
				18	* Copyright (C) 2016 Jens Axboe
				19	*
				20	*/
				21	#include <linux/kernel.h>
				22	#include <linux/blk_types.h>
				23	#include <linux/slab.h>
				24	#include <linux/backing-dev.h>
				25	#include <linux/swap.h>
				26
				27	#include "blk-wbt.h"
				28
				29	#define CREATE_TRACE_POINTS
				30	#include <trace/events/wbt.h>
				31
				32	enum {
				33	/*
				34	* Default setting, we'll scale up (to 75% of QD max) or down (min 1)
				35	* from here depending on device stats
				36	*/
				37	RWB_DEF_DEPTH = 16,
				38
				39	/*
				40	* 100msec window
				41	*/
				42	RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
				43
				44	/*
				45	* Disregard stats, if we don't meet this minimum
				46	*/
				47	RWB_MIN_WRITE_SAMPLES = 3,
				48
				49	/*
				50	* If we have this number of consecutive windows with not enough
				51	* information to scale up or down, scale up.
				52	*/
				53	RWB_UNKNOWN_BUMP = 5,
				54	};
				55
				56	static inline bool rwb_enabled(struct rq_wb *rwb)
				57	{
				58	return rwb && rwb->wb_normal != 0;
				59	}
				60
				61	/*
				62	* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
				63	* false if 'v' + 1 would be bigger than 'below'.
				64	*/
				65	static bool atomic_inc_below(atomic_t *v, int below)
				66	{
				67	int cur = atomic_read(v);
				68
				69	for (;;) {
				70	int old;
				71
				72	if (cur >= below)
				73	return false;
				74	old = atomic_cmpxchg(v, cur, cur + 1);
				75	if (old == cur)
				76	break;
				77	cur = old;
				78	}
				79
				80	return true;
				81	}
				82
				83	static void wb_timestamp(struct rq_wb rwb, unsigned long var)
				84	{
				85	if (rwb_enabled(rwb)) {
				86	const unsigned long cur = jiffies;
				87
				88	if (cur != *var)
				89	*var = cur;
				90	}
				91	}
				92
				93	/*
				94	* If a task was rate throttled in balance_dirty_pages() within the last
				95	* second or so, use that to indicate a higher cleaning rate.
				96	*/
				97	static bool wb_recent_wait(struct rq_wb *rwb)
				98	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	99	struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	100
				101	return time_before(jiffies, wb->dirty_sleep + HZ);
				102	}
				103
				104	static inline struct rq_wait get_rq_wait(struct rq_wb rwb, bool is_kswapd)
				105	{
				106	return &rwb->rq_wait[is_kswapd];
				107	}
				108
				109	static void rwb_wake_all(struct rq_wb *rwb)
				110	{
				111	int i;
				112
				113	for (i = 0; i < WBT_NUM_RWQ; i++) {
				114	struct rq_wait *rqw = &rwb->rq_wait[i];
				115
				116	if (waitqueue_active(&rqw->wait))
				117	wake_up_all(&rqw->wait);
				118	}
				119	}
				120
				121	void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
				122	{
				123	struct rq_wait *rqw;
				124	int inflight, limit;
				125
				126	if (!(wb_acct & WBT_TRACKED))
				127	return;
				128
				129	rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
				130	inflight = atomic_dec_return(&rqw->inflight);
				131
				132	/*
				133	* wbt got disabled with IO in flight. Wake up any potential
				134	* waiters, we don't have to do more than that.
				135	*/
				136	if (unlikely(!rwb_enabled(rwb))) {
				137	rwb_wake_all(rwb);
				138	return;
				139	}
				140
				141	/*
				142	* If the device does write back caching, drop further down
				143	* before we wake people up.
				144	*/
				145	if (rwb->wc && !wb_recent_wait(rwb))
				146	limit = 0;
				147	else
				148	limit = rwb->wb_normal;
				149
				150	/*
				151	* Don't wake anyone up if we are above the normal limit.
				152	*/
				153	if (inflight && inflight >= limit)
				154	return;
				155
				156	if (waitqueue_active(&rqw->wait)) {
				157	int diff = limit - inflight;
				158
				159	if (!inflight \|\| diff >= rwb->wb_background / 2)
				160	wake_up_all(&rqw->wait);
				161	}
				162	}
				163
				164	/*
				165	* Called on completion of a request. Note that it's also called when
				166	* a request is merged, when the request gets freed.
				167	*/
				168	void wbt_done(struct rq_wb rwb, struct blk_issue_stat stat)
				169	{
				170	if (!rwb)
				171	return;
				172
				173	if (!wbt_is_tracked(stat)) {
				174	if (rwb->sync_cookie == stat) {
				175	rwb->sync_issue = 0;
				176	rwb->sync_cookie = NULL;
				177	}
				178
				179	if (wbt_is_read(stat))
				180	wb_timestamp(rwb, &rwb->last_comp);
				181	wbt_clear_state(stat);
				182	} else {
				183	WARN_ON_ONCE(stat == rwb->sync_cookie);
				184	__wbt_done(rwb, wbt_stat_to_mask(stat));
				185	wbt_clear_state(stat);
				186	}
				187	}
				188
				189	/*
				190	* Return true, if we can't increase the depth further by scaling
				191	*/
				192	static bool calc_wb_limits(struct rq_wb *rwb)
				193	{
				194	unsigned int depth;
				195	bool ret = false;
				196
				197	if (!rwb->min_lat_nsec) {
				198	rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
				199	return false;
				200	}
				201
				202	/*
				203	* For QD=1 devices, this is a special case. It's important for those
				204	* to have one request ready when one completes, so force a depth of
				205	* 2 for those devices. On the backend, it'll be a depth of 1 anyway,
				206	* since the device can't have more than that in flight. If we're
				207	* scaling down, then keep a setting of 1/1/1.
				208	*/
				209	if (rwb->queue_depth == 1) {
				210	if (rwb->scale_step > 0)
				211	rwb->wb_max = rwb->wb_normal = 1;
				212	else {
				213	rwb->wb_max = rwb->wb_normal = 2;
				214	ret = true;
				215	}
				216	rwb->wb_background = 1;
				217	} else {
				218	/*
				219	* scale_step == 0 is our default state. If we have suffered
				220	* latency spikes, step will be > 0, and we shrink the
				221	* allowed write depths. If step is < 0, we're only doing
				222	* writes, and we allow a temporarily higher depth to
				223	* increase performance.
				224	*/
				225	depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
				226	if (rwb->scale_step > 0)
				227	depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
				228	else if (rwb->scale_step < 0) {
				229	unsigned int maxd = 3 * rwb->queue_depth / 4;
				230
				231	depth = 1 + ((depth - 1) << -rwb->scale_step);
				232	if (depth > maxd) {
				233	depth = maxd;
				234	ret = true;
				235	}
				236	}
				237
				238	/*
				239	* Set our max/normal/bg queue depths based on how far
				240	* we have scaled down (->scale_step).
				241	*/
				242	rwb->wb_max = depth;
				243	rwb->wb_normal = (rwb->wb_max + 1) / 2;
				244	rwb->wb_background = (rwb->wb_max + 3) / 4;
				245	}
				246
				247	return ret;
				248	}
				249
Arnd Bergmann	4121d38	2016-11-16 16:29:57 +0100	[diff] [blame]	250	static inline bool stat_sample_valid(struct blk_rq_stat *stat)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	251	{
				252	/*
				253	* We need at least one read sample, and a minimum of
				254	* RWB_MIN_WRITE_SAMPLES. We require some write samples to know
				255	* that it's writes impacting us, and not just some sole read on
				256	* a device that is in a lower power state.
				257	*/
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame^]	258	return (stat[READ].nr_samples >= 1 &&
				259	stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	260	}
				261
				262	static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
				263	{
				264	u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
				265
				266	if (!issue \|\| !rwb->sync_cookie)
				267	return 0;
				268
				269	now = ktime_to_ns(ktime_get());
				270	return now - issue;
				271	}
				272
				273	enum {
				274	LAT_OK = 1,
				275	LAT_UNKNOWN,
				276	LAT_UNKNOWN_WRITES,
				277	LAT_EXCEEDED,
				278	};
				279
				280	static int __latency_exceeded(struct rq_wb rwb, struct blk_rq_stat stat)
				281	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	282	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	283	u64 thislat;
				284
				285	/*
				286	* If our stored sync issue exceeds the window size, or it
				287	* exceeds our min target AND we haven't logged any entries,
				288	* flag the latency as exceeded. wbt works off completion latencies,
				289	* but for a flooded device, a single sync IO can take a long time
				290	* to complete after being issued. If this time exceeds our
				291	* monitoring window AND we didn't see any other completions in that
				292	* window, then count that sync IO as a violation of the latency.
				293	*/
				294	thislat = rwb_sync_issue_lat(rwb);
				295	if (thislat > rwb->cur_win_nsec \|\|
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame^]	296	(thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	297	trace_wbt_lat(bdi, thislat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	298	return LAT_EXCEEDED;
				299	}
				300
				301	/*
				302	* No read/write mix, if stat isn't valid
				303	*/
				304	if (!stat_sample_valid(stat)) {
				305	/*
				306	* If we had writes in this stat window and the window is
				307	* current, we're only doing writes. If a task recently
				308	* waited or still has writes in flights, consider us doing
				309	* just writes as well.
				310	*/
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame^]	311	if ((stat[WRITE].nr_samples && blk_stat_is_current(stat)) \|\|
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	312	wb_recent_wait(rwb) \|\| wbt_inflight(rwb))
				313	return LAT_UNKNOWN_WRITES;
				314	return LAT_UNKNOWN;
				315	}
				316
				317	/*
				318	* If the 'min' latency exceeds our target, step down.
				319	*/
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame^]	320	if (stat[READ].min > rwb->min_lat_nsec) {
				321	trace_wbt_lat(bdi, stat[READ].min);
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	322	trace_wbt_stat(bdi, stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	323	return LAT_EXCEEDED;
				324	}
				325
				326	if (rwb->scale_step)
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	327	trace_wbt_stat(bdi, stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	328
				329	return LAT_OK;
				330	}
				331
				332	static int latency_exceeded(struct rq_wb *rwb)
				333	{
				334	struct blk_rq_stat stat[2];
				335
Jens Axboe	8054b89	2016-11-10 21:50:51 -0700	[diff] [blame]	336	blk_queue_stat_get(rwb->queue, stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	337	return __latency_exceeded(rwb, stat);
				338	}
				339
				340	static void rwb_trace_step(struct rq_wb rwb, const char msg)
				341	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	342	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	343
				344	trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	345	rwb->wb_background, rwb->wb_normal, rwb->wb_max);
				346	}
				347
				348	static void scale_up(struct rq_wb *rwb)
				349	{
				350	/*
				351	* Hit max in previous round, stop here
				352	*/
				353	if (rwb->scaled_max)
				354	return;
				355
				356	rwb->scale_step--;
				357	rwb->unknown_cnt = 0;
Jens Axboe	8054b89	2016-11-10 21:50:51 -0700	[diff] [blame]	358	blk_stat_clear(rwb->queue);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	359
				360	rwb->scaled_max = calc_wb_limits(rwb);
				361
				362	rwb_wake_all(rwb);
				363
				364	rwb_trace_step(rwb, "step up");
				365	}
				366
				367	/*
				368	* Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
				369	* had a latency violation.
				370	*/
				371	static void scale_down(struct rq_wb *rwb, bool hard_throttle)
				372	{
				373	/*
				374	* Stop scaling down when we've hit the limit. This also prevents
				375	* ->scale_step from going to crazy values, if the device can't
				376	* keep up.
				377	*/
				378	if (rwb->wb_max == 1)
				379	return;
				380
				381	if (rwb->scale_step < 0 && hard_throttle)
				382	rwb->scale_step = 0;
				383	else
				384	rwb->scale_step++;
				385
				386	rwb->scaled_max = false;
				387	rwb->unknown_cnt = 0;
Jens Axboe	8054b89	2016-11-10 21:50:51 -0700	[diff] [blame]	388	blk_stat_clear(rwb->queue);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	389	calc_wb_limits(rwb);
				390	rwb_trace_step(rwb, "step down");
				391	}
				392
				393	static void rwb_arm_timer(struct rq_wb *rwb)
				394	{
				395	unsigned long expires;
				396
				397	if (rwb->scale_step > 0) {
				398	/*
				399	* We should speed this up, using some variant of a fast
				400	* integer inverse square root calculation. Since we only do
				401	* this for every window expiration, it's not a huge deal,
				402	* though.
				403	*/
				404	rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
				405	int_sqrt((rwb->scale_step + 1) << 8));
				406	} else {
				407	/*
				408	* For step < 0, we don't want to increase/decrease the
				409	* window size.
				410	*/
				411	rwb->cur_win_nsec = rwb->win_nsec;
				412	}
				413
				414	expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
				415	mod_timer(&rwb->window_timer, expires);
				416	}
				417
				418	static void wb_timer_fn(unsigned long data)
				419	{
				420	struct rq_wb rwb = (struct rq_wb ) data;
				421	unsigned int inflight = wbt_inflight(rwb);
				422	int status;
				423
				424	status = latency_exceeded(rwb);
				425
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	426	trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	427	inflight);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	428
				429	/*
				430	* If we exceeded the latency target, step down. If we did not,
				431	* step one level up. If we don't know enough to say either exceeded
				432	* or ok, then don't do anything.
				433	*/
				434	switch (status) {
				435	case LAT_EXCEEDED:
				436	scale_down(rwb, true);
				437	break;
				438	case LAT_OK:
				439	scale_up(rwb);
				440	break;
				441	case LAT_UNKNOWN_WRITES:
				442	/*
				443	* We started a the center step, but don't have a valid
				444	* read/write sample, but we do have writes going on.
				445	* Allow step to go negative, to increase write perf.
				446	*/
				447	scale_up(rwb);
				448	break;
				449	case LAT_UNKNOWN:
				450	if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
				451	break;
				452	/*
				453	* We get here when previously scaled reduced depth, and we
				454	* currently don't have a valid read/write sample. For that
				455	* case, slowly return to center state (step == 0).
				456	*/
				457	if (rwb->scale_step > 0)
				458	scale_up(rwb);
				459	else if (rwb->scale_step < 0)
				460	scale_down(rwb, false);
				461	break;
				462	default:
				463	break;
				464	}
				465
				466	/*
				467	* Re-arm timer, if we have IO in flight
				468	*/
				469	if (rwb->scale_step \|\| inflight)
				470	rwb_arm_timer(rwb);
				471	}
				472
				473	void wbt_update_limits(struct rq_wb *rwb)
				474	{
				475	rwb->scale_step = 0;
				476	rwb->scaled_max = false;
				477	calc_wb_limits(rwb);
				478
				479	rwb_wake_all(rwb);
				480	}
				481
				482	static bool close_io(struct rq_wb *rwb)
				483	{
				484	const unsigned long now = jiffies;
				485
				486	return time_before(now, rwb->last_issue + HZ / 10) \|\|
				487	time_before(now, rwb->last_comp + HZ / 10);
				488	}
				489
				490	#define REQ_HIPRIO (REQ_SYNC \| REQ_META \| REQ_PRIO)
				491
				492	static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
				493	{
				494	unsigned int limit;
				495
				496	/*
				497	* At this point we know it's a buffered write. If this is
				498	* kswapd trying to free memory, or REQ_SYNC is set, set, then
				499	* it's WB_SYNC_ALL writeback, and we'll use the max limit for
				500	* that. If the write is marked as a background write, then use
				501	* the idle limit, or go to normal if we haven't had competing
				502	* IO for a bit.
				503	*/
				504	if ((rw & REQ_HIPRIO) \|\| wb_recent_wait(rwb) \|\| current_is_kswapd())
				505	limit = rwb->wb_max;
				506	else if ((rw & REQ_BACKGROUND) \|\| close_io(rwb)) {
				507	/*
				508	* If less than 100ms since we completed unrelated IO,
				509	* limit us to half the depth for background writeback.
				510	*/
				511	limit = rwb->wb_background;
				512	} else
				513	limit = rwb->wb_normal;
				514
				515	return limit;
				516	}
				517
				518	static inline bool may_queue(struct rq_wb rwb, struct rq_wait rqw,
				519	wait_queue_t *wait, unsigned long rw)
				520	{
				521	/*
				522	* inc it here even if disabled, since we'll dec it at completion.
				523	* this only happens if the task was sleeping in __wbt_wait(),
				524	* and someone turned it off at the same time.
				525	*/
				526	if (!rwb_enabled(rwb)) {
				527	atomic_inc(&rqw->inflight);
				528	return true;
				529	}
				530
				531	/*
				532	* If the waitqueue is already active and we are not the next
				533	* in line to be woken up, wait for our turn.
				534	*/
				535	if (waitqueue_active(&rqw->wait) &&
				536	rqw->wait.task_list.next != &wait->task_list)
				537	return false;
				538
				539	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
				540	}
				541
				542	/*
				543	* Block if we will exceed our limit, or if we are currently waiting for
				544	* the timer to kick off queuing again.
				545	*/
				546	static void __wbt_wait(struct rq_wb rwb, unsigned long rw, spinlock_t lock)
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	547	__releases(lock)
				548	__acquires(lock)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	549	{
				550	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
				551	DEFINE_WAIT(wait);
				552
				553	if (may_queue(rwb, rqw, &wait, rw))
				554	return;
				555
				556	do {
				557	prepare_to_wait_exclusive(&rqw->wait, &wait,
				558	TASK_UNINTERRUPTIBLE);
				559
				560	if (may_queue(rwb, rqw, &wait, rw))
				561	break;
				562
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	563	if (lock) {
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	564	spin_unlock_irq(lock);
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	565	io_schedule();
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	566	spin_lock_irq(lock);
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	567	} else
				568	io_schedule();
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	569	} while (1);
				570
				571	finish_wait(&rqw->wait, &wait);
				572	}
				573
				574	static inline bool wbt_should_throttle(struct rq_wb rwb, struct bio bio)
				575	{
				576	const int op = bio_op(bio);
				577
				578	/*
Christoph Hellwig	be07e14	2016-12-09 14:19:06 +0100	[diff] [blame]	579	* If not a WRITE, do nothing
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	580	*/
Christoph Hellwig	be07e14	2016-12-09 14:19:06 +0100	[diff] [blame]	581	if (op != REQ_OP_WRITE)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	582	return false;
				583
				584	/*
				585	* Don't throttle WRITE_ODIRECT
				586	*/
				587	if ((bio->bi_opf & (REQ_SYNC \| REQ_IDLE)) == (REQ_SYNC \| REQ_IDLE))
				588	return false;
				589
				590	return true;
				591	}
				592
				593	/*
				594	* Returns true if the IO request should be accounted, false if not.
				595	* May sleep, if we have exceeded the writeback limits. Caller can pass
				596	* in an irq held spinlock, if it holds one when calling this function.
				597	* If we do sleep, we'll release and re-grab it.
				598	*/
Bart Van Assche	f2e0a0b	2017-01-02 09:46:15 -0700	[diff] [blame]	599	enum wbt_flags wbt_wait(struct rq_wb rwb, struct bio bio, spinlock_t *lock)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	600	{
				601	unsigned int ret = 0;
				602
				603	if (!rwb_enabled(rwb))
				604	return 0;
				605
				606	if (bio_op(bio) == REQ_OP_READ)
				607	ret = WBT_READ;
				608
				609	if (!wbt_should_throttle(rwb, bio)) {
				610	if (ret & WBT_READ)
				611	wb_timestamp(rwb, &rwb->last_issue);
				612	return ret;
				613	}
				614
				615	__wbt_wait(rwb, bio->bi_opf, lock);
				616
				617	if (!timer_pending(&rwb->window_timer))
				618	rwb_arm_timer(rwb);
				619
				620	if (current_is_kswapd())
				621	ret \|= WBT_KSWAPD;
				622
				623	return ret \| WBT_TRACKED;
				624	}
				625
				626	void wbt_issue(struct rq_wb rwb, struct blk_issue_stat stat)
				627	{
				628	if (!rwb_enabled(rwb))
				629	return;
				630
				631	/*
				632	* Track sync issue, in case it takes a long time to complete. Allows
				633	* us to react quicker, if a sync IO takes a long time to complete.
				634	* Note that this is just a hint. 'stat' can go away when the
				635	* request completes, so it's important we never dereference it. We
				636	* only use the address to compare with, which is why we store the
				637	* sync_issue time locally.
				638	*/
				639	if (wbt_is_read(stat) && !rwb->sync_issue) {
				640	rwb->sync_cookie = stat;
				641	rwb->sync_issue = blk_stat_time(stat);
				642	}
				643	}
				644
				645	void wbt_requeue(struct rq_wb rwb, struct blk_issue_stat stat)
				646	{
				647	if (!rwb_enabled(rwb))
				648	return;
				649	if (stat == rwb->sync_cookie) {
				650	rwb->sync_issue = 0;
				651	rwb->sync_cookie = NULL;
				652	}
				653	}
				654
				655	void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
				656	{
				657	if (rwb) {
				658	rwb->queue_depth = depth;
				659	wbt_update_limits(rwb);
				660	}
				661	}
				662
				663	void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
				664	{
				665	if (rwb)
				666	rwb->wc = write_cache_on;
				667	}
				668
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	669	/*
				670	* Disable wbt, if enabled by default. Only called from CFQ, if we have
				671	* cgroups enabled
				672	*/
				673	void wbt_disable_default(struct request_queue *q)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	674	{
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	675	struct rq_wb *rwb = q->rq_wb;
				676
Jens Axboe	d62118b	2016-11-28 09:40:34 -0700	[diff] [blame]	677	if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	678	del_timer_sync(&rwb->window_timer);
				679	rwb->win_nsec = rwb->min_lat_nsec = 0;
				680	wbt_update_limits(rwb);
				681	}
				682	}
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	683	EXPORT_SYMBOL_GPL(wbt_disable_default);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	684
Jens Axboe	80e091d	2016-11-28 09:22:47 -0700	[diff] [blame]	685	u64 wbt_default_latency_nsec(struct request_queue *q)
				686	{
				687	/*
				688	* We default to 2msec for non-rotational storage, and 75msec
				689	* for rotational storage.
				690	*/
				691	if (blk_queue_nonrot(q))
				692	return 2000000ULL;
				693	else
				694	return 75000000ULL;
				695	}
				696
Jens Axboe	8054b89	2016-11-10 21:50:51 -0700	[diff] [blame]	697	int wbt_init(struct request_queue *q)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	698	{
				699	struct rq_wb *rwb;
				700	int i;
				701
				702	/*
				703	* For now, we depend on the stats window being larger than
				704	* our monitoring window. Ensure that this isn't inadvertently
				705	* violated.
				706	*/
				707	BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
				708	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
				709
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	710	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
				711	if (!rwb)
				712	return -ENOMEM;
				713
				714	for (i = 0; i < WBT_NUM_RWQ; i++) {
				715	atomic_set(&rwb->rq_wait[i].inflight, 0);
				716	init_waitqueue_head(&rwb->rq_wait[i].wait);
				717	}
				718
				719	setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
				720	rwb->wc = 1;
				721	rwb->queue_depth = RWB_DEF_DEPTH;
				722	rwb->last_comp = rwb->last_issue = jiffies;
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	723	rwb->queue = q;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	724	rwb->win_nsec = RWB_WINDOW_NSEC;
Jens Axboe	d62118b	2016-11-28 09:40:34 -0700	[diff] [blame]	725	rwb->enable_state = WBT_STATE_ON_DEFAULT;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	726	wbt_update_limits(rwb);
				727
				728	/*
				729	* Assign rwb, and turn on stats tracking for this queue
				730	*/
				731	q->rq_wb = rwb;
				732	blk_stat_enable(q);
				733
Jens Axboe	80e091d	2016-11-28 09:22:47 -0700	[diff] [blame]	734	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	735
				736	wbt_set_queue_depth(rwb, blk_queue_depth(q));
				737	wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
				738
				739	return 0;
				740	}
				741
				742	void wbt_exit(struct request_queue *q)
				743	{
				744	struct rq_wb *rwb = q->rq_wb;
				745
				746	if (rwb) {
				747	del_timer_sync(&rwb->window_timer);
				748	q->rq_wb = NULL;
				749	kfree(rwb);
				750	}
				751	}