Blame - block/blk-wbt.c - SHIFTPHONES/mainline/linux

blob: d822530e6aeade81a7c9b2b3d9c6a6cccc0bb351 [file] [log] [blame]

Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	1	/*
				2	* buffered writeback throttling. loosely based on CoDel. We can't drop
				3	* packets for IO scheduling, so the logic is something like this:
				4	*
				5	* - Monitor latencies in a defined window of time.
				6	* - If the minimum latency in the above window exceeds some target, increment
				7	* scaling step and scale down queue depth by a factor of 2x. The monitoring
				8	* window is then shrunk to 100 / sqrt(scaling step + 1).
				9	* - For any window where we don't have solid data on what the latencies
				10	* look like, retain status quo.
				11	* - If latencies look good, decrement scaling step.
				12	* - If we're only doing writes, allow the scaling step to go negative. This
				13	* will temporarily boost write performance, snapping back to a stable
				14	* scaling step of 0 if reads show up or the heavy writers finish. Unlike
				15	* positive scaling steps where we shrink the monitoring window, a negative
				16	* scaling step retains the default step==0 window size.
				17	*
				18	* Copyright (C) 2016 Jens Axboe
				19	*
				20	*/
				21	#include <linux/kernel.h>
				22	#include <linux/blk_types.h>
				23	#include <linux/slab.h>
				24	#include <linux/backing-dev.h>
				25	#include <linux/swap.h>
				26
				27	#include "blk-wbt.h"
				28
				29	#define CREATE_TRACE_POINTS
				30	#include <trace/events/wbt.h>
				31
				32	enum {
				33	/*
				34	* Default setting, we'll scale up (to 75% of QD max) or down (min 1)
				35	* from here depending on device stats
				36	*/
				37	RWB_DEF_DEPTH = 16,
				38
				39	/*
				40	* 100msec window
				41	*/
				42	RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
				43
				44	/*
				45	* Disregard stats, if we don't meet this minimum
				46	*/
				47	RWB_MIN_WRITE_SAMPLES = 3,
				48
				49	/*
				50	* If we have this number of consecutive windows with not enough
				51	* information to scale up or down, scale up.
				52	*/
				53	RWB_UNKNOWN_BUMP = 5,
				54	};
				55
				56	static inline bool rwb_enabled(struct rq_wb *rwb)
				57	{
				58	return rwb && rwb->wb_normal != 0;
				59	}
				60
				61	/*
				62	* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
				63	* false if 'v' + 1 would be bigger than 'below'.
				64	*/
				65	static bool atomic_inc_below(atomic_t *v, int below)
				66	{
				67	int cur = atomic_read(v);
				68
				69	for (;;) {
				70	int old;
				71
				72	if (cur >= below)
				73	return false;
				74	old = atomic_cmpxchg(v, cur, cur + 1);
				75	if (old == cur)
				76	break;
				77	cur = old;
				78	}
				79
				80	return true;
				81	}
				82
				83	static void wb_timestamp(struct rq_wb rwb, unsigned long var)
				84	{
				85	if (rwb_enabled(rwb)) {
				86	const unsigned long cur = jiffies;
				87
				88	if (cur != *var)
				89	*var = cur;
				90	}
				91	}
				92
				93	/*
				94	* If a task was rate throttled in balance_dirty_pages() within the last
				95	* second or so, use that to indicate a higher cleaning rate.
				96	*/
				97	static bool wb_recent_wait(struct rq_wb *rwb)
				98	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	99	struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	100
				101	return time_before(jiffies, wb->dirty_sleep + HZ);
				102	}
				103
				104	static inline struct rq_wait get_rq_wait(struct rq_wb rwb, bool is_kswapd)
				105	{
				106	return &rwb->rq_wait[is_kswapd];
				107	}
				108
				109	static void rwb_wake_all(struct rq_wb *rwb)
				110	{
				111	int i;
				112
				113	for (i = 0; i < WBT_NUM_RWQ; i++) {
				114	struct rq_wait *rqw = &rwb->rq_wait[i];
				115
				116	if (waitqueue_active(&rqw->wait))
				117	wake_up_all(&rqw->wait);
				118	}
				119	}
				120
				121	void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
				122	{
				123	struct rq_wait *rqw;
				124	int inflight, limit;
				125
				126	if (!(wb_acct & WBT_TRACKED))
				127	return;
				128
				129	rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
				130	inflight = atomic_dec_return(&rqw->inflight);
				131
				132	/*
				133	* wbt got disabled with IO in flight. Wake up any potential
				134	* waiters, we don't have to do more than that.
				135	*/
				136	if (unlikely(!rwb_enabled(rwb))) {
				137	rwb_wake_all(rwb);
				138	return;
				139	}
				140
				141	/*
				142	* If the device does write back caching, drop further down
				143	* before we wake people up.
				144	*/
				145	if (rwb->wc && !wb_recent_wait(rwb))
				146	limit = 0;
				147	else
				148	limit = rwb->wb_normal;
				149
				150	/*
				151	* Don't wake anyone up if we are above the normal limit.
				152	*/
				153	if (inflight && inflight >= limit)
				154	return;
				155
				156	if (waitqueue_active(&rqw->wait)) {
				157	int diff = limit - inflight;
				158
				159	if (!inflight \|\| diff >= rwb->wb_background / 2)
				160	wake_up_all(&rqw->wait);
				161	}
				162	}
				163
				164	/*
				165	* Called on completion of a request. Note that it's also called when
				166	* a request is merged, when the request gets freed.
				167	*/
				168	void wbt_done(struct rq_wb rwb, struct blk_issue_stat stat)
				169	{
				170	if (!rwb)
				171	return;
				172
				173	if (!wbt_is_tracked(stat)) {
				174	if (rwb->sync_cookie == stat) {
				175	rwb->sync_issue = 0;
				176	rwb->sync_cookie = NULL;
				177	}
				178
				179	if (wbt_is_read(stat))
				180	wb_timestamp(rwb, &rwb->last_comp);
				181	wbt_clear_state(stat);
				182	} else {
				183	WARN_ON_ONCE(stat == rwb->sync_cookie);
				184	__wbt_done(rwb, wbt_stat_to_mask(stat));
				185	wbt_clear_state(stat);
				186	}
				187	}
				188
				189	/*
				190	* Return true, if we can't increase the depth further by scaling
				191	*/
				192	static bool calc_wb_limits(struct rq_wb *rwb)
				193	{
				194	unsigned int depth;
				195	bool ret = false;
				196
				197	if (!rwb->min_lat_nsec) {
				198	rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
				199	return false;
				200	}
				201
				202	/*
				203	* For QD=1 devices, this is a special case. It's important for those
				204	* to have one request ready when one completes, so force a depth of
				205	* 2 for those devices. On the backend, it'll be a depth of 1 anyway,
				206	* since the device can't have more than that in flight. If we're
				207	* scaling down, then keep a setting of 1/1/1.
				208	*/
				209	if (rwb->queue_depth == 1) {
				210	if (rwb->scale_step > 0)
				211	rwb->wb_max = rwb->wb_normal = 1;
				212	else {
				213	rwb->wb_max = rwb->wb_normal = 2;
				214	ret = true;
				215	}
				216	rwb->wb_background = 1;
				217	} else {
				218	/*
				219	* scale_step == 0 is our default state. If we have suffered
				220	* latency spikes, step will be > 0, and we shrink the
				221	* allowed write depths. If step is < 0, we're only doing
				222	* writes, and we allow a temporarily higher depth to
				223	* increase performance.
				224	*/
				225	depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
				226	if (rwb->scale_step > 0)
				227	depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
				228	else if (rwb->scale_step < 0) {
				229	unsigned int maxd = 3 * rwb->queue_depth / 4;
				230
				231	depth = 1 + ((depth - 1) << -rwb->scale_step);
				232	if (depth > maxd) {
				233	depth = maxd;
				234	ret = true;
				235	}
				236	}
				237
				238	/*
				239	* Set our max/normal/bg queue depths based on how far
				240	* we have scaled down (->scale_step).
				241	*/
				242	rwb->wb_max = depth;
				243	rwb->wb_normal = (rwb->wb_max + 1) / 2;
				244	rwb->wb_background = (rwb->wb_max + 3) / 4;
				245	}
				246
				247	return ret;
				248	}
				249
Arnd Bergmann	4121d38	2016-11-16 16:29:57 +0100	[diff] [blame]	250	static inline bool stat_sample_valid(struct blk_rq_stat *stat)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	251	{
				252	/*
				253	* We need at least one read sample, and a minimum of
				254	* RWB_MIN_WRITE_SAMPLES. We require some write samples to know
				255	* that it's writes impacting us, and not just some sole read on
				256	* a device that is in a lower power state.
				257	*/
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame]	258	return (stat[READ].nr_samples >= 1 &&
				259	stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	260	}
				261
				262	static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
				263	{
Mark Rutland	6aa7de0	2017-10-23 14:07:29 -0700	[diff] [blame^]	264	u64 now, issue = READ_ONCE(rwb->sync_issue);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	265
				266	if (!issue \|\| !rwb->sync_cookie)
				267	return 0;
				268
				269	now = ktime_to_ns(ktime_get());
				270	return now - issue;
				271	}
				272
				273	enum {
				274	LAT_OK = 1,
				275	LAT_UNKNOWN,
				276	LAT_UNKNOWN_WRITES,
				277	LAT_EXCEEDED,
				278	};
				279
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	280	static int latency_exceeded(struct rq_wb rwb, struct blk_rq_stat stat)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	281	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	282	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	283	u64 thislat;
				284
				285	/*
				286	* If our stored sync issue exceeds the window size, or it
				287	* exceeds our min target AND we haven't logged any entries,
				288	* flag the latency as exceeded. wbt works off completion latencies,
				289	* but for a flooded device, a single sync IO can take a long time
				290	* to complete after being issued. If this time exceeds our
				291	* monitoring window AND we didn't see any other completions in that
				292	* window, then count that sync IO as a violation of the latency.
				293	*/
				294	thislat = rwb_sync_issue_lat(rwb);
				295	if (thislat > rwb->cur_win_nsec \|\|
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame]	296	(thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	297	trace_wbt_lat(bdi, thislat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	298	return LAT_EXCEEDED;
				299	}
				300
				301	/*
				302	* No read/write mix, if stat isn't valid
				303	*/
				304	if (!stat_sample_valid(stat)) {
				305	/*
				306	* If we had writes in this stat window and the window is
				307	* current, we're only doing writes. If a task recently
				308	* waited or still has writes in flights, consider us doing
				309	* just writes as well.
				310	*/
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	311	if (stat[WRITE].nr_samples \|\| wb_recent_wait(rwb) \|\|
				312	wbt_inflight(rwb))
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	313	return LAT_UNKNOWN_WRITES;
				314	return LAT_UNKNOWN;
				315	}
				316
				317	/*
				318	* If the 'min' latency exceeds our target, step down.
				319	*/
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame]	320	if (stat[READ].min > rwb->min_lat_nsec) {
				321	trace_wbt_lat(bdi, stat[READ].min);
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	322	trace_wbt_stat(bdi, stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	323	return LAT_EXCEEDED;
				324	}
				325
				326	if (rwb->scale_step)
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	327	trace_wbt_stat(bdi, stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	328
				329	return LAT_OK;
				330	}
				331
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	332	static void rwb_trace_step(struct rq_wb rwb, const char msg)
				333	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	334	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	335
				336	trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	337	rwb->wb_background, rwb->wb_normal, rwb->wb_max);
				338	}
				339
				340	static void scale_up(struct rq_wb *rwb)
				341	{
				342	/*
				343	* Hit max in previous round, stop here
				344	*/
				345	if (rwb->scaled_max)
				346	return;
				347
				348	rwb->scale_step--;
				349	rwb->unknown_cnt = 0;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	350
				351	rwb->scaled_max = calc_wb_limits(rwb);
				352
				353	rwb_wake_all(rwb);
				354
				355	rwb_trace_step(rwb, "step up");
				356	}
				357
				358	/*
				359	* Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
				360	* had a latency violation.
				361	*/
				362	static void scale_down(struct rq_wb *rwb, bool hard_throttle)
				363	{
				364	/*
				365	* Stop scaling down when we've hit the limit. This also prevents
				366	* ->scale_step from going to crazy values, if the device can't
				367	* keep up.
				368	*/
				369	if (rwb->wb_max == 1)
				370	return;
				371
				372	if (rwb->scale_step < 0 && hard_throttle)
				373	rwb->scale_step = 0;
				374	else
				375	rwb->scale_step++;
				376
				377	rwb->scaled_max = false;
				378	rwb->unknown_cnt = 0;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	379	calc_wb_limits(rwb);
				380	rwb_trace_step(rwb, "step down");
				381	}
				382
				383	static void rwb_arm_timer(struct rq_wb *rwb)
				384	{
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	385	if (rwb->scale_step > 0) {
				386	/*
				387	* We should speed this up, using some variant of a fast
				388	* integer inverse square root calculation. Since we only do
				389	* this for every window expiration, it's not a huge deal,
				390	* though.
				391	*/
				392	rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
				393	int_sqrt((rwb->scale_step + 1) << 8));
				394	} else {
				395	/*
				396	* For step < 0, we don't want to increase/decrease the
				397	* window size.
				398	*/
				399	rwb->cur_win_nsec = rwb->win_nsec;
				400	}
				401
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	402	blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	403	}
				404
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	405	static void wb_timer_fn(struct blk_stat_callback *cb)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	406	{
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	407	struct rq_wb *rwb = cb->data;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	408	unsigned int inflight = wbt_inflight(rwb);
				409	int status;
				410
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	411	status = latency_exceeded(rwb, cb->stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	412
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	413	trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	414	inflight);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	415
				416	/*
				417	* If we exceeded the latency target, step down. If we did not,
				418	* step one level up. If we don't know enough to say either exceeded
				419	* or ok, then don't do anything.
				420	*/
				421	switch (status) {
				422	case LAT_EXCEEDED:
				423	scale_down(rwb, true);
				424	break;
				425	case LAT_OK:
				426	scale_up(rwb);
				427	break;
				428	case LAT_UNKNOWN_WRITES:
				429	/*
				430	* We started a the center step, but don't have a valid
				431	* read/write sample, but we do have writes going on.
				432	* Allow step to go negative, to increase write perf.
				433	*/
				434	scale_up(rwb);
				435	break;
				436	case LAT_UNKNOWN:
				437	if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
				438	break;
				439	/*
				440	* We get here when previously scaled reduced depth, and we
				441	* currently don't have a valid read/write sample. For that
				442	* case, slowly return to center state (step == 0).
				443	*/
				444	if (rwb->scale_step > 0)
				445	scale_up(rwb);
				446	else if (rwb->scale_step < 0)
				447	scale_down(rwb, false);
				448	break;
				449	default:
				450	break;
				451	}
				452
				453	/*
				454	* Re-arm timer, if we have IO in flight
				455	*/
				456	if (rwb->scale_step \|\| inflight)
				457	rwb_arm_timer(rwb);
				458	}
				459
				460	void wbt_update_limits(struct rq_wb *rwb)
				461	{
				462	rwb->scale_step = 0;
				463	rwb->scaled_max = false;
				464	calc_wb_limits(rwb);
				465
				466	rwb_wake_all(rwb);
				467	}
				468
				469	static bool close_io(struct rq_wb *rwb)
				470	{
				471	const unsigned long now = jiffies;
				472
				473	return time_before(now, rwb->last_issue + HZ / 10) \|\|
				474	time_before(now, rwb->last_comp + HZ / 10);
				475	}
				476
				477	#define REQ_HIPRIO (REQ_SYNC \| REQ_META \| REQ_PRIO)
				478
				479	static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
				480	{
				481	unsigned int limit;
				482
				483	/*
				484	* At this point we know it's a buffered write. If this is
				485	* kswapd trying to free memory, or REQ_SYNC is set, set, then
				486	* it's WB_SYNC_ALL writeback, and we'll use the max limit for
				487	* that. If the write is marked as a background write, then use
				488	* the idle limit, or go to normal if we haven't had competing
				489	* IO for a bit.
				490	*/
				491	if ((rw & REQ_HIPRIO) \|\| wb_recent_wait(rwb) \|\| current_is_kswapd())
				492	limit = rwb->wb_max;
				493	else if ((rw & REQ_BACKGROUND) \|\| close_io(rwb)) {
				494	/*
				495	* If less than 100ms since we completed unrelated IO,
				496	* limit us to half the depth for background writeback.
				497	*/
				498	limit = rwb->wb_background;
				499	} else
				500	limit = rwb->wb_normal;
				501
				502	return limit;
				503	}
				504
				505	static inline bool may_queue(struct rq_wb rwb, struct rq_wait rqw,
Ingo Molnar	ac6424b	2017-06-20 12:06:13 +0200	[diff] [blame]	506	wait_queue_entry_t *wait, unsigned long rw)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	507	{
				508	/*
				509	* inc it here even if disabled, since we'll dec it at completion.
				510	* this only happens if the task was sleeping in __wbt_wait(),
				511	* and someone turned it off at the same time.
				512	*/
				513	if (!rwb_enabled(rwb)) {
				514	atomic_inc(&rqw->inflight);
				515	return true;
				516	}
				517
				518	/*
				519	* If the waitqueue is already active and we are not the next
				520	* in line to be woken up, wait for our turn.
				521	*/
				522	if (waitqueue_active(&rqw->wait) &&
Ingo Molnar	2055da9	2017-06-20 12:06:46 +0200	[diff] [blame]	523	rqw->wait.head.next != &wait->entry)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	524	return false;
				525
				526	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
				527	}
				528
				529	/*
				530	* Block if we will exceed our limit, or if we are currently waiting for
				531	* the timer to kick off queuing again.
				532	*/
				533	static void __wbt_wait(struct rq_wb rwb, unsigned long rw, spinlock_t lock)
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	534	__releases(lock)
				535	__acquires(lock)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	536	{
				537	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
				538	DEFINE_WAIT(wait);
				539
				540	if (may_queue(rwb, rqw, &wait, rw))
				541	return;
				542
				543	do {
				544	prepare_to_wait_exclusive(&rqw->wait, &wait,
				545	TASK_UNINTERRUPTIBLE);
				546
				547	if (may_queue(rwb, rqw, &wait, rw))
				548	break;
				549
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	550	if (lock) {
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	551	spin_unlock_irq(lock);
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	552	io_schedule();
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	553	spin_lock_irq(lock);
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	554	} else
				555	io_schedule();
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	556	} while (1);
				557
				558	finish_wait(&rqw->wait, &wait);
				559	}
				560
				561	static inline bool wbt_should_throttle(struct rq_wb rwb, struct bio bio)
				562	{
				563	const int op = bio_op(bio);
				564
				565	/*
Christoph Hellwig	be07e14	2016-12-09 14:19:06 +0100	[diff] [blame]	566	* If not a WRITE, do nothing
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	567	*/
Christoph Hellwig	be07e14	2016-12-09 14:19:06 +0100	[diff] [blame]	568	if (op != REQ_OP_WRITE)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	569	return false;
				570
				571	/*
				572	* Don't throttle WRITE_ODIRECT
				573	*/
				574	if ((bio->bi_opf & (REQ_SYNC \| REQ_IDLE)) == (REQ_SYNC \| REQ_IDLE))
				575	return false;
				576
				577	return true;
				578	}
				579
				580	/*
				581	* Returns true if the IO request should be accounted, false if not.
				582	* May sleep, if we have exceeded the writeback limits. Caller can pass
				583	* in an irq held spinlock, if it holds one when calling this function.
				584	* If we do sleep, we'll release and re-grab it.
				585	*/
Bart Van Assche	f2e0a0b	2017-01-02 09:46:15 -0700	[diff] [blame]	586	enum wbt_flags wbt_wait(struct rq_wb rwb, struct bio bio, spinlock_t *lock)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	587	{
				588	unsigned int ret = 0;
				589
				590	if (!rwb_enabled(rwb))
				591	return 0;
				592
				593	if (bio_op(bio) == REQ_OP_READ)
				594	ret = WBT_READ;
				595
				596	if (!wbt_should_throttle(rwb, bio)) {
				597	if (ret & WBT_READ)
				598	wb_timestamp(rwb, &rwb->last_issue);
				599	return ret;
				600	}
				601
				602	__wbt_wait(rwb, bio->bi_opf, lock);
				603
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	604	if (!blk_stat_is_active(rwb->cb))
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	605	rwb_arm_timer(rwb);
				606
				607	if (current_is_kswapd())
				608	ret \|= WBT_KSWAPD;
				609
				610	return ret \| WBT_TRACKED;
				611	}
				612
				613	void wbt_issue(struct rq_wb rwb, struct blk_issue_stat stat)
				614	{
				615	if (!rwb_enabled(rwb))
				616	return;
				617
				618	/*
				619	* Track sync issue, in case it takes a long time to complete. Allows
				620	* us to react quicker, if a sync IO takes a long time to complete.
				621	* Note that this is just a hint. 'stat' can go away when the
				622	* request completes, so it's important we never dereference it. We
				623	* only use the address to compare with, which is why we store the
				624	* sync_issue time locally.
				625	*/
				626	if (wbt_is_read(stat) && !rwb->sync_issue) {
				627	rwb->sync_cookie = stat;
				628	rwb->sync_issue = blk_stat_time(stat);
				629	}
				630	}
				631
				632	void wbt_requeue(struct rq_wb rwb, struct blk_issue_stat stat)
				633	{
				634	if (!rwb_enabled(rwb))
				635	return;
				636	if (stat == rwb->sync_cookie) {
				637	rwb->sync_issue = 0;
				638	rwb->sync_cookie = NULL;
				639	}
				640	}
				641
				642	void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
				643	{
				644	if (rwb) {
				645	rwb->queue_depth = depth;
				646	wbt_update_limits(rwb);
				647	}
				648	}
				649
				650	void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
				651	{
				652	if (rwb)
				653	rwb->wc = write_cache_on;
				654	}
				655
Jan Kara	3f19cd2	2017-04-11 11:29:01 +0200	[diff] [blame]	656	/*
				657	* Disable wbt, if enabled by default. Only called from CFQ.
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	658	*/
				659	void wbt_disable_default(struct request_queue *q)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	660	{
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	661	struct rq_wb *rwb = q->rq_wb;
				662
Jan Kara	3f19cd2	2017-04-11 11:29:01 +0200	[diff] [blame]	663	if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
				664	wbt_exit(q);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	665	}
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	666	EXPORT_SYMBOL_GPL(wbt_disable_default);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	667
Jan Kara	8330cdb	2017-04-19 11:33:27 +0200	[diff] [blame]	668	/*
				669	* Enable wbt if defaults are configured that way
				670	*/
				671	void wbt_enable_default(struct request_queue *q)
				672	{
				673	/* Throttling already enabled? */
				674	if (q->rq_wb)
				675	return;
				676
				677	/* Queue not registered? Maybe shutting down... */
				678	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
				679	return;
				680
				681	if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) \|\|
				682	(q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
				683	wbt_init(q);
				684	}
				685	EXPORT_SYMBOL_GPL(wbt_enable_default);
				686
Jens Axboe	80e091d	2016-11-28 09:22:47 -0700	[diff] [blame]	687	u64 wbt_default_latency_nsec(struct request_queue *q)
				688	{
				689	/*
				690	* We default to 2msec for non-rotational storage, and 75msec
				691	* for rotational storage.
				692	*/
				693	if (blk_queue_nonrot(q))
				694	return 2000000ULL;
				695	else
				696	return 75000000ULL;
				697	}
				698
Jens Axboe	99c749a	2017-04-21 07:55:42 -0600	[diff] [blame]	699	static int wbt_data_dir(const struct request *rq)
				700	{
				701	return rq_data_dir(rq);
				702	}
				703
Jens Axboe	8054b89	2016-11-10 21:50:51 -0700	[diff] [blame]	704	int wbt_init(struct request_queue *q)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	705	{
				706	struct rq_wb *rwb;
				707	int i;
				708
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	709	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
				710
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	711	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
				712	if (!rwb)
				713	return -ENOMEM;
				714
Jens Axboe	99c749a	2017-04-21 07:55:42 -0600	[diff] [blame]	715	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	716	if (!rwb->cb) {
				717	kfree(rwb);
				718	return -ENOMEM;
				719	}
				720
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	721	for (i = 0; i < WBT_NUM_RWQ; i++) {
				722	atomic_set(&rwb->rq_wait[i].inflight, 0);
				723	init_waitqueue_head(&rwb->rq_wait[i].wait);
				724	}
				725
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	726	rwb->wc = 1;
				727	rwb->queue_depth = RWB_DEF_DEPTH;
				728	rwb->last_comp = rwb->last_issue = jiffies;
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	729	rwb->queue = q;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	730	rwb->win_nsec = RWB_WINDOW_NSEC;
Jens Axboe	d62118b	2016-11-28 09:40:34 -0700	[diff] [blame]	731	rwb->enable_state = WBT_STATE_ON_DEFAULT;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	732	wbt_update_limits(rwb);
				733
				734	/*
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	735	* Assign rwb and add the stats callback.
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	736	*/
				737	q->rq_wb = rwb;
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	738	blk_stat_add_callback(q, rwb->cb);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	739
Jens Axboe	80e091d	2016-11-28 09:22:47 -0700	[diff] [blame]	740	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	741
				742	wbt_set_queue_depth(rwb, blk_queue_depth(q));
				743	wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
				744
				745	return 0;
				746	}
				747
				748	void wbt_exit(struct request_queue *q)
				749	{
				750	struct rq_wb *rwb = q->rq_wb;
				751
				752	if (rwb) {
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	753	blk_stat_remove_callback(q, rwb->cb);
				754	blk_stat_free_callback(rwb->cb);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	755	q->rq_wb = NULL;
				756	kfree(rwb);
				757	}
				758	}