Blame - block/blk-wbt.c - SHIFTPHONES/kernel/shift/mainline

blob: 592e914c989070bf222dda6f6eff66d4d5ea7411 [file] [log] [blame]

Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	1	/*
				2	* buffered writeback throttling. loosely based on CoDel. We can't drop
				3	* packets for IO scheduling, so the logic is something like this:
				4	*
				5	* - Monitor latencies in a defined window of time.
				6	* - If the minimum latency in the above window exceeds some target, increment
				7	* scaling step and scale down queue depth by a factor of 2x. The monitoring
				8	* window is then shrunk to 100 / sqrt(scaling step + 1).
				9	* - For any window where we don't have solid data on what the latencies
				10	* look like, retain status quo.
				11	* - If latencies look good, decrement scaling step.
				12	* - If we're only doing writes, allow the scaling step to go negative. This
				13	* will temporarily boost write performance, snapping back to a stable
				14	* scaling step of 0 if reads show up or the heavy writers finish. Unlike
				15	* positive scaling steps where we shrink the monitoring window, a negative
				16	* scaling step retains the default step==0 window size.
				17	*
				18	* Copyright (C) 2016 Jens Axboe
				19	*
				20	*/
				21	#include <linux/kernel.h>
				22	#include <linux/blk_types.h>
				23	#include <linux/slab.h>
				24	#include <linux/backing-dev.h>
				25	#include <linux/swap.h>
				26
				27	#include "blk-wbt.h"
				28
				29	#define CREATE_TRACE_POINTS
				30	#include <trace/events/wbt.h>
				31
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	32	static inline void wbt_clear_state(struct request *rq)
Omar Sandoval	934031a	2018-05-09 02:08:47 -0700	[diff] [blame]	33	{
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	34	rq->issue_stat.stat &= ~BLK_STAT_RES_MASK;
Omar Sandoval	934031a	2018-05-09 02:08:47 -0700	[diff] [blame]	35	}
				36
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	37	static inline enum wbt_flags wbt_flags(struct request *rq)
Omar Sandoval	934031a	2018-05-09 02:08:47 -0700	[diff] [blame]	38	{
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	39	return (rq->issue_stat.stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
Omar Sandoval	934031a	2018-05-09 02:08:47 -0700	[diff] [blame]	40	}
				41
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	42	static inline bool wbt_is_tracked(struct request *rq)
Omar Sandoval	934031a	2018-05-09 02:08:47 -0700	[diff] [blame]	43	{
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	44	return (rq->issue_stat.stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
Omar Sandoval	934031a	2018-05-09 02:08:47 -0700	[diff] [blame]	45	}
				46
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	47	static inline bool wbt_is_read(struct request *rq)
Omar Sandoval	934031a	2018-05-09 02:08:47 -0700	[diff] [blame]	48	{
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	49	return (rq->issue_stat.stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
Omar Sandoval	934031a	2018-05-09 02:08:47 -0700	[diff] [blame]	50	}
				51
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	52	enum {
				53	/*
				54	* Default setting, we'll scale up (to 75% of QD max) or down (min 1)
				55	* from here depending on device stats
				56	*/
				57	RWB_DEF_DEPTH = 16,
				58
				59	/*
				60	* 100msec window
				61	*/
				62	RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
				63
				64	/*
				65	* Disregard stats, if we don't meet this minimum
				66	*/
				67	RWB_MIN_WRITE_SAMPLES = 3,
				68
				69	/*
				70	* If we have this number of consecutive windows with not enough
				71	* information to scale up or down, scale up.
				72	*/
				73	RWB_UNKNOWN_BUMP = 5,
				74	};
				75
				76	static inline bool rwb_enabled(struct rq_wb *rwb)
				77	{
				78	return rwb && rwb->wb_normal != 0;
				79	}
				80
				81	/*
				82	* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
				83	* false if 'v' + 1 would be bigger than 'below'.
				84	*/
				85	static bool atomic_inc_below(atomic_t *v, int below)
				86	{
				87	int cur = atomic_read(v);
				88
				89	for (;;) {
				90	int old;
				91
				92	if (cur >= below)
				93	return false;
				94	old = atomic_cmpxchg(v, cur, cur + 1);
				95	if (old == cur)
				96	break;
				97	cur = old;
				98	}
				99
				100	return true;
				101	}
				102
				103	static void wb_timestamp(struct rq_wb rwb, unsigned long var)
				104	{
				105	if (rwb_enabled(rwb)) {
				106	const unsigned long cur = jiffies;
				107
				108	if (cur != *var)
				109	*var = cur;
				110	}
				111	}
				112
				113	/*
				114	* If a task was rate throttled in balance_dirty_pages() within the last
				115	* second or so, use that to indicate a higher cleaning rate.
				116	*/
				117	static bool wb_recent_wait(struct rq_wb *rwb)
				118	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	119	struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	120
				121	return time_before(jiffies, wb->dirty_sleep + HZ);
				122	}
				123
Jens Axboe	8bea609	2018-05-07 09:57:08 -0600	[diff] [blame]	124	static inline struct rq_wait get_rq_wait(struct rq_wb rwb,
				125	enum wbt_flags wb_acct)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	126	{
Jens Axboe	8bea609	2018-05-07 09:57:08 -0600	[diff] [blame]	127	if (wb_acct & WBT_KSWAPD)
				128	return &rwb->rq_wait[WBT_RWQ_KSWAPD];
Jens Axboe	782f569	2018-05-07 10:03:23 -0600	[diff] [blame]	129	else if (wb_acct & WBT_DISCARD)
				130	return &rwb->rq_wait[WBT_RWQ_DISCARD];
Jens Axboe	8bea609	2018-05-07 09:57:08 -0600	[diff] [blame]	131
				132	return &rwb->rq_wait[WBT_RWQ_BG];
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	133	}
				134
				135	static void rwb_wake_all(struct rq_wb *rwb)
				136	{
				137	int i;
				138
				139	for (i = 0; i < WBT_NUM_RWQ; i++) {
				140	struct rq_wait *rqw = &rwb->rq_wait[i];
				141
				142	if (waitqueue_active(&rqw->wait))
				143	wake_up_all(&rqw->wait);
				144	}
				145	}
				146
				147	void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
				148	{
				149	struct rq_wait *rqw;
				150	int inflight, limit;
				151
				152	if (!(wb_acct & WBT_TRACKED))
				153	return;
				154
Jens Axboe	8bea609	2018-05-07 09:57:08 -0600	[diff] [blame]	155	rqw = get_rq_wait(rwb, wb_acct);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	156	inflight = atomic_dec_return(&rqw->inflight);
				157
				158	/*
				159	* wbt got disabled with IO in flight. Wake up any potential
				160	* waiters, we don't have to do more than that.
				161	*/
				162	if (unlikely(!rwb_enabled(rwb))) {
				163	rwb_wake_all(rwb);
				164	return;
				165	}
				166
				167	/*
Jens Axboe	782f569	2018-05-07 10:03:23 -0600	[diff] [blame]	168	* For discards, our limit is always the background. For writes, if
				169	* the device does write back caching, drop further down before we
				170	* wake people up.
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	171	*/
Jens Axboe	782f569	2018-05-07 10:03:23 -0600	[diff] [blame]	172	if (wb_acct & WBT_DISCARD)
				173	limit = rwb->wb_background;
				174	else if (rwb->wc && !wb_recent_wait(rwb))
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	175	limit = 0;
				176	else
				177	limit = rwb->wb_normal;
				178
				179	/*
				180	* Don't wake anyone up if we are above the normal limit.
				181	*/
				182	if (inflight && inflight >= limit)
				183	return;
				184
				185	if (waitqueue_active(&rqw->wait)) {
				186	int diff = limit - inflight;
				187
				188	if (!inflight \|\| diff >= rwb->wb_background / 2)
				189	wake_up_all(&rqw->wait);
				190	}
				191	}
				192
				193	/*
				194	* Called on completion of a request. Note that it's also called when
				195	* a request is merged, when the request gets freed.
				196	*/
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	197	void wbt_done(struct rq_wb rwb, struct request rq)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	198	{
				199	if (!rwb)
				200	return;
				201
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	202	if (!wbt_is_tracked(rq)) {
				203	if (rwb->sync_cookie == rq) {
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	204	rwb->sync_issue = 0;
				205	rwb->sync_cookie = NULL;
				206	}
				207
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	208	if (wbt_is_read(rq))
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	209	wb_timestamp(rwb, &rwb->last_comp);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	210	} else {
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	211	WARN_ON_ONCE(rq == rwb->sync_cookie);
				212	__wbt_done(rwb, wbt_flags(rq));
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	213	}
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	214	wbt_clear_state(rq);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	215	}
				216
				217	/*
				218	* Return true, if we can't increase the depth further by scaling
				219	*/
				220	static bool calc_wb_limits(struct rq_wb *rwb)
				221	{
				222	unsigned int depth;
				223	bool ret = false;
				224
				225	if (!rwb->min_lat_nsec) {
				226	rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
				227	return false;
				228	}
				229
				230	/*
				231	* For QD=1 devices, this is a special case. It's important for those
				232	* to have one request ready when one completes, so force a depth of
				233	* 2 for those devices. On the backend, it'll be a depth of 1 anyway,
				234	* since the device can't have more than that in flight. If we're
				235	* scaling down, then keep a setting of 1/1/1.
				236	*/
				237	if (rwb->queue_depth == 1) {
				238	if (rwb->scale_step > 0)
				239	rwb->wb_max = rwb->wb_normal = 1;
				240	else {
				241	rwb->wb_max = rwb->wb_normal = 2;
				242	ret = true;
				243	}
				244	rwb->wb_background = 1;
				245	} else {
				246	/*
				247	* scale_step == 0 is our default state. If we have suffered
				248	* latency spikes, step will be > 0, and we shrink the
				249	* allowed write depths. If step is < 0, we're only doing
				250	* writes, and we allow a temporarily higher depth to
				251	* increase performance.
				252	*/
				253	depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
				254	if (rwb->scale_step > 0)
				255	depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
				256	else if (rwb->scale_step < 0) {
				257	unsigned int maxd = 3 * rwb->queue_depth / 4;
				258
				259	depth = 1 + ((depth - 1) << -rwb->scale_step);
				260	if (depth > maxd) {
				261	depth = maxd;
				262	ret = true;
				263	}
				264	}
				265
				266	/*
				267	* Set our max/normal/bg queue depths based on how far
				268	* we have scaled down (->scale_step).
				269	*/
				270	rwb->wb_max = depth;
				271	rwb->wb_normal = (rwb->wb_max + 1) / 2;
				272	rwb->wb_background = (rwb->wb_max + 3) / 4;
				273	}
				274
				275	return ret;
				276	}
				277
Arnd Bergmann	4121d38	2016-11-16 16:29:57 +0100	[diff] [blame]	278	static inline bool stat_sample_valid(struct blk_rq_stat *stat)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	279	{
				280	/*
				281	* We need at least one read sample, and a minimum of
				282	* RWB_MIN_WRITE_SAMPLES. We require some write samples to know
				283	* that it's writes impacting us, and not just some sole read on
				284	* a device that is in a lower power state.
				285	*/
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame]	286	return (stat[READ].nr_samples >= 1 &&
				287	stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	288	}
				289
				290	static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
				291	{
Mark Rutland	6aa7de0	2017-10-23 14:07:29 -0700	[diff] [blame]	292	u64 now, issue = READ_ONCE(rwb->sync_issue);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	293
				294	if (!issue \|\| !rwb->sync_cookie)
				295	return 0;
				296
				297	now = ktime_to_ns(ktime_get());
				298	return now - issue;
				299	}
				300
				301	enum {
				302	LAT_OK = 1,
				303	LAT_UNKNOWN,
				304	LAT_UNKNOWN_WRITES,
				305	LAT_EXCEEDED,
				306	};
				307
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	308	static int latency_exceeded(struct rq_wb rwb, struct blk_rq_stat stat)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	309	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	310	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	311	u64 thislat;
				312
				313	/*
				314	* If our stored sync issue exceeds the window size, or it
				315	* exceeds our min target AND we haven't logged any entries,
				316	* flag the latency as exceeded. wbt works off completion latencies,
				317	* but for a flooded device, a single sync IO can take a long time
				318	* to complete after being issued. If this time exceeds our
				319	* monitoring window AND we didn't see any other completions in that
				320	* window, then count that sync IO as a violation of the latency.
				321	*/
				322	thislat = rwb_sync_issue_lat(rwb);
				323	if (thislat > rwb->cur_win_nsec \|\|
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame]	324	(thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	325	trace_wbt_lat(bdi, thislat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	326	return LAT_EXCEEDED;
				327	}
				328
				329	/*
				330	* No read/write mix, if stat isn't valid
				331	*/
				332	if (!stat_sample_valid(stat)) {
				333	/*
				334	* If we had writes in this stat window and the window is
				335	* current, we're only doing writes. If a task recently
				336	* waited or still has writes in flights, consider us doing
				337	* just writes as well.
				338	*/
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	339	if (stat[WRITE].nr_samples \|\| wb_recent_wait(rwb) \|\|
				340	wbt_inflight(rwb))
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	341	return LAT_UNKNOWN_WRITES;
				342	return LAT_UNKNOWN;
				343	}
				344
				345	/*
				346	* If the 'min' latency exceeds our target, step down.
				347	*/
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame]	348	if (stat[READ].min > rwb->min_lat_nsec) {
				349	trace_wbt_lat(bdi, stat[READ].min);
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	350	trace_wbt_stat(bdi, stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	351	return LAT_EXCEEDED;
				352	}
				353
				354	if (rwb->scale_step)
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	355	trace_wbt_stat(bdi, stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	356
				357	return LAT_OK;
				358	}
				359
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	360	static void rwb_trace_step(struct rq_wb rwb, const char msg)
				361	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	362	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	363
				364	trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	365	rwb->wb_background, rwb->wb_normal, rwb->wb_max);
				366	}
				367
				368	static void scale_up(struct rq_wb *rwb)
				369	{
				370	/*
				371	* Hit max in previous round, stop here
				372	*/
				373	if (rwb->scaled_max)
				374	return;
				375
				376	rwb->scale_step--;
				377	rwb->unknown_cnt = 0;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	378
				379	rwb->scaled_max = calc_wb_limits(rwb);
				380
				381	rwb_wake_all(rwb);
				382
				383	rwb_trace_step(rwb, "step up");
				384	}
				385
				386	/*
				387	* Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
				388	* had a latency violation.
				389	*/
				390	static void scale_down(struct rq_wb *rwb, bool hard_throttle)
				391	{
				392	/*
				393	* Stop scaling down when we've hit the limit. This also prevents
				394	* ->scale_step from going to crazy values, if the device can't
				395	* keep up.
				396	*/
				397	if (rwb->wb_max == 1)
				398	return;
				399
				400	if (rwb->scale_step < 0 && hard_throttle)
				401	rwb->scale_step = 0;
				402	else
				403	rwb->scale_step++;
				404
				405	rwb->scaled_max = false;
				406	rwb->unknown_cnt = 0;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	407	calc_wb_limits(rwb);
				408	rwb_trace_step(rwb, "step down");
				409	}
				410
				411	static void rwb_arm_timer(struct rq_wb *rwb)
				412	{
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	413	if (rwb->scale_step > 0) {
				414	/*
				415	* We should speed this up, using some variant of a fast
				416	* integer inverse square root calculation. Since we only do
				417	* this for every window expiration, it's not a huge deal,
				418	* though.
				419	*/
				420	rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
				421	int_sqrt((rwb->scale_step + 1) << 8));
				422	} else {
				423	/*
				424	* For step < 0, we don't want to increase/decrease the
				425	* window size.
				426	*/
				427	rwb->cur_win_nsec = rwb->win_nsec;
				428	}
				429
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	430	blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	431	}
				432
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	433	static void wb_timer_fn(struct blk_stat_callback *cb)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	434	{
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	435	struct rq_wb *rwb = cb->data;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	436	unsigned int inflight = wbt_inflight(rwb);
				437	int status;
				438
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	439	status = latency_exceeded(rwb, cb->stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	440
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	441	trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	442	inflight);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	443
				444	/*
				445	* If we exceeded the latency target, step down. If we did not,
				446	* step one level up. If we don't know enough to say either exceeded
				447	* or ok, then don't do anything.
				448	*/
				449	switch (status) {
				450	case LAT_EXCEEDED:
				451	scale_down(rwb, true);
				452	break;
				453	case LAT_OK:
				454	scale_up(rwb);
				455	break;
				456	case LAT_UNKNOWN_WRITES:
				457	/*
				458	* We started a the center step, but don't have a valid
				459	* read/write sample, but we do have writes going on.
				460	* Allow step to go negative, to increase write perf.
				461	*/
				462	scale_up(rwb);
				463	break;
				464	case LAT_UNKNOWN:
				465	if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
				466	break;
				467	/*
				468	* We get here when previously scaled reduced depth, and we
				469	* currently don't have a valid read/write sample. For that
				470	* case, slowly return to center state (step == 0).
				471	*/
				472	if (rwb->scale_step > 0)
				473	scale_up(rwb);
				474	else if (rwb->scale_step < 0)
				475	scale_down(rwb, false);
				476	break;
				477	default:
				478	break;
				479	}
				480
				481	/*
				482	* Re-arm timer, if we have IO in flight
				483	*/
				484	if (rwb->scale_step \|\| inflight)
				485	rwb_arm_timer(rwb);
				486	}
				487
				488	void wbt_update_limits(struct rq_wb *rwb)
				489	{
				490	rwb->scale_step = 0;
				491	rwb->scaled_max = false;
				492	calc_wb_limits(rwb);
				493
				494	rwb_wake_all(rwb);
				495	}
				496
				497	static bool close_io(struct rq_wb *rwb)
				498	{
				499	const unsigned long now = jiffies;
				500
				501	return time_before(now, rwb->last_issue + HZ / 10) \|\|
				502	time_before(now, rwb->last_comp + HZ / 10);
				503	}
				504
				505	#define REQ_HIPRIO (REQ_SYNC \| REQ_META \| REQ_PRIO)
				506
				507	static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
				508	{
				509	unsigned int limit;
				510
Jens Axboe	782f569	2018-05-07 10:03:23 -0600	[diff] [blame]	511	if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD)
				512	return rwb->wb_background;
				513
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	514	/*
				515	* At this point we know it's a buffered write. If this is
weiping zhang	3dfbdc4	2017-11-23 21:40:10 +0800	[diff] [blame]	516	* kswapd trying to free memory, or REQ_SYNC is set, then
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	517	* it's WB_SYNC_ALL writeback, and we'll use the max limit for
				518	* that. If the write is marked as a background write, then use
				519	* the idle limit, or go to normal if we haven't had competing
				520	* IO for a bit.
				521	*/
				522	if ((rw & REQ_HIPRIO) \|\| wb_recent_wait(rwb) \|\| current_is_kswapd())
				523	limit = rwb->wb_max;
				524	else if ((rw & REQ_BACKGROUND) \|\| close_io(rwb)) {
				525	/*
				526	* If less than 100ms since we completed unrelated IO,
				527	* limit us to half the depth for background writeback.
				528	*/
				529	limit = rwb->wb_background;
				530	} else
				531	limit = rwb->wb_normal;
				532
				533	return limit;
				534	}
				535
				536	static inline bool may_queue(struct rq_wb rwb, struct rq_wait rqw,
Ingo Molnar	ac6424b	2017-06-20 12:06:13 +0200	[diff] [blame]	537	wait_queue_entry_t *wait, unsigned long rw)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	538	{
				539	/*
				540	* inc it here even if disabled, since we'll dec it at completion.
				541	* this only happens if the task was sleeping in __wbt_wait(),
				542	* and someone turned it off at the same time.
				543	*/
				544	if (!rwb_enabled(rwb)) {
				545	atomic_inc(&rqw->inflight);
				546	return true;
				547	}
				548
				549	/*
				550	* If the waitqueue is already active and we are not the next
				551	* in line to be woken up, wait for our turn.
				552	*/
				553	if (waitqueue_active(&rqw->wait) &&
Ingo Molnar	2055da9	2017-06-20 12:06:46 +0200	[diff] [blame]	554	rqw->wait.head.next != &wait->entry)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	555	return false;
				556
				557	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
				558	}
				559
				560	/*
				561	* Block if we will exceed our limit, or if we are currently waiting for
				562	* the timer to kick off queuing again.
				563	*/
Jens Axboe	8bea609	2018-05-07 09:57:08 -0600	[diff] [blame]	564	static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
				565	unsigned long rw, spinlock_t *lock)
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	566	__releases(lock)
				567	__acquires(lock)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	568	{
Jens Axboe	8bea609	2018-05-07 09:57:08 -0600	[diff] [blame]	569	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	570	DEFINE_WAIT(wait);
				571
				572	if (may_queue(rwb, rqw, &wait, rw))
				573	return;
				574
				575	do {
				576	prepare_to_wait_exclusive(&rqw->wait, &wait,
				577	TASK_UNINTERRUPTIBLE);
				578
				579	if (may_queue(rwb, rqw, &wait, rw))
				580	break;
				581
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	582	if (lock) {
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	583	spin_unlock_irq(lock);
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	584	io_schedule();
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	585	spin_lock_irq(lock);
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	586	} else
				587	io_schedule();
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	588	} while (1);
				589
				590	finish_wait(&rqw->wait, &wait);
				591	}
				592
				593	static inline bool wbt_should_throttle(struct rq_wb rwb, struct bio bio)
				594	{
Jens Axboe	782f569	2018-05-07 10:03:23 -0600	[diff] [blame]	595	switch (bio_op(bio)) {
				596	case REQ_OP_WRITE:
				597	/*
				598	* Don't throttle WRITE_ODIRECT
				599	*/
				600	if ((bio->bi_opf & (REQ_SYNC \| REQ_IDLE)) ==
				601	(REQ_SYNC \| REQ_IDLE))
				602	return false;
				603	/* fallthrough */
				604	case REQ_OP_DISCARD:
				605	return true;
				606	default:
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	607	return false;
Jens Axboe	782f569	2018-05-07 10:03:23 -0600	[diff] [blame]	608	}
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	609	}
				610
				611	/*
				612	* Returns true if the IO request should be accounted, false if not.
				613	* May sleep, if we have exceeded the writeback limits. Caller can pass
				614	* in an irq held spinlock, if it holds one when calling this function.
				615	* If we do sleep, we'll release and re-grab it.
				616	*/
Bart Van Assche	f2e0a0b	2017-01-02 09:46:15 -0700	[diff] [blame]	617	enum wbt_flags wbt_wait(struct rq_wb rwb, struct bio bio, spinlock_t *lock)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	618	{
Jens Axboe	8bea609	2018-05-07 09:57:08 -0600	[diff] [blame]	619	enum wbt_flags ret = 0;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	620
				621	if (!rwb_enabled(rwb))
				622	return 0;
				623
				624	if (bio_op(bio) == REQ_OP_READ)
				625	ret = WBT_READ;
				626
				627	if (!wbt_should_throttle(rwb, bio)) {
				628	if (ret & WBT_READ)
				629	wb_timestamp(rwb, &rwb->last_issue);
				630	return ret;
				631	}
				632
Jens Axboe	8bea609	2018-05-07 09:57:08 -0600	[diff] [blame]	633	if (current_is_kswapd())
				634	ret \|= WBT_KSWAPD;
Jens Axboe	782f569	2018-05-07 10:03:23 -0600	[diff] [blame]	635	if (bio_op(bio) == REQ_OP_DISCARD)
				636	ret \|= WBT_DISCARD;
Jens Axboe	8bea609	2018-05-07 09:57:08 -0600	[diff] [blame]	637
				638	__wbt_wait(rwb, ret, bio->bi_opf, lock);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	639
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	640	if (!blk_stat_is_active(rwb->cb))
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	641	rwb_arm_timer(rwb);
				642
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	643	return ret \| WBT_TRACKED;
				644	}
				645
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	646	void wbt_issue(struct rq_wb rwb, struct request rq)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	647	{
				648	if (!rwb_enabled(rwb))
				649	return;
				650
				651	/*
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	652	* Track sync issue, in case it takes a long time to complete. Allows us
				653	* to react quicker, if a sync IO takes a long time to complete. Note
				654	* that this is just a hint. The request can go away when it completes,
				655	* so it's important we never dereference it. We only use the address to
				656	* compare with, which is why we store the sync_issue time locally.
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	657	*/
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	658	if (wbt_is_read(rq) && !rwb->sync_issue) {
				659	rwb->sync_cookie = rq;
				660	rwb->sync_issue = blk_stat_time(&rq->issue_stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	661	}
				662	}
				663
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	664	void wbt_requeue(struct rq_wb rwb, struct request rq)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	665	{
				666	if (!rwb_enabled(rwb))
				667	return;
Omar Sandoval	a8a4594	2018-05-09 02:08:48 -0700	[diff] [blame^]	668	if (rq == rwb->sync_cookie) {
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	669	rwb->sync_issue = 0;
				670	rwb->sync_cookie = NULL;
				671	}
				672	}
				673
				674	void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
				675	{
				676	if (rwb) {
				677	rwb->queue_depth = depth;
				678	wbt_update_limits(rwb);
				679	}
				680	}
				681
				682	void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
				683	{
				684	if (rwb)
				685	rwb->wc = write_cache_on;
				686	}
				687
Jan Kara	3f19cd2	2017-04-11 11:29:01 +0200	[diff] [blame]	688	/*
Luca Miccio	b5dc5d4	2017-10-09 16:27:21 +0200	[diff] [blame]	689	* Disable wbt, if enabled by default.
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	690	*/
				691	void wbt_disable_default(struct request_queue *q)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	692	{
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	693	struct rq_wb *rwb = q->rq_wb;
				694
Jan Kara	3f19cd2	2017-04-11 11:29:01 +0200	[diff] [blame]	695	if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
				696	wbt_exit(q);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	697	}
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	698	EXPORT_SYMBOL_GPL(wbt_disable_default);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	699
Jan Kara	8330cdb	2017-04-19 11:33:27 +0200	[diff] [blame]	700	/*
				701	* Enable wbt if defaults are configured that way
				702	*/
				703	void wbt_enable_default(struct request_queue *q)
				704	{
				705	/* Throttling already enabled? */
				706	if (q->rq_wb)
				707	return;
				708
				709	/* Queue not registered? Maybe shutting down... */
				710	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
				711	return;
				712
				713	if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) \|\|
				714	(q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
				715	wbt_init(q);
				716	}
				717	EXPORT_SYMBOL_GPL(wbt_enable_default);
				718
Jens Axboe	80e091d	2016-11-28 09:22:47 -0700	[diff] [blame]	719	u64 wbt_default_latency_nsec(struct request_queue *q)
				720	{
				721	/*
				722	* We default to 2msec for non-rotational storage, and 75msec
				723	* for rotational storage.
				724	*/
				725	if (blk_queue_nonrot(q))
				726	return 2000000ULL;
				727	else
				728	return 75000000ULL;
				729	}
				730
Jens Axboe	99c749a	2017-04-21 07:55:42 -0600	[diff] [blame]	731	static int wbt_data_dir(const struct request *rq)
				732	{
Jens Axboe	5235553d	2018-02-05 13:16:56 -0700	[diff] [blame]	733	const int op = req_op(rq);
				734
				735	if (op == REQ_OP_READ)
				736	return READ;
Jens Axboe	825843b	2018-05-03 09:14:57 -0600	[diff] [blame]	737	else if (op_is_write(op))
Jens Axboe	5235553d	2018-02-05 13:16:56 -0700	[diff] [blame]	738	return WRITE;
				739
				740	/* don't account */
				741	return -1;
Jens Axboe	99c749a	2017-04-21 07:55:42 -0600	[diff] [blame]	742	}
				743
Jens Axboe	8054b89	2016-11-10 21:50:51 -0700	[diff] [blame]	744	int wbt_init(struct request_queue *q)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	745	{
				746	struct rq_wb *rwb;
				747	int i;
				748
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	749	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
				750
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	751	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
				752	if (!rwb)
				753	return -ENOMEM;
				754
Jens Axboe	99c749a	2017-04-21 07:55:42 -0600	[diff] [blame]	755	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	756	if (!rwb->cb) {
				757	kfree(rwb);
				758	return -ENOMEM;
				759	}
				760
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	761	for (i = 0; i < WBT_NUM_RWQ; i++) {
				762	atomic_set(&rwb->rq_wait[i].inflight, 0);
				763	init_waitqueue_head(&rwb->rq_wait[i].wait);
				764	}
				765
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	766	rwb->last_comp = rwb->last_issue = jiffies;
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	767	rwb->queue = q;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	768	rwb->win_nsec = RWB_WINDOW_NSEC;
Jens Axboe	d62118b	2016-11-28 09:40:34 -0700	[diff] [blame]	769	rwb->enable_state = WBT_STATE_ON_DEFAULT;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	770	wbt_update_limits(rwb);
				771
				772	/*
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	773	* Assign rwb and add the stats callback.
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	774	*/
				775	q->rq_wb = rwb;
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	776	blk_stat_add_callback(q, rwb->cb);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	777
Jens Axboe	80e091d	2016-11-28 09:22:47 -0700	[diff] [blame]	778	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	779
				780	wbt_set_queue_depth(rwb, blk_queue_depth(q));
				781	wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
				782
				783	return 0;
				784	}
				785
				786	void wbt_exit(struct request_queue *q)
				787	{
				788	struct rq_wb *rwb = q->rq_wb;
				789
				790	if (rwb) {
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	791	blk_stat_remove_callback(q, rwb->cb);
				792	blk_stat_free_callback(rwb->cb);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	793	q->rq_wb = NULL;
				794	kfree(rwb);
				795	}
				796	}