Blame - block/blk-wbt.c - SHIFTPHONES/kernel/common

blob: ae8de9780085ae7b8e99237ed16fc9cd02b233a5 [file] [log] [blame]

Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	1	/*
				2	* buffered writeback throttling. loosely based on CoDel. We can't drop
				3	* packets for IO scheduling, so the logic is something like this:
				4	*
				5	* - Monitor latencies in a defined window of time.
				6	* - If the minimum latency in the above window exceeds some target, increment
				7	* scaling step and scale down queue depth by a factor of 2x. The monitoring
				8	* window is then shrunk to 100 / sqrt(scaling step + 1).
				9	* - For any window where we don't have solid data on what the latencies
				10	* look like, retain status quo.
				11	* - If latencies look good, decrement scaling step.
				12	* - If we're only doing writes, allow the scaling step to go negative. This
				13	* will temporarily boost write performance, snapping back to a stable
				14	* scaling step of 0 if reads show up or the heavy writers finish. Unlike
				15	* positive scaling steps where we shrink the monitoring window, a negative
				16	* scaling step retains the default step==0 window size.
				17	*
				18	* Copyright (C) 2016 Jens Axboe
				19	*
				20	*/
				21	#include <linux/kernel.h>
				22	#include <linux/blk_types.h>
				23	#include <linux/slab.h>
				24	#include <linux/backing-dev.h>
				25	#include <linux/swap.h>
				26
				27	#include "blk-wbt.h"
				28
				29	#define CREATE_TRACE_POINTS
				30	#include <trace/events/wbt.h>
				31
				32	enum {
				33	/*
				34	* Default setting, we'll scale up (to 75% of QD max) or down (min 1)
				35	* from here depending on device stats
				36	*/
				37	RWB_DEF_DEPTH = 16,
				38
				39	/*
				40	* 100msec window
				41	*/
				42	RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
				43
				44	/*
				45	* Disregard stats, if we don't meet this minimum
				46	*/
				47	RWB_MIN_WRITE_SAMPLES = 3,
				48
				49	/*
				50	* If we have this number of consecutive windows with not enough
				51	* information to scale up or down, scale up.
				52	*/
				53	RWB_UNKNOWN_BUMP = 5,
				54	};
				55
				56	static inline bool rwb_enabled(struct rq_wb *rwb)
				57	{
				58	return rwb && rwb->wb_normal != 0;
				59	}
				60
				61	/*
				62	* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
				63	* false if 'v' + 1 would be bigger than 'below'.
				64	*/
				65	static bool atomic_inc_below(atomic_t *v, int below)
				66	{
				67	int cur = atomic_read(v);
				68
				69	for (;;) {
				70	int old;
				71
				72	if (cur >= below)
				73	return false;
				74	old = atomic_cmpxchg(v, cur, cur + 1);
				75	if (old == cur)
				76	break;
				77	cur = old;
				78	}
				79
				80	return true;
				81	}
				82
				83	static void wb_timestamp(struct rq_wb rwb, unsigned long var)
				84	{
				85	if (rwb_enabled(rwb)) {
				86	const unsigned long cur = jiffies;
				87
				88	if (cur != *var)
				89	*var = cur;
				90	}
				91	}
				92
				93	/*
				94	* If a task was rate throttled in balance_dirty_pages() within the last
				95	* second or so, use that to indicate a higher cleaning rate.
				96	*/
				97	static bool wb_recent_wait(struct rq_wb *rwb)
				98	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	99	struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	100
				101	return time_before(jiffies, wb->dirty_sleep + HZ);
				102	}
				103
				104	static inline struct rq_wait get_rq_wait(struct rq_wb rwb, bool is_kswapd)
				105	{
				106	return &rwb->rq_wait[is_kswapd];
				107	}
				108
				109	static void rwb_wake_all(struct rq_wb *rwb)
				110	{
				111	int i;
				112
				113	for (i = 0; i < WBT_NUM_RWQ; i++) {
				114	struct rq_wait *rqw = &rwb->rq_wait[i];
				115
				116	if (waitqueue_active(&rqw->wait))
				117	wake_up_all(&rqw->wait);
				118	}
				119	}
				120
				121	void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
				122	{
				123	struct rq_wait *rqw;
				124	int inflight, limit;
				125
				126	if (!(wb_acct & WBT_TRACKED))
				127	return;
				128
				129	rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
				130	inflight = atomic_dec_return(&rqw->inflight);
				131
				132	/*
				133	* wbt got disabled with IO in flight. Wake up any potential
				134	* waiters, we don't have to do more than that.
				135	*/
				136	if (unlikely(!rwb_enabled(rwb))) {
				137	rwb_wake_all(rwb);
				138	return;
				139	}
				140
				141	/*
				142	* If the device does write back caching, drop further down
				143	* before we wake people up.
				144	*/
				145	if (rwb->wc && !wb_recent_wait(rwb))
				146	limit = 0;
				147	else
				148	limit = rwb->wb_normal;
				149
				150	/*
				151	* Don't wake anyone up if we are above the normal limit.
				152	*/
				153	if (inflight && inflight >= limit)
				154	return;
				155
				156	if (waitqueue_active(&rqw->wait)) {
				157	int diff = limit - inflight;
				158
				159	if (!inflight \|\| diff >= rwb->wb_background / 2)
				160	wake_up_all(&rqw->wait);
				161	}
				162	}
				163
				164	/*
				165	* Called on completion of a request. Note that it's also called when
				166	* a request is merged, when the request gets freed.
				167	*/
				168	void wbt_done(struct rq_wb rwb, struct blk_issue_stat stat)
				169	{
				170	if (!rwb)
				171	return;
				172
				173	if (!wbt_is_tracked(stat)) {
				174	if (rwb->sync_cookie == stat) {
				175	rwb->sync_issue = 0;
				176	rwb->sync_cookie = NULL;
				177	}
				178
				179	if (wbt_is_read(stat))
				180	wb_timestamp(rwb, &rwb->last_comp);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	181	} else {
				182	WARN_ON_ONCE(stat == rwb->sync_cookie);
				183	__wbt_done(rwb, wbt_stat_to_mask(stat));
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	184	}
weiping zhang	62d772f	2017-11-23 21:39:55 +0800	[diff] [blame]	185	wbt_clear_state(stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	186	}
				187
				188	/*
				189	* Return true, if we can't increase the depth further by scaling
				190	*/
				191	static bool calc_wb_limits(struct rq_wb *rwb)
				192	{
				193	unsigned int depth;
				194	bool ret = false;
				195
				196	if (!rwb->min_lat_nsec) {
				197	rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
				198	return false;
				199	}
				200
				201	/*
				202	* For QD=1 devices, this is a special case. It's important for those
				203	* to have one request ready when one completes, so force a depth of
				204	* 2 for those devices. On the backend, it'll be a depth of 1 anyway,
				205	* since the device can't have more than that in flight. If we're
				206	* scaling down, then keep a setting of 1/1/1.
				207	*/
				208	if (rwb->queue_depth == 1) {
				209	if (rwb->scale_step > 0)
				210	rwb->wb_max = rwb->wb_normal = 1;
				211	else {
				212	rwb->wb_max = rwb->wb_normal = 2;
				213	ret = true;
				214	}
				215	rwb->wb_background = 1;
				216	} else {
				217	/*
				218	* scale_step == 0 is our default state. If we have suffered
				219	* latency spikes, step will be > 0, and we shrink the
				220	* allowed write depths. If step is < 0, we're only doing
				221	* writes, and we allow a temporarily higher depth to
				222	* increase performance.
				223	*/
				224	depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
				225	if (rwb->scale_step > 0)
				226	depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
				227	else if (rwb->scale_step < 0) {
				228	unsigned int maxd = 3 * rwb->queue_depth / 4;
				229
				230	depth = 1 + ((depth - 1) << -rwb->scale_step);
				231	if (depth > maxd) {
				232	depth = maxd;
				233	ret = true;
				234	}
				235	}
				236
				237	/*
				238	* Set our max/normal/bg queue depths based on how far
				239	* we have scaled down (->scale_step).
				240	*/
				241	rwb->wb_max = depth;
				242	rwb->wb_normal = (rwb->wb_max + 1) / 2;
				243	rwb->wb_background = (rwb->wb_max + 3) / 4;
				244	}
				245
				246	return ret;
				247	}
				248
Arnd Bergmann	4121d38	2016-11-16 16:29:57 +0100	[diff] [blame]	249	static inline bool stat_sample_valid(struct blk_rq_stat *stat)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	250	{
				251	/*
				252	* We need at least one read sample, and a minimum of
				253	* RWB_MIN_WRITE_SAMPLES. We require some write samples to know
				254	* that it's writes impacting us, and not just some sole read on
				255	* a device that is in a lower power state.
				256	*/
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame]	257	return (stat[READ].nr_samples >= 1 &&
				258	stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	259	}
				260
				261	static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
				262	{
Mark Rutland	6aa7de0	2017-10-23 14:07:29 -0700	[diff] [blame]	263	u64 now, issue = READ_ONCE(rwb->sync_issue);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	264
				265	if (!issue \|\| !rwb->sync_cookie)
				266	return 0;
				267
				268	now = ktime_to_ns(ktime_get());
				269	return now - issue;
				270	}
				271
				272	enum {
				273	LAT_OK = 1,
				274	LAT_UNKNOWN,
				275	LAT_UNKNOWN_WRITES,
				276	LAT_EXCEEDED,
				277	};
				278
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	279	static int latency_exceeded(struct rq_wb rwb, struct blk_rq_stat stat)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	280	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	281	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	282	u64 thislat;
				283
				284	/*
				285	* If our stored sync issue exceeds the window size, or it
				286	* exceeds our min target AND we haven't logged any entries,
				287	* flag the latency as exceeded. wbt works off completion latencies,
				288	* but for a flooded device, a single sync IO can take a long time
				289	* to complete after being issued. If this time exceeds our
				290	* monitoring window AND we didn't see any other completions in that
				291	* window, then count that sync IO as a violation of the latency.
				292	*/
				293	thislat = rwb_sync_issue_lat(rwb);
				294	if (thislat > rwb->cur_win_nsec \|\|
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame]	295	(thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	296	trace_wbt_lat(bdi, thislat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	297	return LAT_EXCEEDED;
				298	}
				299
				300	/*
				301	* No read/write mix, if stat isn't valid
				302	*/
				303	if (!stat_sample_valid(stat)) {
				304	/*
				305	* If we had writes in this stat window and the window is
				306	* current, we're only doing writes. If a task recently
				307	* waited or still has writes in flights, consider us doing
				308	* just writes as well.
				309	*/
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	310	if (stat[WRITE].nr_samples \|\| wb_recent_wait(rwb) \|\|
				311	wbt_inflight(rwb))
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	312	return LAT_UNKNOWN_WRITES;
				313	return LAT_UNKNOWN;
				314	}
				315
				316	/*
				317	* If the 'min' latency exceeds our target, step down.
				318	*/
Omar Sandoval	fa2e39c	2017-03-21 08:56:06 -0700	[diff] [blame]	319	if (stat[READ].min > rwb->min_lat_nsec) {
				320	trace_wbt_lat(bdi, stat[READ].min);
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	321	trace_wbt_stat(bdi, stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	322	return LAT_EXCEEDED;
				323	}
				324
				325	if (rwb->scale_step)
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	326	trace_wbt_stat(bdi, stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	327
				328	return LAT_OK;
				329	}
				330
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	331	static void rwb_trace_step(struct rq_wb rwb, const char msg)
				332	{
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	333	struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	334
				335	trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	336	rwb->wb_background, rwb->wb_normal, rwb->wb_max);
				337	}
				338
				339	static void scale_up(struct rq_wb *rwb)
				340	{
				341	/*
				342	* Hit max in previous round, stop here
				343	*/
				344	if (rwb->scaled_max)
				345	return;
				346
				347	rwb->scale_step--;
				348	rwb->unknown_cnt = 0;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	349
				350	rwb->scaled_max = calc_wb_limits(rwb);
				351
				352	rwb_wake_all(rwb);
				353
				354	rwb_trace_step(rwb, "step up");
				355	}
				356
				357	/*
				358	* Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
				359	* had a latency violation.
				360	*/
				361	static void scale_down(struct rq_wb *rwb, bool hard_throttle)
				362	{
				363	/*
				364	* Stop scaling down when we've hit the limit. This also prevents
				365	* ->scale_step from going to crazy values, if the device can't
				366	* keep up.
				367	*/
				368	if (rwb->wb_max == 1)
				369	return;
				370
				371	if (rwb->scale_step < 0 && hard_throttle)
				372	rwb->scale_step = 0;
				373	else
				374	rwb->scale_step++;
				375
				376	rwb->scaled_max = false;
				377	rwb->unknown_cnt = 0;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	378	calc_wb_limits(rwb);
				379	rwb_trace_step(rwb, "step down");
				380	}
				381
				382	static void rwb_arm_timer(struct rq_wb *rwb)
				383	{
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	384	if (rwb->scale_step > 0) {
				385	/*
				386	* We should speed this up, using some variant of a fast
				387	* integer inverse square root calculation. Since we only do
				388	* this for every window expiration, it's not a huge deal,
				389	* though.
				390	*/
				391	rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
				392	int_sqrt((rwb->scale_step + 1) << 8));
				393	} else {
				394	/*
				395	* For step < 0, we don't want to increase/decrease the
				396	* window size.
				397	*/
				398	rwb->cur_win_nsec = rwb->win_nsec;
				399	}
				400
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	401	blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	402	}
				403
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	404	static void wb_timer_fn(struct blk_stat_callback *cb)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	405	{
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	406	struct rq_wb *rwb = cb->data;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	407	unsigned int inflight = wbt_inflight(rwb);
				408	int status;
				409
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	410	status = latency_exceeded(rwb, cb->stat);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	411
Jan Kara	dc3b17c	2017-02-02 15:56:50 +0100	[diff] [blame]	412	trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	413	inflight);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	414
				415	/*
				416	* If we exceeded the latency target, step down. If we did not,
				417	* step one level up. If we don't know enough to say either exceeded
				418	* or ok, then don't do anything.
				419	*/
				420	switch (status) {
				421	case LAT_EXCEEDED:
				422	scale_down(rwb, true);
				423	break;
				424	case LAT_OK:
				425	scale_up(rwb);
				426	break;
				427	case LAT_UNKNOWN_WRITES:
				428	/*
				429	* We started a the center step, but don't have a valid
				430	* read/write sample, but we do have writes going on.
				431	* Allow step to go negative, to increase write perf.
				432	*/
				433	scale_up(rwb);
				434	break;
				435	case LAT_UNKNOWN:
				436	if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
				437	break;
				438	/*
				439	* We get here when previously scaled reduced depth, and we
				440	* currently don't have a valid read/write sample. For that
				441	* case, slowly return to center state (step == 0).
				442	*/
				443	if (rwb->scale_step > 0)
				444	scale_up(rwb);
				445	else if (rwb->scale_step < 0)
				446	scale_down(rwb, false);
				447	break;
				448	default:
				449	break;
				450	}
				451
				452	/*
				453	* Re-arm timer, if we have IO in flight
				454	*/
				455	if (rwb->scale_step \|\| inflight)
				456	rwb_arm_timer(rwb);
				457	}
				458
				459	void wbt_update_limits(struct rq_wb *rwb)
				460	{
				461	rwb->scale_step = 0;
				462	rwb->scaled_max = false;
				463	calc_wb_limits(rwb);
				464
				465	rwb_wake_all(rwb);
				466	}
				467
				468	static bool close_io(struct rq_wb *rwb)
				469	{
				470	const unsigned long now = jiffies;
				471
				472	return time_before(now, rwb->last_issue + HZ / 10) \|\|
				473	time_before(now, rwb->last_comp + HZ / 10);
				474	}
				475
				476	#define REQ_HIPRIO (REQ_SYNC \| REQ_META \| REQ_PRIO)
				477
				478	static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
				479	{
				480	unsigned int limit;
				481
				482	/*
				483	* At this point we know it's a buffered write. If this is
weiping zhang	3dfbdc4	2017-11-23 21:40:10 +0800	[diff] [blame^]	484	* kswapd trying to free memory, or REQ_SYNC is set, then
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	485	* it's WB_SYNC_ALL writeback, and we'll use the max limit for
				486	* that. If the write is marked as a background write, then use
				487	* the idle limit, or go to normal if we haven't had competing
				488	* IO for a bit.
				489	*/
				490	if ((rw & REQ_HIPRIO) \|\| wb_recent_wait(rwb) \|\| current_is_kswapd())
				491	limit = rwb->wb_max;
				492	else if ((rw & REQ_BACKGROUND) \|\| close_io(rwb)) {
				493	/*
				494	* If less than 100ms since we completed unrelated IO,
				495	* limit us to half the depth for background writeback.
				496	*/
				497	limit = rwb->wb_background;
				498	} else
				499	limit = rwb->wb_normal;
				500
				501	return limit;
				502	}
				503
				504	static inline bool may_queue(struct rq_wb rwb, struct rq_wait rqw,
Ingo Molnar	ac6424b	2017-06-20 12:06:13 +0200	[diff] [blame]	505	wait_queue_entry_t *wait, unsigned long rw)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	506	{
				507	/*
				508	* inc it here even if disabled, since we'll dec it at completion.
				509	* this only happens if the task was sleeping in __wbt_wait(),
				510	* and someone turned it off at the same time.
				511	*/
				512	if (!rwb_enabled(rwb)) {
				513	atomic_inc(&rqw->inflight);
				514	return true;
				515	}
				516
				517	/*
				518	* If the waitqueue is already active and we are not the next
				519	* in line to be woken up, wait for our turn.
				520	*/
				521	if (waitqueue_active(&rqw->wait) &&
Ingo Molnar	2055da9	2017-06-20 12:06:46 +0200	[diff] [blame]	522	rqw->wait.head.next != &wait->entry)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	523	return false;
				524
				525	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
				526	}
				527
				528	/*
				529	* Block if we will exceed our limit, or if we are currently waiting for
				530	* the timer to kick off queuing again.
				531	*/
				532	static void __wbt_wait(struct rq_wb rwb, unsigned long rw, spinlock_t lock)
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	533	__releases(lock)
				534	__acquires(lock)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	535	{
				536	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
				537	DEFINE_WAIT(wait);
				538
				539	if (may_queue(rwb, rqw, &wait, rw))
				540	return;
				541
				542	do {
				543	prepare_to_wait_exclusive(&rqw->wait, &wait,
				544	TASK_UNINTERRUPTIBLE);
				545
				546	if (may_queue(rwb, rqw, &wait, rw))
				547	break;
				548
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	549	if (lock) {
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	550	spin_unlock_irq(lock);
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	551	io_schedule();
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	552	spin_lock_irq(lock);
Bart Van Assche	9eca535	2017-01-02 09:48:47 -0700	[diff] [blame]	553	} else
				554	io_schedule();
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	555	} while (1);
				556
				557	finish_wait(&rqw->wait, &wait);
				558	}
				559
				560	static inline bool wbt_should_throttle(struct rq_wb rwb, struct bio bio)
				561	{
				562	const int op = bio_op(bio);
				563
				564	/*
Christoph Hellwig	be07e14	2016-12-09 14:19:06 +0100	[diff] [blame]	565	* If not a WRITE, do nothing
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	566	*/
Christoph Hellwig	be07e14	2016-12-09 14:19:06 +0100	[diff] [blame]	567	if (op != REQ_OP_WRITE)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	568	return false;
				569
				570	/*
				571	* Don't throttle WRITE_ODIRECT
				572	*/
				573	if ((bio->bi_opf & (REQ_SYNC \| REQ_IDLE)) == (REQ_SYNC \| REQ_IDLE))
				574	return false;
				575
				576	return true;
				577	}
				578
				579	/*
				580	* Returns true if the IO request should be accounted, false if not.
				581	* May sleep, if we have exceeded the writeback limits. Caller can pass
				582	* in an irq held spinlock, if it holds one when calling this function.
				583	* If we do sleep, we'll release and re-grab it.
				584	*/
Bart Van Assche	f2e0a0b	2017-01-02 09:46:15 -0700	[diff] [blame]	585	enum wbt_flags wbt_wait(struct rq_wb rwb, struct bio bio, spinlock_t *lock)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	586	{
				587	unsigned int ret = 0;
				588
				589	if (!rwb_enabled(rwb))
				590	return 0;
				591
				592	if (bio_op(bio) == REQ_OP_READ)
				593	ret = WBT_READ;
				594
				595	if (!wbt_should_throttle(rwb, bio)) {
				596	if (ret & WBT_READ)
				597	wb_timestamp(rwb, &rwb->last_issue);
				598	return ret;
				599	}
				600
				601	__wbt_wait(rwb, bio->bi_opf, lock);
				602
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	603	if (!blk_stat_is_active(rwb->cb))
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	604	rwb_arm_timer(rwb);
				605
				606	if (current_is_kswapd())
				607	ret \|= WBT_KSWAPD;
				608
				609	return ret \| WBT_TRACKED;
				610	}
				611
				612	void wbt_issue(struct rq_wb rwb, struct blk_issue_stat stat)
				613	{
				614	if (!rwb_enabled(rwb))
				615	return;
				616
				617	/*
				618	* Track sync issue, in case it takes a long time to complete. Allows
				619	* us to react quicker, if a sync IO takes a long time to complete.
				620	* Note that this is just a hint. 'stat' can go away when the
				621	* request completes, so it's important we never dereference it. We
				622	* only use the address to compare with, which is why we store the
				623	* sync_issue time locally.
				624	*/
				625	if (wbt_is_read(stat) && !rwb->sync_issue) {
				626	rwb->sync_cookie = stat;
				627	rwb->sync_issue = blk_stat_time(stat);
				628	}
				629	}
				630
				631	void wbt_requeue(struct rq_wb rwb, struct blk_issue_stat stat)
				632	{
				633	if (!rwb_enabled(rwb))
				634	return;
				635	if (stat == rwb->sync_cookie) {
				636	rwb->sync_issue = 0;
				637	rwb->sync_cookie = NULL;
				638	}
				639	}
				640
				641	void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
				642	{
				643	if (rwb) {
				644	rwb->queue_depth = depth;
				645	wbt_update_limits(rwb);
				646	}
				647	}
				648
				649	void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
				650	{
				651	if (rwb)
				652	rwb->wc = write_cache_on;
				653	}
				654
Jan Kara	3f19cd2	2017-04-11 11:29:01 +0200	[diff] [blame]	655	/*
Luca Miccio	b5dc5d4	2017-10-09 16:27:21 +0200	[diff] [blame]	656	* Disable wbt, if enabled by default.
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	657	*/
				658	void wbt_disable_default(struct request_queue *q)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	659	{
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	660	struct rq_wb *rwb = q->rq_wb;
				661
Jan Kara	3f19cd2	2017-04-11 11:29:01 +0200	[diff] [blame]	662	if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
				663	wbt_exit(q);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	664	}
Jens Axboe	fa224ee	2016-11-28 09:25:50 -0700	[diff] [blame]	665	EXPORT_SYMBOL_GPL(wbt_disable_default);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	666
Jan Kara	8330cdb	2017-04-19 11:33:27 +0200	[diff] [blame]	667	/*
				668	* Enable wbt if defaults are configured that way
				669	*/
				670	void wbt_enable_default(struct request_queue *q)
				671	{
				672	/* Throttling already enabled? */
				673	if (q->rq_wb)
				674	return;
				675
				676	/* Queue not registered? Maybe shutting down... */
				677	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
				678	return;
				679
				680	if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) \|\|
				681	(q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
				682	wbt_init(q);
				683	}
				684	EXPORT_SYMBOL_GPL(wbt_enable_default);
				685
Jens Axboe	80e091d	2016-11-28 09:22:47 -0700	[diff] [blame]	686	u64 wbt_default_latency_nsec(struct request_queue *q)
				687	{
				688	/*
				689	* We default to 2msec for non-rotational storage, and 75msec
				690	* for rotational storage.
				691	*/
				692	if (blk_queue_nonrot(q))
				693	return 2000000ULL;
				694	else
				695	return 75000000ULL;
				696	}
				697
Jens Axboe	99c749a	2017-04-21 07:55:42 -0600	[diff] [blame]	698	static int wbt_data_dir(const struct request *rq)
				699	{
				700	return rq_data_dir(rq);
				701	}
				702
Jens Axboe	8054b89	2016-11-10 21:50:51 -0700	[diff] [blame]	703	int wbt_init(struct request_queue *q)
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	704	{
				705	struct rq_wb *rwb;
				706	int i;
				707
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	708	BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
				709
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	710	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
				711	if (!rwb)
				712	return -ENOMEM;
				713
Jens Axboe	99c749a	2017-04-21 07:55:42 -0600	[diff] [blame]	714	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	715	if (!rwb->cb) {
				716	kfree(rwb);
				717	return -ENOMEM;
				718	}
				719
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	720	for (i = 0; i < WBT_NUM_RWQ; i++) {
				721	atomic_set(&rwb->rq_wait[i].inflight, 0);
				722	init_waitqueue_head(&rwb->rq_wait[i].wait);
				723	}
				724
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	725	rwb->last_comp = rwb->last_issue = jiffies;
Jens Axboe	d8a0cbf	2016-11-10 21:52:53 -0700	[diff] [blame]	726	rwb->queue = q;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	727	rwb->win_nsec = RWB_WINDOW_NSEC;
Jens Axboe	d62118b	2016-11-28 09:40:34 -0700	[diff] [blame]	728	rwb->enable_state = WBT_STATE_ON_DEFAULT;
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	729	wbt_update_limits(rwb);
				730
				731	/*
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	732	* Assign rwb and add the stats callback.
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	733	*/
				734	q->rq_wb = rwb;
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	735	blk_stat_add_callback(q, rwb->cb);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	736
Jens Axboe	80e091d	2016-11-28 09:22:47 -0700	[diff] [blame]	737	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	738
				739	wbt_set_queue_depth(rwb, blk_queue_depth(q));
				740	wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
				741
				742	return 0;
				743	}
				744
				745	void wbt_exit(struct request_queue *q)
				746	{
				747	struct rq_wb *rwb = q->rq_wb;
				748
				749	if (rwb) {
Omar Sandoval	34dbad5	2017-03-21 08:56:08 -0700	[diff] [blame]	750	blk_stat_remove_callback(q, rwb->cb);
				751	blk_stat_free_callback(rwb->cb);
Jens Axboe	e34cbd3	2016-11-09 12:36:15 -0700	[diff] [blame]	752	q->rq_wb = NULL;
				753	kfree(rwb);
				754	}
				755	}