Blame - block/blk-iolatency.c - SHIFTPHONES/mainline/linux

blob: 6593c7123b97e02a9c80154fd693a1a5d9c30794 [file] [log] [blame]

Christoph Hellwig	3dcf60b	2019-04-30 14:42:43 -0400	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	2	/*
				3	* Block rq-qos base io controller
				4	*
				5	* This works similar to wbt with a few exceptions
				6	*
				7	* - It's bio based, so the latency covers the whole block layer in addition to
				8	* the actual io.
				9	* - We will throttle all IO that comes in here if we need to.
				10	* - We use the mean latency over the 100ms window. This is because writes can
				11	* be particularly fast, which could give us a false sense of the impact of
				12	* other workloads on our protected workload.
Josef Bacik	a284390	2018-07-11 10:34:42 -0400	[diff] [blame]	13	* - By default there's no throttling, we set the queue_depth to UINT_MAX so
				14	* that we can have as many outstanding bio's as we're allowed to. Only at
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	15	* throttle time do we pay attention to the actual queue depth.
				16	*
				17	* The hierarchy works like the cpu controller does, we track the latency at
				18	* every configured node, and each configured node has it's own independent
				19	* queue depth. This means that we only care about our latency targets at the
				20	* peer level. Some group at the bottom of the hierarchy isn't going to affect
				21	* a group at the end of some other path if we're only configred at leaf level.
				22	*
				23	* Consider the following
				24	*
				25	* root blkg
				26	* / \
				27	* fast (target=5ms) slow (target=10ms)
				28	* / \ / \
				29	* a b normal(15ms) unloved
				30	*
				31	* "a" and "b" have no target, but their combined io under "fast" cannot exceed
				32	* an average latency of 5ms. If it does then we will throttle the "slow"
				33	* group. In the case of "normal", if it exceeds its 15ms target, we will
				34	* throttle "unloved", but nobody else.
				35	*
				36	* In this example "fast", "slow", and "normal" will be the only groups actually
				37	* accounting their io latencies. We have to walk up the heirarchy to the root
				38	* on every submit and complete so we can do the appropriate stat recording and
				39	* adjust the queue depth of ourselves if needed.
				40	*
				41	* There are 2 ways we throttle IO.
				42	*
				43	* 1) Queue depth throttling. As we throttle down we will adjust the maximum
				44	* number of IO's we're allowed to have in flight. This starts at (u64)-1 down
				45	* to 1. If the group is only ever submitting IO for itself then this is the
				46	* only way we throttle.
				47	*
				48	* 2) Induced delay throttling. This is for the case that a group is generating
				49	* IO that has to be issued by the root cg to avoid priority inversion. So think
				50	* REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot
				51	* of work done for us on behalf of the root cg and are being asked to scale
				52	* down more then we induce a latency at userspace return. We accumulate the
				53	* total amount of time we need to be punished by doing
				54	*
				55	* total_time += min_lat_nsec - actual_io_completion
				56	*
				57	* and then at throttle time will do
				58	*
				59	* throttle_time = min(total_time, NSEC_PER_SEC)
				60	*
				61	* This induced delay will throttle back the activity that is generating the
				62	* root cg issued io's, wethere that's some metadata intensive operation or the
				63	* group is using so much memory that it is pushing us into swap.
				64	*
				65	* Copyright (C) 2018 Josef Bacik
				66	*/
				67	#include <linux/kernel.h>
				68	#include <linux/blk_types.h>
				69	#include <linux/backing-dev.h>
				70	#include <linux/module.h>
				71	#include <linux/timer.h>
				72	#include <linux/memcontrol.h>
Dennis Zhou (Facebook)	c480bcf	2018-08-01 23:15:41 -0700	[diff] [blame]	73	#include <linux/sched/loadavg.h>
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	74	#include <linux/sched/signal.h>
				75	#include <trace/events/block.h>
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	76	#include <linux/blk-mq.h>
Christoph Hellwig	e41d12f	2021-09-20 14:33:13 +0200	[diff] [blame]	77	#include <linux/blk-cgroup.h>
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	78	#include "blk-rq-qos.h"
				79	#include "blk-stat.h"
Bart Van Assche	373e915	2019-03-20 13:15:01 -0700	[diff] [blame]	80	#include "blk.h"
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	81
				82	#define DEFAULT_SCALE_COOKIE 1000000U
				83
				84	static struct blkcg_policy blkcg_policy_iolatency;
				85	struct iolatency_grp;
				86
				87	struct blk_iolatency {
				88	struct rq_qos rqos;
				89	struct timer_list timer;
				90	atomic_t enabled;
				91	};
				92
				93	static inline struct blk_iolatency BLKIOLATENCY(struct rq_qos rqos)
				94	{
				95	return container_of(rqos, struct blk_iolatency, rqos);
				96	}
				97
				98	static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat)
				99	{
				100	return atomic_read(&blkiolat->enabled) > 0;
				101	}
				102
				103	struct child_latency_info {
				104	spinlock_t lock;
				105
				106	/* Last time we adjusted the scale of everybody. */
				107	u64 last_scale_event;
				108
				109	/* The latency that we missed. */
				110	u64 scale_lat;
				111
				112	/* Total io's from all of our children for the last summation. */
				113	u64 nr_samples;
				114
				115	/* The guy who actually changed the latency numbers. */
				116	struct iolatency_grp *scale_grp;
				117
				118	/* Cookie to tell if we need to scale up or down. */
				119	atomic_t scale_cookie;
				120	};
				121
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	122	struct percentile_stats {
				123	u64 total;
				124	u64 missed;
				125	};
				126
				127	struct latency_stat {
				128	union {
				129	struct percentile_stats ps;
				130	struct blk_rq_stat rqs;
				131	};
				132	};
				133
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	134	struct iolatency_grp {
				135	struct blkg_policy_data pd;
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	136	struct latency_stat __percpu *stats;
Josef Bacik	451bb7c	2018-09-28 13:45:43 -0400	[diff] [blame]	137	struct latency_stat cur_stat;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	138	struct blk_iolatency *blkiolat;
				139	struct rq_depth rq_depth;
				140	struct rq_wait rq_wait;
				141	atomic64_t window_start;
				142	atomic_t scale_cookie;
				143	u64 min_lat_nsec;
				144	u64 cur_win_nsec;
				145
				146	/* total running average of our io latency. */
Dennis Zhou (Facebook)	c480bcf	2018-08-01 23:15:41 -0700	[diff] [blame]	147	u64 lat_avg;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	148
				149	/* Our current number of IO's for the last summation. */
				150	u64 nr_samples;
				151
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	152	bool ssd;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	153	struct child_latency_info child_lat;
				154	};
				155
Dennis Zhou (Facebook)	c480bcf	2018-08-01 23:15:41 -0700	[diff] [blame]	156	#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC)
				157	#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
				158	/*
				159	* These are the constants used to fake the fixed-point moving average
Johannes Weiner	8508cf3	2018-10-26 15:06:11 -0700	[diff] [blame]	160	* calculation just like load average. The call to calc_load() folds
Dennis Zhou (Facebook)	c480bcf	2018-08-01 23:15:41 -0700	[diff] [blame]	161	* (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling
				162	* window size is bucketed to try to approximately calculate average
				163	* latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
				164	* elapse immediately. Note, windows only elapse with IO activity. Idle
				165	* periods extend the most recent window.
				166	*/
				167	#define BLKIOLATENCY_NR_EXP_FACTORS 5
				168	#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \
				169	(BLKIOLATENCY_NR_EXP_FACTORS - 1))
				170	static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = {
				171	2045, // exp(1/600) - 600 samples
				172	2039, // exp(1/240) - 240 samples
				173	2031, // exp(1/120) - 120 samples
				174	2023, // exp(1/80) - 80 samples
				175	2014, // exp(1/60) - 60 samples
				176	};
				177
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	178	static inline struct iolatency_grp pd_to_lat(struct blkg_policy_data pd)
				179	{
				180	return pd ? container_of(pd, struct iolatency_grp, pd) : NULL;
				181	}
				182
				183	static inline struct iolatency_grp blkg_to_lat(struct blkcg_gq blkg)
				184	{
				185	return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency));
				186	}
				187
				188	static inline struct blkcg_gq lat_to_blkg(struct iolatency_grp iolat)
				189	{
				190	return pd_to_blkg(&iolat->pd);
				191	}
				192
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	193	static inline void latency_stat_init(struct iolatency_grp *iolat,
				194	struct latency_stat *stat)
				195	{
				196	if (iolat->ssd) {
				197	stat->ps.total = 0;
				198	stat->ps.missed = 0;
				199	} else
				200	blk_rq_stat_init(&stat->rqs);
				201	}
				202
				203	static inline void latency_stat_sum(struct iolatency_grp *iolat,
				204	struct latency_stat *sum,
				205	struct latency_stat *stat)
				206	{
				207	if (iolat->ssd) {
				208	sum->ps.total += stat->ps.total;
				209	sum->ps.missed += stat->ps.missed;
				210	} else
				211	blk_rq_stat_sum(&sum->rqs, &stat->rqs);
				212	}
				213
				214	static inline void latency_stat_record_time(struct iolatency_grp *iolat,
				215	u64 req_time)
				216	{
				217	struct latency_stat *stat = get_cpu_ptr(iolat->stats);
				218	if (iolat->ssd) {
				219	if (req_time >= iolat->min_lat_nsec)
				220	stat->ps.missed++;
				221	stat->ps.total++;
				222	} else
				223	blk_rq_stat_add(&stat->rqs, req_time);
				224	put_cpu_ptr(stat);
				225	}
				226
				227	static inline bool latency_sum_ok(struct iolatency_grp *iolat,
				228	struct latency_stat *stat)
				229	{
				230	if (iolat->ssd) {
				231	u64 thresh = div64_u64(stat->ps.total, 10);
				232	thresh = max(thresh, 1ULL);
				233	return stat->ps.missed < thresh;
				234	}
				235	return stat->rqs.mean <= iolat->min_lat_nsec;
				236	}
				237
				238	static inline u64 latency_stat_samples(struct iolatency_grp *iolat,
				239	struct latency_stat *stat)
				240	{
				241	if (iolat->ssd)
				242	return stat->ps.total;
				243	return stat->rqs.nr_samples;
				244	}
				245
				246	static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
				247	struct latency_stat *stat)
				248	{
				249	int exp_idx;
				250
				251	if (iolat->ssd)
				252	return;
				253
				254	/*
Johannes Weiner	8508cf3	2018-10-26 15:06:11 -0700	[diff] [blame]	255	* calc_load() takes in a number stored in fixed point representation.
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	256	* Because we are using this for IO time in ns, the values stored
				257	* are significantly larger than the FIXED_1 denominator (2048).
				258	* Therefore, rounding errors in the calculation are negligible and
				259	* can be ignored.
				260	*/
				261	exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
				262	div64_u64(iolat->cur_win_nsec,
				263	BLKIOLATENCY_EXP_BUCKET_SIZE));
Johannes Weiner	8508cf3	2018-10-26 15:06:11 -0700	[diff] [blame]	264	iolat->lat_avg = calc_load(iolat->lat_avg,
				265	iolatency_exp_factors[exp_idx],
				266	stat->rqs.mean);
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	267	}
				268
Josef Bacik	d3fcdff	2018-12-04 12:59:04 -0500	[diff] [blame]	269	static void iolat_cleanup_cb(struct rq_wait rqw, void private_data)
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	270	{
Josef Bacik	d3fcdff	2018-12-04 12:59:04 -0500	[diff] [blame]	271	atomic_dec(&rqw->inflight);
				272	wake_up(&rqw->wait);
				273	}
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	274
Josef Bacik	d3fcdff	2018-12-04 12:59:04 -0500	[diff] [blame]	275	static bool iolat_acquire_inflight(struct rq_wait rqw, void private_data)
				276	{
				277	struct iolatency_grp *iolat = private_data;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	278	return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
				279	}
				280
				281	static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
				282	struct iolatency_grp *iolat,
Christoph Hellwig	d533756	2018-11-14 17:02:09 +0100	[diff] [blame]	283	bool issue_as_root,
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	284	bool use_memdelay)
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	285	{
				286	struct rq_wait *rqw = &iolat->rq_wait;
				287	unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	288
				289	if (use_delay)
				290	blkcg_schedule_throttle(rqos->q, use_memdelay);
				291
				292	/*
				293	* To avoid priority inversions we want to just take a slot if we are
				294	* issuing as root. If we're being killed off there's no point in
				295	* delaying things, we may have been killed by OOM so throttling may
				296	* make recovery take even longer, so just let the IO's through so the
				297	* task can go away.
				298	*/
				299	if (issue_as_root \|\| fatal_signal_pending(current)) {
				300	atomic_inc(&rqw->inflight);
				301	return;
				302	}
				303
Josef Bacik	d3fcdff	2018-12-04 12:59:04 -0500	[diff] [blame]	304	rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	305	}
				306
				307	#define SCALE_DOWN_FACTOR 2
				308	#define SCALE_UP_FACTOR 4
				309
				310	static inline unsigned long scale_amount(unsigned long qd, bool up)
				311	{
				312	return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL);
				313	}
				314
				315	/*
				316	* We scale the qd down faster than we scale up, so we need to use this helper
				317	* to adjust the scale_cookie accordingly so we don't prematurely get
				318	* scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much.
				319	*
				320	* Each group has their own local copy of the last scale cookie they saw, so if
				321	* the global scale cookie goes up or down they know which way they need to go
				322	* based on their last knowledge of it.
				323	*/
				324	static void scale_cookie_change(struct blk_iolatency *blkiolat,
				325	struct child_latency_info *lat_info,
				326	bool up)
				327	{
Josef Bacik	ff4cee0	2018-09-28 13:45:39 -0400	[diff] [blame]	328	unsigned long qd = blkiolat->rqos.q->nr_requests;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	329	unsigned long scale = scale_amount(qd, up);
				330	unsigned long old = atomic_read(&lat_info->scale_cookie);
				331	unsigned long max_scale = qd << 1;
				332	unsigned long diff = 0;
				333
				334	if (old < DEFAULT_SCALE_COOKIE)
				335	diff = DEFAULT_SCALE_COOKIE - old;
				336
				337	if (up) {
				338	if (scale + old > DEFAULT_SCALE_COOKIE)
				339	atomic_set(&lat_info->scale_cookie,
				340	DEFAULT_SCALE_COOKIE);
				341	else if (diff > qd)
				342	atomic_inc(&lat_info->scale_cookie);
				343	else
				344	atomic_add(scale, &lat_info->scale_cookie);
				345	} else {
				346	/*
				347	* We don't want to dig a hole so deep that it takes us hours to
				348	* dig out of it. Just enough that we don't throttle/unthrottle
				349	* with jagged workloads but can still unthrottle once pressure
				350	* has sufficiently dissipated.
				351	*/
				352	if (diff > qd) {
				353	if (diff < max_scale)
				354	atomic_dec(&lat_info->scale_cookie);
				355	} else {
				356	atomic_sub(scale, &lat_info->scale_cookie);
				357	}
				358	}
				359	}
				360
				361	/*
				362	* Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the
				363	* queue depth at a time so we don't get wild swings and hopefully dial in to
				364	* fairer distribution of the overall queue depth.
				365	*/
				366	static void scale_change(struct iolatency_grp *iolat, bool up)
				367	{
Josef Bacik	ff4cee0	2018-09-28 13:45:39 -0400	[diff] [blame]	368	unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	369	unsigned long scale = scale_amount(qd, up);
				370	unsigned long old = iolat->rq_depth.max_depth;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	371
				372	if (old > qd)
				373	old = qd;
				374
				375	if (up) {
				376	if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat)))
				377	return;
				378
				379	if (old < qd) {
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	380	old += scale;
				381	old = min(old, qd);
				382	iolat->rq_depth.max_depth = old;
				383	wake_up_all(&iolat->rq_wait.wait);
				384	}
Josef Bacik	9f60511	2018-09-28 13:45:40 -0400	[diff] [blame]	385	} else {
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	386	old >>= 1;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	387	iolat->rq_depth.max_depth = max(old, 1UL);
				388	}
				389	}
				390
				391	/* Check our parent and see if the scale cookie has changed. */
				392	static void check_scale_change(struct iolatency_grp *iolat)
				393	{
				394	struct iolatency_grp *parent;
				395	struct child_latency_info *lat_info;
				396	unsigned int cur_cookie;
				397	unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
				398	u64 scale_lat;
				399	unsigned int old;
				400	int direction = 0;
				401
				402	if (lat_to_blkg(iolat)->parent == NULL)
				403	return;
				404
				405	parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
				406	if (!parent)
				407	return;
				408
				409	lat_info = &parent->child_lat;
				410	cur_cookie = atomic_read(&lat_info->scale_cookie);
				411	scale_lat = READ_ONCE(lat_info->scale_lat);
				412
				413	if (cur_cookie < our_cookie)
				414	direction = -1;
				415	else if (cur_cookie > our_cookie)
				416	direction = 1;
				417	else
				418	return;
				419
				420	old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
				421
				422	/* Somebody beat us to the punch, just bail. */
				423	if (old != our_cookie)
				424	return;
				425
				426	if (direction < 0 && iolat->min_lat_nsec) {
				427	u64 samples_thresh;
				428
				429	if (!scale_lat \|\| iolat->min_lat_nsec <= scale_lat)
				430	return;
				431
				432	/*
				433	* Sometimes high priority groups are their own worst enemy, so
				434	* instead of taking it out on some poor other group that did 5%
				435	* or less of the IO's for the last summation just skip this
				436	* scale down event.
				437	*/
				438	samples_thresh = lat_info->nr_samples * 5;
Josef Bacik	22ed8a9	2018-09-28 13:45:41 -0400	[diff] [blame]	439	samples_thresh = max(1ULL, div64_u64(samples_thresh, 100));
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	440	if (iolat->nr_samples <= samples_thresh)
				441	return;
				442	}
				443
				444	/* We're as low as we can go. */
				445	if (iolat->rq_depth.max_depth == 1 && direction < 0) {
				446	blkcg_use_delay(lat_to_blkg(iolat));
				447	return;
				448	}
				449
				450	/* We're back to the default cookie, unthrottle all the things. */
				451	if (cur_cookie == DEFAULT_SCALE_COOKIE) {
				452	blkcg_clear_delay(lat_to_blkg(iolat));
Josef Bacik	a284390	2018-07-11 10:34:42 -0400	[diff] [blame]	453	iolat->rq_depth.max_depth = UINT_MAX;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	454	wake_up_all(&iolat->rq_wait.wait);
				455	return;
				456	}
				457
				458	scale_change(iolat, direction > 0);
				459	}
				460
Christoph Hellwig	d533756	2018-11-14 17:02:09 +0100	[diff] [blame]	461	static void blkcg_iolatency_throttle(struct rq_qos rqos, struct bio bio)
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	462	{
				463	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
Dennis Zhou	5cdf2e3	2018-12-05 12:10:31 -0500	[diff] [blame]	464	struct blkcg_gq *blkg = bio->bi_blkg;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	465	bool issue_as_root = bio_issue_as_root_blkg(bio);
				466
				467	if (!blk_iolatency_enabled(blkiolat))
				468	return;
				469
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	470	while (blkg && blkg->parent) {
				471	struct iolatency_grp *iolat = blkg_to_lat(blkg);
				472	if (!iolat) {
				473	blkg = blkg->parent;
				474	continue;
				475	}
				476
				477	check_scale_change(iolat);
Christoph Hellwig	d533756	2018-11-14 17:02:09 +0100	[diff] [blame]	478	__blkcg_iolatency_throttle(rqos, iolat, issue_as_root,
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	479	(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
				480	blkg = blkg->parent;
				481	}
				482	if (!timer_pending(&blkiolat->timer))
				483	mod_timer(&blkiolat->timer, jiffies + HZ);
				484	}
				485
				486	static void iolatency_record_time(struct iolatency_grp *iolat,
				487	struct bio_issue *issue, u64 now,
				488	bool issue_as_root)
				489	{
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	490	u64 start = bio_issue_time(issue);
				491	u64 req_time;
				492
Josef Bacik	71e9690	2018-07-16 12:12:23 -0400	[diff] [blame]	493	/*
				494	* Have to do this so we are truncated to the correct time that our
				495	* issue is truncated to.
				496	*/
				497	now = __bio_issue_time(now);
				498
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	499	if (now <= start)
				500	return;
				501
				502	req_time = now - start;
				503
				504	/*
				505	* We don't want to count issue_as_root bio's in the cgroups latency
				506	* statistics as it could skew the numbers downwards.
				507	*/
Josef Bacik	a284390	2018-07-11 10:34:42 -0400	[diff] [blame]	508	if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	509	u64 sub = iolat->min_lat_nsec;
				510	if (req_time < sub)
				511	blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
				512	return;
				513	}
				514
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	515	latency_stat_record_time(iolat, req_time);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	516	}
				517
				518	#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
				519	#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5
				520
				521	static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
				522	{
				523	struct blkcg_gq *blkg = lat_to_blkg(iolat);
				524	struct iolatency_grp *parent;
				525	struct child_latency_info *lat_info;
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	526	struct latency_stat stat;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	527	unsigned long flags;
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	528	int cpu;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	529
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	530	latency_stat_init(iolat, &stat);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	531	preempt_disable();
				532	for_each_online_cpu(cpu) {
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	533	struct latency_stat *s;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	534	s = per_cpu_ptr(iolat->stats, cpu);
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	535	latency_stat_sum(iolat, &stat, s);
				536	latency_stat_init(iolat, s);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	537	}
				538	preempt_enable();
				539
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	540	parent = blkg_to_lat(blkg->parent);
				541	if (!parent)
				542	return;
				543
				544	lat_info = &parent->child_lat;
				545
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	546	iolat_update_total_lat_avg(iolat, &stat);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	547
				548	/* Everything is ok and we don't need to adjust the scale. */
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	549	if (latency_sum_ok(iolat, &stat) &&
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	550	atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
				551	return;
				552
				553	/* Somebody beat us to the punch, just bail. */
				554	spin_lock_irqsave(&lat_info->lock, flags);
Josef Bacik	451bb7c	2018-09-28 13:45:43 -0400	[diff] [blame]	555
				556	latency_stat_sum(iolat, &iolat->cur_stat, &stat);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	557	lat_info->nr_samples -= iolat->nr_samples;
Josef Bacik	451bb7c	2018-09-28 13:45:43 -0400	[diff] [blame]	558	lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat);
				559	iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	560
				561	if ((lat_info->last_scale_event >= now \|\|
Josef Bacik	451bb7c	2018-09-28 13:45:43 -0400	[diff] [blame]	562	now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME))
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	563	goto out;
				564
Josef Bacik	451bb7c	2018-09-28 13:45:43 -0400	[diff] [blame]	565	if (latency_sum_ok(iolat, &iolat->cur_stat) &&
				566	latency_sum_ok(iolat, &stat)) {
				567	if (latency_stat_samples(iolat, &iolat->cur_stat) <
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	568	BLKIOLATENCY_MIN_GOOD_SAMPLES)
				569	goto out;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	570	if (lat_info->scale_grp == iolat) {
				571	lat_info->last_scale_event = now;
				572	scale_cookie_change(iolat->blkiolat, lat_info, true);
				573	}
Josef Bacik	451bb7c	2018-09-28 13:45:43 -0400	[diff] [blame]	574	} else if (lat_info->scale_lat == 0 \|\|
				575	lat_info->scale_lat >= iolat->min_lat_nsec) {
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	576	lat_info->last_scale_event = now;
				577	if (!lat_info->scale_grp \|\|
				578	lat_info->scale_lat > iolat->min_lat_nsec) {
				579	WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec);
				580	lat_info->scale_grp = iolat;
				581	}
				582	scale_cookie_change(iolat->blkiolat, lat_info, false);
				583	}
Josef Bacik	451bb7c	2018-09-28 13:45:43 -0400	[diff] [blame]	584	latency_stat_init(iolat, &iolat->cur_stat);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	585	out:
				586	spin_unlock_irqrestore(&lat_info->lock, flags);
				587	}
				588
				589	static void blkcg_iolatency_done_bio(struct rq_qos rqos, struct bio bio)
				590	{
				591	struct blkcg_gq *blkg;
				592	struct rq_wait *rqw;
				593	struct iolatency_grp *iolat;
				594	u64 window_start;
Hongnan Li	6e2fa4d	2020-07-01 16:09:38 +0800	[diff] [blame]	595	u64 now;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	596	bool issue_as_root = bio_issue_as_root_blkg(bio);
				597	bool enabled = false;
Liu Bo	391f552	2019-01-25 08:12:48 +0800	[diff] [blame]	598	int inflight = 0;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	599
				600	blkg = bio->bi_blkg;
Dennis Zhou	1336981	2018-12-17 11:03:51 -0500	[diff] [blame]	601	if (!blkg \|\| !bio_flagged(bio, BIO_TRACKED))
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	602	return;
				603
				604	iolat = blkg_to_lat(bio->bi_blkg);
				605	if (!iolat)
				606	return;
				607
				608	enabled = blk_iolatency_enabled(iolat->blkiolat);
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	609	if (!enabled)
				610	return;
				611
Hongnan Li	6e2fa4d	2020-07-01 16:09:38 +0800	[diff] [blame]	612	now = ktime_to_ns(ktime_get());
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	613	while (blkg && blkg->parent) {
				614	iolat = blkg_to_lat(blkg);
				615	if (!iolat) {
				616	blkg = blkg->parent;
				617	continue;
				618	}
				619	rqw = &iolat->rq_wait;
				620
Liu Bo	391f552	2019-01-25 08:12:48 +0800	[diff] [blame]	621	inflight = atomic_dec_return(&rqw->inflight);
				622	WARN_ON_ONCE(inflight < 0);
Dennis Zhou	c9b3007	2019-07-05 17:09:09 -0400	[diff] [blame]	623	/*
				624	* If bi_status is BLK_STS_AGAIN, the bio wasn't actually
				625	* submitted, so do not account for it.
				626	*/
				627	if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) {
				628	iolatency_record_time(iolat, &bio->bi_issue, now,
				629	issue_as_root);
				630	window_start = atomic64_read(&iolat->window_start);
				631	if (now > window_start &&
				632	(now - window_start) >= iolat->cur_win_nsec) {
				633	if (atomic64_cmpxchg(&iolat->window_start,
				634	window_start, now) == window_start)
				635	iolatency_check_latencies(iolat, now);
				636	}
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	637	}
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	638	wake_up(&rqw->wait);
				639	blkg = blkg->parent;
				640	}
				641	}
				642
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	643	static void blkcg_iolatency_exit(struct rq_qos *rqos)
				644	{
				645	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
				646
				647	del_timer_sync(&blkiolat->timer);
				648	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency);
				649	kfree(blkiolat);
				650	}
				651
				652	static struct rq_qos_ops blkcg_iolatency_ops = {
				653	.throttle = blkcg_iolatency_throttle,
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	654	.done_bio = blkcg_iolatency_done_bio,
				655	.exit = blkcg_iolatency_exit,
				656	};
				657
				658	static void blkiolatency_timer_fn(struct timer_list *t)
				659	{
				660	struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
				661	struct blkcg_gq *blkg;
				662	struct cgroup_subsys_state *pos_css;
				663	u64 now = ktime_to_ns(ktime_get());
				664
				665	rcu_read_lock();
				666	blkg_for_each_descendant_pre(blkg, pos_css,
				667	blkiolat->rqos.q->root_blkg) {
				668	struct iolatency_grp *iolat;
				669	struct child_latency_info *lat_info;
				670	unsigned long flags;
				671	u64 cookie;
				672
				673	/*
				674	* We could be exiting, don't access the pd unless we have a
				675	* ref on the blkg.
				676	*/
Dennis Zhou	7754f66	2018-12-05 12:10:39 -0500	[diff] [blame]	677	if (!blkg_tryget(blkg))
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	678	continue;
				679
				680	iolat = blkg_to_lat(blkg);
				681	if (!iolat)
Josef Bacik	52a1199	2018-07-31 12:39:02 -0400	[diff] [blame]	682	goto next;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	683
				684	lat_info = &iolat->child_lat;
				685	cookie = atomic_read(&lat_info->scale_cookie);
				686
				687	if (cookie >= DEFAULT_SCALE_COOKIE)
				688	goto next;
				689
				690	spin_lock_irqsave(&lat_info->lock, flags);
				691	if (lat_info->last_scale_event >= now)
				692	goto next_lock;
				693
				694	/*
				695	* We scaled down but don't have a scale_grp, scale up and carry
				696	* on.
				697	*/
				698	if (lat_info->scale_grp == NULL) {
				699	scale_cookie_change(iolat->blkiolat, lat_info, true);
				700	goto next_lock;
				701	}
				702
				703	/*
				704	* It's been 5 seconds since our last scale event, clear the
				705	* scale grp in case the group that needed the scale down isn't
				706	* doing any IO currently.
				707	*/
				708	if (now - lat_info->last_scale_event >=
				709	((u64)NSEC_PER_SEC * 5))
				710	lat_info->scale_grp = NULL;
				711	next_lock:
				712	spin_unlock_irqrestore(&lat_info->lock, flags);
				713	next:
				714	blkg_put(blkg);
				715	}
				716	rcu_read_unlock();
				717	}
				718
				719	int blk_iolatency_init(struct request_queue *q)
				720	{
				721	struct blk_iolatency *blkiolat;
				722	struct rq_qos *rqos;
				723	int ret;
				724
				725	blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
				726	if (!blkiolat)
				727	return -ENOMEM;
				728
				729	rqos = &blkiolat->rqos;
Tejun Heo	beab17f	2019-08-28 15:05:56 -0700	[diff] [blame]	730	rqos->id = RQ_QOS_LATENCY;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	731	rqos->ops = &blkcg_iolatency_ops;
				732	rqos->q = q;
				733
				734	rq_qos_add(q, rqos);
				735
				736	ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
				737	if (ret) {
				738	rq_qos_del(q, rqos);
				739	kfree(blkiolat);
				740	return ret;
				741	}
				742
				743	timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
				744
				745	return 0;
				746	}
				747
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	748	/*
				749	* return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise
				750	* return 0.
				751	*/
				752	static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	753	{
				754	struct iolatency_grp *iolat = blkg_to_lat(blkg);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	755	u64 oldval = iolat->min_lat_nsec;
				756
				757	iolat->min_lat_nsec = val;
Dennis Zhou (Facebook)	c480bcf	2018-08-01 23:15:41 -0700	[diff] [blame]	758	iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE);
				759	iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
				760	BLKIOLATENCY_MAX_WIN_SIZE);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	761
				762	if (!oldval && val)
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	763	return 1;
Tejun Heo	5de0073	2019-06-13 15:30:37 -0700	[diff] [blame]	764	if (oldval && !val) {
				765	blkcg_clear_delay(blkg);
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	766	return -1;
Tejun Heo	5de0073	2019-06-13 15:30:37 -0700	[diff] [blame]	767	}
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	768	return 0;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	769	}
				770
				771	static void iolatency_clear_scaling(struct blkcg_gq *blkg)
				772	{
				773	if (blkg->parent) {
				774	struct iolatency_grp *iolat = blkg_to_lat(blkg->parent);
				775	struct child_latency_info *lat_info;
				776	if (!iolat)
				777	return;
				778
				779	lat_info = &iolat->child_lat;
				780	spin_lock(&lat_info->lock);
				781	atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE);
				782	lat_info->last_scale_event = 0;
				783	lat_info->scale_grp = NULL;
				784	lat_info->scale_lat = 0;
				785	spin_unlock(&lat_info->lock);
				786	}
				787	}
				788
				789	static ssize_t iolatency_set_limit(struct kernfs_open_file of, char buf,
				790	size_t nbytes, loff_t off)
				791	{
				792	struct blkcg *blkcg = css_to_blkcg(of_css(of));
				793	struct blkcg_gq *blkg;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	794	struct blkg_conf_ctx ctx;
				795	struct iolatency_grp *iolat;
				796	char p, tok;
				797	u64 lat_val = 0;
				798	u64 oldval;
				799	int ret;
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	800	int enable = 0;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	801
				802	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
				803	if (ret)
				804	return ret;
				805
				806	iolat = blkg_to_lat(ctx.blkg);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	807	p = ctx.body;
				808
				809	ret = -EINVAL;
				810	while ((tok = strsep(&p, " "))) {
				811	char key[16];
				812	char val[21]; /* 18446744073709551616 */
				813
				814	if (sscanf(tok, "%15[^=]=%20s", key, val) != 2)
				815	goto out;
				816
				817	if (!strcmp(key, "target")) {
				818	u64 v;
				819
				820	if (!strcmp(val, "max"))
				821	lat_val = 0;
				822	else if (sscanf(val, "%llu", &v) == 1)
				823	lat_val = v * NSEC_PER_USEC;
				824	else
				825	goto out;
				826	} else {
				827	goto out;
				828	}
				829	}
				830
				831	/* Walk up the tree to see if our new val is lower than it should be. */
				832	blkg = ctx.blkg;
				833	oldval = iolat->min_lat_nsec;
				834
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	835	enable = iolatency_set_min_lat_nsec(blkg, lat_val);
				836	if (enable) {
Yu Kuai	8d75d0e	2021-08-05 20:46:45 +0800	[diff] [blame]	837	if (!blk_get_queue(blkg->q)) {
				838	ret = -ENODEV;
				839	goto out;
				840	}
				841
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	842	blkg_get(blkg);
				843	}
				844
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	845	if (oldval != iolat->min_lat_nsec) {
				846	iolatency_clear_scaling(blkg);
				847	}
				848
				849	ret = 0;
				850	out:
				851	blkg_conf_finish(&ctx);
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	852	if (ret == 0 && enable) {
				853	struct iolatency_grp *tmp = blkg_to_lat(blkg);
				854	struct blk_iolatency *blkiolat = tmp->blkiolat;
				855
				856	blk_mq_freeze_queue(blkg->q);
				857
				858	if (enable == 1)
				859	atomic_inc(&blkiolat->enabled);
				860	else if (enable == -1)
				861	atomic_dec(&blkiolat->enabled);
				862	else
				863	WARN_ON_ONCE(1);
				864
				865	blk_mq_unfreeze_queue(blkg->q);
				866
				867	blkg_put(blkg);
				868	blk_put_queue(blkg->q);
				869	}
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	870	return ret ?: nbytes;
				871	}
				872
				873	static u64 iolatency_prfill_limit(struct seq_file *sf,
				874	struct blkg_policy_data *pd, int off)
				875	{
				876	struct iolatency_grp *iolat = pd_to_lat(pd);
				877	const char *dname = blkg_dev_name(pd->blkg);
				878
				879	if (!dname \|\| !iolat->min_lat_nsec)
				880	return 0;
				881	seq_printf(sf, "%s target=%llu\n",
Arnd Bergmann	88b7210	2018-07-10 17:21:34 +0200	[diff] [blame]	882	dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC));
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	883	return 0;
				884	}
				885
				886	static int iolatency_print_limit(struct seq_file sf, void v)
				887	{
				888	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				889	iolatency_prfill_limit,
				890	&blkcg_policy_iolatency, seq_cft(sf)->private, false);
				891	return 0;
				892	}
				893
Christoph Hellwig	252c651	2021-08-10 17:26:23 +0200	[diff] [blame]	894	static bool iolatency_ssd_stat(struct iolatency_grp iolat, struct seq_file s)
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	895	{
				896	struct latency_stat stat;
				897	int cpu;
				898
				899	latency_stat_init(iolat, &stat);
				900	preempt_disable();
				901	for_each_online_cpu(cpu) {
				902	struct latency_stat *s;
				903	s = per_cpu_ptr(iolat->stats, cpu);
				904	latency_stat_sum(iolat, &stat, s);
				905	}
				906	preempt_enable();
				907
				908	if (iolat->rq_depth.max_depth == UINT_MAX)
Christoph Hellwig	252c651	2021-08-10 17:26:23 +0200	[diff] [blame]	909	seq_printf(s, " missed=%llu total=%llu depth=max",
				910	(unsigned long long)stat.ps.missed,
				911	(unsigned long long)stat.ps.total);
				912	else
				913	seq_printf(s, " missed=%llu total=%llu depth=%u",
				914	(unsigned long long)stat.ps.missed,
				915	(unsigned long long)stat.ps.total,
				916	iolat->rq_depth.max_depth);
				917	return true;
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	918	}
				919
Christoph Hellwig	252c651	2021-08-10 17:26:23 +0200	[diff] [blame]	920	static bool iolatency_pd_stat(struct blkg_policy_data pd, struct seq_file s)
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	921	{
				922	struct iolatency_grp *iolat = pd_to_lat(pd);
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	923	unsigned long long avg_lat;
				924	unsigned long long cur_win;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	925
Tejun Heo	07b0fde	2019-07-16 07:58:31 -0700	[diff] [blame]	926	if (!blkcg_debug_stats)
Christoph Hellwig	252c651	2021-08-10 17:26:23 +0200	[diff] [blame]	927	return false;
Tejun Heo	07b0fde	2019-07-16 07:58:31 -0700	[diff] [blame]	928
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	929	if (iolat->ssd)
Christoph Hellwig	252c651	2021-08-10 17:26:23 +0200	[diff] [blame]	930	return iolatency_ssd_stat(iolat, s);
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	931
				932	avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
				933	cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
Josef Bacik	a284390	2018-07-11 10:34:42 -0400	[diff] [blame]	934	if (iolat->rq_depth.max_depth == UINT_MAX)
Christoph Hellwig	252c651	2021-08-10 17:26:23 +0200	[diff] [blame]	935	seq_printf(s, " depth=max avg_lat=%llu win=%llu",
				936	avg_lat, cur_win);
				937	else
				938	seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
				939	iolat->rq_depth.max_depth, avg_lat, cur_win);
				940	return true;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	941	}
				942
Tejun Heo	cf09a8e	2019-08-28 15:05:51 -0700	[diff] [blame]	943	static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
				944	struct request_queue *q,
				945	struct blkcg *blkcg)
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	946	{
				947	struct iolatency_grp *iolat;
				948
Tejun Heo	cf09a8e	2019-08-28 15:05:51 -0700	[diff] [blame]	949	iolat = kzalloc_node(sizeof(*iolat), gfp, q->node);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	950	if (!iolat)
				951	return NULL;
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	952	iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
				953	__alignof__(struct latency_stat), gfp);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	954	if (!iolat->stats) {
				955	kfree(iolat);
				956	return NULL;
				957	}
				958	return &iolat->pd;
				959	}
				960
				961	static void iolatency_pd_init(struct blkg_policy_data *pd)
				962	{
				963	struct iolatency_grp *iolat = pd_to_lat(pd);
				964	struct blkcg_gq *blkg = lat_to_blkg(iolat);
				965	struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
				966	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
				967	u64 now = ktime_to_ns(ktime_get());
				968	int cpu;
				969
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	970	if (blk_queue_nonrot(blkg->q))
				971	iolat->ssd = true;
				972	else
				973	iolat->ssd = false;
				974
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	975	for_each_possible_cpu(cpu) {
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	976	struct latency_stat *stat;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	977	stat = per_cpu_ptr(iolat->stats, cpu);
Josef Bacik	1fa2840	2018-09-28 13:45:42 -0400	[diff] [blame]	978	latency_stat_init(iolat, stat);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	979	}
				980
Josef Bacik	451bb7c	2018-09-28 13:45:43 -0400	[diff] [blame]	981	latency_stat_init(iolat, &iolat->cur_stat);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	982	rq_wait_init(&iolat->rq_wait);
				983	spin_lock_init(&iolat->child_lat.lock);
Josef Bacik	ff4cee0	2018-09-28 13:45:39 -0400	[diff] [blame]	984	iolat->rq_depth.queue_depth = blkg->q->nr_requests;
Josef Bacik	a284390	2018-07-11 10:34:42 -0400	[diff] [blame]	985	iolat->rq_depth.max_depth = UINT_MAX;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	986	iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
				987	iolat->blkiolat = blkiolat;
				988	iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
				989	atomic64_set(&iolat->window_start, now);
				990
				991	/*
				992	* We init things in list order, so the pd for the parent may not be
				993	* init'ed yet for whatever reason.
				994	*/
				995	if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) {
				996	struct iolatency_grp *parent = blkg_to_lat(blkg->parent);
				997	atomic_set(&iolat->scale_cookie,
				998	atomic_read(&parent->child_lat.scale_cookie));
				999	} else {
				1000	atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE);
				1001	}
				1002
				1003	atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE);
				1004	}
				1005
				1006	static void iolatency_pd_offline(struct blkg_policy_data *pd)
				1007	{
				1008	struct iolatency_grp *iolat = pd_to_lat(pd);
				1009	struct blkcg_gq *blkg = lat_to_blkg(iolat);
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	1010	struct blk_iolatency *blkiolat = iolat->blkiolat;
				1011	int ret;
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	1012
Liu Bo	8c772a9	2019-01-25 08:12:47 +0800	[diff] [blame]	1013	ret = iolatency_set_min_lat_nsec(blkg, 0);
				1014	if (ret == 1)
				1015	atomic_inc(&blkiolat->enabled);
				1016	if (ret == -1)
				1017	atomic_dec(&blkiolat->enabled);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	1018	iolatency_clear_scaling(blkg);
				1019	}
				1020
				1021	static void iolatency_pd_free(struct blkg_policy_data *pd)
				1022	{
				1023	struct iolatency_grp *iolat = pd_to_lat(pd);
				1024	free_percpu(iolat->stats);
				1025	kfree(iolat);
				1026	}
				1027
				1028	static struct cftype iolatency_files[] = {
				1029	{
				1030	.name = "latency",
				1031	.flags = CFTYPE_NOT_ON_ROOT,
				1032	.seq_show = iolatency_print_limit,
				1033	.write = iolatency_set_limit,
				1034	},
				1035	{}
				1036	};
				1037
				1038	static struct blkcg_policy blkcg_policy_iolatency = {
				1039	.dfl_cftypes = iolatency_files,
				1040	.pd_alloc_fn = iolatency_pd_alloc,
				1041	.pd_init_fn = iolatency_pd_init,
				1042	.pd_offline_fn = iolatency_pd_offline,
				1043	.pd_free_fn = iolatency_pd_free,
				1044	.pd_stat_fn = iolatency_pd_stat,
				1045	};
				1046
				1047	static int __init iolatency_init(void)
				1048	{
				1049	return blkcg_policy_register(&blkcg_policy_iolatency);
				1050	}
				1051
				1052	static void __exit iolatency_exit(void)
				1053	{
Baolin Wang	fa1c3ea	2020-09-28 08:42:26 +0800	[diff] [blame]	1054	blkcg_policy_unregister(&blkcg_policy_iolatency);
Josef Bacik	d706751	2018-07-03 11:15:01 -0400	[diff] [blame]	1055	}
				1056
				1057	module_init(iolatency_init);
				1058	module_exit(iolatency_exit);