Blame - block/blk-iocost.c - SHIFTPHONES/mainline/linux

blob: a8e99ef76a08e3758adc6794d9580991b8207361 [file] [log] [blame]

Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1	/* SPDX-License-Identifier: GPL-2.0
				2	*
				3	* IO cost model based controller.
				4	*
				5	* Copyright (C) 2019 Tejun Heo <tj@kernel.org>
				6	* Copyright (C) 2019 Andy Newell <newella@fb.com>
				7	* Copyright (C) 2019 Facebook
				8	*
				9	* One challenge of controlling IO resources is the lack of trivially
				10	* observable cost metric. This is distinguished from CPU and memory where
				11	* wallclock time and the number of bytes can serve as accurate enough
				12	* approximations.
				13	*
				14	* Bandwidth and iops are the most commonly used metrics for IO devices but
				15	* depending on the type and specifics of the device, different IO patterns
				16	* easily lead to multiple orders of magnitude variations rendering them
				17	* useless for the purpose of IO capacity distribution. While on-device
				18	* time, with a lot of clutches, could serve as a useful approximation for
				19	* non-queued rotational devices, this is no longer viable with modern
				20	* devices, even the rotational ones.
				21	*
				22	* While there is no cost metric we can trivially observe, it isn't a
				23	* complete mystery. For example, on a rotational device, seek cost
				24	* dominates while a contiguous transfer contributes a smaller amount
				25	* proportional to the size. If we can characterize at least the relative
				26	* costs of these different types of IOs, it should be possible to
				27	* implement a reasonable work-conserving proportional IO resource
				28	* distribution.
				29	*
				30	* 1. IO Cost Model
				31	*
				32	* IO cost model estimates the cost of an IO given its basic parameters and
				33	* history (e.g. the end sector of the last IO). The cost is measured in
				34	* device time. If a given IO is estimated to cost 10ms, the device should
				35	* be able to process ~100 of those IOs in a second.
				36	*
				37	* Currently, there's only one builtin cost model - linear. Each IO is
				38	* classified as sequential or random and given a base cost accordingly.
				39	* On top of that, a size cost proportional to the length of the IO is
				40	* added. While simple, this model captures the operational
				41	* characteristics of a wide varienty of devices well enough. Default
				42	* paramters for several different classes of devices are provided and the
				43	* parameters can be configured from userspace via
				44	* /sys/fs/cgroup/io.cost.model.
				45	*
				46	* If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
				47	* device-specific coefficients.
				48	*
				49	* 2. Control Strategy
				50	*
				51	* The device virtual time (vtime) is used as the primary control metric.
				52	* The control strategy is composed of the following three parts.
				53	*
				54	* 2-1. Vtime Distribution
				55	*
				56	* When a cgroup becomes active in terms of IOs, its hierarchical share is
				57	* calculated. Please consider the following hierarchy where the numbers
				58	* inside parentheses denote the configured weights.
				59	*
				60	* root
				61	* / \
				62	* A (w:100) B (w:300)
				63	* / \
				64	* A0 (w:100) A1 (w:100)
				65	*
				66	* If B is idle and only A0 and A1 are actively issuing IOs, as the two are
				67	* of equal weight, each gets 50% share. If then B starts issuing IOs, B
				68	* gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
				69	* 12.5% each. The distribution mechanism only cares about these flattened
				70	* shares. They're called hweights (hierarchical weights) and always add
				71	* upto 1 (HWEIGHT_WHOLE).
				72	*
				73	* A given cgroup's vtime runs slower in inverse proportion to its hweight.
				74	* For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
				75	* against the device vtime - an IO which takes 10ms on the underlying
				76	* device is considered to take 80ms on A0.
				77	*
				78	* This constitutes the basis of IO capacity distribution. Each cgroup's
				79	* vtime is running at a rate determined by its hweight. A cgroup tracks
				80	* the vtime consumed by past IOs and can issue a new IO iff doing so
				81	* wouldn't outrun the current device vtime. Otherwise, the IO is
				82	* suspended until the vtime has progressed enough to cover it.
				83	*
				84	* 2-2. Vrate Adjustment
				85	*
				86	* It's unrealistic to expect the cost model to be perfect. There are too
				87	* many devices and even on the same device the overall performance
				88	* fluctuates depending on numerous factors such as IO mixture and device
				89	* internal garbage collection. The controller needs to adapt dynamically.
				90	*
				91	* This is achieved by adjusting the overall IO rate according to how busy
				92	* the device is. If the device becomes overloaded, we're sending down too
				93	* many IOs and should generally slow down. If there are waiting issuers
				94	* but the device isn't saturated, we're issuing too few and should
				95	* generally speed up.
				96	*
				97	* To slow down, we lower the vrate - the rate at which the device vtime
				98	* passes compared to the wall clock. For example, if the vtime is running
				99	* at the vrate of 75%, all cgroups added up would only be able to issue
				100	* 750ms worth of IOs per second, and vice-versa for speeding up.
				101	*
				102	* Device business is determined using two criteria - rq wait and
				103	* completion latencies.
				104	*
				105	* When a device gets saturated, the on-device and then the request queues
				106	* fill up and a bio which is ready to be issued has to wait for a request
				107	* to become available. When this delay becomes noticeable, it's a clear
				108	* indication that the device is saturated and we lower the vrate. This
				109	* saturation signal is fairly conservative as it only triggers when both
				110	* hardware and software queues are filled up, and is used as the default
				111	* busy signal.
				112	*
				113	* As devices can have deep queues and be unfair in how the queued commands
				114	* are executed, soley depending on rq wait may not result in satisfactory
				115	* control quality. For a better control quality, completion latency QoS
				116	* parameters can be configured so that the device is considered saturated
				117	* if N'th percentile completion latency rises above the set point.
				118	*
				119	* The completion latency requirements are a function of both the
				120	* underlying device characteristics and the desired IO latency quality of
				121	* service. There is an inherent trade-off - the tighter the latency QoS,
				122	* the higher the bandwidth lossage. Latency QoS is disabled by default
				123	* and can be set through /sys/fs/cgroup/io.cost.qos.
				124	*
				125	* 2-3. Work Conservation
				126	*
				127	* Imagine two cgroups A and B with equal weights. A is issuing a small IO
				128	* periodically while B is sending out enough parallel IOs to saturate the
				129	* device on its own. Let's say A's usage amounts to 100ms worth of IO
				130	* cost per second, i.e., 10% of the device capacity. The naive
				131	* distribution of half and half would lead to 60% utilization of the
				132	* device, a significant reduction in the total amount of work done
				133	* compared to free-for-all competition. This is too high a cost to pay
				134	* for IO control.
				135	*
				136	* To conserve the total amount of work done, we keep track of how much
				137	* each active cgroup is actually using and yield part of its weight if
				138	* there are other cgroups which can make use of it. In the above case,
				139	* A's weight will be lowered so that it hovers above the actual usage and
				140	* B would be able to use the rest.
				141	*
				142	* As we don't want to penalize a cgroup for donating its weight, the
				143	* surplus weight adjustment factors in a margin and has an immediate
				144	* snapback mechanism in case the cgroup needs more IO vtime for itself.
				145	*
				146	* Note that adjusting down surplus weights has the same effects as
				147	* accelerating vtime for other cgroups and work conservation can also be
				148	* implemented by adjusting vrate dynamically. However, squaring who can
				149	* donate and should take back how much requires hweight propagations
				150	* anyway making it easier to implement and understand as a separate
				151	* mechanism.
Tejun Heo	6954ff1	2019-08-28 15:05:59 -0700	[diff] [blame]	152	*
				153	* 3. Monitoring
				154	*
				155	* Instead of debugfs or other clumsy monitoring mechanisms, this
				156	* controller uses a drgn based monitoring script -
				157	* tools/cgroup/iocost_monitor.py. For details on drgn, please see
				158	* https://github.com/osandov/drgn. The ouput looks like the following.
				159	*
				160	* sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
Tejun Heo	7c1ee70	2019-09-04 12:45:56 -0700	[diff] [blame]	161	* active weight hweight% inflt% dbt delay usages%
				162	* test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
				163	* test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
Tejun Heo	6954ff1	2019-08-28 15:05:59 -0700	[diff] [blame]	164	*
				165	* - per : Timer period
				166	* - cur_per : Internal wall and device vtime clock
				167	* - vrate : Device virtual time rate against wall clock
				168	* - weight : Surplus-adjusted and configured weights
				169	* - hweight : Surplus-adjusted and configured hierarchical weights
				170	* - inflt : The percentage of in-flight IO cost at the end of last period
				171	* - del_ms : Deferred issuer delay induction level and duration
				172	* - usages : Usage history
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	173	*/
				174
				175	#include <linux/kernel.h>
				176	#include <linux/module.h>
				177	#include <linux/timer.h>
				178	#include <linux/time64.h>
				179	#include <linux/parser.h>
				180	#include <linux/sched/signal.h>
				181	#include <linux/blk-cgroup.h>
				182	#include "blk-rq-qos.h"
				183	#include "blk-stat.h"
				184	#include "blk-wbt.h"
				185
				186	#ifdef CONFIG_TRACEPOINTS
				187
				188	/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
				189	#define TRACE_IOCG_PATH_LEN 1024
				190	static DEFINE_SPINLOCK(trace_iocg_path_lock);
				191	static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
				192
				193	#define TRACE_IOCG_PATH(type, iocg, ...) \
				194	do { \
				195	unsigned long flags; \
				196	if (trace_iocost_##type##_enabled()) { \
				197	spin_lock_irqsave(&trace_iocg_path_lock, flags); \
				198	cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
				199	trace_iocg_path, TRACE_IOCG_PATH_LEN); \
				200	trace_iocost_##type(iocg, trace_iocg_path, \
				201	##__VA_ARGS__); \
				202	spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
				203	} \
				204	} while (0)
				205
				206	#else /* CONFIG_TRACE_POINTS */
				207	#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
				208	#endif /* CONFIG_TRACE_POINTS */
				209
				210	enum {
				211	MILLION = 1000000,
				212
				213	/* timer period is calculated from latency requirements, bound it */
				214	MIN_PERIOD = USEC_PER_MSEC,
				215	MAX_PERIOD = USEC_PER_SEC,
				216
				217	/*
				218	* A cgroup's vtime can run 50% behind the device vtime, which
				219	* serves as its IO credit buffer. Surplus weight adjustment is
				220	* immediately canceled if the vtime margin runs below 10%.
				221	*/
				222	MARGIN_PCT = 50,
				223	INUSE_MARGIN_PCT = 10,
				224
				225	/* Have some play in waitq timer operations */
				226	WAITQ_TIMER_MARGIN_PCT = 5,
				227
				228	/*
				229	* vtime can wrap well within a reasonable uptime when vrate is
				230	* consistently raised. Don't trust recorded cgroup vtime if the
				231	* period counter indicates that it's older than 5mins.
				232	*/
				233	VTIME_VALID_DUR = 300 * USEC_PER_SEC,
				234
				235	/*
				236	* Remember the past three non-zero usages and use the max for
				237	* surplus calculation. Three slots guarantee that we remember one
				238	* full period usage from the last active stretch even after
				239	* partial deactivation and re-activation periods. Don't start
				240	* giving away weight before collecting two data points to prevent
				241	* hweight adjustments based on one partial activation period.
				242	*/
				243	NR_USAGE_SLOTS = 3,
				244	MIN_VALID_USAGES = 2,
				245
				246	/* 1/64k is granular enough and can easily be handled w/ u32 */
				247	HWEIGHT_WHOLE = 1 << 16,
				248
				249	/*
				250	* As vtime is used to calculate the cost of each IO, it needs to
				251	* be fairly high precision. For example, it should be able to
				252	* represent the cost of a single page worth of discard with
				253	* suffificient accuracy. At the same time, it should be able to
				254	* represent reasonably long enough durations to be useful and
				255	* convenient during operation.
				256	*
				257	* 1s worth of vtime is 2^37. This gives us both sub-nanosecond
				258	* granularity and days of wrap-around time even at extreme vrates.
				259	*/
				260	VTIME_PER_SEC_SHIFT = 37,
				261	VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
				262	VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
				263
				264	/* bound vrate adjustments within two orders of magnitude */
				265	VRATE_MIN_PPM = 10000, /* 1% */
				266	VRATE_MAX_PPM = 100000000, /* 10000% */
				267
				268	VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
				269	VRATE_CLAMP_ADJ_PCT = 4,
				270
				271	/* if IOs end up waiting for requests, issue less */
				272	RQ_WAIT_BUSY_PCT = 5,
				273
				274	/* unbusy hysterisis */
				275	UNBUSY_THR_PCT = 75,
				276
				277	/* don't let cmds which take a very long time pin lagging for too long */
				278	MAX_LAGGING_PERIODS = 10,
				279
				280	/*
				281	* If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
				282	* donate the surplus.
				283	*/
				284	SURPLUS_SCALE_PCT = 125, /* * 125% */
				285	SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
				286	SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
				287
				288	/* switch iff the conditions are met for longer than this */
				289	AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
				290
				291	/*
				292	* Count IO size in 4k pages. The 12bit shift helps keeping
				293	* size-proportional components of cost calculation in closer
				294	* numbers of digits to per-IO cost components.
				295	*/
				296	IOC_PAGE_SHIFT = 12,
				297	IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
				298	IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
				299
				300	/* if apart further than 16M, consider randio for linear model */
				301	LCOEF_RANDIO_PAGES = 4096,
				302	};
				303
				304	enum ioc_running {
				305	IOC_IDLE,
				306	IOC_RUNNING,
				307	IOC_STOP,
				308	};
				309
				310	/* io.cost.qos controls including per-dev enable of the whole controller */
				311	enum {
				312	QOS_ENABLE,
				313	QOS_CTRL,
				314	NR_QOS_CTRL_PARAMS,
				315	};
				316
				317	/* io.cost.qos params */
				318	enum {
				319	QOS_RPPM,
				320	QOS_RLAT,
				321	QOS_WPPM,
				322	QOS_WLAT,
				323	QOS_MIN,
				324	QOS_MAX,
				325	NR_QOS_PARAMS,
				326	};
				327
				328	/* io.cost.model controls */
				329	enum {
				330	COST_CTRL,
				331	COST_MODEL,
				332	NR_COST_CTRL_PARAMS,
				333	};
				334
				335	/* builtin linear cost model coefficients */
				336	enum {
				337	I_LCOEF_RBPS,
				338	I_LCOEF_RSEQIOPS,
				339	I_LCOEF_RRANDIOPS,
				340	I_LCOEF_WBPS,
				341	I_LCOEF_WSEQIOPS,
				342	I_LCOEF_WRANDIOPS,
				343	NR_I_LCOEFS,
				344	};
				345
				346	enum {
				347	LCOEF_RPAGE,
				348	LCOEF_RSEQIO,
				349	LCOEF_RRANDIO,
				350	LCOEF_WPAGE,
				351	LCOEF_WSEQIO,
				352	LCOEF_WRANDIO,
				353	NR_LCOEFS,
				354	};
				355
				356	enum {
				357	AUTOP_INVALID,
				358	AUTOP_HDD,
				359	AUTOP_SSD_QD1,
				360	AUTOP_SSD_DFL,
				361	AUTOP_SSD_FAST,
				362	};
				363
				364	struct ioc_gq;
				365
				366	struct ioc_params {
				367	u32 qos[NR_QOS_PARAMS];
				368	u64 i_lcoefs[NR_I_LCOEFS];
				369	u64 lcoefs[NR_LCOEFS];
				370	u32 too_fast_vrate_pct;
				371	u32 too_slow_vrate_pct;
				372	};
				373
				374	struct ioc_missed {
				375	u32 nr_met;
				376	u32 nr_missed;
				377	u32 last_met;
				378	u32 last_missed;
				379	};
				380
				381	struct ioc_pcpu_stat {
				382	struct ioc_missed missed[2];
				383
				384	u64 rq_wait_ns;
				385	u64 last_rq_wait_ns;
				386	};
				387
				388	/* per device */
				389	struct ioc {
				390	struct rq_qos rqos;
				391
				392	bool enabled;
				393
				394	struct ioc_params params;
				395	u32 period_us;
				396	u32 margin_us;
				397	u64 vrate_min;
				398	u64 vrate_max;
				399
				400	spinlock_t lock;
				401	struct timer_list timer;
				402	struct list_head active_iocgs; /* active cgroups */
				403	struct ioc_pcpu_stat __percpu *pcpu_stat;
				404
				405	enum ioc_running running;
				406	atomic64_t vtime_rate;
				407
				408	seqcount_t period_seqcount;
				409	u32 period_at; /* wallclock starttime */
				410	u64 period_at_vtime; /* vtime starttime */
				411
				412	atomic64_t cur_period; /* inc'd each period */
				413	int busy_level; /* saturation history */
				414
				415	u64 inuse_margin_vtime;
				416	bool weights_updated;
				417	atomic_t hweight_gen; /* for lazy hweights */
				418
				419	u64 autop_too_fast_at;
				420	u64 autop_too_slow_at;
				421	int autop_idx;
				422	bool user_qos_params:1;
				423	bool user_cost_model:1;
				424	};
				425
				426	/* per device-cgroup pair */
				427	struct ioc_gq {
				428	struct blkg_policy_data pd;
				429	struct ioc *ioc;
				430
				431	/*
				432	* A iocg can get its weight from two sources - an explicit
				433	* per-device-cgroup configuration or the default weight of the
				434	* cgroup. `cfg_weight` is the explicit per-device-cgroup
				435	* configuration. `weight` is the effective considering both
				436	* sources.
				437	*
				438	* When an idle cgroup becomes active its `active` goes from 0 to
				439	* `weight`. `inuse` is the surplus adjusted active weight.
				440	* `active` and `inuse` are used to calculate `hweight_active` and
				441	* `hweight_inuse`.
				442	*
				443	* `last_inuse` remembers `inuse` while an iocg is idle to persist
				444	* surplus adjustments.
				445	*/
				446	u32 cfg_weight;
				447	u32 weight;
				448	u32 active;
				449	u32 inuse;
				450	u32 last_inuse;
				451
				452	sector_t cursor; /* to detect randio */
				453
				454	/*
				455	* `vtime` is this iocg's vtime cursor which progresses as IOs are
				456	* issued. If lagging behind device vtime, the delta represents
				457	* the currently available IO budget. If runnning ahead, the
				458	* overage.
				459	*
				460	* `vtime_done` is the same but progressed on completion rather
				461	* than issue. The delta behind `vtime` represents the cost of
				462	* currently in-flight IOs.
				463	*
				464	* `last_vtime` is used to remember `vtime` at the end of the last
				465	* period to calculate utilization.
				466	*/
				467	atomic64_t vtime;
				468	atomic64_t done_vtime;
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	469	atomic64_t abs_vdebt;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	470	u64 last_vtime;
				471
				472	/*
				473	* The period this iocg was last active in. Used for deactivation
				474	* and invalidating `vtime`.
				475	*/
				476	atomic64_t active_period;
				477	struct list_head active_list;
				478
				479	/* see __propagate_active_weight() and current_hweight() for details */
				480	u64 child_active_sum;
				481	u64 child_inuse_sum;
				482	int hweight_gen;
				483	u32 hweight_active;
				484	u32 hweight_inuse;
				485	bool has_surplus;
				486
				487	struct wait_queue_head waitq;
				488	struct hrtimer waitq_timer;
				489	struct hrtimer delay_timer;
				490
				491	/* usage is recorded as fractions of HWEIGHT_WHOLE */
				492	int usage_idx;
				493	u32 usages[NR_USAGE_SLOTS];
				494
				495	/* this iocg's depth in the hierarchy and ancestors including self */
				496	int level;
				497	struct ioc_gq *ancestors[];
				498	};
				499
				500	/* per cgroup */
				501	struct ioc_cgrp {
				502	struct blkcg_policy_data cpd;
				503	unsigned int dfl_weight;
				504	};
				505
				506	struct ioc_now {
				507	u64 now_ns;
				508	u32 now;
				509	u64 vnow;
				510	u64 vrate;
				511	};
				512
				513	struct iocg_wait {
				514	struct wait_queue_entry wait;
				515	struct bio *bio;
				516	u64 abs_cost;
				517	bool committed;
				518	};
				519
				520	struct iocg_wake_ctx {
				521	struct ioc_gq *iocg;
				522	u32 hw_inuse;
				523	s64 vbudget;
				524	};
				525
				526	static const struct ioc_params autop[] = {
				527	[AUTOP_HDD] = {
				528	.qos = {
Tejun Heo	7afccca	2019-09-25 16:03:35 -0700	[diff] [blame]	529	[QOS_RLAT] = 250000, /* 250ms */
				530	[QOS_WLAT] = 250000,
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	531	[QOS_MIN] = VRATE_MIN_PPM,
				532	[QOS_MAX] = VRATE_MAX_PPM,
				533	},
				534	.i_lcoefs = {
				535	[I_LCOEF_RBPS] = 174019176,
				536	[I_LCOEF_RSEQIOPS] = 41708,
				537	[I_LCOEF_RRANDIOPS] = 370,
				538	[I_LCOEF_WBPS] = 178075866,
				539	[I_LCOEF_WSEQIOPS] = 42705,
				540	[I_LCOEF_WRANDIOPS] = 378,
				541	},
				542	},
				543	[AUTOP_SSD_QD1] = {
				544	.qos = {
				545	[QOS_RLAT] = 25000, /* 25ms */
				546	[QOS_WLAT] = 25000,
				547	[QOS_MIN] = VRATE_MIN_PPM,
				548	[QOS_MAX] = VRATE_MAX_PPM,
				549	},
				550	.i_lcoefs = {
				551	[I_LCOEF_RBPS] = 245855193,
				552	[I_LCOEF_RSEQIOPS] = 61575,
				553	[I_LCOEF_RRANDIOPS] = 6946,
				554	[I_LCOEF_WBPS] = 141365009,
				555	[I_LCOEF_WSEQIOPS] = 33716,
				556	[I_LCOEF_WRANDIOPS] = 26796,
				557	},
				558	},
				559	[AUTOP_SSD_DFL] = {
				560	.qos = {
				561	[QOS_RLAT] = 25000, /* 25ms */
				562	[QOS_WLAT] = 25000,
				563	[QOS_MIN] = VRATE_MIN_PPM,
				564	[QOS_MAX] = VRATE_MAX_PPM,
				565	},
				566	.i_lcoefs = {
				567	[I_LCOEF_RBPS] = 488636629,
				568	[I_LCOEF_RSEQIOPS] = 8932,
				569	[I_LCOEF_RRANDIOPS] = 8518,
				570	[I_LCOEF_WBPS] = 427891549,
				571	[I_LCOEF_WSEQIOPS] = 28755,
				572	[I_LCOEF_WRANDIOPS] = 21940,
				573	},
				574	.too_fast_vrate_pct = 500,
				575	},
				576	[AUTOP_SSD_FAST] = {
				577	.qos = {
				578	[QOS_RLAT] = 5000, /* 5ms */
				579	[QOS_WLAT] = 5000,
				580	[QOS_MIN] = VRATE_MIN_PPM,
				581	[QOS_MAX] = VRATE_MAX_PPM,
				582	},
				583	.i_lcoefs = {
				584	[I_LCOEF_RBPS] = 3102524156LLU,
				585	[I_LCOEF_RSEQIOPS] = 724816,
				586	[I_LCOEF_RRANDIOPS] = 778122,
				587	[I_LCOEF_WBPS] = 1742780862LLU,
				588	[I_LCOEF_WSEQIOPS] = 425702,
				589	[I_LCOEF_WRANDIOPS] = 443193,
				590	},
				591	.too_slow_vrate_pct = 10,
				592	},
				593	};
				594
				595	/*
				596	* vrate adjust percentages indexed by ioc->busy_level. We adjust up on
				597	* vtime credit shortage and down on device saturation.
				598	*/
				599	static u32 vrate_adj_pct[] =
				600	{ 0, 0, 0, 0,
				601	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				602	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				603	4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
				604
				605	static struct blkcg_policy blkcg_policy_iocost;
				606
				607	/* accessors and helpers */
				608	static struct ioc rqos_to_ioc(struct rq_qos rqos)
				609	{
				610	return container_of(rqos, struct ioc, rqos);
				611	}
				612
				613	static struct ioc q_to_ioc(struct request_queue q)
				614	{
				615	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
				616	}
				617
				618	static const char q_name(struct request_queue q)
				619	{
				620	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
				621	return kobject_name(q->kobj.parent);
				622	else
				623	return "<unknown>";
				624	}
				625
				626	static const char __maybe_unused ioc_name(struct ioc ioc)
				627	{
				628	return q_name(ioc->rqos.q);
				629	}
				630
				631	static struct ioc_gq pd_to_iocg(struct blkg_policy_data pd)
				632	{
				633	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
				634	}
				635
				636	static struct ioc_gq blkg_to_iocg(struct blkcg_gq blkg)
				637	{
				638	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
				639	}
				640
				641	static struct blkcg_gq iocg_to_blkg(struct ioc_gq iocg)
				642	{
				643	return pd_to_blkg(&iocg->pd);
				644	}
				645
				646	static struct ioc_cgrp blkcg_to_iocc(struct blkcg blkcg)
				647	{
				648	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
				649	struct ioc_cgrp, cpd);
				650	}
				651
				652	/*
				653	* Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	654	* weight, the more expensive each IO. Must round up.
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	655	*/
				656	static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
				657	{
				658	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
				659	}
				660
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	661	/*
				662	* The inverse of abs_cost_to_cost(). Must round up.
				663	*/
				664	static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
				665	{
				666	return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
				667	}
				668
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	669	static void iocg_commit_bio(struct ioc_gq iocg, struct bio bio, u64 cost)
				670	{
				671	bio->bi_iocost_cost = cost;
				672	atomic64_add(cost, &iocg->vtime);
				673	}
				674
				675	#define CREATE_TRACE_POINTS
				676	#include <trace/events/iocost.h>
				677
				678	/* latency Qos params changed, update period_us and all the dependent params */
				679	static void ioc_refresh_period_us(struct ioc *ioc)
				680	{
				681	u32 ppm, lat, multi, period_us;
				682
				683	lockdep_assert_held(&ioc->lock);
				684
				685	/* pick the higher latency target */
				686	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
				687	ppm = ioc->params.qos[QOS_RPPM];
				688	lat = ioc->params.qos[QOS_RLAT];
				689	} else {
				690	ppm = ioc->params.qos[QOS_WPPM];
				691	lat = ioc->params.qos[QOS_WLAT];
				692	}
				693
				694	/*
				695	* We want the period to be long enough to contain a healthy number
				696	* of IOs while short enough for granular control. Define it as a
				697	* multiple of the latency target. Ideally, the multiplier should
				698	* be scaled according to the percentile so that it would nominally
				699	* contain a certain number of requests. Let's be simpler and
				700	* scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
				701	*/
				702	if (ppm)
				703	multi = max_t(u32, (MILLION - ppm) / 50000, 2);
				704	else
				705	multi = 2;
				706	period_us = multi * lat;
				707	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
				708
				709	/* calculate dependent params */
				710	ioc->period_us = period_us;
				711	ioc->margin_us = period_us * MARGIN_PCT / 100;
				712	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
				713	period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
				714	}
				715
				716	static int ioc_autop_idx(struct ioc *ioc)
				717	{
				718	int idx = ioc->autop_idx;
				719	const struct ioc_params *p = &autop[idx];
				720	u32 vrate_pct;
				721	u64 now_ns;
				722
				723	/* rotational? */
				724	if (!blk_queue_nonrot(ioc->rqos.q))
				725	return AUTOP_HDD;
				726
				727	/* handle SATA SSDs w/ broken NCQ */
				728	if (blk_queue_depth(ioc->rqos.q) == 1)
				729	return AUTOP_SSD_QD1;
				730
				731	/* use one of the normal ssd sets */
				732	if (idx < AUTOP_SSD_DFL)
				733	return AUTOP_SSD_DFL;
				734
				735	/* if user is overriding anything, maintain what was there */
				736	if (ioc->user_qos_params \|\| ioc->user_cost_model)
				737	return idx;
				738
				739	/* step up/down based on the vrate */
				740	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
				741	VTIME_PER_USEC);
				742	now_ns = ktime_get_ns();
				743
				744	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
				745	if (!ioc->autop_too_fast_at)
				746	ioc->autop_too_fast_at = now_ns;
				747	if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
				748	return idx + 1;
				749	} else {
				750	ioc->autop_too_fast_at = 0;
				751	}
				752
				753	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
				754	if (!ioc->autop_too_slow_at)
				755	ioc->autop_too_slow_at = now_ns;
				756	if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
				757	return idx - 1;
				758	} else {
				759	ioc->autop_too_slow_at = 0;
				760	}
				761
				762	return idx;
				763	}
				764
				765	/*
				766	* Take the followings as input
				767	*
				768	* @bps maximum sequential throughput
				769	* @seqiops maximum sequential 4k iops
				770	* @randiops maximum random 4k iops
				771	*
				772	* and calculate the linear model cost coefficients.
				773	*
				774	* *@page per-page cost 1s / (@bps / 4096)
				775	* @seqio base cost of a seq IO max((1s / @seqiops) - @page, 0)
				776	* @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
				777	*/
				778	static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
				779	u64 page, u64 seqio, u64 *randio)
				780	{
				781	u64 v;
				782
				783	page = seqio = *randio = 0;
				784
				785	if (bps)
				786	*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
				787	DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
				788
				789	if (seqiops) {
				790	v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
				791	if (v > *page)
				792	seqio = v - page;
				793	}
				794
				795	if (randiops) {
				796	v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
				797	if (v > *page)
				798	randio = v - page;
				799	}
				800	}
				801
				802	static void ioc_refresh_lcoefs(struct ioc *ioc)
				803	{
				804	u64 *u = ioc->params.i_lcoefs;
				805	u64 *c = ioc->params.lcoefs;
				806
				807	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
				808	&c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
				809	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
				810	&c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
				811	}
				812
				813	static bool ioc_refresh_params(struct ioc *ioc, bool force)
				814	{
				815	const struct ioc_params *p;
				816	int idx;
				817
				818	lockdep_assert_held(&ioc->lock);
				819
				820	idx = ioc_autop_idx(ioc);
				821	p = &autop[idx];
				822
				823	if (idx == ioc->autop_idx && !force)
				824	return false;
				825
				826	if (idx != ioc->autop_idx)
				827	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
				828
				829	ioc->autop_idx = idx;
				830	ioc->autop_too_fast_at = 0;
				831	ioc->autop_too_slow_at = 0;
				832
				833	if (!ioc->user_qos_params)
				834	memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
				835	if (!ioc->user_cost_model)
				836	memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
				837
				838	ioc_refresh_period_us(ioc);
				839	ioc_refresh_lcoefs(ioc);
				840
				841	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
				842	VTIME_PER_USEC, MILLION);
				843	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
				844	VTIME_PER_USEC, MILLION);
				845
				846	return true;
				847	}
				848
				849	/* take a snapshot of the current [v]time and vrate */
				850	static void ioc_now(struct ioc ioc, struct ioc_now now)
				851	{
				852	unsigned seq;
				853
				854	now->now_ns = ktime_get();
				855	now->now = ktime_to_us(now->now_ns);
				856	now->vrate = atomic64_read(&ioc->vtime_rate);
				857
				858	/*
				859	* The current vtime is
				860	*
				861	* vtime at period start + (wallclock time since the start) * vrate
				862	*
				863	* As a consistent snapshot of `period_at_vtime` and `period_at` is
				864	* needed, they're seqcount protected.
				865	*/
				866	do {
				867	seq = read_seqcount_begin(&ioc->period_seqcount);
				868	now->vnow = ioc->period_at_vtime +
				869	(now->now - ioc->period_at) * now->vrate;
				870	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
				871	}
				872
				873	static void ioc_start_period(struct ioc ioc, struct ioc_now now)
				874	{
				875	lockdep_assert_held(&ioc->lock);
				876	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
				877
				878	write_seqcount_begin(&ioc->period_seqcount);
				879	ioc->period_at = now->now;
				880	ioc->period_at_vtime = now->vnow;
				881	write_seqcount_end(&ioc->period_seqcount);
				882
				883	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
				884	add_timer(&ioc->timer);
				885	}
				886
				887	/*
				888	* Update @iocg's `active` and `inuse` to @active and @inuse, update level
				889	* weight sums and propagate upwards accordingly.
				890	*/
				891	static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
				892	{
				893	struct ioc *ioc = iocg->ioc;
				894	int lvl;
				895
				896	lockdep_assert_held(&ioc->lock);
				897
				898	inuse = min(active, inuse);
				899
				900	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
				901	struct ioc_gq *parent = iocg->ancestors[lvl];
				902	struct ioc_gq *child = iocg->ancestors[lvl + 1];
				903	u32 parent_active = 0, parent_inuse = 0;
				904
				905	/* update the level sums */
				906	parent->child_active_sum += (s32)(active - child->active);
				907	parent->child_inuse_sum += (s32)(inuse - child->inuse);
				908	/* apply the udpates */
				909	child->active = active;
				910	child->inuse = inuse;
				911
				912	/*
				913	* The delta between inuse and active sums indicates that
				914	* that much of weight is being given away. Parent's inuse
				915	* and active should reflect the ratio.
				916	*/
				917	if (parent->child_active_sum) {
				918	parent_active = parent->weight;
				919	parent_inuse = DIV64_U64_ROUND_UP(
				920	parent_active * parent->child_inuse_sum,
				921	parent->child_active_sum);
				922	}
				923
				924	/* do we need to keep walking up? */
				925	if (parent_active == parent->active &&
				926	parent_inuse == parent->inuse)
				927	break;
				928
				929	active = parent_active;
				930	inuse = parent_inuse;
				931	}
				932
				933	ioc->weights_updated = true;
				934	}
				935
				936	static void commit_active_weights(struct ioc *ioc)
				937	{
				938	lockdep_assert_held(&ioc->lock);
				939
				940	if (ioc->weights_updated) {
				941	/* paired with rmb in current_hweight(), see there */
				942	smp_wmb();
				943	atomic_inc(&ioc->hweight_gen);
				944	ioc->weights_updated = false;
				945	}
				946	}
				947
				948	static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
				949	{
				950	__propagate_active_weight(iocg, active, inuse);
				951	commit_active_weights(iocg->ioc);
				952	}
				953
				954	static void current_hweight(struct ioc_gq iocg, u32 hw_activep, u32 *hw_inusep)
				955	{
				956	struct ioc *ioc = iocg->ioc;
				957	int lvl;
				958	u32 hwa, hwi;
				959	int ioc_gen;
				960
				961	/* hot path - if uptodate, use cached */
				962	ioc_gen = atomic_read(&ioc->hweight_gen);
				963	if (ioc_gen == iocg->hweight_gen)
				964	goto out;
				965
				966	/*
				967	* Paired with wmb in commit_active_weights(). If we saw the
				968	* updated hweight_gen, all the weight updates from
				969	* __propagate_active_weight() are visible too.
				970	*
				971	* We can race with weight updates during calculation and get it
				972	* wrong. However, hweight_gen would have changed and a future
				973	* reader will recalculate and we're guaranteed to discard the
				974	* wrong result soon.
				975	*/
				976	smp_rmb();
				977
				978	hwa = hwi = HWEIGHT_WHOLE;
				979	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
				980	struct ioc_gq *parent = iocg->ancestors[lvl];
				981	struct ioc_gq *child = iocg->ancestors[lvl + 1];
				982	u32 active_sum = READ_ONCE(parent->child_active_sum);
				983	u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
				984	u32 active = READ_ONCE(child->active);
				985	u32 inuse = READ_ONCE(child->inuse);
				986
				987	/* we can race with deactivations and either may read as zero */
				988	if (!active_sum \|\| !inuse_sum)
				989	continue;
				990
				991	active_sum = max(active, active_sum);
				992	hwa = hwa * active / active_sum; /* max 16bits * 10000 */
				993
				994	inuse_sum = max(inuse, inuse_sum);
				995	hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
				996	}
				997
				998	iocg->hweight_active = max_t(u32, hwa, 1);
				999	iocg->hweight_inuse = max_t(u32, hwi, 1);
				1000	iocg->hweight_gen = ioc_gen;
				1001	out:
				1002	if (hw_activep)
				1003	*hw_activep = iocg->hweight_active;
				1004	if (hw_inusep)
				1005	*hw_inusep = iocg->hweight_inuse;
				1006	}
				1007
				1008	static void weight_updated(struct ioc_gq *iocg)
				1009	{
				1010	struct ioc *ioc = iocg->ioc;
				1011	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
				1012	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
				1013	u32 weight;
				1014
				1015	lockdep_assert_held(&ioc->lock);
				1016
				1017	weight = iocg->cfg_weight ?: iocc->dfl_weight;
				1018	if (weight != iocg->weight && iocg->active)
				1019	propagate_active_weight(iocg, weight,
				1020	DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
				1021	iocg->weight = weight;
				1022	}
				1023
				1024	static bool iocg_activate(struct ioc_gq iocg, struct ioc_now now)
				1025	{
				1026	struct ioc *ioc = iocg->ioc;
				1027	u64 last_period, cur_period, max_period_delta;
				1028	u64 vtime, vmargin, vmin;
				1029	int i;
				1030
				1031	/*
				1032	* If seem to be already active, just update the stamp to tell the
				1033	* timer that we're still active. We don't mind occassional races.
				1034	*/
				1035	if (!list_empty(&iocg->active_list)) {
				1036	ioc_now(ioc, now);
				1037	cur_period = atomic64_read(&ioc->cur_period);
				1038	if (atomic64_read(&iocg->active_period) != cur_period)
				1039	atomic64_set(&iocg->active_period, cur_period);
				1040	return true;
				1041	}
				1042
				1043	/* racy check on internal node IOs, treat as root level IOs */
				1044	if (iocg->child_active_sum)
				1045	return false;
				1046
				1047	spin_lock_irq(&ioc->lock);
				1048
				1049	ioc_now(ioc, now);
				1050
				1051	/* update period */
				1052	cur_period = atomic64_read(&ioc->cur_period);
				1053	last_period = atomic64_read(&iocg->active_period);
				1054	atomic64_set(&iocg->active_period, cur_period);
				1055
				1056	/* already activated or breaking leaf-only constraint? */
Jiufei Xue	8b37bc2	2019-11-13 15:21:31 +0800	[diff] [blame]	1057	if (!list_empty(&iocg->active_list))
				1058	goto succeed_unlock;
				1059	for (i = iocg->level - 1; i > 0; i--)
				1060	if (!list_empty(&iocg->ancestors[i]->active_list))
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1061	goto fail_unlock;
Jiufei Xue	8b37bc2	2019-11-13 15:21:31 +0800	[diff] [blame]	1062
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1063	if (iocg->child_active_sum)
				1064	goto fail_unlock;
				1065
				1066	/*
				1067	* vtime may wrap when vrate is raised substantially due to
				1068	* underestimated IO costs. Look at the period and ignore its
				1069	* vtime if the iocg has been idle for too long. Also, cap the
				1070	* budget it can start with to the margin.
				1071	*/
				1072	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
				1073	vtime = atomic64_read(&iocg->vtime);
				1074	vmargin = ioc->margin_us * now->vrate;
				1075	vmin = now->vnow - vmargin;
				1076
				1077	if (last_period + max_period_delta < cur_period \|\|
				1078	time_before64(vtime, vmin)) {
				1079	atomic64_add(vmin - vtime, &iocg->vtime);
				1080	atomic64_add(vmin - vtime, &iocg->done_vtime);
				1081	vtime = vmin;
				1082	}
				1083
				1084	/*
				1085	* Activate, propagate weight and start period timer if not
				1086	* running. Reset hweight_gen to avoid accidental match from
				1087	* wrapping.
				1088	*/
				1089	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
				1090	list_add(&iocg->active_list, &ioc->active_iocgs);
				1091	propagate_active_weight(iocg, iocg->weight,
				1092	iocg->last_inuse ?: iocg->weight);
				1093
				1094	TRACE_IOCG_PATH(iocg_activate, iocg, now,
				1095	last_period, cur_period, vtime);
				1096
				1097	iocg->last_vtime = vtime;
				1098
				1099	if (ioc->running == IOC_IDLE) {
				1100	ioc->running = IOC_RUNNING;
				1101	ioc_start_period(ioc, now);
				1102	}
				1103
Jiufei Xue	8b37bc2	2019-11-13 15:21:31 +0800	[diff] [blame]	1104	succeed_unlock:
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1105	spin_unlock_irq(&ioc->lock);
				1106	return true;
				1107
				1108	fail_unlock:
				1109	spin_unlock_irq(&ioc->lock);
				1110	return false;
				1111	}
				1112
				1113	static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
				1114	int flags, void *key)
				1115	{
				1116	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
				1117	struct iocg_wake_ctx ctx = (struct iocg_wake_ctx )key;
				1118	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
				1119
				1120	ctx->vbudget -= cost;
				1121
				1122	if (ctx->vbudget < 0)
				1123	return -1;
				1124
				1125	iocg_commit_bio(ctx->iocg, wait->bio, cost);
				1126
				1127	/*
				1128	* autoremove_wake_function() removes the wait entry only when it
				1129	* actually changed the task state. We want the wait always
				1130	* removed. Remove explicitly and use default_wake_function().
				1131	*/
				1132	list_del_init(&wq_entry->entry);
				1133	wait->committed = true;
				1134
				1135	default_wake_function(wq_entry, mode, flags, key);
				1136	return 0;
				1137	}
				1138
				1139	static void iocg_kick_waitq(struct ioc_gq iocg, struct ioc_now now)
				1140	{
				1141	struct ioc *ioc = iocg->ioc;
				1142	struct iocg_wake_ctx ctx = { .iocg = iocg };
				1143	u64 margin_ns = (u64)(ioc->period_us *
				1144	WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1145	u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
				1146	s64 vbudget;
				1147	u32 hw_inuse;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1148
				1149	lockdep_assert_held(&iocg->waitq.lock);
				1150
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1151	current_hweight(iocg, NULL, &hw_inuse);
				1152	vbudget = now->vnow - atomic64_read(&iocg->vtime);
				1153
				1154	/* pay off debt */
				1155	abs_vdebt = atomic64_read(&iocg->abs_vdebt);
				1156	vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
				1157	if (vdebt && vbudget > 0) {
				1158	u64 delta = min_t(u64, vbudget, vdebt);
				1159	u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
				1160	abs_vdebt);
				1161
				1162	atomic64_add(delta, &iocg->vtime);
				1163	atomic64_add(delta, &iocg->done_vtime);
				1164	atomic64_sub(abs_delta, &iocg->abs_vdebt);
				1165	if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
				1166	atomic64_set(&iocg->abs_vdebt, 0);
				1167	}
				1168
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1169	/*
				1170	* Wake up the ones which are due and see how much vtime we'll need
				1171	* for the next one.
				1172	*/
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1173	ctx.hw_inuse = hw_inuse;
				1174	ctx.vbudget = vbudget - vdebt;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1175	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
				1176	if (!waitqueue_active(&iocg->waitq))
				1177	return;
				1178	if (WARN_ON_ONCE(ctx.vbudget >= 0))
				1179	return;
				1180
				1181	/* determine next wakeup, add a quarter margin to guarantee chunking */
				1182	vshortage = -ctx.vbudget;
				1183	expires = now->now_ns +
				1184	DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
				1185	expires += margin_ns / 4;
				1186
				1187	/* if already active and close enough, don't bother */
				1188	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
				1189	if (hrtimer_is_queued(&iocg->waitq_timer) &&
				1190	abs(oexpires - expires) <= margin_ns / 4)
				1191	return;
				1192
				1193	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
				1194	margin_ns / 4, HRTIMER_MODE_ABS);
				1195	}
				1196
				1197	static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
				1198	{
				1199	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
				1200	struct ioc_now now;
				1201	unsigned long flags;
				1202
				1203	ioc_now(iocg->ioc, &now);
				1204
				1205	spin_lock_irqsave(&iocg->waitq.lock, flags);
				1206	iocg_kick_waitq(iocg, &now);
				1207	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
				1208
				1209	return HRTIMER_NORESTART;
				1210	}
				1211
Tejun Heo	54c52e1	2020-04-13 12:27:55 -0400	[diff] [blame^]	1212	static bool iocg_kick_delay(struct ioc_gq iocg, struct ioc_now now)
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1213	{
				1214	struct ioc *ioc = iocg->ioc;
				1215	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
				1216	u64 vtime = atomic64_read(&iocg->vtime);
				1217	u64 vmargin = ioc->margin_us * now->vrate;
				1218	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
Tejun Heo	54c52e1	2020-04-13 12:27:55 -0400	[diff] [blame^]	1219	u64 delta_ns, expires, oexpires;
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1220	u32 hw_inuse;
				1221
				1222	/* debt-adjust vtime */
				1223	current_hweight(iocg, NULL, &hw_inuse);
				1224	vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1225
				1226	/* clear or maintain depending on the overage */
				1227	if (time_before_eq64(vtime, now->vnow)) {
				1228	blkcg_clear_delay(blkg);
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1229	return false;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1230	}
				1231	if (!atomic_read(&blkg->use_delay) &&
				1232	time_before_eq64(vtime, now->vnow + vmargin))
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1233	return false;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1234
				1235	/* use delay */
Tejun Heo	54c52e1	2020-04-13 12:27:55 -0400	[diff] [blame^]	1236	delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
				1237	now->vrate) * NSEC_PER_USEC;
				1238	blkcg_set_delay(blkg, delta_ns);
				1239	expires = now->now_ns + delta_ns;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1240
				1241	/* if already active and close enough, don't bother */
				1242	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
				1243	if (hrtimer_is_queued(&iocg->delay_timer) &&
				1244	abs(oexpires - expires) <= margin_ns / 4)
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1245	return true;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1246
				1247	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
				1248	margin_ns / 4, HRTIMER_MODE_ABS);
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1249	return true;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1250	}
				1251
				1252	static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
				1253	{
				1254	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
				1255	struct ioc_now now;
				1256
				1257	ioc_now(iocg->ioc, &now);
Tejun Heo	54c52e1	2020-04-13 12:27:55 -0400	[diff] [blame^]	1258	iocg_kick_delay(iocg, &now);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1259
				1260	return HRTIMER_NORESTART;
				1261	}
				1262
				1263	static void ioc_lat_stat(struct ioc ioc, u32 missed_ppm_ar, u32 *rq_wait_pct_p)
				1264	{
				1265	u32 nr_met[2] = { };
				1266	u32 nr_missed[2] = { };
				1267	u64 rq_wait_ns = 0;
				1268	int cpu, rw;
				1269
				1270	for_each_online_cpu(cpu) {
				1271	struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
				1272	u64 this_rq_wait_ns;
				1273
				1274	for (rw = READ; rw <= WRITE; rw++) {
				1275	u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
				1276	u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
				1277
				1278	nr_met[rw] += this_met - stat->missed[rw].last_met;
				1279	nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
				1280	stat->missed[rw].last_met = this_met;
				1281	stat->missed[rw].last_missed = this_missed;
				1282	}
				1283
				1284	this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
				1285	rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
				1286	stat->last_rq_wait_ns = this_rq_wait_ns;
				1287	}
				1288
				1289	for (rw = READ; rw <= WRITE; rw++) {
				1290	if (nr_met[rw] + nr_missed[rw])
				1291	missed_ppm_ar[rw] =
				1292	DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
				1293	nr_met[rw] + nr_missed[rw]);
				1294	else
				1295	missed_ppm_ar[rw] = 0;
				1296	}
				1297
				1298	rq_wait_pct_p = div64_u64(rq_wait_ns 100,
				1299	ioc->period_us * NSEC_PER_USEC);
				1300	}
				1301
				1302	/* was iocg idle this period? */
				1303	static bool iocg_is_idle(struct ioc_gq *iocg)
				1304	{
				1305	struct ioc *ioc = iocg->ioc;
				1306
				1307	/* did something get issued this period? */
				1308	if (atomic64_read(&iocg->active_period) ==
				1309	atomic64_read(&ioc->cur_period))
				1310	return false;
				1311
				1312	/* is something in flight? */
Tejun Heo	dcd6589	2020-03-10 13:07:46 -0400	[diff] [blame]	1313	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1314	return false;
				1315
				1316	return true;
				1317	}
				1318
				1319	/* returns usage with margin added if surplus is large enough */
				1320	static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
				1321	{
				1322	/* add margin */
				1323	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
				1324	usage += SURPLUS_SCALE_ABS;
				1325
				1326	/* don't bother if the surplus is too small */
				1327	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
				1328	return 0;
				1329
				1330	return usage;
				1331	}
				1332
				1333	static void ioc_timer_fn(struct timer_list *timer)
				1334	{
				1335	struct ioc *ioc = container_of(timer, struct ioc, timer);
				1336	struct ioc_gq iocg, tiocg;
				1337	struct ioc_now now;
				1338	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
				1339	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
				1340	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
				1341	u32 missed_ppm[2], rq_wait_pct;
				1342	u64 period_vtime;
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1343	int prev_busy_level, i;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1344
				1345	/* how were the latencies during the period? */
				1346	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
				1347
				1348	/* take care of active iocgs */
				1349	spin_lock_irq(&ioc->lock);
				1350
				1351	ioc_now(ioc, &now);
				1352
				1353	period_vtime = now.vnow - ioc->period_at_vtime;
				1354	if (WARN_ON_ONCE(!period_vtime)) {
				1355	spin_unlock_irq(&ioc->lock);
				1356	return;
				1357	}
				1358
				1359	/*
				1360	* Waiters determine the sleep durations based on the vrate they
				1361	* saw at the time of sleep. If vrate has increased, some waiters
				1362	* could be sleeping for too long. Wake up tardy waiters which
				1363	* should have woken up in the last period and expire idle iocgs.
				1364	*/
				1365	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1366	if (!waitqueue_active(&iocg->waitq) &&
				1367	!atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1368	continue;
				1369
				1370	spin_lock(&iocg->waitq.lock);
				1371
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1372	if (waitqueue_active(&iocg->waitq) \|\|
				1373	atomic64_read(&iocg->abs_vdebt)) {
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1374	/* might be oversleeping vtime / hweight changes, kick */
				1375	iocg_kick_waitq(iocg, &now);
Tejun Heo	54c52e1	2020-04-13 12:27:55 -0400	[diff] [blame^]	1376	iocg_kick_delay(iocg, &now);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1377	} else if (iocg_is_idle(iocg)) {
				1378	/* no waiter and idle, deactivate */
				1379	iocg->last_inuse = iocg->inuse;
				1380	__propagate_active_weight(iocg, 0, 0);
				1381	list_del_init(&iocg->active_list);
				1382	}
				1383
				1384	spin_unlock(&iocg->waitq.lock);
				1385	}
				1386	commit_active_weights(ioc);
				1387
				1388	/* calc usages and see whether some weights need to be moved around */
				1389	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
				1390	u64 vdone, vtime, vusage, vmargin, vmin;
				1391	u32 hw_active, hw_inuse, usage;
				1392
				1393	/*
				1394	* Collect unused and wind vtime closer to vnow to prevent
				1395	* iocgs from accumulating a large amount of budget.
				1396	*/
				1397	vdone = atomic64_read(&iocg->done_vtime);
				1398	vtime = atomic64_read(&iocg->vtime);
				1399	current_hweight(iocg, &hw_active, &hw_inuse);
				1400
				1401	/*
				1402	* Latency QoS detection doesn't account for IOs which are
				1403	* in-flight for longer than a period. Detect them by
				1404	* comparing vdone against period start. If lagging behind
				1405	* IOs from past periods, don't increase vrate.
				1406	*/
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1407	if ((ppm_rthr != MILLION \|\| ppm_wthr != MILLION) &&
				1408	!atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1409	time_after64(vtime, vdone) &&
				1410	time_after64(vtime, now.vnow -
				1411	MAX_LAGGING_PERIODS * period_vtime) &&
				1412	time_before64(vdone, now.vnow - period_vtime))
				1413	nr_lagging++;
				1414
				1415	if (waitqueue_active(&iocg->waitq))
				1416	vusage = now.vnow - iocg->last_vtime;
				1417	else if (time_before64(iocg->last_vtime, vtime))
				1418	vusage = vtime - iocg->last_vtime;
				1419	else
				1420	vusage = 0;
				1421
				1422	iocg->last_vtime += vusage;
				1423	/*
				1424	* Factor in in-flight vtime into vusage to avoid
				1425	* high-latency completions appearing as idle. This should
				1426	* be done after the above ->last_time adjustment.
				1427	*/
				1428	vusage = max(vusage, vtime - vdone);
				1429
				1430	/* calculate hweight based usage ratio and record */
				1431	if (vusage) {
				1432	usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
				1433	period_vtime);
				1434	iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
				1435	iocg->usages[iocg->usage_idx] = usage;
				1436	} else {
				1437	usage = 0;
				1438	}
				1439
				1440	/* see whether there's surplus vtime */
				1441	vmargin = ioc->margin_us * now.vrate;
				1442	vmin = now.vnow - vmargin;
				1443
				1444	iocg->has_surplus = false;
				1445
				1446	if (!waitqueue_active(&iocg->waitq) &&
				1447	time_before64(vtime, vmin)) {
				1448	u64 delta = vmin - vtime;
				1449
				1450	/* throw away surplus vtime */
				1451	atomic64_add(delta, &iocg->vtime);
				1452	atomic64_add(delta, &iocg->done_vtime);
				1453	iocg->last_vtime += delta;
				1454	/* if usage is sufficiently low, maybe it can donate */
				1455	if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
				1456	iocg->has_surplus = true;
				1457	nr_surpluses++;
				1458	}
				1459	} else if (hw_inuse < hw_active) {
				1460	u32 new_hwi, new_inuse;
				1461
				1462	/* was donating but might need to take back some */
				1463	if (waitqueue_active(&iocg->waitq)) {
				1464	new_hwi = hw_active;
				1465	} else {
				1466	new_hwi = max(hw_inuse,
				1467	usage * SURPLUS_SCALE_PCT / 100 +
				1468	SURPLUS_SCALE_ABS);
				1469	}
				1470
				1471	new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
				1472	hw_inuse);
				1473	new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
				1474
				1475	if (new_inuse > iocg->inuse) {
				1476	TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
				1477	iocg->inuse, new_inuse,
				1478	hw_inuse, new_hwi);
				1479	__propagate_active_weight(iocg, iocg->weight,
				1480	new_inuse);
				1481	}
				1482	} else {
				1483	/* genuninely out of vtime */
				1484	nr_shortages++;
				1485	}
				1486	}
				1487
				1488	if (!nr_shortages \|\| !nr_surpluses)
				1489	goto skip_surplus_transfers;
				1490
				1491	/* there are both shortages and surpluses, transfer surpluses */
				1492	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
				1493	u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
				1494	int nr_valid = 0;
				1495
				1496	if (!iocg->has_surplus)
				1497	continue;
				1498
				1499	/* base the decision on max historical usage */
				1500	for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
				1501	if (iocg->usages[i]) {
				1502	usage = max(usage, iocg->usages[i]);
				1503	nr_valid++;
				1504	}
				1505	}
				1506	if (nr_valid < MIN_VALID_USAGES)
				1507	continue;
				1508
				1509	current_hweight(iocg, &hw_active, &hw_inuse);
				1510	new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
				1511	if (!new_hwi)
				1512	continue;
				1513
				1514	new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
				1515	hw_inuse);
				1516	if (new_inuse < iocg->inuse) {
				1517	TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
				1518	iocg->inuse, new_inuse,
				1519	hw_inuse, new_hwi);
				1520	__propagate_active_weight(iocg, iocg->weight, new_inuse);
				1521	}
				1522	}
				1523	skip_surplus_transfers:
				1524	commit_active_weights(ioc);
				1525
				1526	/*
				1527	* If q is getting clogged or we're missing too much, we're issuing
				1528	* too much IO and should lower vtime rate. If we're not missing
				1529	* and experiencing shortages but not surpluses, we're too stingy
				1530	* and should increase vtime rate.
				1531	*/
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1532	prev_busy_level = ioc->busy_level;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1533	if (rq_wait_pct > RQ_WAIT_BUSY_PCT \|\|
				1534	missed_ppm[READ] > ppm_rthr \|\|
				1535	missed_ppm[WRITE] > ppm_wthr) {
				1536	ioc->busy_level = max(ioc->busy_level, 0);
				1537	ioc->busy_level++;
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1538	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1539	missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
				1540	missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1541	/* take action iff there is contention */
				1542	if (nr_shortages && !nr_lagging) {
				1543	ioc->busy_level = min(ioc->busy_level, 0);
				1544	/* redistribute surpluses first */
				1545	if (!nr_surpluses)
				1546	ioc->busy_level--;
				1547	}
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1548	} else {
				1549	ioc->busy_level = 0;
				1550	}
				1551
				1552	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
				1553
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1554	if (ioc->busy_level > 0 \|\| (ioc->busy_level < 0 && !nr_lagging)) {
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1555	u64 vrate = atomic64_read(&ioc->vtime_rate);
				1556	u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
				1557
				1558	/* rq_wait signal is always reliable, ignore user vrate_min */
				1559	if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
				1560	vrate_min = VRATE_MIN;
				1561
				1562	/*
				1563	* If vrate is out of bounds, apply clamp gradually as the
				1564	* bounds can change abruptly. Otherwise, apply busy_level
				1565	* based adjustment.
				1566	*/
				1567	if (vrate < vrate_min) {
				1568	vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
				1569	100);
				1570	vrate = min(vrate, vrate_min);
				1571	} else if (vrate > vrate_max) {
				1572	vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
				1573	100);
				1574	vrate = max(vrate, vrate_max);
				1575	} else {
				1576	int idx = min_t(int, abs(ioc->busy_level),
				1577	ARRAY_SIZE(vrate_adj_pct) - 1);
				1578	u32 adj_pct = vrate_adj_pct[idx];
				1579
				1580	if (ioc->busy_level > 0)
				1581	adj_pct = 100 - adj_pct;
				1582	else
				1583	adj_pct = 100 + adj_pct;
				1584
				1585	vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
				1586	vrate_min, vrate_max);
				1587	}
				1588
				1589	trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct,
				1590	nr_lagging, nr_shortages,
				1591	nr_surpluses);
				1592
				1593	atomic64_set(&ioc->vtime_rate, vrate);
				1594	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
				1595	ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1596	} else if (ioc->busy_level != prev_busy_level \|\| nr_lagging) {
				1597	trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
				1598	&missed_ppm, rq_wait_pct, nr_lagging,
				1599	nr_shortages, nr_surpluses);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1600	}
				1601
				1602	ioc_refresh_params(ioc, false);
				1603
				1604	/*
				1605	* This period is done. Move onto the next one. If nothing's
				1606	* going on with the device, stop the timer.
				1607	*/
				1608	atomic64_inc(&ioc->cur_period);
				1609
				1610	if (ioc->running != IOC_STOP) {
				1611	if (!list_empty(&ioc->active_iocgs)) {
				1612	ioc_start_period(ioc, &now);
				1613	} else {
				1614	ioc->busy_level = 0;
				1615	ioc->running = IOC_IDLE;
				1616	}
				1617	}
				1618
				1619	spin_unlock_irq(&ioc->lock);
				1620	}
				1621
				1622	static void calc_vtime_cost_builtin(struct bio bio, struct ioc_gq iocg,
				1623	bool is_merge, u64 *costp)
				1624	{
				1625	struct ioc *ioc = iocg->ioc;
				1626	u64 coef_seqio, coef_randio, coef_page;
				1627	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
				1628	u64 seek_pages = 0;
				1629	u64 cost = 0;
				1630
				1631	switch (bio_op(bio)) {
				1632	case REQ_OP_READ:
				1633	coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
				1634	coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
				1635	coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
				1636	break;
				1637	case REQ_OP_WRITE:
				1638	coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
				1639	coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
				1640	coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
				1641	break;
				1642	default:
				1643	goto out;
				1644	}
				1645
				1646	if (iocg->cursor) {
				1647	seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
				1648	seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
				1649	}
				1650
				1651	if (!is_merge) {
				1652	if (seek_pages > LCOEF_RANDIO_PAGES) {
				1653	cost += coef_randio;
				1654	} else {
				1655	cost += coef_seqio;
				1656	}
				1657	}
				1658	cost += pages * coef_page;
				1659	out:
				1660	*costp = cost;
				1661	}
				1662
				1663	static u64 calc_vtime_cost(struct bio bio, struct ioc_gq iocg, bool is_merge)
				1664	{
				1665	u64 cost;
				1666
				1667	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
				1668	return cost;
				1669	}
				1670
				1671	static void ioc_rqos_throttle(struct rq_qos rqos, struct bio bio)
				1672	{
				1673	struct blkcg_gq *blkg = bio->bi_blkg;
				1674	struct ioc *ioc = rqos_to_ioc(rqos);
				1675	struct ioc_gq *iocg = blkg_to_iocg(blkg);
				1676	struct ioc_now now;
				1677	struct iocg_wait wait;
				1678	u32 hw_active, hw_inuse;
				1679	u64 abs_cost, cost, vtime;
				1680
				1681	/* bypass IOs if disabled or for root cgroup */
				1682	if (!ioc->enabled \|\| !iocg->level)
				1683	return;
				1684
				1685	/* always activate so that even 0 cost IOs get protected to some level */
				1686	if (!iocg_activate(iocg, &now))
				1687	return;
				1688
				1689	/* calculate the absolute vtime cost */
				1690	abs_cost = calc_vtime_cost(bio, iocg, false);
				1691	if (!abs_cost)
				1692	return;
				1693
				1694	iocg->cursor = bio_end_sector(bio);
				1695
				1696	vtime = atomic64_read(&iocg->vtime);
				1697	current_hweight(iocg, &hw_active, &hw_inuse);
				1698
				1699	if (hw_inuse < hw_active &&
				1700	time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
				1701	TRACE_IOCG_PATH(inuse_reset, iocg, &now,
				1702	iocg->inuse, iocg->weight, hw_inuse, hw_active);
				1703	spin_lock_irq(&ioc->lock);
				1704	propagate_active_weight(iocg, iocg->weight, iocg->weight);
				1705	spin_unlock_irq(&ioc->lock);
				1706	current_hweight(iocg, &hw_active, &hw_inuse);
				1707	}
				1708
				1709	cost = abs_cost_to_cost(abs_cost, hw_inuse);
				1710
				1711	/*
				1712	* If no one's waiting and within budget, issue right away. The
				1713	* tests are racy but the races aren't systemic - we only miss once
				1714	* in a while which is fine.
				1715	*/
				1716	if (!waitqueue_active(&iocg->waitq) &&
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1717	!atomic64_read(&iocg->abs_vdebt) &&
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1718	time_before_eq64(vtime + cost, now.vnow)) {
				1719	iocg_commit_bio(iocg, bio, cost);
				1720	return;
				1721	}
				1722
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1723	/*
				1724	* We're over budget. If @bio has to be issued regardless,
				1725	* remember the abs_cost instead of advancing vtime.
				1726	* iocg_kick_waitq() will pay off the debt before waking more IOs.
				1727	* This way, the debt is continuously paid off each period with the
				1728	* actual budget available to the cgroup. If we just wound vtime,
				1729	* we would incorrectly use the current hw_inuse for the entire
				1730	* amount which, for example, can lead to the cgroup staying
				1731	* blocked for a long time even with substantially raised hw_inuse.
				1732	*/
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1733	if (bio_issue_as_root_blkg(bio) \|\| fatal_signal_pending(current)) {
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1734	atomic64_add(abs_cost, &iocg->abs_vdebt);
Tejun Heo	54c52e1	2020-04-13 12:27:55 -0400	[diff] [blame^]	1735	if (iocg_kick_delay(iocg, &now))
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1736	blkcg_schedule_throttle(rqos->q,
				1737	(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1738	return;
				1739	}
				1740
				1741	/*
				1742	* Append self to the waitq and schedule the wakeup timer if we're
				1743	* the first waiter. The timer duration is calculated based on the
				1744	* current vrate. vtime and hweight changes can make it too short
				1745	* or too long. Each wait entry records the absolute cost it's
				1746	* waiting for to allow re-evaluation using a custom wait entry.
				1747	*
				1748	* If too short, the timer simply reschedules itself. If too long,
				1749	* the period timer will notice and trigger wakeups.
				1750	*
				1751	* All waiters are on iocg->waitq and the wait states are
				1752	* synchronized using waitq.lock.
				1753	*/
				1754	spin_lock_irq(&iocg->waitq.lock);
				1755
				1756	/*
				1757	* We activated above but w/o any synchronization. Deactivation is
				1758	* synchronized with waitq.lock and we won't get deactivated as
				1759	* long as we're waiting, so we're good if we're activated here.
				1760	* In the unlikely case that we are deactivated, just issue the IO.
				1761	*/
				1762	if (unlikely(list_empty(&iocg->active_list))) {
				1763	spin_unlock_irq(&iocg->waitq.lock);
				1764	iocg_commit_bio(iocg, bio, cost);
				1765	return;
				1766	}
				1767
				1768	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
				1769	wait.wait.private = current;
				1770	wait.bio = bio;
				1771	wait.abs_cost = abs_cost;
				1772	wait.committed = false; /* will be set true by waker */
				1773
				1774	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
				1775	iocg_kick_waitq(iocg, &now);
				1776
				1777	spin_unlock_irq(&iocg->waitq.lock);
				1778
				1779	while (true) {
				1780	set_current_state(TASK_UNINTERRUPTIBLE);
				1781	if (wait.committed)
				1782	break;
				1783	io_schedule();
				1784	}
				1785
				1786	/* waker already committed us, proceed */
				1787	finish_wait(&iocg->waitq, &wait.wait);
				1788	}
				1789
				1790	static void ioc_rqos_merge(struct rq_qos rqos, struct request rq,
				1791	struct bio *bio)
				1792	{
				1793	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1794	struct ioc *ioc = iocg->ioc;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1795	sector_t bio_end = bio_end_sector(bio);
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1796	struct ioc_now now;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1797	u32 hw_inuse;
				1798	u64 abs_cost, cost;
				1799
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1800	/* bypass if disabled or for root cgroup */
				1801	if (!ioc->enabled \|\| !iocg->level)
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1802	return;
				1803
				1804	abs_cost = calc_vtime_cost(bio, iocg, true);
				1805	if (!abs_cost)
				1806	return;
				1807
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1808	ioc_now(ioc, &now);
				1809	current_hweight(iocg, NULL, &hw_inuse);
				1810	cost = abs_cost_to_cost(abs_cost, hw_inuse);
				1811
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1812	/* update cursor if backmerging into the request at the cursor */
				1813	if (blk_rq_pos(rq) < bio_end &&
				1814	blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
				1815	iocg->cursor = bio_end;
				1816
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1817	/*
				1818	* Charge if there's enough vtime budget and the existing request
				1819	* has cost assigned. Otherwise, account it as debt. See debt
				1820	* handling in ioc_rqos_throttle() for details.
				1821	*/
				1822	if (rq->bio && rq->bio->bi_iocost_cost &&
				1823	time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
				1824	iocg_commit_bio(iocg, bio, cost);
				1825	else
				1826	atomic64_add(abs_cost, &iocg->abs_vdebt);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1827	}
				1828
				1829	static void ioc_rqos_done_bio(struct rq_qos rqos, struct bio bio)
				1830	{
				1831	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
				1832
				1833	if (iocg && bio->bi_iocost_cost)
				1834	atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
				1835	}
				1836
				1837	static void ioc_rqos_done(struct rq_qos rqos, struct request rq)
				1838	{
				1839	struct ioc *ioc = rqos_to_ioc(rqos);
				1840	u64 on_q_ns, rq_wait_ns;
				1841	int pidx, rw;
				1842
				1843	if (!ioc->enabled \|\| !rq->alloc_time_ns \|\| !rq->start_time_ns)
				1844	return;
				1845
				1846	switch (req_op(rq) & REQ_OP_MASK) {
				1847	case REQ_OP_READ:
				1848	pidx = QOS_RLAT;
				1849	rw = READ;
				1850	break;
				1851	case REQ_OP_WRITE:
				1852	pidx = QOS_WLAT;
				1853	rw = WRITE;
				1854	break;
				1855	default:
				1856	return;
				1857	}
				1858
				1859	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
				1860	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
				1861
				1862	if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
				1863	this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
				1864	else
				1865	this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
				1866
				1867	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
				1868	}
				1869
				1870	static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
				1871	{
				1872	struct ioc *ioc = rqos_to_ioc(rqos);
				1873
				1874	spin_lock_irq(&ioc->lock);
				1875	ioc_refresh_params(ioc, false);
				1876	spin_unlock_irq(&ioc->lock);
				1877	}
				1878
				1879	static void ioc_rqos_exit(struct rq_qos *rqos)
				1880	{
				1881	struct ioc *ioc = rqos_to_ioc(rqos);
				1882
				1883	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
				1884
				1885	spin_lock_irq(&ioc->lock);
				1886	ioc->running = IOC_STOP;
				1887	spin_unlock_irq(&ioc->lock);
				1888
				1889	del_timer_sync(&ioc->timer);
				1890	free_percpu(ioc->pcpu_stat);
				1891	kfree(ioc);
				1892	}
				1893
				1894	static struct rq_qos_ops ioc_rqos_ops = {
				1895	.throttle = ioc_rqos_throttle,
				1896	.merge = ioc_rqos_merge,
				1897	.done_bio = ioc_rqos_done_bio,
				1898	.done = ioc_rqos_done,
				1899	.queue_depth_changed = ioc_rqos_queue_depth_changed,
				1900	.exit = ioc_rqos_exit,
				1901	};
				1902
				1903	static int blk_iocost_init(struct request_queue *q)
				1904	{
				1905	struct ioc *ioc;
				1906	struct rq_qos *rqos;
				1907	int ret;
				1908
				1909	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
				1910	if (!ioc)
				1911	return -ENOMEM;
				1912
				1913	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
				1914	if (!ioc->pcpu_stat) {
				1915	kfree(ioc);
				1916	return -ENOMEM;
				1917	}
				1918
				1919	rqos = &ioc->rqos;
				1920	rqos->id = RQ_QOS_COST;
				1921	rqos->ops = &ioc_rqos_ops;
				1922	rqos->q = q;
				1923
				1924	spin_lock_init(&ioc->lock);
				1925	timer_setup(&ioc->timer, ioc_timer_fn, 0);
				1926	INIT_LIST_HEAD(&ioc->active_iocgs);
				1927
				1928	ioc->running = IOC_IDLE;
				1929	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
				1930	seqcount_init(&ioc->period_seqcount);
				1931	ioc->period_at = ktime_to_us(ktime_get());
				1932	atomic64_set(&ioc->cur_period, 0);
				1933	atomic_set(&ioc->hweight_gen, 0);
				1934
				1935	spin_lock_irq(&ioc->lock);
				1936	ioc->autop_idx = AUTOP_INVALID;
				1937	ioc_refresh_params(ioc, true);
				1938	spin_unlock_irq(&ioc->lock);
				1939
				1940	rq_qos_add(q, rqos);
				1941	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
				1942	if (ret) {
				1943	rq_qos_del(q, rqos);
Tejun Heo	3532e72	2019-08-29 08:53:06 -0700	[diff] [blame]	1944	free_percpu(ioc->pcpu_stat);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1945	kfree(ioc);
				1946	return ret;
				1947	}
				1948	return 0;
				1949	}
				1950
				1951	static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
				1952	{
				1953	struct ioc_cgrp *iocc;
				1954
				1955	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
Tejun Heo	e916ad2	2019-08-30 06:10:58 -0700	[diff] [blame]	1956	if (!iocc)
				1957	return NULL;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1958
Tejun Heo	e916ad2	2019-08-30 06:10:58 -0700	[diff] [blame]	1959	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1960	return &iocc->cpd;
				1961	}
				1962
				1963	static void ioc_cpd_free(struct blkcg_policy_data *cpd)
				1964	{
				1965	kfree(container_of(cpd, struct ioc_cgrp, cpd));
				1966	}
				1967
				1968	static struct blkg_policy_data ioc_pd_alloc(gfp_t gfp, struct request_queue q,
				1969	struct blkcg *blkcg)
				1970	{
				1971	int levels = blkcg->css.cgroup->level + 1;
				1972	struct ioc_gq *iocg;
				1973
				1974	iocg = kzalloc_node(sizeof(iocg) + levels sizeof(iocg->ancestors[0]),
				1975	gfp, q->node);
				1976	if (!iocg)
				1977	return NULL;
				1978
				1979	return &iocg->pd;
				1980	}
				1981
				1982	static void ioc_pd_init(struct blkg_policy_data *pd)
				1983	{
				1984	struct ioc_gq *iocg = pd_to_iocg(pd);
				1985	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
				1986	struct ioc *ioc = q_to_ioc(blkg->q);
				1987	struct ioc_now now;
				1988	struct blkcg_gq *tblkg;
				1989	unsigned long flags;
				1990
				1991	ioc_now(ioc, &now);
				1992
				1993	iocg->ioc = ioc;
				1994	atomic64_set(&iocg->vtime, now.vnow);
				1995	atomic64_set(&iocg->done_vtime, now.vnow);
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1996	atomic64_set(&iocg->abs_vdebt, 0);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1997	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
				1998	INIT_LIST_HEAD(&iocg->active_list);
				1999	iocg->hweight_active = HWEIGHT_WHOLE;
				2000	iocg->hweight_inuse = HWEIGHT_WHOLE;
				2001
				2002	init_waitqueue_head(&iocg->waitq);
				2003	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
				2004	iocg->waitq_timer.function = iocg_waitq_timer_fn;
				2005	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
				2006	iocg->delay_timer.function = iocg_delay_timer_fn;
				2007
				2008	iocg->level = blkg->blkcg->css.cgroup->level;
				2009
				2010	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
				2011	struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
				2012	iocg->ancestors[tiocg->level] = tiocg;
				2013	}
				2014
				2015	spin_lock_irqsave(&ioc->lock, flags);
				2016	weight_updated(iocg);
				2017	spin_unlock_irqrestore(&ioc->lock, flags);
				2018	}
				2019
				2020	static void ioc_pd_free(struct blkg_policy_data *pd)
				2021	{
				2022	struct ioc_gq *iocg = pd_to_iocg(pd);
				2023	struct ioc *ioc = iocg->ioc;
				2024
				2025	if (ioc) {
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2026	spin_lock(&ioc->lock);
				2027	if (!list_empty(&iocg->active_list)) {
				2028	propagate_active_weight(iocg, 0, 0);
				2029	list_del_init(&iocg->active_list);
				2030	}
				2031	spin_unlock(&ioc->lock);
Tejun Heo	e036c4c	2019-09-10 09:15:25 -0700	[diff] [blame]	2032
				2033	hrtimer_cancel(&iocg->waitq_timer);
				2034	hrtimer_cancel(&iocg->delay_timer);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2035	}
				2036	kfree(iocg);
				2037	}
				2038
				2039	static u64 ioc_weight_prfill(struct seq_file sf, struct blkg_policy_data pd,
				2040	int off)
				2041	{
				2042	const char *dname = blkg_dev_name(pd->blkg);
				2043	struct ioc_gq *iocg = pd_to_iocg(pd);
				2044
				2045	if (dname && iocg->cfg_weight)
				2046	seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
				2047	return 0;
				2048	}
				2049
				2050
				2051	static int ioc_weight_show(struct seq_file sf, void v)
				2052	{
				2053	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2054	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
				2055
				2056	seq_printf(sf, "default %u\n", iocc->dfl_weight);
				2057	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
				2058	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2059	return 0;
				2060	}
				2061
				2062	static ssize_t ioc_weight_write(struct kernfs_open_file of, char buf,
				2063	size_t nbytes, loff_t off)
				2064	{
				2065	struct blkcg *blkcg = css_to_blkcg(of_css(of));
				2066	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
				2067	struct blkg_conf_ctx ctx;
				2068	struct ioc_gq *iocg;
				2069	u32 v;
				2070	int ret;
				2071
				2072	if (!strchr(buf, ':')) {
				2073	struct blkcg_gq *blkg;
				2074
				2075	if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
				2076	return -EINVAL;
				2077
				2078	if (v < CGROUP_WEIGHT_MIN \|\| v > CGROUP_WEIGHT_MAX)
				2079	return -EINVAL;
				2080
				2081	spin_lock(&blkcg->lock);
				2082	iocc->dfl_weight = v;
				2083	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
				2084	struct ioc_gq *iocg = blkg_to_iocg(blkg);
				2085
				2086	if (iocg) {
				2087	spin_lock_irq(&iocg->ioc->lock);
				2088	weight_updated(iocg);
				2089	spin_unlock_irq(&iocg->ioc->lock);
				2090	}
				2091	}
				2092	spin_unlock(&blkcg->lock);
				2093
				2094	return nbytes;
				2095	}
				2096
				2097	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
				2098	if (ret)
				2099	return ret;
				2100
				2101	iocg = blkg_to_iocg(ctx.blkg);
				2102
				2103	if (!strncmp(ctx.body, "default", 7)) {
				2104	v = 0;
				2105	} else {
				2106	if (!sscanf(ctx.body, "%u", &v))
				2107	goto einval;
				2108	if (v < CGROUP_WEIGHT_MIN \|\| v > CGROUP_WEIGHT_MAX)
				2109	goto einval;
				2110	}
				2111
Dan Carpenter	41591a5	2019-10-31 13:53:41 +0300	[diff] [blame]	2112	spin_lock(&iocg->ioc->lock);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2113	iocg->cfg_weight = v;
				2114	weight_updated(iocg);
Dan Carpenter	41591a5	2019-10-31 13:53:41 +0300	[diff] [blame]	2115	spin_unlock(&iocg->ioc->lock);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2116
				2117	blkg_conf_finish(&ctx);
				2118	return nbytes;
				2119
				2120	einval:
				2121	blkg_conf_finish(&ctx);
				2122	return -EINVAL;
				2123	}
				2124
				2125	static u64 ioc_qos_prfill(struct seq_file sf, struct blkg_policy_data pd,
				2126	int off)
				2127	{
				2128	const char *dname = blkg_dev_name(pd->blkg);
				2129	struct ioc *ioc = pd_to_iocg(pd)->ioc;
				2130
				2131	if (!dname)
				2132	return 0;
				2133
				2134	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
				2135	dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
				2136	ioc->params.qos[QOS_RPPM] / 10000,
				2137	ioc->params.qos[QOS_RPPM] % 10000 / 100,
				2138	ioc->params.qos[QOS_RLAT],
				2139	ioc->params.qos[QOS_WPPM] / 10000,
				2140	ioc->params.qos[QOS_WPPM] % 10000 / 100,
				2141	ioc->params.qos[QOS_WLAT],
				2142	ioc->params.qos[QOS_MIN] / 10000,
				2143	ioc->params.qos[QOS_MIN] % 10000 / 100,
				2144	ioc->params.qos[QOS_MAX] / 10000,
				2145	ioc->params.qos[QOS_MAX] % 10000 / 100);
				2146	return 0;
				2147	}
				2148
				2149	static int ioc_qos_show(struct seq_file sf, void v)
				2150	{
				2151	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2152
				2153	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
				2154	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2155	return 0;
				2156	}
				2157
				2158	static const match_table_t qos_ctrl_tokens = {
				2159	{ QOS_ENABLE, "enable=%u" },
				2160	{ QOS_CTRL, "ctrl=%s" },
				2161	{ NR_QOS_CTRL_PARAMS, NULL },
				2162	};
				2163
				2164	static const match_table_t qos_tokens = {
				2165	{ QOS_RPPM, "rpct=%s" },
				2166	{ QOS_RLAT, "rlat=%u" },
				2167	{ QOS_WPPM, "wpct=%s" },
				2168	{ QOS_WLAT, "wlat=%u" },
				2169	{ QOS_MIN, "min=%s" },
				2170	{ QOS_MAX, "max=%s" },
				2171	{ NR_QOS_PARAMS, NULL },
				2172	};
				2173
				2174	static ssize_t ioc_qos_write(struct kernfs_open_file of, char input,
				2175	size_t nbytes, loff_t off)
				2176	{
				2177	struct gendisk *disk;
				2178	struct ioc *ioc;
				2179	u32 qos[NR_QOS_PARAMS];
				2180	bool enable, user;
				2181	char *p;
				2182	int ret;
				2183
				2184	disk = blkcg_conf_get_disk(&input);
				2185	if (IS_ERR(disk))
				2186	return PTR_ERR(disk);
				2187
				2188	ioc = q_to_ioc(disk->queue);
				2189	if (!ioc) {
				2190	ret = blk_iocost_init(disk->queue);
				2191	if (ret)
				2192	goto err;
				2193	ioc = q_to_ioc(disk->queue);
				2194	}
				2195
				2196	spin_lock_irq(&ioc->lock);
				2197	memcpy(qos, ioc->params.qos, sizeof(qos));
				2198	enable = ioc->enabled;
				2199	user = ioc->user_qos_params;
				2200	spin_unlock_irq(&ioc->lock);
				2201
				2202	while ((p = strsep(&input, " \t\n"))) {
				2203	substring_t args[MAX_OPT_ARGS];
				2204	char buf[32];
				2205	int tok;
				2206	s64 v;
				2207
				2208	if (!*p)
				2209	continue;
				2210
				2211	switch (match_token(p, qos_ctrl_tokens, args)) {
				2212	case QOS_ENABLE:
				2213	match_u64(&args[0], &v);
				2214	enable = v;
				2215	continue;
				2216	case QOS_CTRL:
				2217	match_strlcpy(buf, &args[0], sizeof(buf));
				2218	if (!strcmp(buf, "auto"))
				2219	user = false;
				2220	else if (!strcmp(buf, "user"))
				2221	user = true;
				2222	else
				2223	goto einval;
				2224	continue;
				2225	}
				2226
				2227	tok = match_token(p, qos_tokens, args);
				2228	switch (tok) {
				2229	case QOS_RPPM:
				2230	case QOS_WPPM:
				2231	if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
				2232	sizeof(buf))
				2233	goto einval;
				2234	if (cgroup_parse_float(buf, 2, &v))
				2235	goto einval;
				2236	if (v < 0 \|\| v > 10000)
				2237	goto einval;
				2238	qos[tok] = v * 100;
				2239	break;
				2240	case QOS_RLAT:
				2241	case QOS_WLAT:
				2242	if (match_u64(&args[0], &v))
				2243	goto einval;
				2244	qos[tok] = v;
				2245	break;
				2246	case QOS_MIN:
				2247	case QOS_MAX:
				2248	if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
				2249	sizeof(buf))
				2250	goto einval;
				2251	if (cgroup_parse_float(buf, 2, &v))
				2252	goto einval;
				2253	if (v < 0)
				2254	goto einval;
				2255	qos[tok] = clamp_t(s64, v * 100,
				2256	VRATE_MIN_PPM, VRATE_MAX_PPM);
				2257	break;
				2258	default:
				2259	goto einval;
				2260	}
				2261	user = true;
				2262	}
				2263
				2264	if (qos[QOS_MIN] > qos[QOS_MAX])
				2265	goto einval;
				2266
				2267	spin_lock_irq(&ioc->lock);
				2268
				2269	if (enable) {
				2270	blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
				2271	ioc->enabled = true;
				2272	} else {
				2273	blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
				2274	ioc->enabled = false;
				2275	}
				2276
				2277	if (user) {
				2278	memcpy(ioc->params.qos, qos, sizeof(qos));
				2279	ioc->user_qos_params = true;
				2280	} else {
				2281	ioc->user_qos_params = false;
				2282	}
				2283
				2284	ioc_refresh_params(ioc, true);
				2285	spin_unlock_irq(&ioc->lock);
				2286
				2287	put_disk_and_module(disk);
				2288	return nbytes;
				2289	einval:
				2290	ret = -EINVAL;
				2291	err:
				2292	put_disk_and_module(disk);
				2293	return ret;
				2294	}
				2295
				2296	static u64 ioc_cost_model_prfill(struct seq_file *sf,
				2297	struct blkg_policy_data *pd, int off)
				2298	{
				2299	const char *dname = blkg_dev_name(pd->blkg);
				2300	struct ioc *ioc = pd_to_iocg(pd)->ioc;
				2301	u64 *u = ioc->params.i_lcoefs;
				2302
				2303	if (!dname)
				2304	return 0;
				2305
				2306	seq_printf(sf, "%s ctrl=%s model=linear "
				2307	"rbps=%llu rseqiops=%llu rrandiops=%llu "
				2308	"wbps=%llu wseqiops=%llu wrandiops=%llu\n",
				2309	dname, ioc->user_cost_model ? "user" : "auto",
				2310	u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
				2311	u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
				2312	return 0;
				2313	}
				2314
				2315	static int ioc_cost_model_show(struct seq_file sf, void v)
				2316	{
				2317	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2318
				2319	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
				2320	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2321	return 0;
				2322	}
				2323
				2324	static const match_table_t cost_ctrl_tokens = {
				2325	{ COST_CTRL, "ctrl=%s" },
				2326	{ COST_MODEL, "model=%s" },
				2327	{ NR_COST_CTRL_PARAMS, NULL },
				2328	};
				2329
				2330	static const match_table_t i_lcoef_tokens = {
				2331	{ I_LCOEF_RBPS, "rbps=%u" },
				2332	{ I_LCOEF_RSEQIOPS, "rseqiops=%u" },
				2333	{ I_LCOEF_RRANDIOPS, "rrandiops=%u" },
				2334	{ I_LCOEF_WBPS, "wbps=%u" },
				2335	{ I_LCOEF_WSEQIOPS, "wseqiops=%u" },
				2336	{ I_LCOEF_WRANDIOPS, "wrandiops=%u" },
				2337	{ NR_I_LCOEFS, NULL },
				2338	};
				2339
				2340	static ssize_t ioc_cost_model_write(struct kernfs_open_file of, char input,
				2341	size_t nbytes, loff_t off)
				2342	{
				2343	struct gendisk *disk;
				2344	struct ioc *ioc;
				2345	u64 u[NR_I_LCOEFS];
				2346	bool user;
				2347	char *p;
				2348	int ret;
				2349
				2350	disk = blkcg_conf_get_disk(&input);
				2351	if (IS_ERR(disk))
				2352	return PTR_ERR(disk);
				2353
				2354	ioc = q_to_ioc(disk->queue);
				2355	if (!ioc) {
				2356	ret = blk_iocost_init(disk->queue);
				2357	if (ret)
				2358	goto err;
				2359	ioc = q_to_ioc(disk->queue);
				2360	}
				2361
				2362	spin_lock_irq(&ioc->lock);
				2363	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
				2364	user = ioc->user_cost_model;
				2365	spin_unlock_irq(&ioc->lock);
				2366
				2367	while ((p = strsep(&input, " \t\n"))) {
				2368	substring_t args[MAX_OPT_ARGS];
				2369	char buf[32];
				2370	int tok;
				2371	u64 v;
				2372
				2373	if (!*p)
				2374	continue;
				2375
				2376	switch (match_token(p, cost_ctrl_tokens, args)) {
				2377	case COST_CTRL:
				2378	match_strlcpy(buf, &args[0], sizeof(buf));
				2379	if (!strcmp(buf, "auto"))
				2380	user = false;
				2381	else if (!strcmp(buf, "user"))
				2382	user = true;
				2383	else
				2384	goto einval;
				2385	continue;
				2386	case COST_MODEL:
				2387	match_strlcpy(buf, &args[0], sizeof(buf));
				2388	if (strcmp(buf, "linear"))
				2389	goto einval;
				2390	continue;
				2391	}
				2392
				2393	tok = match_token(p, i_lcoef_tokens, args);
				2394	if (tok == NR_I_LCOEFS)
				2395	goto einval;
				2396	if (match_u64(&args[0], &v))
				2397	goto einval;
				2398	u[tok] = v;
				2399	user = true;
				2400	}
				2401
				2402	spin_lock_irq(&ioc->lock);
				2403	if (user) {
				2404	memcpy(ioc->params.i_lcoefs, u, sizeof(u));
				2405	ioc->user_cost_model = true;
				2406	} else {
				2407	ioc->user_cost_model = false;
				2408	}
				2409	ioc_refresh_params(ioc, true);
				2410	spin_unlock_irq(&ioc->lock);
				2411
				2412	put_disk_and_module(disk);
				2413	return nbytes;
				2414
				2415	einval:
				2416	ret = -EINVAL;
				2417	err:
				2418	put_disk_and_module(disk);
				2419	return ret;
				2420	}
				2421
				2422	static struct cftype ioc_files[] = {
				2423	{
				2424	.name = "weight",
				2425	.flags = CFTYPE_NOT_ON_ROOT,
				2426	.seq_show = ioc_weight_show,
				2427	.write = ioc_weight_write,
				2428	},
				2429	{
				2430	.name = "cost.qos",
				2431	.flags = CFTYPE_ONLY_ON_ROOT,
				2432	.seq_show = ioc_qos_show,
				2433	.write = ioc_qos_write,
				2434	},
				2435	{
				2436	.name = "cost.model",
				2437	.flags = CFTYPE_ONLY_ON_ROOT,
				2438	.seq_show = ioc_cost_model_show,
				2439	.write = ioc_cost_model_write,
				2440	},
				2441	{}
				2442	};
				2443
				2444	static struct blkcg_policy blkcg_policy_iocost = {
				2445	.dfl_cftypes = ioc_files,
				2446	.cpd_alloc_fn = ioc_cpd_alloc,
				2447	.cpd_free_fn = ioc_cpd_free,
				2448	.pd_alloc_fn = ioc_pd_alloc,
				2449	.pd_init_fn = ioc_pd_init,
				2450	.pd_free_fn = ioc_pd_free,
				2451	};
				2452
				2453	static int __init ioc_init(void)
				2454	{
				2455	return blkcg_policy_register(&blkcg_policy_iocost);
				2456	}
				2457
				2458	static void __exit ioc_exit(void)
				2459	{
				2460	return blkcg_policy_unregister(&blkcg_policy_iocost);
				2461	}
				2462
				2463	module_init(ioc_init);
				2464	module_exit(ioc_exit);