Blame - block/blk-iocost.c - SHIFTPHONES/mainline/linux

blob: dc72cd9658371a3319ace110ce7d8700e15e26a7 [file] [log] [blame]

Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1	/* SPDX-License-Identifier: GPL-2.0
				2	*
				3	* IO cost model based controller.
				4	*
				5	* Copyright (C) 2019 Tejun Heo <tj@kernel.org>
				6	* Copyright (C) 2019 Andy Newell <newella@fb.com>
				7	* Copyright (C) 2019 Facebook
				8	*
				9	* One challenge of controlling IO resources is the lack of trivially
				10	* observable cost metric. This is distinguished from CPU and memory where
				11	* wallclock time and the number of bytes can serve as accurate enough
				12	* approximations.
				13	*
				14	* Bandwidth and iops are the most commonly used metrics for IO devices but
				15	* depending on the type and specifics of the device, different IO patterns
				16	* easily lead to multiple orders of magnitude variations rendering them
				17	* useless for the purpose of IO capacity distribution. While on-device
				18	* time, with a lot of clutches, could serve as a useful approximation for
				19	* non-queued rotational devices, this is no longer viable with modern
				20	* devices, even the rotational ones.
				21	*
				22	* While there is no cost metric we can trivially observe, it isn't a
				23	* complete mystery. For example, on a rotational device, seek cost
				24	* dominates while a contiguous transfer contributes a smaller amount
				25	* proportional to the size. If we can characterize at least the relative
				26	* costs of these different types of IOs, it should be possible to
				27	* implement a reasonable work-conserving proportional IO resource
				28	* distribution.
				29	*
				30	* 1. IO Cost Model
				31	*
				32	* IO cost model estimates the cost of an IO given its basic parameters and
				33	* history (e.g. the end sector of the last IO). The cost is measured in
				34	* device time. If a given IO is estimated to cost 10ms, the device should
				35	* be able to process ~100 of those IOs in a second.
				36	*
				37	* Currently, there's only one builtin cost model - linear. Each IO is
				38	* classified as sequential or random and given a base cost accordingly.
				39	* On top of that, a size cost proportional to the length of the IO is
				40	* added. While simple, this model captures the operational
				41	* characteristics of a wide varienty of devices well enough. Default
				42	* paramters for several different classes of devices are provided and the
				43	* parameters can be configured from userspace via
				44	* /sys/fs/cgroup/io.cost.model.
				45	*
				46	* If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
				47	* device-specific coefficients.
				48	*
				49	* 2. Control Strategy
				50	*
				51	* The device virtual time (vtime) is used as the primary control metric.
				52	* The control strategy is composed of the following three parts.
				53	*
				54	* 2-1. Vtime Distribution
				55	*
				56	* When a cgroup becomes active in terms of IOs, its hierarchical share is
				57	* calculated. Please consider the following hierarchy where the numbers
				58	* inside parentheses denote the configured weights.
				59	*
				60	* root
				61	* / \
				62	* A (w:100) B (w:300)
				63	* / \
				64	* A0 (w:100) A1 (w:100)
				65	*
				66	* If B is idle and only A0 and A1 are actively issuing IOs, as the two are
				67	* of equal weight, each gets 50% share. If then B starts issuing IOs, B
				68	* gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
				69	* 12.5% each. The distribution mechanism only cares about these flattened
				70	* shares. They're called hweights (hierarchical weights) and always add
Tejun Heo	fe20cdb5	2020-09-01 14:52:38 -0400	[diff] [blame]	71	* upto 1 (WEIGHT_ONE).
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	72	*
				73	* A given cgroup's vtime runs slower in inverse proportion to its hweight.
				74	* For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
				75	* against the device vtime - an IO which takes 10ms on the underlying
				76	* device is considered to take 80ms on A0.
				77	*
				78	* This constitutes the basis of IO capacity distribution. Each cgroup's
				79	* vtime is running at a rate determined by its hweight. A cgroup tracks
				80	* the vtime consumed by past IOs and can issue a new IO iff doing so
				81	* wouldn't outrun the current device vtime. Otherwise, the IO is
				82	* suspended until the vtime has progressed enough to cover it.
				83	*
				84	* 2-2. Vrate Adjustment
				85	*
				86	* It's unrealistic to expect the cost model to be perfect. There are too
				87	* many devices and even on the same device the overall performance
				88	* fluctuates depending on numerous factors such as IO mixture and device
				89	* internal garbage collection. The controller needs to adapt dynamically.
				90	*
				91	* This is achieved by adjusting the overall IO rate according to how busy
				92	* the device is. If the device becomes overloaded, we're sending down too
				93	* many IOs and should generally slow down. If there are waiting issuers
				94	* but the device isn't saturated, we're issuing too few and should
				95	* generally speed up.
				96	*
				97	* To slow down, we lower the vrate - the rate at which the device vtime
				98	* passes compared to the wall clock. For example, if the vtime is running
				99	* at the vrate of 75%, all cgroups added up would only be able to issue
				100	* 750ms worth of IOs per second, and vice-versa for speeding up.
				101	*
				102	* Device business is determined using two criteria - rq wait and
				103	* completion latencies.
				104	*
				105	* When a device gets saturated, the on-device and then the request queues
				106	* fill up and a bio which is ready to be issued has to wait for a request
				107	* to become available. When this delay becomes noticeable, it's a clear
				108	* indication that the device is saturated and we lower the vrate. This
				109	* saturation signal is fairly conservative as it only triggers when both
				110	* hardware and software queues are filled up, and is used as the default
				111	* busy signal.
				112	*
				113	* As devices can have deep queues and be unfair in how the queued commands
				114	* are executed, soley depending on rq wait may not result in satisfactory
				115	* control quality. For a better control quality, completion latency QoS
				116	* parameters can be configured so that the device is considered saturated
				117	* if N'th percentile completion latency rises above the set point.
				118	*
				119	* The completion latency requirements are a function of both the
				120	* underlying device characteristics and the desired IO latency quality of
				121	* service. There is an inherent trade-off - the tighter the latency QoS,
				122	* the higher the bandwidth lossage. Latency QoS is disabled by default
				123	* and can be set through /sys/fs/cgroup/io.cost.qos.
				124	*
				125	* 2-3. Work Conservation
				126	*
				127	* Imagine two cgroups A and B with equal weights. A is issuing a small IO
				128	* periodically while B is sending out enough parallel IOs to saturate the
				129	* device on its own. Let's say A's usage amounts to 100ms worth of IO
				130	* cost per second, i.e., 10% of the device capacity. The naive
				131	* distribution of half and half would lead to 60% utilization of the
				132	* device, a significant reduction in the total amount of work done
				133	* compared to free-for-all competition. This is too high a cost to pay
				134	* for IO control.
				135	*
				136	* To conserve the total amount of work done, we keep track of how much
				137	* each active cgroup is actually using and yield part of its weight if
				138	* there are other cgroups which can make use of it. In the above case,
				139	* A's weight will be lowered so that it hovers above the actual usage and
				140	* B would be able to use the rest.
				141	*
				142	* As we don't want to penalize a cgroup for donating its weight, the
				143	* surplus weight adjustment factors in a margin and has an immediate
				144	* snapback mechanism in case the cgroup needs more IO vtime for itself.
				145	*
				146	* Note that adjusting down surplus weights has the same effects as
				147	* accelerating vtime for other cgroups and work conservation can also be
				148	* implemented by adjusting vrate dynamically. However, squaring who can
				149	* donate and should take back how much requires hweight propagations
				150	* anyway making it easier to implement and understand as a separate
				151	* mechanism.
Tejun Heo	6954ff1	2019-08-28 15:05:59 -0700	[diff] [blame]	152	*
				153	* 3. Monitoring
				154	*
				155	* Instead of debugfs or other clumsy monitoring mechanisms, this
				156	* controller uses a drgn based monitoring script -
				157	* tools/cgroup/iocost_monitor.py. For details on drgn, please see
				158	* https://github.com/osandov/drgn. The ouput looks like the following.
				159	*
				160	* sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
Tejun Heo	7c1ee70	2019-09-04 12:45:56 -0700	[diff] [blame]	161	* active weight hweight% inflt% dbt delay usages%
				162	* test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
				163	* test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
Tejun Heo	6954ff1	2019-08-28 15:05:59 -0700	[diff] [blame]	164	*
				165	* - per : Timer period
				166	* - cur_per : Internal wall and device vtime clock
				167	* - vrate : Device virtual time rate against wall clock
				168	* - weight : Surplus-adjusted and configured weights
				169	* - hweight : Surplus-adjusted and configured hierarchical weights
				170	* - inflt : The percentage of in-flight IO cost at the end of last period
				171	* - del_ms : Deferred issuer delay induction level and duration
				172	* - usages : Usage history
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	173	*/
				174
				175	#include <linux/kernel.h>
				176	#include <linux/module.h>
				177	#include <linux/timer.h>
				178	#include <linux/time64.h>
				179	#include <linux/parser.h>
				180	#include <linux/sched/signal.h>
				181	#include <linux/blk-cgroup.h>
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	182	#include <asm/local.h>
				183	#include <asm/local64.h>
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	184	#include "blk-rq-qos.h"
				185	#include "blk-stat.h"
				186	#include "blk-wbt.h"
				187
				188	#ifdef CONFIG_TRACEPOINTS
				189
				190	/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
				191	#define TRACE_IOCG_PATH_LEN 1024
				192	static DEFINE_SPINLOCK(trace_iocg_path_lock);
				193	static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
				194
				195	#define TRACE_IOCG_PATH(type, iocg, ...) \
				196	do { \
				197	unsigned long flags; \
				198	if (trace_iocost_##type##_enabled()) { \
				199	spin_lock_irqsave(&trace_iocg_path_lock, flags); \
				200	cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
				201	trace_iocg_path, TRACE_IOCG_PATH_LEN); \
				202	trace_iocost_##type(iocg, trace_iocg_path, \
				203	##__VA_ARGS__); \
				204	spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
				205	} \
				206	} while (0)
				207
				208	#else /* CONFIG_TRACE_POINTS */
				209	#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
				210	#endif /* CONFIG_TRACE_POINTS */
				211
				212	enum {
				213	MILLION = 1000000,
				214
				215	/* timer period is calculated from latency requirements, bound it */
				216	MIN_PERIOD = USEC_PER_MSEC,
				217	MAX_PERIOD = USEC_PER_SEC,
				218
				219	/*
				220	* A cgroup's vtime can run 50% behind the device vtime, which
				221	* serves as its IO credit buffer. Surplus weight adjustment is
				222	* immediately canceled if the vtime margin runs below 10%.
				223	*/
				224	MARGIN_PCT = 50,
				225	INUSE_MARGIN_PCT = 10,
				226
				227	/* Have some play in waitq timer operations */
				228	WAITQ_TIMER_MARGIN_PCT = 5,
				229
				230	/*
				231	* vtime can wrap well within a reasonable uptime when vrate is
				232	* consistently raised. Don't trust recorded cgroup vtime if the
				233	* period counter indicates that it's older than 5mins.
				234	*/
				235	VTIME_VALID_DUR = 300 * USEC_PER_SEC,
				236
				237	/*
				238	* Remember the past three non-zero usages and use the max for
				239	* surplus calculation. Three slots guarantee that we remember one
				240	* full period usage from the last active stretch even after
				241	* partial deactivation and re-activation periods. Don't start
				242	* giving away weight before collecting two data points to prevent
				243	* hweight adjustments based on one partial activation period.
				244	*/
				245	NR_USAGE_SLOTS = 3,
				246	MIN_VALID_USAGES = 2,
				247
				248	/* 1/64k is granular enough and can easily be handled w/ u32 */
Tejun Heo	fe20cdb5	2020-09-01 14:52:38 -0400	[diff] [blame]	249	WEIGHT_ONE = 1 << 16,
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	250
				251	/*
				252	* As vtime is used to calculate the cost of each IO, it needs to
				253	* be fairly high precision. For example, it should be able to
				254	* represent the cost of a single page worth of discard with
				255	* suffificient accuracy. At the same time, it should be able to
				256	* represent reasonably long enough durations to be useful and
				257	* convenient during operation.
				258	*
				259	* 1s worth of vtime is 2^37. This gives us both sub-nanosecond
				260	* granularity and days of wrap-around time even at extreme vrates.
				261	*/
				262	VTIME_PER_SEC_SHIFT = 37,
				263	VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
				264	VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
Tejun Heo	cd00650	2020-04-13 12:27:56 -0400	[diff] [blame]	265	VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	266
				267	/* bound vrate adjustments within two orders of magnitude */
				268	VRATE_MIN_PPM = 10000, /* 1% */
				269	VRATE_MAX_PPM = 100000000, /* 10000% */
				270
				271	VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
				272	VRATE_CLAMP_ADJ_PCT = 4,
				273
				274	/* if IOs end up waiting for requests, issue less */
				275	RQ_WAIT_BUSY_PCT = 5,
				276
				277	/* unbusy hysterisis */
				278	UNBUSY_THR_PCT = 75,
				279
				280	/* don't let cmds which take a very long time pin lagging for too long */
				281	MAX_LAGGING_PERIODS = 10,
				282
				283	/*
				284	* If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
				285	* donate the surplus.
				286	*/
				287	SURPLUS_SCALE_PCT = 125, /* * 125% */
Tejun Heo	fe20cdb5	2020-09-01 14:52:38 -0400	[diff] [blame]	288	SURPLUS_SCALE_ABS = WEIGHT_ONE / 50, /* + 2% */
				289	SURPLUS_MIN_ADJ_DELTA = WEIGHT_ONE / 33, /* 3% */
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	290
				291	/* switch iff the conditions are met for longer than this */
				292	AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
				293
				294	/*
				295	* Count IO size in 4k pages. The 12bit shift helps keeping
				296	* size-proportional components of cost calculation in closer
				297	* numbers of digits to per-IO cost components.
				298	*/
				299	IOC_PAGE_SHIFT = 12,
				300	IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
				301	IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
				302
				303	/* if apart further than 16M, consider randio for linear model */
				304	LCOEF_RANDIO_PAGES = 4096,
				305	};
				306
				307	enum ioc_running {
				308	IOC_IDLE,
				309	IOC_RUNNING,
				310	IOC_STOP,
				311	};
				312
				313	/* io.cost.qos controls including per-dev enable of the whole controller */
				314	enum {
				315	QOS_ENABLE,
				316	QOS_CTRL,
				317	NR_QOS_CTRL_PARAMS,
				318	};
				319
				320	/* io.cost.qos params */
				321	enum {
				322	QOS_RPPM,
				323	QOS_RLAT,
				324	QOS_WPPM,
				325	QOS_WLAT,
				326	QOS_MIN,
				327	QOS_MAX,
				328	NR_QOS_PARAMS,
				329	};
				330
				331	/* io.cost.model controls */
				332	enum {
				333	COST_CTRL,
				334	COST_MODEL,
				335	NR_COST_CTRL_PARAMS,
				336	};
				337
				338	/* builtin linear cost model coefficients */
				339	enum {
				340	I_LCOEF_RBPS,
				341	I_LCOEF_RSEQIOPS,
				342	I_LCOEF_RRANDIOPS,
				343	I_LCOEF_WBPS,
				344	I_LCOEF_WSEQIOPS,
				345	I_LCOEF_WRANDIOPS,
				346	NR_I_LCOEFS,
				347	};
				348
				349	enum {
				350	LCOEF_RPAGE,
				351	LCOEF_RSEQIO,
				352	LCOEF_RRANDIO,
				353	LCOEF_WPAGE,
				354	LCOEF_WSEQIO,
				355	LCOEF_WRANDIO,
				356	NR_LCOEFS,
				357	};
				358
				359	enum {
				360	AUTOP_INVALID,
				361	AUTOP_HDD,
				362	AUTOP_SSD_QD1,
				363	AUTOP_SSD_DFL,
				364	AUTOP_SSD_FAST,
				365	};
				366
				367	struct ioc_gq;
				368
				369	struct ioc_params {
				370	u32 qos[NR_QOS_PARAMS];
				371	u64 i_lcoefs[NR_I_LCOEFS];
				372	u64 lcoefs[NR_LCOEFS];
				373	u32 too_fast_vrate_pct;
				374	u32 too_slow_vrate_pct;
				375	};
				376
				377	struct ioc_missed {
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	378	local_t nr_met;
				379	local_t nr_missed;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	380	u32 last_met;
				381	u32 last_missed;
				382	};
				383
				384	struct ioc_pcpu_stat {
				385	struct ioc_missed missed[2];
				386
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	387	local64_t rq_wait_ns;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	388	u64 last_rq_wait_ns;
				389	};
				390
				391	/* per device */
				392	struct ioc {
				393	struct rq_qos rqos;
				394
				395	bool enabled;
				396
				397	struct ioc_params params;
				398	u32 period_us;
				399	u32 margin_us;
				400	u64 vrate_min;
				401	u64 vrate_max;
				402
				403	spinlock_t lock;
				404	struct timer_list timer;
				405	struct list_head active_iocgs; /* active cgroups */
				406	struct ioc_pcpu_stat __percpu *pcpu_stat;
				407
				408	enum ioc_running running;
				409	atomic64_t vtime_rate;
				410
Ahmed S. Darwish	67b7b64	2020-07-20 17:55:26 +0200	[diff] [blame]	411	seqcount_spinlock_t period_seqcount;
Tejun Heo	ce95570	2020-09-01 14:52:40 -0400	[diff] [blame^]	412	u64 period_at; /* wallclock starttime */
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	413	u64 period_at_vtime; /* vtime starttime */
				414
				415	atomic64_t cur_period; /* inc'd each period */
				416	int busy_level; /* saturation history */
				417
				418	u64 inuse_margin_vtime;
				419	bool weights_updated;
				420	atomic_t hweight_gen; /* for lazy hweights */
				421
				422	u64 autop_too_fast_at;
				423	u64 autop_too_slow_at;
				424	int autop_idx;
				425	bool user_qos_params:1;
				426	bool user_cost_model:1;
				427	};
				428
				429	/* per device-cgroup pair */
				430	struct ioc_gq {
				431	struct blkg_policy_data pd;
				432	struct ioc *ioc;
				433
				434	/*
				435	* A iocg can get its weight from two sources - an explicit
				436	* per-device-cgroup configuration or the default weight of the
				437	* cgroup. `cfg_weight` is the explicit per-device-cgroup
				438	* configuration. `weight` is the effective considering both
				439	* sources.
				440	*
				441	* When an idle cgroup becomes active its `active` goes from 0 to
				442	* `weight`. `inuse` is the surplus adjusted active weight.
				443	* `active` and `inuse` are used to calculate `hweight_active` and
				444	* `hweight_inuse`.
				445	*
				446	* `last_inuse` remembers `inuse` while an iocg is idle to persist
				447	* surplus adjustments.
				448	*/
				449	u32 cfg_weight;
				450	u32 weight;
				451	u32 active;
				452	u32 inuse;
				453	u32 last_inuse;
				454
				455	sector_t cursor; /* to detect randio */
				456
				457	/*
				458	* `vtime` is this iocg's vtime cursor which progresses as IOs are
				459	* issued. If lagging behind device vtime, the delta represents
				460	* the currently available IO budget. If runnning ahead, the
				461	* overage.
				462	*
				463	* `vtime_done` is the same but progressed on completion rather
				464	* than issue. The delta behind `vtime` represents the cost of
				465	* currently in-flight IOs.
				466	*
				467	* `last_vtime` is used to remember `vtime` at the end of the last
				468	* period to calculate utilization.
				469	*/
				470	atomic64_t vtime;
				471	atomic64_t done_vtime;
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	472	u64 abs_vdebt;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	473	u64 last_vtime;
				474
				475	/*
				476	* The period this iocg was last active in. Used for deactivation
				477	* and invalidating `vtime`.
				478	*/
				479	atomic64_t active_period;
				480	struct list_head active_list;
				481
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	482	/* see __propagate_weights() and current_hweight() for details */
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	483	u64 child_active_sum;
				484	u64 child_inuse_sum;
				485	int hweight_gen;
				486	u32 hweight_active;
				487	u32 hweight_inuse;
				488	bool has_surplus;
				489
				490	struct wait_queue_head waitq;
				491	struct hrtimer waitq_timer;
				492	struct hrtimer delay_timer;
				493
Tejun Heo	fe20cdb5	2020-09-01 14:52:38 -0400	[diff] [blame]	494	/* usage is recorded as fractions of WEIGHT_ONE */
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	495	int usage_idx;
				496	u32 usages[NR_USAGE_SLOTS];
				497
				498	/* this iocg's depth in the hierarchy and ancestors including self */
				499	int level;
				500	struct ioc_gq *ancestors[];
				501	};
				502
				503	/* per cgroup */
				504	struct ioc_cgrp {
				505	struct blkcg_policy_data cpd;
				506	unsigned int dfl_weight;
				507	};
				508
				509	struct ioc_now {
				510	u64 now_ns;
Tejun Heo	ce95570	2020-09-01 14:52:40 -0400	[diff] [blame^]	511	u64 now;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	512	u64 vnow;
				513	u64 vrate;
				514	};
				515
				516	struct iocg_wait {
				517	struct wait_queue_entry wait;
				518	struct bio *bio;
				519	u64 abs_cost;
				520	bool committed;
				521	};
				522
				523	struct iocg_wake_ctx {
				524	struct ioc_gq *iocg;
				525	u32 hw_inuse;
				526	s64 vbudget;
				527	};
				528
				529	static const struct ioc_params autop[] = {
				530	[AUTOP_HDD] = {
				531	.qos = {
Tejun Heo	7afccca	2019-09-25 16:03:35 -0700	[diff] [blame]	532	[QOS_RLAT] = 250000, /* 250ms */
				533	[QOS_WLAT] = 250000,
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	534	[QOS_MIN] = VRATE_MIN_PPM,
				535	[QOS_MAX] = VRATE_MAX_PPM,
				536	},
				537	.i_lcoefs = {
				538	[I_LCOEF_RBPS] = 174019176,
				539	[I_LCOEF_RSEQIOPS] = 41708,
				540	[I_LCOEF_RRANDIOPS] = 370,
				541	[I_LCOEF_WBPS] = 178075866,
				542	[I_LCOEF_WSEQIOPS] = 42705,
				543	[I_LCOEF_WRANDIOPS] = 378,
				544	},
				545	},
				546	[AUTOP_SSD_QD1] = {
				547	.qos = {
				548	[QOS_RLAT] = 25000, /* 25ms */
				549	[QOS_WLAT] = 25000,
				550	[QOS_MIN] = VRATE_MIN_PPM,
				551	[QOS_MAX] = VRATE_MAX_PPM,
				552	},
				553	.i_lcoefs = {
				554	[I_LCOEF_RBPS] = 245855193,
				555	[I_LCOEF_RSEQIOPS] = 61575,
				556	[I_LCOEF_RRANDIOPS] = 6946,
				557	[I_LCOEF_WBPS] = 141365009,
				558	[I_LCOEF_WSEQIOPS] = 33716,
				559	[I_LCOEF_WRANDIOPS] = 26796,
				560	},
				561	},
				562	[AUTOP_SSD_DFL] = {
				563	.qos = {
				564	[QOS_RLAT] = 25000, /* 25ms */
				565	[QOS_WLAT] = 25000,
				566	[QOS_MIN] = VRATE_MIN_PPM,
				567	[QOS_MAX] = VRATE_MAX_PPM,
				568	},
				569	.i_lcoefs = {
				570	[I_LCOEF_RBPS] = 488636629,
				571	[I_LCOEF_RSEQIOPS] = 8932,
				572	[I_LCOEF_RRANDIOPS] = 8518,
				573	[I_LCOEF_WBPS] = 427891549,
				574	[I_LCOEF_WSEQIOPS] = 28755,
				575	[I_LCOEF_WRANDIOPS] = 21940,
				576	},
				577	.too_fast_vrate_pct = 500,
				578	},
				579	[AUTOP_SSD_FAST] = {
				580	.qos = {
				581	[QOS_RLAT] = 5000, /* 5ms */
				582	[QOS_WLAT] = 5000,
				583	[QOS_MIN] = VRATE_MIN_PPM,
				584	[QOS_MAX] = VRATE_MAX_PPM,
				585	},
				586	.i_lcoefs = {
				587	[I_LCOEF_RBPS] = 3102524156LLU,
				588	[I_LCOEF_RSEQIOPS] = 724816,
				589	[I_LCOEF_RRANDIOPS] = 778122,
				590	[I_LCOEF_WBPS] = 1742780862LLU,
				591	[I_LCOEF_WSEQIOPS] = 425702,
				592	[I_LCOEF_WRANDIOPS] = 443193,
				593	},
				594	.too_slow_vrate_pct = 10,
				595	},
				596	};
				597
				598	/*
				599	* vrate adjust percentages indexed by ioc->busy_level. We adjust up on
				600	* vtime credit shortage and down on device saturation.
				601	*/
				602	static u32 vrate_adj_pct[] =
				603	{ 0, 0, 0, 0,
				604	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				605	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				606	4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
				607
				608	static struct blkcg_policy blkcg_policy_iocost;
				609
				610	/* accessors and helpers */
				611	static struct ioc rqos_to_ioc(struct rq_qos rqos)
				612	{
				613	return container_of(rqos, struct ioc, rqos);
				614	}
				615
				616	static struct ioc q_to_ioc(struct request_queue q)
				617	{
				618	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
				619	}
				620
				621	static const char q_name(struct request_queue q)
				622	{
				623	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
				624	return kobject_name(q->kobj.parent);
				625	else
				626	return "<unknown>";
				627	}
				628
				629	static const char __maybe_unused ioc_name(struct ioc ioc)
				630	{
				631	return q_name(ioc->rqos.q);
				632	}
				633
				634	static struct ioc_gq pd_to_iocg(struct blkg_policy_data pd)
				635	{
				636	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
				637	}
				638
				639	static struct ioc_gq blkg_to_iocg(struct blkcg_gq blkg)
				640	{
				641	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
				642	}
				643
				644	static struct blkcg_gq iocg_to_blkg(struct ioc_gq iocg)
				645	{
				646	return pd_to_blkg(&iocg->pd);
				647	}
				648
				649	static struct ioc_cgrp blkcg_to_iocc(struct blkcg blkcg)
				650	{
				651	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
				652	struct ioc_cgrp, cpd);
				653	}
				654
				655	/*
				656	* Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	657	* weight, the more expensive each IO. Must round up.
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	658	*/
				659	static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
				660	{
Tejun Heo	fe20cdb5	2020-09-01 14:52:38 -0400	[diff] [blame]	661	return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	662	}
				663
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	664	/*
				665	* The inverse of abs_cost_to_cost(). Must round up.
				666	*/
				667	static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
				668	{
Tejun Heo	fe20cdb5	2020-09-01 14:52:38 -0400	[diff] [blame]	669	return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	670	}
				671
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	672	static void iocg_commit_bio(struct ioc_gq iocg, struct bio bio, u64 cost)
				673	{
				674	bio->bi_iocost_cost = cost;
				675	atomic64_add(cost, &iocg->vtime);
				676	}
				677
				678	#define CREATE_TRACE_POINTS
				679	#include <trace/events/iocost.h>
				680
				681	/* latency Qos params changed, update period_us and all the dependent params */
				682	static void ioc_refresh_period_us(struct ioc *ioc)
				683	{
				684	u32 ppm, lat, multi, period_us;
				685
				686	lockdep_assert_held(&ioc->lock);
				687
				688	/* pick the higher latency target */
				689	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
				690	ppm = ioc->params.qos[QOS_RPPM];
				691	lat = ioc->params.qos[QOS_RLAT];
				692	} else {
				693	ppm = ioc->params.qos[QOS_WPPM];
				694	lat = ioc->params.qos[QOS_WLAT];
				695	}
				696
				697	/*
				698	* We want the period to be long enough to contain a healthy number
				699	* of IOs while short enough for granular control. Define it as a
				700	* multiple of the latency target. Ideally, the multiplier should
				701	* be scaled according to the percentile so that it would nominally
				702	* contain a certain number of requests. Let's be simpler and
				703	* scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
				704	*/
				705	if (ppm)
				706	multi = max_t(u32, (MILLION - ppm) / 50000, 2);
				707	else
				708	multi = 2;
				709	period_us = multi * lat;
				710	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
				711
				712	/* calculate dependent params */
				713	ioc->period_us = period_us;
				714	ioc->margin_us = period_us * MARGIN_PCT / 100;
				715	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
				716	period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
				717	}
				718
				719	static int ioc_autop_idx(struct ioc *ioc)
				720	{
				721	int idx = ioc->autop_idx;
				722	const struct ioc_params *p = &autop[idx];
				723	u32 vrate_pct;
				724	u64 now_ns;
				725
				726	/* rotational? */
				727	if (!blk_queue_nonrot(ioc->rqos.q))
				728	return AUTOP_HDD;
				729
				730	/* handle SATA SSDs w/ broken NCQ */
				731	if (blk_queue_depth(ioc->rqos.q) == 1)
				732	return AUTOP_SSD_QD1;
				733
				734	/* use one of the normal ssd sets */
				735	if (idx < AUTOP_SSD_DFL)
				736	return AUTOP_SSD_DFL;
				737
				738	/* if user is overriding anything, maintain what was there */
				739	if (ioc->user_qos_params \|\| ioc->user_cost_model)
				740	return idx;
				741
				742	/* step up/down based on the vrate */
				743	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
				744	VTIME_PER_USEC);
				745	now_ns = ktime_get_ns();
				746
				747	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
				748	if (!ioc->autop_too_fast_at)
				749	ioc->autop_too_fast_at = now_ns;
				750	if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
				751	return idx + 1;
				752	} else {
				753	ioc->autop_too_fast_at = 0;
				754	}
				755
				756	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
				757	if (!ioc->autop_too_slow_at)
				758	ioc->autop_too_slow_at = now_ns;
				759	if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
				760	return idx - 1;
				761	} else {
				762	ioc->autop_too_slow_at = 0;
				763	}
				764
				765	return idx;
				766	}
				767
				768	/*
				769	* Take the followings as input
				770	*
				771	* @bps maximum sequential throughput
				772	* @seqiops maximum sequential 4k iops
				773	* @randiops maximum random 4k iops
				774	*
				775	* and calculate the linear model cost coefficients.
				776	*
				777	* *@page per-page cost 1s / (@bps / 4096)
				778	* @seqio base cost of a seq IO max((1s / @seqiops) - @page, 0)
				779	* @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
				780	*/
				781	static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
				782	u64 page, u64 seqio, u64 *randio)
				783	{
				784	u64 v;
				785
				786	page = seqio = *randio = 0;
				787
				788	if (bps)
				789	*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
				790	DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
				791
				792	if (seqiops) {
				793	v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
				794	if (v > *page)
				795	seqio = v - page;
				796	}
				797
				798	if (randiops) {
				799	v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
				800	if (v > *page)
				801	randio = v - page;
				802	}
				803	}
				804
				805	static void ioc_refresh_lcoefs(struct ioc *ioc)
				806	{
				807	u64 *u = ioc->params.i_lcoefs;
				808	u64 *c = ioc->params.lcoefs;
				809
				810	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
				811	&c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
				812	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
				813	&c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
				814	}
				815
				816	static bool ioc_refresh_params(struct ioc *ioc, bool force)
				817	{
				818	const struct ioc_params *p;
				819	int idx;
				820
				821	lockdep_assert_held(&ioc->lock);
				822
				823	idx = ioc_autop_idx(ioc);
				824	p = &autop[idx];
				825
				826	if (idx == ioc->autop_idx && !force)
				827	return false;
				828
				829	if (idx != ioc->autop_idx)
				830	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
				831
				832	ioc->autop_idx = idx;
				833	ioc->autop_too_fast_at = 0;
				834	ioc->autop_too_slow_at = 0;
				835
				836	if (!ioc->user_qos_params)
				837	memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
				838	if (!ioc->user_cost_model)
				839	memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
				840
				841	ioc_refresh_period_us(ioc);
				842	ioc_refresh_lcoefs(ioc);
				843
				844	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
				845	VTIME_PER_USEC, MILLION);
				846	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
				847	VTIME_PER_USEC, MILLION);
				848
				849	return true;
				850	}
				851
				852	/* take a snapshot of the current [v]time and vrate */
				853	static void ioc_now(struct ioc ioc, struct ioc_now now)
				854	{
				855	unsigned seq;
				856
				857	now->now_ns = ktime_get();
				858	now->now = ktime_to_us(now->now_ns);
				859	now->vrate = atomic64_read(&ioc->vtime_rate);
				860
				861	/*
				862	* The current vtime is
				863	*
				864	* vtime at period start + (wallclock time since the start) * vrate
				865	*
				866	* As a consistent snapshot of `period_at_vtime` and `period_at` is
				867	* needed, they're seqcount protected.
				868	*/
				869	do {
				870	seq = read_seqcount_begin(&ioc->period_seqcount);
				871	now->vnow = ioc->period_at_vtime +
				872	(now->now - ioc->period_at) * now->vrate;
				873	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
				874	}
				875
				876	static void ioc_start_period(struct ioc ioc, struct ioc_now now)
				877	{
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	878	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
				879
				880	write_seqcount_begin(&ioc->period_seqcount);
				881	ioc->period_at = now->now;
				882	ioc->period_at_vtime = now->vnow;
				883	write_seqcount_end(&ioc->period_seqcount);
				884
				885	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
				886	add_timer(&ioc->timer);
				887	}
				888
				889	/*
				890	* Update @iocg's `active` and `inuse` to @active and @inuse, update level
				891	* weight sums and propagate upwards accordingly.
				892	*/
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	893	static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	894	{
				895	struct ioc *ioc = iocg->ioc;
				896	int lvl;
				897
				898	lockdep_assert_held(&ioc->lock);
				899
Tejun Heo	db84a72	2020-09-01 14:52:35 -0400	[diff] [blame]	900	inuse = clamp_t(u32, inuse, 1, active);
				901
				902	if (active == iocg->active && inuse == iocg->inuse)
				903	return;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	904
				905	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
				906	struct ioc_gq *parent = iocg->ancestors[lvl];
				907	struct ioc_gq *child = iocg->ancestors[lvl + 1];
				908	u32 parent_active = 0, parent_inuse = 0;
				909
				910	/* update the level sums */
				911	parent->child_active_sum += (s32)(active - child->active);
				912	parent->child_inuse_sum += (s32)(inuse - child->inuse);
				913	/* apply the udpates */
				914	child->active = active;
				915	child->inuse = inuse;
				916
				917	/*
				918	* The delta between inuse and active sums indicates that
				919	* that much of weight is being given away. Parent's inuse
				920	* and active should reflect the ratio.
				921	*/
				922	if (parent->child_active_sum) {
				923	parent_active = parent->weight;
				924	parent_inuse = DIV64_U64_ROUND_UP(
				925	parent_active * parent->child_inuse_sum,
				926	parent->child_active_sum);
				927	}
				928
				929	/* do we need to keep walking up? */
				930	if (parent_active == parent->active &&
				931	parent_inuse == parent->inuse)
				932	break;
				933
				934	active = parent_active;
				935	inuse = parent_inuse;
				936	}
				937
				938	ioc->weights_updated = true;
				939	}
				940
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	941	static void commit_weights(struct ioc *ioc)
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	942	{
				943	lockdep_assert_held(&ioc->lock);
				944
				945	if (ioc->weights_updated) {
				946	/* paired with rmb in current_hweight(), see there */
				947	smp_wmb();
				948	atomic_inc(&ioc->hweight_gen);
				949	ioc->weights_updated = false;
				950	}
				951	}
				952
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	953	static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	954	{
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	955	__propagate_weights(iocg, active, inuse);
				956	commit_weights(iocg->ioc);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	957	}
				958
				959	static void current_hweight(struct ioc_gq iocg, u32 hw_activep, u32 *hw_inusep)
				960	{
				961	struct ioc *ioc = iocg->ioc;
				962	int lvl;
				963	u32 hwa, hwi;
				964	int ioc_gen;
				965
				966	/* hot path - if uptodate, use cached */
				967	ioc_gen = atomic_read(&ioc->hweight_gen);
				968	if (ioc_gen == iocg->hweight_gen)
				969	goto out;
				970
				971	/*
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	972	* Paired with wmb in commit_weights(). If we saw the updated
				973	* hweight_gen, all the weight updates from __propagate_weights() are
				974	* visible too.
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	975	*
				976	* We can race with weight updates during calculation and get it
				977	* wrong. However, hweight_gen would have changed and a future
				978	* reader will recalculate and we're guaranteed to discard the
				979	* wrong result soon.
				980	*/
				981	smp_rmb();
				982
Tejun Heo	fe20cdb5	2020-09-01 14:52:38 -0400	[diff] [blame]	983	hwa = hwi = WEIGHT_ONE;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	984	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
				985	struct ioc_gq *parent = iocg->ancestors[lvl];
				986	struct ioc_gq *child = iocg->ancestors[lvl + 1];
Tejun Heo	bd0adb9	2020-09-01 14:52:39 -0400	[diff] [blame]	987	u64 active_sum = READ_ONCE(parent->child_active_sum);
				988	u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	989	u32 active = READ_ONCE(child->active);
				990	u32 inuse = READ_ONCE(child->inuse);
				991
				992	/* we can race with deactivations and either may read as zero */
				993	if (!active_sum \|\| !inuse_sum)
				994	continue;
				995
Tejun Heo	bd0adb9	2020-09-01 14:52:39 -0400	[diff] [blame]	996	active_sum = max_t(u64, active, active_sum);
				997	hwa = div64_u64((u64)hwa * active, active_sum);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	998
Tejun Heo	bd0adb9	2020-09-01 14:52:39 -0400	[diff] [blame]	999	inuse_sum = max_t(u64, inuse, inuse_sum);
				1000	hwi = div64_u64((u64)hwi * inuse, inuse_sum);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1001	}
				1002
				1003	iocg->hweight_active = max_t(u32, hwa, 1);
				1004	iocg->hweight_inuse = max_t(u32, hwi, 1);
				1005	iocg->hweight_gen = ioc_gen;
				1006	out:
				1007	if (hw_activep)
				1008	*hw_activep = iocg->hweight_active;
				1009	if (hw_inusep)
				1010	*hw_inusep = iocg->hweight_inuse;
				1011	}
				1012
				1013	static void weight_updated(struct ioc_gq *iocg)
				1014	{
				1015	struct ioc *ioc = iocg->ioc;
				1016	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
				1017	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
				1018	u32 weight;
				1019
				1020	lockdep_assert_held(&ioc->lock);
				1021
				1022	weight = iocg->cfg_weight ?: iocc->dfl_weight;
				1023	if (weight != iocg->weight && iocg->active)
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	1024	propagate_weights(iocg, weight,
Tejun Heo	bd0adb9	2020-09-01 14:52:39 -0400	[diff] [blame]	1025	DIV64_U64_ROUND_UP((u64)iocg->inuse * weight,
				1026	iocg->weight));
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1027	iocg->weight = weight;
				1028	}
				1029
				1030	static bool iocg_activate(struct ioc_gq iocg, struct ioc_now now)
				1031	{
				1032	struct ioc *ioc = iocg->ioc;
				1033	u64 last_period, cur_period, max_period_delta;
				1034	u64 vtime, vmargin, vmin;
				1035	int i;
				1036
				1037	/*
				1038	* If seem to be already active, just update the stamp to tell the
				1039	* timer that we're still active. We don't mind occassional races.
				1040	*/
				1041	if (!list_empty(&iocg->active_list)) {
				1042	ioc_now(ioc, now);
				1043	cur_period = atomic64_read(&ioc->cur_period);
				1044	if (atomic64_read(&iocg->active_period) != cur_period)
				1045	atomic64_set(&iocg->active_period, cur_period);
				1046	return true;
				1047	}
				1048
				1049	/* racy check on internal node IOs, treat as root level IOs */
				1050	if (iocg->child_active_sum)
				1051	return false;
				1052
				1053	spin_lock_irq(&ioc->lock);
				1054
				1055	ioc_now(ioc, now);
				1056
				1057	/* update period */
				1058	cur_period = atomic64_read(&ioc->cur_period);
				1059	last_period = atomic64_read(&iocg->active_period);
				1060	atomic64_set(&iocg->active_period, cur_period);
				1061
				1062	/* already activated or breaking leaf-only constraint? */
Jiufei Xue	8b37bc2	2019-11-13 15:21:31 +0800	[diff] [blame]	1063	if (!list_empty(&iocg->active_list))
				1064	goto succeed_unlock;
				1065	for (i = iocg->level - 1; i > 0; i--)
				1066	if (!list_empty(&iocg->ancestors[i]->active_list))
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1067	goto fail_unlock;
Jiufei Xue	8b37bc2	2019-11-13 15:21:31 +0800	[diff] [blame]	1068
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1069	if (iocg->child_active_sum)
				1070	goto fail_unlock;
				1071
				1072	/*
				1073	* vtime may wrap when vrate is raised substantially due to
				1074	* underestimated IO costs. Look at the period and ignore its
				1075	* vtime if the iocg has been idle for too long. Also, cap the
				1076	* budget it can start with to the margin.
				1077	*/
				1078	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
				1079	vtime = atomic64_read(&iocg->vtime);
				1080	vmargin = ioc->margin_us * now->vrate;
				1081	vmin = now->vnow - vmargin;
				1082
				1083	if (last_period + max_period_delta < cur_period \|\|
				1084	time_before64(vtime, vmin)) {
				1085	atomic64_add(vmin - vtime, &iocg->vtime);
				1086	atomic64_add(vmin - vtime, &iocg->done_vtime);
				1087	vtime = vmin;
				1088	}
				1089
				1090	/*
				1091	* Activate, propagate weight and start period timer if not
				1092	* running. Reset hweight_gen to avoid accidental match from
				1093	* wrapping.
				1094	*/
				1095	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
				1096	list_add(&iocg->active_list, &ioc->active_iocgs);
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	1097	propagate_weights(iocg, iocg->weight,
				1098	iocg->last_inuse ?: iocg->weight);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1099
				1100	TRACE_IOCG_PATH(iocg_activate, iocg, now,
				1101	last_period, cur_period, vtime);
				1102
				1103	iocg->last_vtime = vtime;
				1104
				1105	if (ioc->running == IOC_IDLE) {
				1106	ioc->running = IOC_RUNNING;
				1107	ioc_start_period(ioc, now);
				1108	}
				1109
Jiufei Xue	8b37bc2	2019-11-13 15:21:31 +0800	[diff] [blame]	1110	succeed_unlock:
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1111	spin_unlock_irq(&ioc->lock);
				1112	return true;
				1113
				1114	fail_unlock:
				1115	spin_unlock_irq(&ioc->lock);
				1116	return false;
				1117	}
				1118
Tejun Heo	6ef20f7	2020-09-01 14:52:36 -0400	[diff] [blame]	1119	static bool iocg_kick_delay(struct ioc_gq iocg, struct ioc_now now)
				1120	{
				1121	struct ioc *ioc = iocg->ioc;
				1122	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
				1123	u64 vtime = atomic64_read(&iocg->vtime);
				1124	u64 vmargin = ioc->margin_us * now->vrate;
				1125	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
				1126	u64 delta_ns, expires, oexpires;
				1127	u32 hw_inuse;
				1128
				1129	lockdep_assert_held(&iocg->waitq.lock);
				1130
				1131	/* debt-adjust vtime */
				1132	current_hweight(iocg, NULL, &hw_inuse);
				1133	vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
				1134
				1135	/*
				1136	* Clear or maintain depending on the overage. Non-zero vdebt is what
				1137	* guarantees that @iocg is online and future iocg_kick_delay() will
				1138	* clear use_delay. Don't leave it on when there's no vdebt.
				1139	*/
				1140	if (!iocg->abs_vdebt \|\| time_before_eq64(vtime, now->vnow)) {
				1141	blkcg_clear_delay(blkg);
				1142	return false;
				1143	}
				1144	if (!atomic_read(&blkg->use_delay) &&
				1145	time_before_eq64(vtime, now->vnow + vmargin))
				1146	return false;
				1147
				1148	/* use delay */
				1149	delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
				1150	now->vrate) * NSEC_PER_USEC;
				1151	blkcg_set_delay(blkg, delta_ns);
				1152	expires = now->now_ns + delta_ns;
				1153
				1154	/* if already active and close enough, don't bother */
				1155	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
				1156	if (hrtimer_is_queued(&iocg->delay_timer) &&
				1157	abs(oexpires - expires) <= margin_ns / 4)
				1158	return true;
				1159
				1160	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
				1161	margin_ns / 4, HRTIMER_MODE_ABS);
				1162	return true;
				1163	}
				1164
				1165	static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
				1166	{
				1167	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
				1168	struct ioc_now now;
				1169	unsigned long flags;
				1170
				1171	spin_lock_irqsave(&iocg->waitq.lock, flags);
				1172	ioc_now(iocg->ioc, &now);
				1173	iocg_kick_delay(iocg, &now);
				1174	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
				1175
				1176	return HRTIMER_NORESTART;
				1177	}
				1178
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1179	static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
				1180	int flags, void *key)
				1181	{
				1182	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
				1183	struct iocg_wake_ctx ctx = (struct iocg_wake_ctx )key;
				1184	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
				1185
				1186	ctx->vbudget -= cost;
				1187
				1188	if (ctx->vbudget < 0)
				1189	return -1;
				1190
				1191	iocg_commit_bio(ctx->iocg, wait->bio, cost);
				1192
				1193	/*
				1194	* autoremove_wake_function() removes the wait entry only when it
				1195	* actually changed the task state. We want the wait always
				1196	* removed. Remove explicitly and use default_wake_function().
				1197	*/
				1198	list_del_init(&wq_entry->entry);
				1199	wait->committed = true;
				1200
				1201	default_wake_function(wq_entry, mode, flags, key);
				1202	return 0;
				1203	}
				1204
				1205	static void iocg_kick_waitq(struct ioc_gq iocg, struct ioc_now now)
				1206	{
				1207	struct ioc *ioc = iocg->ioc;
				1208	struct iocg_wake_ctx ctx = { .iocg = iocg };
				1209	u64 margin_ns = (u64)(ioc->period_us *
				1210	WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1211	u64 vdebt, vshortage, expires, oexpires;
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1212	s64 vbudget;
				1213	u32 hw_inuse;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1214
				1215	lockdep_assert_held(&iocg->waitq.lock);
				1216
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1217	current_hweight(iocg, NULL, &hw_inuse);
				1218	vbudget = now->vnow - atomic64_read(&iocg->vtime);
				1219
				1220	/* pay off debt */
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1221	vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1222	if (vdebt && vbudget > 0) {
				1223	u64 delta = min_t(u64, vbudget, vdebt);
				1224	u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1225	iocg->abs_vdebt);
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1226
				1227	atomic64_add(delta, &iocg->vtime);
				1228	atomic64_add(delta, &iocg->done_vtime);
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1229	iocg->abs_vdebt -= abs_delta;
Tejun Heo	7b84b49	2020-09-01 14:52:37 -0400	[diff] [blame]	1230
				1231	iocg_kick_delay(iocg, now);
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1232	}
				1233
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1234	/*
				1235	* Wake up the ones which are due and see how much vtime we'll need
				1236	* for the next one.
				1237	*/
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1238	ctx.hw_inuse = hw_inuse;
				1239	ctx.vbudget = vbudget - vdebt;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1240	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
				1241	if (!waitqueue_active(&iocg->waitq))
				1242	return;
				1243	if (WARN_ON_ONCE(ctx.vbudget >= 0))
				1244	return;
				1245
				1246	/* determine next wakeup, add a quarter margin to guarantee chunking */
				1247	vshortage = -ctx.vbudget;
				1248	expires = now->now_ns +
				1249	DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
				1250	expires += margin_ns / 4;
				1251
				1252	/* if already active and close enough, don't bother */
				1253	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
				1254	if (hrtimer_is_queued(&iocg->waitq_timer) &&
				1255	abs(oexpires - expires) <= margin_ns / 4)
				1256	return;
				1257
				1258	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
				1259	margin_ns / 4, HRTIMER_MODE_ABS);
				1260	}
				1261
				1262	static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
				1263	{
				1264	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
				1265	struct ioc_now now;
				1266	unsigned long flags;
				1267
				1268	ioc_now(iocg->ioc, &now);
				1269
				1270	spin_lock_irqsave(&iocg->waitq.lock, flags);
				1271	iocg_kick_waitq(iocg, &now);
				1272	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
				1273
				1274	return HRTIMER_NORESTART;
				1275	}
				1276
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1277	static void ioc_lat_stat(struct ioc ioc, u32 missed_ppm_ar, u32 *rq_wait_pct_p)
				1278	{
				1279	u32 nr_met[2] = { };
				1280	u32 nr_missed[2] = { };
				1281	u64 rq_wait_ns = 0;
				1282	int cpu, rw;
				1283
				1284	for_each_online_cpu(cpu) {
				1285	struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
				1286	u64 this_rq_wait_ns;
				1287
				1288	for (rw = READ; rw <= WRITE; rw++) {
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	1289	u32 this_met = local_read(&stat->missed[rw].nr_met);
				1290	u32 this_missed = local_read(&stat->missed[rw].nr_missed);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1291
				1292	nr_met[rw] += this_met - stat->missed[rw].last_met;
				1293	nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
				1294	stat->missed[rw].last_met = this_met;
				1295	stat->missed[rw].last_missed = this_missed;
				1296	}
				1297
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	1298	this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1299	rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
				1300	stat->last_rq_wait_ns = this_rq_wait_ns;
				1301	}
				1302
				1303	for (rw = READ; rw <= WRITE; rw++) {
				1304	if (nr_met[rw] + nr_missed[rw])
				1305	missed_ppm_ar[rw] =
				1306	DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
				1307	nr_met[rw] + nr_missed[rw]);
				1308	else
				1309	missed_ppm_ar[rw] = 0;
				1310	}
				1311
				1312	rq_wait_pct_p = div64_u64(rq_wait_ns 100,
				1313	ioc->period_us * NSEC_PER_USEC);
				1314	}
				1315
				1316	/* was iocg idle this period? */
				1317	static bool iocg_is_idle(struct ioc_gq *iocg)
				1318	{
				1319	struct ioc *ioc = iocg->ioc;
				1320
				1321	/* did something get issued this period? */
				1322	if (atomic64_read(&iocg->active_period) ==
				1323	atomic64_read(&ioc->cur_period))
				1324	return false;
				1325
				1326	/* is something in flight? */
Tejun Heo	dcd6589	2020-03-10 13:07:46 -0400	[diff] [blame]	1327	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1328	return false;
				1329
				1330	return true;
				1331	}
				1332
				1333	/* returns usage with margin added if surplus is large enough */
				1334	static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
				1335	{
				1336	/* add margin */
				1337	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
				1338	usage += SURPLUS_SCALE_ABS;
				1339
				1340	/* don't bother if the surplus is too small */
				1341	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
				1342	return 0;
				1343
				1344	return usage;
				1345	}
				1346
				1347	static void ioc_timer_fn(struct timer_list *timer)
				1348	{
				1349	struct ioc *ioc = container_of(timer, struct ioc, timer);
				1350	struct ioc_gq iocg, tiocg;
				1351	struct ioc_now now;
				1352	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
				1353	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
				1354	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
				1355	u32 missed_ppm[2], rq_wait_pct;
				1356	u64 period_vtime;
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1357	int prev_busy_level, i;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1358
				1359	/* how were the latencies during the period? */
				1360	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
				1361
				1362	/* take care of active iocgs */
				1363	spin_lock_irq(&ioc->lock);
				1364
				1365	ioc_now(ioc, &now);
				1366
				1367	period_vtime = now.vnow - ioc->period_at_vtime;
				1368	if (WARN_ON_ONCE(!period_vtime)) {
				1369	spin_unlock_irq(&ioc->lock);
				1370	return;
				1371	}
				1372
				1373	/*
				1374	* Waiters determine the sleep durations based on the vrate they
				1375	* saw at the time of sleep. If vrate has increased, some waiters
				1376	* could be sleeping for too long. Wake up tardy waiters which
				1377	* should have woken up in the last period and expire idle iocgs.
				1378	*/
				1379	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
Chengming Zhou	d9012a5	2020-07-30 17:03:21 +0800	[diff] [blame]	1380	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1381	!iocg_is_idle(iocg))
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1382	continue;
				1383
				1384	spin_lock(&iocg->waitq.lock);
				1385
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1386	if (waitqueue_active(&iocg->waitq) \|\| iocg->abs_vdebt) {
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1387	/* might be oversleeping vtime / hweight changes, kick */
				1388	iocg_kick_waitq(iocg, &now);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1389	} else if (iocg_is_idle(iocg)) {
				1390	/* no waiter and idle, deactivate */
				1391	iocg->last_inuse = iocg->inuse;
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	1392	__propagate_weights(iocg, 0, 0);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1393	list_del_init(&iocg->active_list);
				1394	}
				1395
				1396	spin_unlock(&iocg->waitq.lock);
				1397	}
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	1398	commit_weights(ioc);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1399
				1400	/* calc usages and see whether some weights need to be moved around */
				1401	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
				1402	u64 vdone, vtime, vusage, vmargin, vmin;
				1403	u32 hw_active, hw_inuse, usage;
				1404
				1405	/*
				1406	* Collect unused and wind vtime closer to vnow to prevent
				1407	* iocgs from accumulating a large amount of budget.
				1408	*/
				1409	vdone = atomic64_read(&iocg->done_vtime);
				1410	vtime = atomic64_read(&iocg->vtime);
				1411	current_hweight(iocg, &hw_active, &hw_inuse);
				1412
				1413	/*
				1414	* Latency QoS detection doesn't account for IOs which are
				1415	* in-flight for longer than a period. Detect them by
				1416	* comparing vdone against period start. If lagging behind
				1417	* IOs from past periods, don't increase vrate.
				1418	*/
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1419	if ((ppm_rthr != MILLION \|\| ppm_wthr != MILLION) &&
				1420	!atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1421	time_after64(vtime, vdone) &&
				1422	time_after64(vtime, now.vnow -
				1423	MAX_LAGGING_PERIODS * period_vtime) &&
				1424	time_before64(vdone, now.vnow - period_vtime))
				1425	nr_lagging++;
				1426
				1427	if (waitqueue_active(&iocg->waitq))
				1428	vusage = now.vnow - iocg->last_vtime;
				1429	else if (time_before64(iocg->last_vtime, vtime))
				1430	vusage = vtime - iocg->last_vtime;
				1431	else
				1432	vusage = 0;
				1433
				1434	iocg->last_vtime += vusage;
				1435	/*
				1436	* Factor in in-flight vtime into vusage to avoid
				1437	* high-latency completions appearing as idle. This should
				1438	* be done after the above ->last_time adjustment.
				1439	*/
				1440	vusage = max(vusage, vtime - vdone);
				1441
				1442	/* calculate hweight based usage ratio and record */
				1443	if (vusage) {
				1444	usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
				1445	period_vtime);
				1446	iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
				1447	iocg->usages[iocg->usage_idx] = usage;
				1448	} else {
				1449	usage = 0;
				1450	}
				1451
				1452	/* see whether there's surplus vtime */
				1453	vmargin = ioc->margin_us * now.vrate;
				1454	vmin = now.vnow - vmargin;
				1455
				1456	iocg->has_surplus = false;
				1457
				1458	if (!waitqueue_active(&iocg->waitq) &&
				1459	time_before64(vtime, vmin)) {
				1460	u64 delta = vmin - vtime;
				1461
				1462	/* throw away surplus vtime */
				1463	atomic64_add(delta, &iocg->vtime);
				1464	atomic64_add(delta, &iocg->done_vtime);
				1465	iocg->last_vtime += delta;
				1466	/* if usage is sufficiently low, maybe it can donate */
				1467	if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
				1468	iocg->has_surplus = true;
				1469	nr_surpluses++;
				1470	}
				1471	} else if (hw_inuse < hw_active) {
				1472	u32 new_hwi, new_inuse;
				1473
				1474	/* was donating but might need to take back some */
				1475	if (waitqueue_active(&iocg->waitq)) {
				1476	new_hwi = hw_active;
				1477	} else {
				1478	new_hwi = max(hw_inuse,
				1479	usage * SURPLUS_SCALE_PCT / 100 +
				1480	SURPLUS_SCALE_ABS);
				1481	}
				1482
				1483	new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
				1484	hw_inuse);
				1485	new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
				1486
				1487	if (new_inuse > iocg->inuse) {
				1488	TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
				1489	iocg->inuse, new_inuse,
				1490	hw_inuse, new_hwi);
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	1491	__propagate_weights(iocg, iocg->weight,
				1492	new_inuse);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1493	}
				1494	} else {
				1495	/* genuninely out of vtime */
				1496	nr_shortages++;
				1497	}
				1498	}
				1499
				1500	if (!nr_shortages \|\| !nr_surpluses)
				1501	goto skip_surplus_transfers;
				1502
				1503	/* there are both shortages and surpluses, transfer surpluses */
				1504	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
				1505	u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
				1506	int nr_valid = 0;
				1507
				1508	if (!iocg->has_surplus)
				1509	continue;
				1510
				1511	/* base the decision on max historical usage */
				1512	for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
				1513	if (iocg->usages[i]) {
				1514	usage = max(usage, iocg->usages[i]);
				1515	nr_valid++;
				1516	}
				1517	}
				1518	if (nr_valid < MIN_VALID_USAGES)
				1519	continue;
				1520
				1521	current_hweight(iocg, &hw_active, &hw_inuse);
				1522	new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
				1523	if (!new_hwi)
				1524	continue;
				1525
				1526	new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
				1527	hw_inuse);
				1528	if (new_inuse < iocg->inuse) {
				1529	TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
				1530	iocg->inuse, new_inuse,
				1531	hw_inuse, new_hwi);
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	1532	__propagate_weights(iocg, iocg->weight, new_inuse);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1533	}
				1534	}
				1535	skip_surplus_transfers:
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	1536	commit_weights(ioc);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1537
				1538	/*
				1539	* If q is getting clogged or we're missing too much, we're issuing
				1540	* too much IO and should lower vtime rate. If we're not missing
				1541	* and experiencing shortages but not surpluses, we're too stingy
				1542	* and should increase vtime rate.
				1543	*/
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1544	prev_busy_level = ioc->busy_level;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1545	if (rq_wait_pct > RQ_WAIT_BUSY_PCT \|\|
				1546	missed_ppm[READ] > ppm_rthr \|\|
				1547	missed_ppm[WRITE] > ppm_wthr) {
Tejun Heo	81ca627	2019-10-14 17:18:11 -0700	[diff] [blame]	1548	/* clearly missing QoS targets, slow down vrate */
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1549	ioc->busy_level = max(ioc->busy_level, 0);
				1550	ioc->busy_level++;
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1551	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1552	missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
				1553	missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
Tejun Heo	81ca627	2019-10-14 17:18:11 -0700	[diff] [blame]	1554	/* QoS targets are being met with >25% margin */
				1555	if (nr_shortages) {
				1556	/*
				1557	* We're throttling while the device has spare
				1558	* capacity. If vrate was being slowed down, stop.
				1559	*/
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1560	ioc->busy_level = min(ioc->busy_level, 0);
Tejun Heo	81ca627	2019-10-14 17:18:11 -0700	[diff] [blame]	1561
				1562	/*
				1563	* If there are IOs spanning multiple periods, wait
				1564	* them out before pushing the device harder. If
				1565	* there are surpluses, let redistribution work it
				1566	* out first.
				1567	*/
				1568	if (!nr_lagging && !nr_surpluses)
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1569	ioc->busy_level--;
Tejun Heo	81ca627	2019-10-14 17:18:11 -0700	[diff] [blame]	1570	} else {
				1571	/*
				1572	* Nobody is being throttled and the users aren't
				1573	* issuing enough IOs to saturate the device. We
				1574	* simply don't know how close the device is to
				1575	* saturation. Coast.
				1576	*/
				1577	ioc->busy_level = 0;
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1578	}
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1579	} else {
Tejun Heo	81ca627	2019-10-14 17:18:11 -0700	[diff] [blame]	1580	/* inside the hysterisis margin, we're good */
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1581	ioc->busy_level = 0;
				1582	}
				1583
				1584	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
				1585
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1586	if (ioc->busy_level > 0 \|\| (ioc->busy_level < 0 && !nr_lagging)) {
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1587	u64 vrate = atomic64_read(&ioc->vtime_rate);
				1588	u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
				1589
				1590	/* rq_wait signal is always reliable, ignore user vrate_min */
				1591	if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
				1592	vrate_min = VRATE_MIN;
				1593
				1594	/*
				1595	* If vrate is out of bounds, apply clamp gradually as the
				1596	* bounds can change abruptly. Otherwise, apply busy_level
				1597	* based adjustment.
				1598	*/
				1599	if (vrate < vrate_min) {
				1600	vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
				1601	100);
				1602	vrate = min(vrate, vrate_min);
				1603	} else if (vrate > vrate_max) {
				1604	vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
				1605	100);
				1606	vrate = max(vrate, vrate_max);
				1607	} else {
				1608	int idx = min_t(int, abs(ioc->busy_level),
				1609	ARRAY_SIZE(vrate_adj_pct) - 1);
				1610	u32 adj_pct = vrate_adj_pct[idx];
				1611
				1612	if (ioc->busy_level > 0)
				1613	adj_pct = 100 - adj_pct;
				1614	else
				1615	adj_pct = 100 + adj_pct;
				1616
				1617	vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
				1618	vrate_min, vrate_max);
				1619	}
				1620
Waiman Long	d6c8e94	2020-04-21 09:07:55 -0400	[diff] [blame]	1621	trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1622	nr_lagging, nr_shortages,
				1623	nr_surpluses);
				1624
				1625	atomic64_set(&ioc->vtime_rate, vrate);
				1626	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
				1627	ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1628	} else if (ioc->busy_level != prev_busy_level \|\| nr_lagging) {
				1629	trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
Waiman Long	d6c8e94	2020-04-21 09:07:55 -0400	[diff] [blame]	1630	missed_ppm, rq_wait_pct, nr_lagging,
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1631	nr_shortages, nr_surpluses);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1632	}
				1633
				1634	ioc_refresh_params(ioc, false);
				1635
				1636	/*
				1637	* This period is done. Move onto the next one. If nothing's
				1638	* going on with the device, stop the timer.
				1639	*/
				1640	atomic64_inc(&ioc->cur_period);
				1641
				1642	if (ioc->running != IOC_STOP) {
				1643	if (!list_empty(&ioc->active_iocgs)) {
				1644	ioc_start_period(ioc, &now);
				1645	} else {
				1646	ioc->busy_level = 0;
				1647	ioc->running = IOC_IDLE;
				1648	}
				1649	}
				1650
				1651	spin_unlock_irq(&ioc->lock);
				1652	}
				1653
				1654	static void calc_vtime_cost_builtin(struct bio bio, struct ioc_gq iocg,
				1655	bool is_merge, u64 *costp)
				1656	{
				1657	struct ioc *ioc = iocg->ioc;
				1658	u64 coef_seqio, coef_randio, coef_page;
				1659	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
				1660	u64 seek_pages = 0;
				1661	u64 cost = 0;
				1662
				1663	switch (bio_op(bio)) {
				1664	case REQ_OP_READ:
				1665	coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
				1666	coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
				1667	coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
				1668	break;
				1669	case REQ_OP_WRITE:
				1670	coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
				1671	coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
				1672	coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
				1673	break;
				1674	default:
				1675	goto out;
				1676	}
				1677
				1678	if (iocg->cursor) {
				1679	seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
				1680	seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
				1681	}
				1682
				1683	if (!is_merge) {
				1684	if (seek_pages > LCOEF_RANDIO_PAGES) {
				1685	cost += coef_randio;
				1686	} else {
				1687	cost += coef_seqio;
				1688	}
				1689	}
				1690	cost += pages * coef_page;
				1691	out:
				1692	*costp = cost;
				1693	}
				1694
				1695	static u64 calc_vtime_cost(struct bio bio, struct ioc_gq iocg, bool is_merge)
				1696	{
				1697	u64 cost;
				1698
				1699	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
				1700	return cost;
				1701	}
				1702
Tejun Heo	cd00650	2020-04-13 12:27:56 -0400	[diff] [blame]	1703	static void calc_size_vtime_cost_builtin(struct request rq, struct ioc ioc,
				1704	u64 *costp)
				1705	{
				1706	unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
				1707
				1708	switch (req_op(rq)) {
				1709	case REQ_OP_READ:
				1710	costp = pages ioc->params.lcoefs[LCOEF_RPAGE];
				1711	break;
				1712	case REQ_OP_WRITE:
				1713	costp = pages ioc->params.lcoefs[LCOEF_WPAGE];
				1714	break;
				1715	default:
				1716	*costp = 0;
				1717	}
				1718	}
				1719
				1720	static u64 calc_size_vtime_cost(struct request rq, struct ioc ioc)
				1721	{
				1722	u64 cost;
				1723
				1724	calc_size_vtime_cost_builtin(rq, ioc, &cost);
				1725	return cost;
				1726	}
				1727
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1728	static void ioc_rqos_throttle(struct rq_qos rqos, struct bio bio)
				1729	{
				1730	struct blkcg_gq *blkg = bio->bi_blkg;
				1731	struct ioc *ioc = rqos_to_ioc(rqos);
				1732	struct ioc_gq *iocg = blkg_to_iocg(blkg);
				1733	struct ioc_now now;
				1734	struct iocg_wait wait;
				1735	u32 hw_active, hw_inuse;
				1736	u64 abs_cost, cost, vtime;
				1737
				1738	/* bypass IOs if disabled or for root cgroup */
				1739	if (!ioc->enabled \|\| !iocg->level)
				1740	return;
				1741
				1742	/* always activate so that even 0 cost IOs get protected to some level */
				1743	if (!iocg_activate(iocg, &now))
				1744	return;
				1745
				1746	/* calculate the absolute vtime cost */
				1747	abs_cost = calc_vtime_cost(bio, iocg, false);
				1748	if (!abs_cost)
				1749	return;
				1750
				1751	iocg->cursor = bio_end_sector(bio);
				1752
				1753	vtime = atomic64_read(&iocg->vtime);
				1754	current_hweight(iocg, &hw_active, &hw_inuse);
				1755
				1756	if (hw_inuse < hw_active &&
				1757	time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
				1758	TRACE_IOCG_PATH(inuse_reset, iocg, &now,
				1759	iocg->inuse, iocg->weight, hw_inuse, hw_active);
				1760	spin_lock_irq(&ioc->lock);
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	1761	propagate_weights(iocg, iocg->weight, iocg->weight);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1762	spin_unlock_irq(&ioc->lock);
				1763	current_hweight(iocg, &hw_active, &hw_inuse);
				1764	}
				1765
				1766	cost = abs_cost_to_cost(abs_cost, hw_inuse);
				1767
				1768	/*
				1769	* If no one's waiting and within budget, issue right away. The
				1770	* tests are racy but the races aren't systemic - we only miss once
				1771	* in a while which is fine.
				1772	*/
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1773	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1774	time_before_eq64(vtime + cost, now.vnow)) {
				1775	iocg_commit_bio(iocg, bio, cost);
				1776	return;
				1777	}
				1778
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1779	/*
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1780	* We activated above but w/o any synchronization. Deactivation is
				1781	* synchronized with waitq.lock and we won't get deactivated as long
				1782	* as we're waiting or has debt, so we're good if we're activated
				1783	* here. In the unlikely case that we aren't, just issue the IO.
				1784	*/
				1785	spin_lock_irq(&iocg->waitq.lock);
				1786
				1787	if (unlikely(list_empty(&iocg->active_list))) {
				1788	spin_unlock_irq(&iocg->waitq.lock);
				1789	iocg_commit_bio(iocg, bio, cost);
				1790	return;
				1791	}
				1792
				1793	/*
				1794	* We're over budget. If @bio has to be issued regardless, remember
				1795	* the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
				1796	* off the debt before waking more IOs.
				1797	*
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1798	* This way, the debt is continuously paid off each period with the
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1799	* actual budget available to the cgroup. If we just wound vtime, we
				1800	* would incorrectly use the current hw_inuse for the entire amount
				1801	* which, for example, can lead to the cgroup staying blocked for a
				1802	* long time even with substantially raised hw_inuse.
				1803	*
				1804	* An iocg with vdebt should stay online so that the timer can keep
				1805	* deducting its vdebt and [de]activate use_delay mechanism
				1806	* accordingly. We don't want to race against the timer trying to
				1807	* clear them and leave @iocg inactive w/ dangling use_delay heavily
				1808	* penalizing the cgroup and its descendants.
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1809	*/
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1810	if (bio_issue_as_root_blkg(bio) \|\| fatal_signal_pending(current)) {
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1811	iocg->abs_vdebt += abs_cost;
Tejun Heo	54c52e1	2020-04-13 12:27:55 -0400	[diff] [blame]	1812	if (iocg_kick_delay(iocg, &now))
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1813	blkcg_schedule_throttle(rqos->q,
				1814	(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1815	spin_unlock_irq(&iocg->waitq.lock);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1816	return;
				1817	}
				1818
				1819	/*
				1820	* Append self to the waitq and schedule the wakeup timer if we're
				1821	* the first waiter. The timer duration is calculated based on the
				1822	* current vrate. vtime and hweight changes can make it too short
				1823	* or too long. Each wait entry records the absolute cost it's
				1824	* waiting for to allow re-evaluation using a custom wait entry.
				1825	*
				1826	* If too short, the timer simply reschedules itself. If too long,
				1827	* the period timer will notice and trigger wakeups.
				1828	*
				1829	* All waiters are on iocg->waitq and the wait states are
				1830	* synchronized using waitq.lock.
				1831	*/
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1832	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
				1833	wait.wait.private = current;
				1834	wait.bio = bio;
				1835	wait.abs_cost = abs_cost;
				1836	wait.committed = false; /* will be set true by waker */
				1837
				1838	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
				1839	iocg_kick_waitq(iocg, &now);
				1840
				1841	spin_unlock_irq(&iocg->waitq.lock);
				1842
				1843	while (true) {
				1844	set_current_state(TASK_UNINTERRUPTIBLE);
				1845	if (wait.committed)
				1846	break;
				1847	io_schedule();
				1848	}
				1849
				1850	/* waker already committed us, proceed */
				1851	finish_wait(&iocg->waitq, &wait.wait);
				1852	}
				1853
				1854	static void ioc_rqos_merge(struct rq_qos rqos, struct request rq,
				1855	struct bio *bio)
				1856	{
				1857	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1858	struct ioc *ioc = iocg->ioc;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1859	sector_t bio_end = bio_end_sector(bio);
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1860	struct ioc_now now;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1861	u32 hw_inuse;
				1862	u64 abs_cost, cost;
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1863	unsigned long flags;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1864
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1865	/* bypass if disabled or for root cgroup */
				1866	if (!ioc->enabled \|\| !iocg->level)
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1867	return;
				1868
				1869	abs_cost = calc_vtime_cost(bio, iocg, true);
				1870	if (!abs_cost)
				1871	return;
				1872
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1873	ioc_now(ioc, &now);
				1874	current_hweight(iocg, NULL, &hw_inuse);
				1875	cost = abs_cost_to_cost(abs_cost, hw_inuse);
				1876
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1877	/* update cursor if backmerging into the request at the cursor */
				1878	if (blk_rq_pos(rq) < bio_end &&
				1879	blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
				1880	iocg->cursor = bio_end;
				1881
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1882	/*
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1883	* Charge if there's enough vtime budget and the existing request has
				1884	* cost assigned.
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1885	*/
				1886	if (rq->bio && rq->bio->bi_iocost_cost &&
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1887	time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1888	iocg_commit_bio(iocg, bio, cost);
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1889	return;
				1890	}
				1891
				1892	/*
				1893	* Otherwise, account it as debt if @iocg is online, which it should
				1894	* be for the vast majority of cases. See debt handling in
				1895	* ioc_rqos_throttle() for details.
				1896	*/
				1897	spin_lock_irqsave(&iocg->waitq.lock, flags);
				1898	if (likely(!list_empty(&iocg->active_list))) {
				1899	iocg->abs_vdebt += abs_cost;
Jens Axboe	873f1c8	2020-05-09 16:13:58 -0600	[diff] [blame]	1900	iocg_kick_delay(iocg, &now);
Tejun Heo	0b80f98	2020-05-04 19:27:54 -0400	[diff] [blame]	1901	} else {
				1902	iocg_commit_bio(iocg, bio, cost);
				1903	}
				1904	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1905	}
				1906
				1907	static void ioc_rqos_done_bio(struct rq_qos rqos, struct bio bio)
				1908	{
				1909	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
				1910
				1911	if (iocg && bio->bi_iocost_cost)
				1912	atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
				1913	}
				1914
				1915	static void ioc_rqos_done(struct rq_qos rqos, struct request rq)
				1916	{
				1917	struct ioc *ioc = rqos_to_ioc(rqos);
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	1918	struct ioc_pcpu_stat *ccs;
Tejun Heo	cd00650	2020-04-13 12:27:56 -0400	[diff] [blame]	1919	u64 on_q_ns, rq_wait_ns, size_nsec;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1920	int pidx, rw;
				1921
				1922	if (!ioc->enabled \|\| !rq->alloc_time_ns \|\| !rq->start_time_ns)
				1923	return;
				1924
				1925	switch (req_op(rq) & REQ_OP_MASK) {
				1926	case REQ_OP_READ:
				1927	pidx = QOS_RLAT;
				1928	rw = READ;
				1929	break;
				1930	case REQ_OP_WRITE:
				1931	pidx = QOS_WLAT;
				1932	rw = WRITE;
				1933	break;
				1934	default:
				1935	return;
				1936	}
				1937
				1938	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
				1939	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
Tejun Heo	cd00650	2020-04-13 12:27:56 -0400	[diff] [blame]	1940	size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1941
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	1942	ccs = get_cpu_ptr(ioc->pcpu_stat);
				1943
Tejun Heo	cd00650	2020-04-13 12:27:56 -0400	[diff] [blame]	1944	if (on_q_ns <= size_nsec \|\|
				1945	on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	1946	local_inc(&ccs->missed[rw].nr_met);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1947	else
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	1948	local_inc(&ccs->missed[rw].nr_missed);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1949
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	1950	local64_add(rq_wait_ns, &ccs->rq_wait_ns);
				1951
				1952	put_cpu_ptr(ccs);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1953	}
				1954
				1955	static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
				1956	{
				1957	struct ioc *ioc = rqos_to_ioc(rqos);
				1958
				1959	spin_lock_irq(&ioc->lock);
				1960	ioc_refresh_params(ioc, false);
				1961	spin_unlock_irq(&ioc->lock);
				1962	}
				1963
				1964	static void ioc_rqos_exit(struct rq_qos *rqos)
				1965	{
				1966	struct ioc *ioc = rqos_to_ioc(rqos);
				1967
				1968	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
				1969
				1970	spin_lock_irq(&ioc->lock);
				1971	ioc->running = IOC_STOP;
				1972	spin_unlock_irq(&ioc->lock);
				1973
				1974	del_timer_sync(&ioc->timer);
				1975	free_percpu(ioc->pcpu_stat);
				1976	kfree(ioc);
				1977	}
				1978
				1979	static struct rq_qos_ops ioc_rqos_ops = {
				1980	.throttle = ioc_rqos_throttle,
				1981	.merge = ioc_rqos_merge,
				1982	.done_bio = ioc_rqos_done_bio,
				1983	.done = ioc_rqos_done,
				1984	.queue_depth_changed = ioc_rqos_queue_depth_changed,
				1985	.exit = ioc_rqos_exit,
				1986	};
				1987
				1988	static int blk_iocost_init(struct request_queue *q)
				1989	{
				1990	struct ioc *ioc;
				1991	struct rq_qos *rqos;
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	1992	int i, cpu, ret;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1993
				1994	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
				1995	if (!ioc)
				1996	return -ENOMEM;
				1997
				1998	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
				1999	if (!ioc->pcpu_stat) {
				2000	kfree(ioc);
				2001	return -ENOMEM;
				2002	}
				2003
Tejun Heo	5e124f7	2020-09-01 14:52:33 -0400	[diff] [blame]	2004	for_each_possible_cpu(cpu) {
				2005	struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
				2006
				2007	for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
				2008	local_set(&ccs->missed[i].nr_met, 0);
				2009	local_set(&ccs->missed[i].nr_missed, 0);
				2010	}
				2011	local64_set(&ccs->rq_wait_ns, 0);
				2012	}
				2013
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2014	rqos = &ioc->rqos;
				2015	rqos->id = RQ_QOS_COST;
				2016	rqos->ops = &ioc_rqos_ops;
				2017	rqos->q = q;
				2018
				2019	spin_lock_init(&ioc->lock);
				2020	timer_setup(&ioc->timer, ioc_timer_fn, 0);
				2021	INIT_LIST_HEAD(&ioc->active_iocgs);
				2022
				2023	ioc->running = IOC_IDLE;
				2024	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
Ahmed S. Darwish	67b7b64	2020-07-20 17:55:26 +0200	[diff] [blame]	2025	seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2026	ioc->period_at = ktime_to_us(ktime_get());
				2027	atomic64_set(&ioc->cur_period, 0);
				2028	atomic_set(&ioc->hweight_gen, 0);
				2029
				2030	spin_lock_irq(&ioc->lock);
				2031	ioc->autop_idx = AUTOP_INVALID;
				2032	ioc_refresh_params(ioc, true);
				2033	spin_unlock_irq(&ioc->lock);
				2034
				2035	rq_qos_add(q, rqos);
				2036	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
				2037	if (ret) {
				2038	rq_qos_del(q, rqos);
Tejun Heo	3532e72	2019-08-29 08:53:06 -0700	[diff] [blame]	2039	free_percpu(ioc->pcpu_stat);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2040	kfree(ioc);
				2041	return ret;
				2042	}
				2043	return 0;
				2044	}
				2045
				2046	static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
				2047	{
				2048	struct ioc_cgrp *iocc;
				2049
				2050	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
Tejun Heo	e916ad2	2019-08-30 06:10:58 -0700	[diff] [blame]	2051	if (!iocc)
				2052	return NULL;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2053
Tejun Heo	bd0adb9	2020-09-01 14:52:39 -0400	[diff] [blame]	2054	iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2055	return &iocc->cpd;
				2056	}
				2057
				2058	static void ioc_cpd_free(struct blkcg_policy_data *cpd)
				2059	{
				2060	kfree(container_of(cpd, struct ioc_cgrp, cpd));
				2061	}
				2062
				2063	static struct blkg_policy_data ioc_pd_alloc(gfp_t gfp, struct request_queue q,
				2064	struct blkcg *blkcg)
				2065	{
				2066	int levels = blkcg->css.cgroup->level + 1;
				2067	struct ioc_gq *iocg;
				2068
Gustavo A. R. Silva	f61d6e2	2020-06-19 18:08:30 -0500	[diff] [blame]	2069	iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2070	if (!iocg)
				2071	return NULL;
				2072
				2073	return &iocg->pd;
				2074	}
				2075
				2076	static void ioc_pd_init(struct blkg_policy_data *pd)
				2077	{
				2078	struct ioc_gq *iocg = pd_to_iocg(pd);
				2079	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
				2080	struct ioc *ioc = q_to_ioc(blkg->q);
				2081	struct ioc_now now;
				2082	struct blkcg_gq *tblkg;
				2083	unsigned long flags;
				2084
				2085	ioc_now(ioc, &now);
				2086
				2087	iocg->ioc = ioc;
				2088	atomic64_set(&iocg->vtime, now.vnow);
				2089	atomic64_set(&iocg->done_vtime, now.vnow);
				2090	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
				2091	INIT_LIST_HEAD(&iocg->active_list);
Tejun Heo	fe20cdb5	2020-09-01 14:52:38 -0400	[diff] [blame]	2092	iocg->hweight_active = WEIGHT_ONE;
				2093	iocg->hweight_inuse = WEIGHT_ONE;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2094
				2095	init_waitqueue_head(&iocg->waitq);
				2096	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
				2097	iocg->waitq_timer.function = iocg_waitq_timer_fn;
				2098	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
				2099	iocg->delay_timer.function = iocg_delay_timer_fn;
				2100
				2101	iocg->level = blkg->blkcg->css.cgroup->level;
				2102
				2103	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
				2104	struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
				2105	iocg->ancestors[tiocg->level] = tiocg;
				2106	}
				2107
				2108	spin_lock_irqsave(&ioc->lock, flags);
				2109	weight_updated(iocg);
				2110	spin_unlock_irqrestore(&ioc->lock, flags);
				2111	}
				2112
				2113	static void ioc_pd_free(struct blkg_policy_data *pd)
				2114	{
				2115	struct ioc_gq *iocg = pd_to_iocg(pd);
				2116	struct ioc *ioc = iocg->ioc;
Tejun Heo	5aeac7c	2020-09-01 14:52:31 -0400	[diff] [blame]	2117	unsigned long flags;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2118
				2119	if (ioc) {
Tejun Heo	5aeac7c	2020-09-01 14:52:31 -0400	[diff] [blame]	2120	spin_lock_irqsave(&ioc->lock, flags);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2121	if (!list_empty(&iocg->active_list)) {
Tejun Heo	00410f1	2020-09-01 14:52:34 -0400	[diff] [blame]	2122	propagate_weights(iocg, 0, 0);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2123	list_del_init(&iocg->active_list);
				2124	}
Tejun Heo	5aeac7c	2020-09-01 14:52:31 -0400	[diff] [blame]	2125	spin_unlock_irqrestore(&ioc->lock, flags);
Tejun Heo	e036c4c	2019-09-10 09:15:25 -0700	[diff] [blame]	2126
				2127	hrtimer_cancel(&iocg->waitq_timer);
				2128	hrtimer_cancel(&iocg->delay_timer);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2129	}
				2130	kfree(iocg);
				2131	}
				2132
				2133	static u64 ioc_weight_prfill(struct seq_file sf, struct blkg_policy_data pd,
				2134	int off)
				2135	{
				2136	const char *dname = blkg_dev_name(pd->blkg);
				2137	struct ioc_gq *iocg = pd_to_iocg(pd);
				2138
				2139	if (dname && iocg->cfg_weight)
Tejun Heo	bd0adb9	2020-09-01 14:52:39 -0400	[diff] [blame]	2140	seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2141	return 0;
				2142	}
				2143
				2144
				2145	static int ioc_weight_show(struct seq_file sf, void v)
				2146	{
				2147	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2148	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
				2149
Tejun Heo	bd0adb9	2020-09-01 14:52:39 -0400	[diff] [blame]	2150	seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2151	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
				2152	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2153	return 0;
				2154	}
				2155
				2156	static ssize_t ioc_weight_write(struct kernfs_open_file of, char buf,
				2157	size_t nbytes, loff_t off)
				2158	{
				2159	struct blkcg *blkcg = css_to_blkcg(of_css(of));
				2160	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
				2161	struct blkg_conf_ctx ctx;
				2162	struct ioc_gq *iocg;
				2163	u32 v;
				2164	int ret;
				2165
				2166	if (!strchr(buf, ':')) {
				2167	struct blkcg_gq *blkg;
				2168
				2169	if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
				2170	return -EINVAL;
				2171
				2172	if (v < CGROUP_WEIGHT_MIN \|\| v > CGROUP_WEIGHT_MAX)
				2173	return -EINVAL;
				2174
				2175	spin_lock(&blkcg->lock);
Tejun Heo	bd0adb9	2020-09-01 14:52:39 -0400	[diff] [blame]	2176	iocc->dfl_weight = v * WEIGHT_ONE;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2177	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
				2178	struct ioc_gq *iocg = blkg_to_iocg(blkg);
				2179
				2180	if (iocg) {
				2181	spin_lock_irq(&iocg->ioc->lock);
				2182	weight_updated(iocg);
				2183	spin_unlock_irq(&iocg->ioc->lock);
				2184	}
				2185	}
				2186	spin_unlock(&blkcg->lock);
				2187
				2188	return nbytes;
				2189	}
				2190
				2191	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
				2192	if (ret)
				2193	return ret;
				2194
				2195	iocg = blkg_to_iocg(ctx.blkg);
				2196
				2197	if (!strncmp(ctx.body, "default", 7)) {
				2198	v = 0;
				2199	} else {
				2200	if (!sscanf(ctx.body, "%u", &v))
				2201	goto einval;
				2202	if (v < CGROUP_WEIGHT_MIN \|\| v > CGROUP_WEIGHT_MAX)
				2203	goto einval;
				2204	}
				2205
Dan Carpenter	41591a5	2019-10-31 13:53:41 +0300	[diff] [blame]	2206	spin_lock(&iocg->ioc->lock);
Tejun Heo	bd0adb9	2020-09-01 14:52:39 -0400	[diff] [blame]	2207	iocg->cfg_weight = v * WEIGHT_ONE;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2208	weight_updated(iocg);
Dan Carpenter	41591a5	2019-10-31 13:53:41 +0300	[diff] [blame]	2209	spin_unlock(&iocg->ioc->lock);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2210
				2211	blkg_conf_finish(&ctx);
				2212	return nbytes;
				2213
				2214	einval:
				2215	blkg_conf_finish(&ctx);
				2216	return -EINVAL;
				2217	}
				2218
				2219	static u64 ioc_qos_prfill(struct seq_file sf, struct blkg_policy_data pd,
				2220	int off)
				2221	{
				2222	const char *dname = blkg_dev_name(pd->blkg);
				2223	struct ioc *ioc = pd_to_iocg(pd)->ioc;
				2224
				2225	if (!dname)
				2226	return 0;
				2227
				2228	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
				2229	dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
				2230	ioc->params.qos[QOS_RPPM] / 10000,
				2231	ioc->params.qos[QOS_RPPM] % 10000 / 100,
				2232	ioc->params.qos[QOS_RLAT],
				2233	ioc->params.qos[QOS_WPPM] / 10000,
				2234	ioc->params.qos[QOS_WPPM] % 10000 / 100,
				2235	ioc->params.qos[QOS_WLAT],
				2236	ioc->params.qos[QOS_MIN] / 10000,
				2237	ioc->params.qos[QOS_MIN] % 10000 / 100,
				2238	ioc->params.qos[QOS_MAX] / 10000,
				2239	ioc->params.qos[QOS_MAX] % 10000 / 100);
				2240	return 0;
				2241	}
				2242
				2243	static int ioc_qos_show(struct seq_file sf, void v)
				2244	{
				2245	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2246
				2247	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
				2248	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2249	return 0;
				2250	}
				2251
				2252	static const match_table_t qos_ctrl_tokens = {
				2253	{ QOS_ENABLE, "enable=%u" },
				2254	{ QOS_CTRL, "ctrl=%s" },
				2255	{ NR_QOS_CTRL_PARAMS, NULL },
				2256	};
				2257
				2258	static const match_table_t qos_tokens = {
				2259	{ QOS_RPPM, "rpct=%s" },
				2260	{ QOS_RLAT, "rlat=%u" },
				2261	{ QOS_WPPM, "wpct=%s" },
				2262	{ QOS_WLAT, "wlat=%u" },
				2263	{ QOS_MIN, "min=%s" },
				2264	{ QOS_MAX, "max=%s" },
				2265	{ NR_QOS_PARAMS, NULL },
				2266	};
				2267
				2268	static ssize_t ioc_qos_write(struct kernfs_open_file of, char input,
				2269	size_t nbytes, loff_t off)
				2270	{
				2271	struct gendisk *disk;
				2272	struct ioc *ioc;
				2273	u32 qos[NR_QOS_PARAMS];
				2274	bool enable, user;
				2275	char *p;
				2276	int ret;
				2277
				2278	disk = blkcg_conf_get_disk(&input);
				2279	if (IS_ERR(disk))
				2280	return PTR_ERR(disk);
				2281
				2282	ioc = q_to_ioc(disk->queue);
				2283	if (!ioc) {
				2284	ret = blk_iocost_init(disk->queue);
				2285	if (ret)
				2286	goto err;
				2287	ioc = q_to_ioc(disk->queue);
				2288	}
				2289
				2290	spin_lock_irq(&ioc->lock);
				2291	memcpy(qos, ioc->params.qos, sizeof(qos));
				2292	enable = ioc->enabled;
				2293	user = ioc->user_qos_params;
				2294	spin_unlock_irq(&ioc->lock);
				2295
				2296	while ((p = strsep(&input, " \t\n"))) {
				2297	substring_t args[MAX_OPT_ARGS];
				2298	char buf[32];
				2299	int tok;
				2300	s64 v;
				2301
				2302	if (!*p)
				2303	continue;
				2304
				2305	switch (match_token(p, qos_ctrl_tokens, args)) {
				2306	case QOS_ENABLE:
				2307	match_u64(&args[0], &v);
				2308	enable = v;
				2309	continue;
				2310	case QOS_CTRL:
				2311	match_strlcpy(buf, &args[0], sizeof(buf));
				2312	if (!strcmp(buf, "auto"))
				2313	user = false;
				2314	else if (!strcmp(buf, "user"))
				2315	user = true;
				2316	else
				2317	goto einval;
				2318	continue;
				2319	}
				2320
				2321	tok = match_token(p, qos_tokens, args);
				2322	switch (tok) {
				2323	case QOS_RPPM:
				2324	case QOS_WPPM:
				2325	if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
				2326	sizeof(buf))
				2327	goto einval;
				2328	if (cgroup_parse_float(buf, 2, &v))
				2329	goto einval;
				2330	if (v < 0 \|\| v > 10000)
				2331	goto einval;
				2332	qos[tok] = v * 100;
				2333	break;
				2334	case QOS_RLAT:
				2335	case QOS_WLAT:
				2336	if (match_u64(&args[0], &v))
				2337	goto einval;
				2338	qos[tok] = v;
				2339	break;
				2340	case QOS_MIN:
				2341	case QOS_MAX:
				2342	if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
				2343	sizeof(buf))
				2344	goto einval;
				2345	if (cgroup_parse_float(buf, 2, &v))
				2346	goto einval;
				2347	if (v < 0)
				2348	goto einval;
				2349	qos[tok] = clamp_t(s64, v * 100,
				2350	VRATE_MIN_PPM, VRATE_MAX_PPM);
				2351	break;
				2352	default:
				2353	goto einval;
				2354	}
				2355	user = true;
				2356	}
				2357
				2358	if (qos[QOS_MIN] > qos[QOS_MAX])
				2359	goto einval;
				2360
				2361	spin_lock_irq(&ioc->lock);
				2362
				2363	if (enable) {
Tejun Heo	cd00650	2020-04-13 12:27:56 -0400	[diff] [blame]	2364	blk_stat_enable_accounting(ioc->rqos.q);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2365	blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
				2366	ioc->enabled = true;
				2367	} else {
				2368	blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
				2369	ioc->enabled = false;
				2370	}
				2371
				2372	if (user) {
				2373	memcpy(ioc->params.qos, qos, sizeof(qos));
				2374	ioc->user_qos_params = true;
				2375	} else {
				2376	ioc->user_qos_params = false;
				2377	}
				2378
				2379	ioc_refresh_params(ioc, true);
				2380	spin_unlock_irq(&ioc->lock);
				2381
				2382	put_disk_and_module(disk);
				2383	return nbytes;
				2384	einval:
				2385	ret = -EINVAL;
				2386	err:
				2387	put_disk_and_module(disk);
				2388	return ret;
				2389	}
				2390
				2391	static u64 ioc_cost_model_prfill(struct seq_file *sf,
				2392	struct blkg_policy_data *pd, int off)
				2393	{
				2394	const char *dname = blkg_dev_name(pd->blkg);
				2395	struct ioc *ioc = pd_to_iocg(pd)->ioc;
				2396	u64 *u = ioc->params.i_lcoefs;
				2397
				2398	if (!dname)
				2399	return 0;
				2400
				2401	seq_printf(sf, "%s ctrl=%s model=linear "
				2402	"rbps=%llu rseqiops=%llu rrandiops=%llu "
				2403	"wbps=%llu wseqiops=%llu wrandiops=%llu\n",
				2404	dname, ioc->user_cost_model ? "user" : "auto",
				2405	u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
				2406	u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
				2407	return 0;
				2408	}
				2409
				2410	static int ioc_cost_model_show(struct seq_file sf, void v)
				2411	{
				2412	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2413
				2414	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
				2415	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2416	return 0;
				2417	}
				2418
				2419	static const match_table_t cost_ctrl_tokens = {
				2420	{ COST_CTRL, "ctrl=%s" },
				2421	{ COST_MODEL, "model=%s" },
				2422	{ NR_COST_CTRL_PARAMS, NULL },
				2423	};
				2424
				2425	static const match_table_t i_lcoef_tokens = {
				2426	{ I_LCOEF_RBPS, "rbps=%u" },
				2427	{ I_LCOEF_RSEQIOPS, "rseqiops=%u" },
				2428	{ I_LCOEF_RRANDIOPS, "rrandiops=%u" },
				2429	{ I_LCOEF_WBPS, "wbps=%u" },
				2430	{ I_LCOEF_WSEQIOPS, "wseqiops=%u" },
				2431	{ I_LCOEF_WRANDIOPS, "wrandiops=%u" },
				2432	{ NR_I_LCOEFS, NULL },
				2433	};
				2434
				2435	static ssize_t ioc_cost_model_write(struct kernfs_open_file of, char input,
				2436	size_t nbytes, loff_t off)
				2437	{
				2438	struct gendisk *disk;
				2439	struct ioc *ioc;
				2440	u64 u[NR_I_LCOEFS];
				2441	bool user;
				2442	char *p;
				2443	int ret;
				2444
				2445	disk = blkcg_conf_get_disk(&input);
				2446	if (IS_ERR(disk))
				2447	return PTR_ERR(disk);
				2448
				2449	ioc = q_to_ioc(disk->queue);
				2450	if (!ioc) {
				2451	ret = blk_iocost_init(disk->queue);
				2452	if (ret)
				2453	goto err;
				2454	ioc = q_to_ioc(disk->queue);
				2455	}
				2456
				2457	spin_lock_irq(&ioc->lock);
				2458	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
				2459	user = ioc->user_cost_model;
				2460	spin_unlock_irq(&ioc->lock);
				2461
				2462	while ((p = strsep(&input, " \t\n"))) {
				2463	substring_t args[MAX_OPT_ARGS];
				2464	char buf[32];
				2465	int tok;
				2466	u64 v;
				2467
				2468	if (!*p)
				2469	continue;
				2470
				2471	switch (match_token(p, cost_ctrl_tokens, args)) {
				2472	case COST_CTRL:
				2473	match_strlcpy(buf, &args[0], sizeof(buf));
				2474	if (!strcmp(buf, "auto"))
				2475	user = false;
				2476	else if (!strcmp(buf, "user"))
				2477	user = true;
				2478	else
				2479	goto einval;
				2480	continue;
				2481	case COST_MODEL:
				2482	match_strlcpy(buf, &args[0], sizeof(buf));
				2483	if (strcmp(buf, "linear"))
				2484	goto einval;
				2485	continue;
				2486	}
				2487
				2488	tok = match_token(p, i_lcoef_tokens, args);
				2489	if (tok == NR_I_LCOEFS)
				2490	goto einval;
				2491	if (match_u64(&args[0], &v))
				2492	goto einval;
				2493	u[tok] = v;
				2494	user = true;
				2495	}
				2496
				2497	spin_lock_irq(&ioc->lock);
				2498	if (user) {
				2499	memcpy(ioc->params.i_lcoefs, u, sizeof(u));
				2500	ioc->user_cost_model = true;
				2501	} else {
				2502	ioc->user_cost_model = false;
				2503	}
				2504	ioc_refresh_params(ioc, true);
				2505	spin_unlock_irq(&ioc->lock);
				2506
				2507	put_disk_and_module(disk);
				2508	return nbytes;
				2509
				2510	einval:
				2511	ret = -EINVAL;
				2512	err:
				2513	put_disk_and_module(disk);
				2514	return ret;
				2515	}
				2516
				2517	static struct cftype ioc_files[] = {
				2518	{
				2519	.name = "weight",
				2520	.flags = CFTYPE_NOT_ON_ROOT,
				2521	.seq_show = ioc_weight_show,
				2522	.write = ioc_weight_write,
				2523	},
				2524	{
				2525	.name = "cost.qos",
				2526	.flags = CFTYPE_ONLY_ON_ROOT,
				2527	.seq_show = ioc_qos_show,
				2528	.write = ioc_qos_write,
				2529	},
				2530	{
				2531	.name = "cost.model",
				2532	.flags = CFTYPE_ONLY_ON_ROOT,
				2533	.seq_show = ioc_cost_model_show,
				2534	.write = ioc_cost_model_write,
				2535	},
				2536	{}
				2537	};
				2538
				2539	static struct blkcg_policy blkcg_policy_iocost = {
				2540	.dfl_cftypes = ioc_files,
				2541	.cpd_alloc_fn = ioc_cpd_alloc,
				2542	.cpd_free_fn = ioc_cpd_free,
				2543	.pd_alloc_fn = ioc_pd_alloc,
				2544	.pd_init_fn = ioc_pd_init,
				2545	.pd_free_fn = ioc_pd_free,
				2546	};
				2547
				2548	static int __init ioc_init(void)
				2549	{
				2550	return blkcg_policy_register(&blkcg_policy_iocost);
				2551	}
				2552
				2553	static void __exit ioc_exit(void)
				2554	{
				2555	return blkcg_policy_unregister(&blkcg_policy_iocost);
				2556	}
				2557
				2558	module_init(ioc_init);
				2559	module_exit(ioc_exit);