Blame - block/blk-iocost.c - SHIFTPHONES/kernel/common

blob: 3ab0c1c704b68f0a9d5574d2a76ea252f3cc950d [file] [log] [blame]

Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1	/* SPDX-License-Identifier: GPL-2.0
				2	*
				3	* IO cost model based controller.
				4	*
				5	* Copyright (C) 2019 Tejun Heo <tj@kernel.org>
				6	* Copyright (C) 2019 Andy Newell <newella@fb.com>
				7	* Copyright (C) 2019 Facebook
				8	*
				9	* One challenge of controlling IO resources is the lack of trivially
				10	* observable cost metric. This is distinguished from CPU and memory where
				11	* wallclock time and the number of bytes can serve as accurate enough
				12	* approximations.
				13	*
				14	* Bandwidth and iops are the most commonly used metrics for IO devices but
				15	* depending on the type and specifics of the device, different IO patterns
				16	* easily lead to multiple orders of magnitude variations rendering them
				17	* useless for the purpose of IO capacity distribution. While on-device
				18	* time, with a lot of clutches, could serve as a useful approximation for
				19	* non-queued rotational devices, this is no longer viable with modern
				20	* devices, even the rotational ones.
				21	*
				22	* While there is no cost metric we can trivially observe, it isn't a
				23	* complete mystery. For example, on a rotational device, seek cost
				24	* dominates while a contiguous transfer contributes a smaller amount
				25	* proportional to the size. If we can characterize at least the relative
				26	* costs of these different types of IOs, it should be possible to
				27	* implement a reasonable work-conserving proportional IO resource
				28	* distribution.
				29	*
				30	* 1. IO Cost Model
				31	*
				32	* IO cost model estimates the cost of an IO given its basic parameters and
				33	* history (e.g. the end sector of the last IO). The cost is measured in
				34	* device time. If a given IO is estimated to cost 10ms, the device should
				35	* be able to process ~100 of those IOs in a second.
				36	*
				37	* Currently, there's only one builtin cost model - linear. Each IO is
				38	* classified as sequential or random and given a base cost accordingly.
				39	* On top of that, a size cost proportional to the length of the IO is
				40	* added. While simple, this model captures the operational
				41	* characteristics of a wide varienty of devices well enough. Default
				42	* paramters for several different classes of devices are provided and the
				43	* parameters can be configured from userspace via
				44	* /sys/fs/cgroup/io.cost.model.
				45	*
				46	* If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
				47	* device-specific coefficients.
				48	*
				49	* 2. Control Strategy
				50	*
				51	* The device virtual time (vtime) is used as the primary control metric.
				52	* The control strategy is composed of the following three parts.
				53	*
				54	* 2-1. Vtime Distribution
				55	*
				56	* When a cgroup becomes active in terms of IOs, its hierarchical share is
				57	* calculated. Please consider the following hierarchy where the numbers
				58	* inside parentheses denote the configured weights.
				59	*
				60	* root
				61	* / \
				62	* A (w:100) B (w:300)
				63	* / \
				64	* A0 (w:100) A1 (w:100)
				65	*
				66	* If B is idle and only A0 and A1 are actively issuing IOs, as the two are
				67	* of equal weight, each gets 50% share. If then B starts issuing IOs, B
				68	* gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
				69	* 12.5% each. The distribution mechanism only cares about these flattened
				70	* shares. They're called hweights (hierarchical weights) and always add
				71	* upto 1 (HWEIGHT_WHOLE).
				72	*
				73	* A given cgroup's vtime runs slower in inverse proportion to its hweight.
				74	* For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
				75	* against the device vtime - an IO which takes 10ms on the underlying
				76	* device is considered to take 80ms on A0.
				77	*
				78	* This constitutes the basis of IO capacity distribution. Each cgroup's
				79	* vtime is running at a rate determined by its hweight. A cgroup tracks
				80	* the vtime consumed by past IOs and can issue a new IO iff doing so
				81	* wouldn't outrun the current device vtime. Otherwise, the IO is
				82	* suspended until the vtime has progressed enough to cover it.
				83	*
				84	* 2-2. Vrate Adjustment
				85	*
				86	* It's unrealistic to expect the cost model to be perfect. There are too
				87	* many devices and even on the same device the overall performance
				88	* fluctuates depending on numerous factors such as IO mixture and device
				89	* internal garbage collection. The controller needs to adapt dynamically.
				90	*
				91	* This is achieved by adjusting the overall IO rate according to how busy
				92	* the device is. If the device becomes overloaded, we're sending down too
				93	* many IOs and should generally slow down. If there are waiting issuers
				94	* but the device isn't saturated, we're issuing too few and should
				95	* generally speed up.
				96	*
				97	* To slow down, we lower the vrate - the rate at which the device vtime
				98	* passes compared to the wall clock. For example, if the vtime is running
				99	* at the vrate of 75%, all cgroups added up would only be able to issue
				100	* 750ms worth of IOs per second, and vice-versa for speeding up.
				101	*
				102	* Device business is determined using two criteria - rq wait and
				103	* completion latencies.
				104	*
				105	* When a device gets saturated, the on-device and then the request queues
				106	* fill up and a bio which is ready to be issued has to wait for a request
				107	* to become available. When this delay becomes noticeable, it's a clear
				108	* indication that the device is saturated and we lower the vrate. This
				109	* saturation signal is fairly conservative as it only triggers when both
				110	* hardware and software queues are filled up, and is used as the default
				111	* busy signal.
				112	*
				113	* As devices can have deep queues and be unfair in how the queued commands
				114	* are executed, soley depending on rq wait may not result in satisfactory
				115	* control quality. For a better control quality, completion latency QoS
				116	* parameters can be configured so that the device is considered saturated
				117	* if N'th percentile completion latency rises above the set point.
				118	*
				119	* The completion latency requirements are a function of both the
				120	* underlying device characteristics and the desired IO latency quality of
				121	* service. There is an inherent trade-off - the tighter the latency QoS,
				122	* the higher the bandwidth lossage. Latency QoS is disabled by default
				123	* and can be set through /sys/fs/cgroup/io.cost.qos.
				124	*
				125	* 2-3. Work Conservation
				126	*
				127	* Imagine two cgroups A and B with equal weights. A is issuing a small IO
				128	* periodically while B is sending out enough parallel IOs to saturate the
				129	* device on its own. Let's say A's usage amounts to 100ms worth of IO
				130	* cost per second, i.e., 10% of the device capacity. The naive
				131	* distribution of half and half would lead to 60% utilization of the
				132	* device, a significant reduction in the total amount of work done
				133	* compared to free-for-all competition. This is too high a cost to pay
				134	* for IO control.
				135	*
				136	* To conserve the total amount of work done, we keep track of how much
				137	* each active cgroup is actually using and yield part of its weight if
				138	* there are other cgroups which can make use of it. In the above case,
				139	* A's weight will be lowered so that it hovers above the actual usage and
				140	* B would be able to use the rest.
				141	*
				142	* As we don't want to penalize a cgroup for donating its weight, the
				143	* surplus weight adjustment factors in a margin and has an immediate
				144	* snapback mechanism in case the cgroup needs more IO vtime for itself.
				145	*
				146	* Note that adjusting down surplus weights has the same effects as
				147	* accelerating vtime for other cgroups and work conservation can also be
				148	* implemented by adjusting vrate dynamically. However, squaring who can
				149	* donate and should take back how much requires hweight propagations
				150	* anyway making it easier to implement and understand as a separate
				151	* mechanism.
Tejun Heo	6954ff1	2019-08-28 15:05:59 -0700	[diff] [blame]	152	*
				153	* 3. Monitoring
				154	*
				155	* Instead of debugfs or other clumsy monitoring mechanisms, this
				156	* controller uses a drgn based monitoring script -
				157	* tools/cgroup/iocost_monitor.py. For details on drgn, please see
				158	* https://github.com/osandov/drgn. The ouput looks like the following.
				159	*
				160	* sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
Tejun Heo	7c1ee70	2019-09-04 12:45:56 -0700	[diff] [blame]	161	* active weight hweight% inflt% dbt delay usages%
				162	* test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
				163	* test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
Tejun Heo	6954ff1	2019-08-28 15:05:59 -0700	[diff] [blame]	164	*
				165	* - per : Timer period
				166	* - cur_per : Internal wall and device vtime clock
				167	* - vrate : Device virtual time rate against wall clock
				168	* - weight : Surplus-adjusted and configured weights
				169	* - hweight : Surplus-adjusted and configured hierarchical weights
				170	* - inflt : The percentage of in-flight IO cost at the end of last period
				171	* - del_ms : Deferred issuer delay induction level and duration
				172	* - usages : Usage history
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	173	*/
				174
				175	#include <linux/kernel.h>
				176	#include <linux/module.h>
				177	#include <linux/timer.h>
				178	#include <linux/time64.h>
				179	#include <linux/parser.h>
				180	#include <linux/sched/signal.h>
				181	#include <linux/blk-cgroup.h>
				182	#include "blk-rq-qos.h"
				183	#include "blk-stat.h"
				184	#include "blk-wbt.h"
				185
				186	#ifdef CONFIG_TRACEPOINTS
				187
				188	/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
				189	#define TRACE_IOCG_PATH_LEN 1024
				190	static DEFINE_SPINLOCK(trace_iocg_path_lock);
				191	static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
				192
				193	#define TRACE_IOCG_PATH(type, iocg, ...) \
				194	do { \
				195	unsigned long flags; \
				196	if (trace_iocost_##type##_enabled()) { \
				197	spin_lock_irqsave(&trace_iocg_path_lock, flags); \
				198	cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
				199	trace_iocg_path, TRACE_IOCG_PATH_LEN); \
				200	trace_iocost_##type(iocg, trace_iocg_path, \
				201	##__VA_ARGS__); \
				202	spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
				203	} \
				204	} while (0)
				205
				206	#else /* CONFIG_TRACE_POINTS */
				207	#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
				208	#endif /* CONFIG_TRACE_POINTS */
				209
				210	enum {
				211	MILLION = 1000000,
				212
				213	/* timer period is calculated from latency requirements, bound it */
				214	MIN_PERIOD = USEC_PER_MSEC,
				215	MAX_PERIOD = USEC_PER_SEC,
				216
				217	/*
				218	* A cgroup's vtime can run 50% behind the device vtime, which
				219	* serves as its IO credit buffer. Surplus weight adjustment is
				220	* immediately canceled if the vtime margin runs below 10%.
				221	*/
				222	MARGIN_PCT = 50,
				223	INUSE_MARGIN_PCT = 10,
				224
				225	/* Have some play in waitq timer operations */
				226	WAITQ_TIMER_MARGIN_PCT = 5,
				227
				228	/*
				229	* vtime can wrap well within a reasonable uptime when vrate is
				230	* consistently raised. Don't trust recorded cgroup vtime if the
				231	* period counter indicates that it's older than 5mins.
				232	*/
				233	VTIME_VALID_DUR = 300 * USEC_PER_SEC,
				234
				235	/*
				236	* Remember the past three non-zero usages and use the max for
				237	* surplus calculation. Three slots guarantee that we remember one
				238	* full period usage from the last active stretch even after
				239	* partial deactivation and re-activation periods. Don't start
				240	* giving away weight before collecting two data points to prevent
				241	* hweight adjustments based on one partial activation period.
				242	*/
				243	NR_USAGE_SLOTS = 3,
				244	MIN_VALID_USAGES = 2,
				245
				246	/* 1/64k is granular enough and can easily be handled w/ u32 */
				247	HWEIGHT_WHOLE = 1 << 16,
				248
				249	/*
				250	* As vtime is used to calculate the cost of each IO, it needs to
				251	* be fairly high precision. For example, it should be able to
				252	* represent the cost of a single page worth of discard with
				253	* suffificient accuracy. At the same time, it should be able to
				254	* represent reasonably long enough durations to be useful and
				255	* convenient during operation.
				256	*
				257	* 1s worth of vtime is 2^37. This gives us both sub-nanosecond
				258	* granularity and days of wrap-around time even at extreme vrates.
				259	*/
				260	VTIME_PER_SEC_SHIFT = 37,
				261	VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
				262	VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
				263
				264	/* bound vrate adjustments within two orders of magnitude */
				265	VRATE_MIN_PPM = 10000, /* 1% */
				266	VRATE_MAX_PPM = 100000000, /* 10000% */
				267
				268	VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
				269	VRATE_CLAMP_ADJ_PCT = 4,
				270
				271	/* if IOs end up waiting for requests, issue less */
				272	RQ_WAIT_BUSY_PCT = 5,
				273
				274	/* unbusy hysterisis */
				275	UNBUSY_THR_PCT = 75,
				276
				277	/* don't let cmds which take a very long time pin lagging for too long */
				278	MAX_LAGGING_PERIODS = 10,
				279
				280	/*
				281	* If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
				282	* donate the surplus.
				283	*/
				284	SURPLUS_SCALE_PCT = 125, /* * 125% */
				285	SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
				286	SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
				287
				288	/* switch iff the conditions are met for longer than this */
				289	AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
				290
				291	/*
				292	* Count IO size in 4k pages. The 12bit shift helps keeping
				293	* size-proportional components of cost calculation in closer
				294	* numbers of digits to per-IO cost components.
				295	*/
				296	IOC_PAGE_SHIFT = 12,
				297	IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
				298	IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
				299
				300	/* if apart further than 16M, consider randio for linear model */
				301	LCOEF_RANDIO_PAGES = 4096,
				302	};
				303
				304	enum ioc_running {
				305	IOC_IDLE,
				306	IOC_RUNNING,
				307	IOC_STOP,
				308	};
				309
				310	/* io.cost.qos controls including per-dev enable of the whole controller */
				311	enum {
				312	QOS_ENABLE,
				313	QOS_CTRL,
				314	NR_QOS_CTRL_PARAMS,
				315	};
				316
				317	/* io.cost.qos params */
				318	enum {
				319	QOS_RPPM,
				320	QOS_RLAT,
				321	QOS_WPPM,
				322	QOS_WLAT,
				323	QOS_MIN,
				324	QOS_MAX,
				325	NR_QOS_PARAMS,
				326	};
				327
				328	/* io.cost.model controls */
				329	enum {
				330	COST_CTRL,
				331	COST_MODEL,
				332	NR_COST_CTRL_PARAMS,
				333	};
				334
				335	/* builtin linear cost model coefficients */
				336	enum {
				337	I_LCOEF_RBPS,
				338	I_LCOEF_RSEQIOPS,
				339	I_LCOEF_RRANDIOPS,
				340	I_LCOEF_WBPS,
				341	I_LCOEF_WSEQIOPS,
				342	I_LCOEF_WRANDIOPS,
				343	NR_I_LCOEFS,
				344	};
				345
				346	enum {
				347	LCOEF_RPAGE,
				348	LCOEF_RSEQIO,
				349	LCOEF_RRANDIO,
				350	LCOEF_WPAGE,
				351	LCOEF_WSEQIO,
				352	LCOEF_WRANDIO,
				353	NR_LCOEFS,
				354	};
				355
				356	enum {
				357	AUTOP_INVALID,
				358	AUTOP_HDD,
				359	AUTOP_SSD_QD1,
				360	AUTOP_SSD_DFL,
				361	AUTOP_SSD_FAST,
				362	};
				363
				364	struct ioc_gq;
				365
				366	struct ioc_params {
				367	u32 qos[NR_QOS_PARAMS];
				368	u64 i_lcoefs[NR_I_LCOEFS];
				369	u64 lcoefs[NR_LCOEFS];
				370	u32 too_fast_vrate_pct;
				371	u32 too_slow_vrate_pct;
				372	};
				373
				374	struct ioc_missed {
				375	u32 nr_met;
				376	u32 nr_missed;
				377	u32 last_met;
				378	u32 last_missed;
				379	};
				380
				381	struct ioc_pcpu_stat {
				382	struct ioc_missed missed[2];
				383
				384	u64 rq_wait_ns;
				385	u64 last_rq_wait_ns;
				386	};
				387
				388	/* per device */
				389	struct ioc {
				390	struct rq_qos rqos;
				391
				392	bool enabled;
				393
				394	struct ioc_params params;
				395	u32 period_us;
				396	u32 margin_us;
				397	u64 vrate_min;
				398	u64 vrate_max;
				399
				400	spinlock_t lock;
				401	struct timer_list timer;
				402	struct list_head active_iocgs; /* active cgroups */
				403	struct ioc_pcpu_stat __percpu *pcpu_stat;
				404
				405	enum ioc_running running;
				406	atomic64_t vtime_rate;
				407
				408	seqcount_t period_seqcount;
				409	u32 period_at; /* wallclock starttime */
				410	u64 period_at_vtime; /* vtime starttime */
				411
				412	atomic64_t cur_period; /* inc'd each period */
				413	int busy_level; /* saturation history */
				414
				415	u64 inuse_margin_vtime;
				416	bool weights_updated;
				417	atomic_t hweight_gen; /* for lazy hweights */
				418
				419	u64 autop_too_fast_at;
				420	u64 autop_too_slow_at;
				421	int autop_idx;
				422	bool user_qos_params:1;
				423	bool user_cost_model:1;
				424	};
				425
				426	/* per device-cgroup pair */
				427	struct ioc_gq {
				428	struct blkg_policy_data pd;
				429	struct ioc *ioc;
				430
				431	/*
				432	* A iocg can get its weight from two sources - an explicit
				433	* per-device-cgroup configuration or the default weight of the
				434	* cgroup. `cfg_weight` is the explicit per-device-cgroup
				435	* configuration. `weight` is the effective considering both
				436	* sources.
				437	*
				438	* When an idle cgroup becomes active its `active` goes from 0 to
				439	* `weight`. `inuse` is the surplus adjusted active weight.
				440	* `active` and `inuse` are used to calculate `hweight_active` and
				441	* `hweight_inuse`.
				442	*
				443	* `last_inuse` remembers `inuse` while an iocg is idle to persist
				444	* surplus adjustments.
				445	*/
				446	u32 cfg_weight;
				447	u32 weight;
				448	u32 active;
				449	u32 inuse;
				450	u32 last_inuse;
				451
				452	sector_t cursor; /* to detect randio */
				453
				454	/*
				455	* `vtime` is this iocg's vtime cursor which progresses as IOs are
				456	* issued. If lagging behind device vtime, the delta represents
				457	* the currently available IO budget. If runnning ahead, the
				458	* overage.
				459	*
				460	* `vtime_done` is the same but progressed on completion rather
				461	* than issue. The delta behind `vtime` represents the cost of
				462	* currently in-flight IOs.
				463	*
				464	* `last_vtime` is used to remember `vtime` at the end of the last
				465	* period to calculate utilization.
				466	*/
				467	atomic64_t vtime;
				468	atomic64_t done_vtime;
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	469	atomic64_t abs_vdebt;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	470	u64 last_vtime;
				471
				472	/*
				473	* The period this iocg was last active in. Used for deactivation
				474	* and invalidating `vtime`.
				475	*/
				476	atomic64_t active_period;
				477	struct list_head active_list;
				478
				479	/* see __propagate_active_weight() and current_hweight() for details */
				480	u64 child_active_sum;
				481	u64 child_inuse_sum;
				482	int hweight_gen;
				483	u32 hweight_active;
				484	u32 hweight_inuse;
				485	bool has_surplus;
				486
				487	struct wait_queue_head waitq;
				488	struct hrtimer waitq_timer;
				489	struct hrtimer delay_timer;
				490
				491	/* usage is recorded as fractions of HWEIGHT_WHOLE */
				492	int usage_idx;
				493	u32 usages[NR_USAGE_SLOTS];
				494
				495	/* this iocg's depth in the hierarchy and ancestors including self */
				496	int level;
				497	struct ioc_gq *ancestors[];
				498	};
				499
				500	/* per cgroup */
				501	struct ioc_cgrp {
				502	struct blkcg_policy_data cpd;
				503	unsigned int dfl_weight;
				504	};
				505
				506	struct ioc_now {
				507	u64 now_ns;
				508	u32 now;
				509	u64 vnow;
				510	u64 vrate;
				511	};
				512
				513	struct iocg_wait {
				514	struct wait_queue_entry wait;
				515	struct bio *bio;
				516	u64 abs_cost;
				517	bool committed;
				518	};
				519
				520	struct iocg_wake_ctx {
				521	struct ioc_gq *iocg;
				522	u32 hw_inuse;
				523	s64 vbudget;
				524	};
				525
				526	static const struct ioc_params autop[] = {
				527	[AUTOP_HDD] = {
				528	.qos = {
Tejun Heo	7afccca	2019-09-25 16:03:35 -0700	[diff] [blame]	529	[QOS_RLAT] = 250000, /* 250ms */
				530	[QOS_WLAT] = 250000,
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	531	[QOS_MIN] = VRATE_MIN_PPM,
				532	[QOS_MAX] = VRATE_MAX_PPM,
				533	},
				534	.i_lcoefs = {
				535	[I_LCOEF_RBPS] = 174019176,
				536	[I_LCOEF_RSEQIOPS] = 41708,
				537	[I_LCOEF_RRANDIOPS] = 370,
				538	[I_LCOEF_WBPS] = 178075866,
				539	[I_LCOEF_WSEQIOPS] = 42705,
				540	[I_LCOEF_WRANDIOPS] = 378,
				541	},
				542	},
				543	[AUTOP_SSD_QD1] = {
				544	.qos = {
				545	[QOS_RLAT] = 25000, /* 25ms */
				546	[QOS_WLAT] = 25000,
				547	[QOS_MIN] = VRATE_MIN_PPM,
				548	[QOS_MAX] = VRATE_MAX_PPM,
				549	},
				550	.i_lcoefs = {
				551	[I_LCOEF_RBPS] = 245855193,
				552	[I_LCOEF_RSEQIOPS] = 61575,
				553	[I_LCOEF_RRANDIOPS] = 6946,
				554	[I_LCOEF_WBPS] = 141365009,
				555	[I_LCOEF_WSEQIOPS] = 33716,
				556	[I_LCOEF_WRANDIOPS] = 26796,
				557	},
				558	},
				559	[AUTOP_SSD_DFL] = {
				560	.qos = {
				561	[QOS_RLAT] = 25000, /* 25ms */
				562	[QOS_WLAT] = 25000,
				563	[QOS_MIN] = VRATE_MIN_PPM,
				564	[QOS_MAX] = VRATE_MAX_PPM,
				565	},
				566	.i_lcoefs = {
				567	[I_LCOEF_RBPS] = 488636629,
				568	[I_LCOEF_RSEQIOPS] = 8932,
				569	[I_LCOEF_RRANDIOPS] = 8518,
				570	[I_LCOEF_WBPS] = 427891549,
				571	[I_LCOEF_WSEQIOPS] = 28755,
				572	[I_LCOEF_WRANDIOPS] = 21940,
				573	},
				574	.too_fast_vrate_pct = 500,
				575	},
				576	[AUTOP_SSD_FAST] = {
				577	.qos = {
				578	[QOS_RLAT] = 5000, /* 5ms */
				579	[QOS_WLAT] = 5000,
				580	[QOS_MIN] = VRATE_MIN_PPM,
				581	[QOS_MAX] = VRATE_MAX_PPM,
				582	},
				583	.i_lcoefs = {
				584	[I_LCOEF_RBPS] = 3102524156LLU,
				585	[I_LCOEF_RSEQIOPS] = 724816,
				586	[I_LCOEF_RRANDIOPS] = 778122,
				587	[I_LCOEF_WBPS] = 1742780862LLU,
				588	[I_LCOEF_WSEQIOPS] = 425702,
				589	[I_LCOEF_WRANDIOPS] = 443193,
				590	},
				591	.too_slow_vrate_pct = 10,
				592	},
				593	};
				594
				595	/*
				596	* vrate adjust percentages indexed by ioc->busy_level. We adjust up on
				597	* vtime credit shortage and down on device saturation.
				598	*/
				599	static u32 vrate_adj_pct[] =
				600	{ 0, 0, 0, 0,
				601	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				602	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				603	4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
				604
				605	static struct blkcg_policy blkcg_policy_iocost;
				606
				607	/* accessors and helpers */
				608	static struct ioc rqos_to_ioc(struct rq_qos rqos)
				609	{
				610	return container_of(rqos, struct ioc, rqos);
				611	}
				612
				613	static struct ioc q_to_ioc(struct request_queue q)
				614	{
				615	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
				616	}
				617
				618	static const char q_name(struct request_queue q)
				619	{
				620	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
				621	return kobject_name(q->kobj.parent);
				622	else
				623	return "<unknown>";
				624	}
				625
				626	static const char __maybe_unused ioc_name(struct ioc ioc)
				627	{
				628	return q_name(ioc->rqos.q);
				629	}
				630
				631	static struct ioc_gq pd_to_iocg(struct blkg_policy_data pd)
				632	{
				633	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
				634	}
				635
				636	static struct ioc_gq blkg_to_iocg(struct blkcg_gq blkg)
				637	{
				638	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
				639	}
				640
				641	static struct blkcg_gq iocg_to_blkg(struct ioc_gq iocg)
				642	{
				643	return pd_to_blkg(&iocg->pd);
				644	}
				645
				646	static struct ioc_cgrp blkcg_to_iocc(struct blkcg blkcg)
				647	{
				648	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
				649	struct ioc_cgrp, cpd);
				650	}
				651
				652	/*
				653	* Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	654	* weight, the more expensive each IO. Must round up.
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	655	*/
				656	static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
				657	{
				658	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
				659	}
				660
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	661	/*
				662	* The inverse of abs_cost_to_cost(). Must round up.
				663	*/
				664	static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
				665	{
				666	return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
				667	}
				668
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	669	static void iocg_commit_bio(struct ioc_gq iocg, struct bio bio, u64 cost)
				670	{
				671	bio->bi_iocost_cost = cost;
				672	atomic64_add(cost, &iocg->vtime);
				673	}
				674
				675	#define CREATE_TRACE_POINTS
				676	#include <trace/events/iocost.h>
				677
				678	/* latency Qos params changed, update period_us and all the dependent params */
				679	static void ioc_refresh_period_us(struct ioc *ioc)
				680	{
				681	u32 ppm, lat, multi, period_us;
				682
				683	lockdep_assert_held(&ioc->lock);
				684
				685	/* pick the higher latency target */
				686	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
				687	ppm = ioc->params.qos[QOS_RPPM];
				688	lat = ioc->params.qos[QOS_RLAT];
				689	} else {
				690	ppm = ioc->params.qos[QOS_WPPM];
				691	lat = ioc->params.qos[QOS_WLAT];
				692	}
				693
				694	/*
				695	* We want the period to be long enough to contain a healthy number
				696	* of IOs while short enough for granular control. Define it as a
				697	* multiple of the latency target. Ideally, the multiplier should
				698	* be scaled according to the percentile so that it would nominally
				699	* contain a certain number of requests. Let's be simpler and
				700	* scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
				701	*/
				702	if (ppm)
				703	multi = max_t(u32, (MILLION - ppm) / 50000, 2);
				704	else
				705	multi = 2;
				706	period_us = multi * lat;
				707	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
				708
				709	/* calculate dependent params */
				710	ioc->period_us = period_us;
				711	ioc->margin_us = period_us * MARGIN_PCT / 100;
				712	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
				713	period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
				714	}
				715
				716	static int ioc_autop_idx(struct ioc *ioc)
				717	{
				718	int idx = ioc->autop_idx;
				719	const struct ioc_params *p = &autop[idx];
				720	u32 vrate_pct;
				721	u64 now_ns;
				722
				723	/* rotational? */
				724	if (!blk_queue_nonrot(ioc->rqos.q))
				725	return AUTOP_HDD;
				726
				727	/* handle SATA SSDs w/ broken NCQ */
				728	if (blk_queue_depth(ioc->rqos.q) == 1)
				729	return AUTOP_SSD_QD1;
				730
				731	/* use one of the normal ssd sets */
				732	if (idx < AUTOP_SSD_DFL)
				733	return AUTOP_SSD_DFL;
				734
				735	/* if user is overriding anything, maintain what was there */
				736	if (ioc->user_qos_params \|\| ioc->user_cost_model)
				737	return idx;
				738
				739	/* step up/down based on the vrate */
				740	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
				741	VTIME_PER_USEC);
				742	now_ns = ktime_get_ns();
				743
				744	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
				745	if (!ioc->autop_too_fast_at)
				746	ioc->autop_too_fast_at = now_ns;
				747	if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
				748	return idx + 1;
				749	} else {
				750	ioc->autop_too_fast_at = 0;
				751	}
				752
				753	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
				754	if (!ioc->autop_too_slow_at)
				755	ioc->autop_too_slow_at = now_ns;
				756	if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
				757	return idx - 1;
				758	} else {
				759	ioc->autop_too_slow_at = 0;
				760	}
				761
				762	return idx;
				763	}
				764
				765	/*
				766	* Take the followings as input
				767	*
				768	* @bps maximum sequential throughput
				769	* @seqiops maximum sequential 4k iops
				770	* @randiops maximum random 4k iops
				771	*
				772	* and calculate the linear model cost coefficients.
				773	*
				774	* *@page per-page cost 1s / (@bps / 4096)
				775	* @seqio base cost of a seq IO max((1s / @seqiops) - @page, 0)
				776	* @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
				777	*/
				778	static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
				779	u64 page, u64 seqio, u64 *randio)
				780	{
				781	u64 v;
				782
				783	page = seqio = *randio = 0;
				784
				785	if (bps)
				786	*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
				787	DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
				788
				789	if (seqiops) {
				790	v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
				791	if (v > *page)
				792	seqio = v - page;
				793	}
				794
				795	if (randiops) {
				796	v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
				797	if (v > *page)
				798	randio = v - page;
				799	}
				800	}
				801
				802	static void ioc_refresh_lcoefs(struct ioc *ioc)
				803	{
				804	u64 *u = ioc->params.i_lcoefs;
				805	u64 *c = ioc->params.lcoefs;
				806
				807	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
				808	&c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
				809	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
				810	&c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
				811	}
				812
				813	static bool ioc_refresh_params(struct ioc *ioc, bool force)
				814	{
				815	const struct ioc_params *p;
				816	int idx;
				817
				818	lockdep_assert_held(&ioc->lock);
				819
				820	idx = ioc_autop_idx(ioc);
				821	p = &autop[idx];
				822
				823	if (idx == ioc->autop_idx && !force)
				824	return false;
				825
				826	if (idx != ioc->autop_idx)
				827	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
				828
				829	ioc->autop_idx = idx;
				830	ioc->autop_too_fast_at = 0;
				831	ioc->autop_too_slow_at = 0;
				832
				833	if (!ioc->user_qos_params)
				834	memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
				835	if (!ioc->user_cost_model)
				836	memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
				837
				838	ioc_refresh_period_us(ioc);
				839	ioc_refresh_lcoefs(ioc);
				840
				841	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
				842	VTIME_PER_USEC, MILLION);
				843	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
				844	VTIME_PER_USEC, MILLION);
				845
				846	return true;
				847	}
				848
				849	/* take a snapshot of the current [v]time and vrate */
				850	static void ioc_now(struct ioc ioc, struct ioc_now now)
				851	{
				852	unsigned seq;
				853
				854	now->now_ns = ktime_get();
				855	now->now = ktime_to_us(now->now_ns);
				856	now->vrate = atomic64_read(&ioc->vtime_rate);
				857
				858	/*
				859	* The current vtime is
				860	*
				861	* vtime at period start + (wallclock time since the start) * vrate
				862	*
				863	* As a consistent snapshot of `period_at_vtime` and `period_at` is
				864	* needed, they're seqcount protected.
				865	*/
				866	do {
				867	seq = read_seqcount_begin(&ioc->period_seqcount);
				868	now->vnow = ioc->period_at_vtime +
				869	(now->now - ioc->period_at) * now->vrate;
				870	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
				871	}
				872
				873	static void ioc_start_period(struct ioc ioc, struct ioc_now now)
				874	{
				875	lockdep_assert_held(&ioc->lock);
				876	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
				877
				878	write_seqcount_begin(&ioc->period_seqcount);
				879	ioc->period_at = now->now;
				880	ioc->period_at_vtime = now->vnow;
				881	write_seqcount_end(&ioc->period_seqcount);
				882
				883	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
				884	add_timer(&ioc->timer);
				885	}
				886
				887	/*
				888	* Update @iocg's `active` and `inuse` to @active and @inuse, update level
				889	* weight sums and propagate upwards accordingly.
				890	*/
				891	static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
				892	{
				893	struct ioc *ioc = iocg->ioc;
				894	int lvl;
				895
				896	lockdep_assert_held(&ioc->lock);
				897
				898	inuse = min(active, inuse);
				899
				900	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
				901	struct ioc_gq *parent = iocg->ancestors[lvl];
				902	struct ioc_gq *child = iocg->ancestors[lvl + 1];
				903	u32 parent_active = 0, parent_inuse = 0;
				904
				905	/* update the level sums */
				906	parent->child_active_sum += (s32)(active - child->active);
				907	parent->child_inuse_sum += (s32)(inuse - child->inuse);
				908	/* apply the udpates */
				909	child->active = active;
				910	child->inuse = inuse;
				911
				912	/*
				913	* The delta between inuse and active sums indicates that
				914	* that much of weight is being given away. Parent's inuse
				915	* and active should reflect the ratio.
				916	*/
				917	if (parent->child_active_sum) {
				918	parent_active = parent->weight;
				919	parent_inuse = DIV64_U64_ROUND_UP(
				920	parent_active * parent->child_inuse_sum,
				921	parent->child_active_sum);
				922	}
				923
				924	/* do we need to keep walking up? */
				925	if (parent_active == parent->active &&
				926	parent_inuse == parent->inuse)
				927	break;
				928
				929	active = parent_active;
				930	inuse = parent_inuse;
				931	}
				932
				933	ioc->weights_updated = true;
				934	}
				935
				936	static void commit_active_weights(struct ioc *ioc)
				937	{
				938	lockdep_assert_held(&ioc->lock);
				939
				940	if (ioc->weights_updated) {
				941	/* paired with rmb in current_hweight(), see there */
				942	smp_wmb();
				943	atomic_inc(&ioc->hweight_gen);
				944	ioc->weights_updated = false;
				945	}
				946	}
				947
				948	static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
				949	{
				950	__propagate_active_weight(iocg, active, inuse);
				951	commit_active_weights(iocg->ioc);
				952	}
				953
				954	static void current_hweight(struct ioc_gq iocg, u32 hw_activep, u32 *hw_inusep)
				955	{
				956	struct ioc *ioc = iocg->ioc;
				957	int lvl;
				958	u32 hwa, hwi;
				959	int ioc_gen;
				960
				961	/* hot path - if uptodate, use cached */
				962	ioc_gen = atomic_read(&ioc->hweight_gen);
				963	if (ioc_gen == iocg->hweight_gen)
				964	goto out;
				965
				966	/*
				967	* Paired with wmb in commit_active_weights(). If we saw the
				968	* updated hweight_gen, all the weight updates from
				969	* __propagate_active_weight() are visible too.
				970	*
				971	* We can race with weight updates during calculation and get it
				972	* wrong. However, hweight_gen would have changed and a future
				973	* reader will recalculate and we're guaranteed to discard the
				974	* wrong result soon.
				975	*/
				976	smp_rmb();
				977
				978	hwa = hwi = HWEIGHT_WHOLE;
				979	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
				980	struct ioc_gq *parent = iocg->ancestors[lvl];
				981	struct ioc_gq *child = iocg->ancestors[lvl + 1];
				982	u32 active_sum = READ_ONCE(parent->child_active_sum);
				983	u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
				984	u32 active = READ_ONCE(child->active);
				985	u32 inuse = READ_ONCE(child->inuse);
				986
				987	/* we can race with deactivations and either may read as zero */
				988	if (!active_sum \|\| !inuse_sum)
				989	continue;
				990
				991	active_sum = max(active, active_sum);
				992	hwa = hwa * active / active_sum; /* max 16bits * 10000 */
				993
				994	inuse_sum = max(inuse, inuse_sum);
				995	hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
				996	}
				997
				998	iocg->hweight_active = max_t(u32, hwa, 1);
				999	iocg->hweight_inuse = max_t(u32, hwi, 1);
				1000	iocg->hweight_gen = ioc_gen;
				1001	out:
				1002	if (hw_activep)
				1003	*hw_activep = iocg->hweight_active;
				1004	if (hw_inusep)
				1005	*hw_inusep = iocg->hweight_inuse;
				1006	}
				1007
				1008	static void weight_updated(struct ioc_gq *iocg)
				1009	{
				1010	struct ioc *ioc = iocg->ioc;
				1011	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
				1012	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
				1013	u32 weight;
				1014
				1015	lockdep_assert_held(&ioc->lock);
				1016
				1017	weight = iocg->cfg_weight ?: iocc->dfl_weight;
				1018	if (weight != iocg->weight && iocg->active)
				1019	propagate_active_weight(iocg, weight,
				1020	DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
				1021	iocg->weight = weight;
				1022	}
				1023
				1024	static bool iocg_activate(struct ioc_gq iocg, struct ioc_now now)
				1025	{
				1026	struct ioc *ioc = iocg->ioc;
				1027	u64 last_period, cur_period, max_period_delta;
				1028	u64 vtime, vmargin, vmin;
				1029	int i;
				1030
				1031	/*
				1032	* If seem to be already active, just update the stamp to tell the
				1033	* timer that we're still active. We don't mind occassional races.
				1034	*/
				1035	if (!list_empty(&iocg->active_list)) {
				1036	ioc_now(ioc, now);
				1037	cur_period = atomic64_read(&ioc->cur_period);
				1038	if (atomic64_read(&iocg->active_period) != cur_period)
				1039	atomic64_set(&iocg->active_period, cur_period);
				1040	return true;
				1041	}
				1042
				1043	/* racy check on internal node IOs, treat as root level IOs */
				1044	if (iocg->child_active_sum)
				1045	return false;
				1046
				1047	spin_lock_irq(&ioc->lock);
				1048
				1049	ioc_now(ioc, now);
				1050
				1051	/* update period */
				1052	cur_period = atomic64_read(&ioc->cur_period);
				1053	last_period = atomic64_read(&iocg->active_period);
				1054	atomic64_set(&iocg->active_period, cur_period);
				1055
				1056	/* already activated or breaking leaf-only constraint? */
Jiufei Xue	8b37bc2	2019-11-13 15:21:31 +0800	[diff] [blame]	1057	if (!list_empty(&iocg->active_list))
				1058	goto succeed_unlock;
				1059	for (i = iocg->level - 1; i > 0; i--)
				1060	if (!list_empty(&iocg->ancestors[i]->active_list))
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1061	goto fail_unlock;
Jiufei Xue	8b37bc2	2019-11-13 15:21:31 +0800	[diff] [blame]	1062
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1063	if (iocg->child_active_sum)
				1064	goto fail_unlock;
				1065
				1066	/*
				1067	* vtime may wrap when vrate is raised substantially due to
				1068	* underestimated IO costs. Look at the period and ignore its
				1069	* vtime if the iocg has been idle for too long. Also, cap the
				1070	* budget it can start with to the margin.
				1071	*/
				1072	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
				1073	vtime = atomic64_read(&iocg->vtime);
				1074	vmargin = ioc->margin_us * now->vrate;
				1075	vmin = now->vnow - vmargin;
				1076
				1077	if (last_period + max_period_delta < cur_period \|\|
				1078	time_before64(vtime, vmin)) {
				1079	atomic64_add(vmin - vtime, &iocg->vtime);
				1080	atomic64_add(vmin - vtime, &iocg->done_vtime);
				1081	vtime = vmin;
				1082	}
				1083
				1084	/*
				1085	* Activate, propagate weight and start period timer if not
				1086	* running. Reset hweight_gen to avoid accidental match from
				1087	* wrapping.
				1088	*/
				1089	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
				1090	list_add(&iocg->active_list, &ioc->active_iocgs);
				1091	propagate_active_weight(iocg, iocg->weight,
				1092	iocg->last_inuse ?: iocg->weight);
				1093
				1094	TRACE_IOCG_PATH(iocg_activate, iocg, now,
				1095	last_period, cur_period, vtime);
				1096
				1097	iocg->last_vtime = vtime;
				1098
				1099	if (ioc->running == IOC_IDLE) {
				1100	ioc->running = IOC_RUNNING;
				1101	ioc_start_period(ioc, now);
				1102	}
				1103
Jiufei Xue	8b37bc2	2019-11-13 15:21:31 +0800	[diff] [blame]	1104	succeed_unlock:
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1105	spin_unlock_irq(&ioc->lock);
				1106	return true;
				1107
				1108	fail_unlock:
				1109	spin_unlock_irq(&ioc->lock);
				1110	return false;
				1111	}
				1112
				1113	static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
				1114	int flags, void *key)
				1115	{
				1116	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
				1117	struct iocg_wake_ctx ctx = (struct iocg_wake_ctx )key;
				1118	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
				1119
				1120	ctx->vbudget -= cost;
				1121
				1122	if (ctx->vbudget < 0)
				1123	return -1;
				1124
				1125	iocg_commit_bio(ctx->iocg, wait->bio, cost);
				1126
				1127	/*
				1128	* autoremove_wake_function() removes the wait entry only when it
				1129	* actually changed the task state. We want the wait always
				1130	* removed. Remove explicitly and use default_wake_function().
				1131	*/
				1132	list_del_init(&wq_entry->entry);
				1133	wait->committed = true;
				1134
				1135	default_wake_function(wq_entry, mode, flags, key);
				1136	return 0;
				1137	}
				1138
				1139	static void iocg_kick_waitq(struct ioc_gq iocg, struct ioc_now now)
				1140	{
				1141	struct ioc *ioc = iocg->ioc;
				1142	struct iocg_wake_ctx ctx = { .iocg = iocg };
				1143	u64 margin_ns = (u64)(ioc->period_us *
				1144	WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1145	u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
				1146	s64 vbudget;
				1147	u32 hw_inuse;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1148
				1149	lockdep_assert_held(&iocg->waitq.lock);
				1150
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1151	current_hweight(iocg, NULL, &hw_inuse);
				1152	vbudget = now->vnow - atomic64_read(&iocg->vtime);
				1153
				1154	/* pay off debt */
				1155	abs_vdebt = atomic64_read(&iocg->abs_vdebt);
				1156	vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
				1157	if (vdebt && vbudget > 0) {
				1158	u64 delta = min_t(u64, vbudget, vdebt);
				1159	u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
				1160	abs_vdebt);
				1161
				1162	atomic64_add(delta, &iocg->vtime);
				1163	atomic64_add(delta, &iocg->done_vtime);
				1164	atomic64_sub(abs_delta, &iocg->abs_vdebt);
				1165	if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
				1166	atomic64_set(&iocg->abs_vdebt, 0);
				1167	}
				1168
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1169	/*
				1170	* Wake up the ones which are due and see how much vtime we'll need
				1171	* for the next one.
				1172	*/
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1173	ctx.hw_inuse = hw_inuse;
				1174	ctx.vbudget = vbudget - vdebt;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1175	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
				1176	if (!waitqueue_active(&iocg->waitq))
				1177	return;
				1178	if (WARN_ON_ONCE(ctx.vbudget >= 0))
				1179	return;
				1180
				1181	/* determine next wakeup, add a quarter margin to guarantee chunking */
				1182	vshortage = -ctx.vbudget;
				1183	expires = now->now_ns +
				1184	DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
				1185	expires += margin_ns / 4;
				1186
				1187	/* if already active and close enough, don't bother */
				1188	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
				1189	if (hrtimer_is_queued(&iocg->waitq_timer) &&
				1190	abs(oexpires - expires) <= margin_ns / 4)
				1191	return;
				1192
				1193	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
				1194	margin_ns / 4, HRTIMER_MODE_ABS);
				1195	}
				1196
				1197	static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
				1198	{
				1199	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
				1200	struct ioc_now now;
				1201	unsigned long flags;
				1202
				1203	ioc_now(iocg->ioc, &now);
				1204
				1205	spin_lock_irqsave(&iocg->waitq.lock, flags);
				1206	iocg_kick_waitq(iocg, &now);
				1207	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
				1208
				1209	return HRTIMER_NORESTART;
				1210	}
				1211
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1212	static bool iocg_kick_delay(struct ioc_gq iocg, struct ioc_now now, u64 cost)
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1213	{
				1214	struct ioc *ioc = iocg->ioc;
				1215	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
				1216	u64 vtime = atomic64_read(&iocg->vtime);
				1217	u64 vmargin = ioc->margin_us * now->vrate;
				1218	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
				1219	u64 expires, oexpires;
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1220	u32 hw_inuse;
				1221
				1222	/* debt-adjust vtime */
				1223	current_hweight(iocg, NULL, &hw_inuse);
				1224	vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1225
				1226	/* clear or maintain depending on the overage */
				1227	if (time_before_eq64(vtime, now->vnow)) {
				1228	blkcg_clear_delay(blkg);
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1229	return false;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1230	}
				1231	if (!atomic_read(&blkg->use_delay) &&
				1232	time_before_eq64(vtime, now->vnow + vmargin))
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1233	return false;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1234
				1235	/* use delay */
				1236	if (cost) {
				1237	u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
				1238	now->vrate);
				1239	blkcg_add_delay(blkg, now->now_ns, cost_ns);
				1240	}
				1241	blkcg_use_delay(blkg);
				1242
				1243	expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
				1244	now->vrate) * NSEC_PER_USEC;
				1245
				1246	/* if already active and close enough, don't bother */
				1247	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
				1248	if (hrtimer_is_queued(&iocg->delay_timer) &&
				1249	abs(oexpires - expires) <= margin_ns / 4)
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1250	return true;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1251
				1252	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
				1253	margin_ns / 4, HRTIMER_MODE_ABS);
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1254	return true;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1255	}
				1256
				1257	static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
				1258	{
				1259	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
				1260	struct ioc_now now;
				1261
				1262	ioc_now(iocg->ioc, &now);
				1263	iocg_kick_delay(iocg, &now, 0);
				1264
				1265	return HRTIMER_NORESTART;
				1266	}
				1267
				1268	static void ioc_lat_stat(struct ioc ioc, u32 missed_ppm_ar, u32 *rq_wait_pct_p)
				1269	{
				1270	u32 nr_met[2] = { };
				1271	u32 nr_missed[2] = { };
				1272	u64 rq_wait_ns = 0;
				1273	int cpu, rw;
				1274
				1275	for_each_online_cpu(cpu) {
				1276	struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
				1277	u64 this_rq_wait_ns;
				1278
				1279	for (rw = READ; rw <= WRITE; rw++) {
				1280	u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
				1281	u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
				1282
				1283	nr_met[rw] += this_met - stat->missed[rw].last_met;
				1284	nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
				1285	stat->missed[rw].last_met = this_met;
				1286	stat->missed[rw].last_missed = this_missed;
				1287	}
				1288
				1289	this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
				1290	rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
				1291	stat->last_rq_wait_ns = this_rq_wait_ns;
				1292	}
				1293
				1294	for (rw = READ; rw <= WRITE; rw++) {
				1295	if (nr_met[rw] + nr_missed[rw])
				1296	missed_ppm_ar[rw] =
				1297	DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
				1298	nr_met[rw] + nr_missed[rw]);
				1299	else
				1300	missed_ppm_ar[rw] = 0;
				1301	}
				1302
				1303	rq_wait_pct_p = div64_u64(rq_wait_ns 100,
				1304	ioc->period_us * NSEC_PER_USEC);
				1305	}
				1306
				1307	/* was iocg idle this period? */
				1308	static bool iocg_is_idle(struct ioc_gq *iocg)
				1309	{
				1310	struct ioc *ioc = iocg->ioc;
				1311
				1312	/* did something get issued this period? */
				1313	if (atomic64_read(&iocg->active_period) ==
				1314	atomic64_read(&ioc->cur_period))
				1315	return false;
				1316
				1317	/* is something in flight? */
Tejun Heo	dcd6589	2020-03-10 13:07:46 -0400	[diff] [blame]	1318	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1319	return false;
				1320
				1321	return true;
				1322	}
				1323
				1324	/* returns usage with margin added if surplus is large enough */
				1325	static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
				1326	{
				1327	/* add margin */
				1328	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
				1329	usage += SURPLUS_SCALE_ABS;
				1330
				1331	/* don't bother if the surplus is too small */
				1332	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
				1333	return 0;
				1334
				1335	return usage;
				1336	}
				1337
				1338	static void ioc_timer_fn(struct timer_list *timer)
				1339	{
				1340	struct ioc *ioc = container_of(timer, struct ioc, timer);
				1341	struct ioc_gq iocg, tiocg;
				1342	struct ioc_now now;
				1343	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
				1344	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
				1345	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
				1346	u32 missed_ppm[2], rq_wait_pct;
				1347	u64 period_vtime;
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1348	int prev_busy_level, i;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1349
				1350	/* how were the latencies during the period? */
				1351	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
				1352
				1353	/* take care of active iocgs */
				1354	spin_lock_irq(&ioc->lock);
				1355
				1356	ioc_now(ioc, &now);
				1357
				1358	period_vtime = now.vnow - ioc->period_at_vtime;
				1359	if (WARN_ON_ONCE(!period_vtime)) {
				1360	spin_unlock_irq(&ioc->lock);
				1361	return;
				1362	}
				1363
				1364	/*
				1365	* Waiters determine the sleep durations based on the vrate they
				1366	* saw at the time of sleep. If vrate has increased, some waiters
				1367	* could be sleeping for too long. Wake up tardy waiters which
				1368	* should have woken up in the last period and expire idle iocgs.
				1369	*/
				1370	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1371	if (!waitqueue_active(&iocg->waitq) &&
				1372	!atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1373	continue;
				1374
				1375	spin_lock(&iocg->waitq.lock);
				1376
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1377	if (waitqueue_active(&iocg->waitq) \|\|
				1378	atomic64_read(&iocg->abs_vdebt)) {
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1379	/* might be oversleeping vtime / hweight changes, kick */
				1380	iocg_kick_waitq(iocg, &now);
				1381	iocg_kick_delay(iocg, &now, 0);
				1382	} else if (iocg_is_idle(iocg)) {
				1383	/* no waiter and idle, deactivate */
				1384	iocg->last_inuse = iocg->inuse;
				1385	__propagate_active_weight(iocg, 0, 0);
				1386	list_del_init(&iocg->active_list);
				1387	}
				1388
				1389	spin_unlock(&iocg->waitq.lock);
				1390	}
				1391	commit_active_weights(ioc);
				1392
				1393	/* calc usages and see whether some weights need to be moved around */
				1394	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
				1395	u64 vdone, vtime, vusage, vmargin, vmin;
				1396	u32 hw_active, hw_inuse, usage;
				1397
				1398	/*
				1399	* Collect unused and wind vtime closer to vnow to prevent
				1400	* iocgs from accumulating a large amount of budget.
				1401	*/
				1402	vdone = atomic64_read(&iocg->done_vtime);
				1403	vtime = atomic64_read(&iocg->vtime);
				1404	current_hweight(iocg, &hw_active, &hw_inuse);
				1405
				1406	/*
				1407	* Latency QoS detection doesn't account for IOs which are
				1408	* in-flight for longer than a period. Detect them by
				1409	* comparing vdone against period start. If lagging behind
				1410	* IOs from past periods, don't increase vrate.
				1411	*/
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1412	if ((ppm_rthr != MILLION \|\| ppm_wthr != MILLION) &&
				1413	!atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1414	time_after64(vtime, vdone) &&
				1415	time_after64(vtime, now.vnow -
				1416	MAX_LAGGING_PERIODS * period_vtime) &&
				1417	time_before64(vdone, now.vnow - period_vtime))
				1418	nr_lagging++;
				1419
				1420	if (waitqueue_active(&iocg->waitq))
				1421	vusage = now.vnow - iocg->last_vtime;
				1422	else if (time_before64(iocg->last_vtime, vtime))
				1423	vusage = vtime - iocg->last_vtime;
				1424	else
				1425	vusage = 0;
				1426
				1427	iocg->last_vtime += vusage;
				1428	/*
				1429	* Factor in in-flight vtime into vusage to avoid
				1430	* high-latency completions appearing as idle. This should
				1431	* be done after the above ->last_time adjustment.
				1432	*/
				1433	vusage = max(vusage, vtime - vdone);
				1434
				1435	/* calculate hweight based usage ratio and record */
				1436	if (vusage) {
				1437	usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
				1438	period_vtime);
				1439	iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
				1440	iocg->usages[iocg->usage_idx] = usage;
				1441	} else {
				1442	usage = 0;
				1443	}
				1444
				1445	/* see whether there's surplus vtime */
				1446	vmargin = ioc->margin_us * now.vrate;
				1447	vmin = now.vnow - vmargin;
				1448
				1449	iocg->has_surplus = false;
				1450
				1451	if (!waitqueue_active(&iocg->waitq) &&
				1452	time_before64(vtime, vmin)) {
				1453	u64 delta = vmin - vtime;
				1454
				1455	/* throw away surplus vtime */
				1456	atomic64_add(delta, &iocg->vtime);
				1457	atomic64_add(delta, &iocg->done_vtime);
				1458	iocg->last_vtime += delta;
				1459	/* if usage is sufficiently low, maybe it can donate */
				1460	if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
				1461	iocg->has_surplus = true;
				1462	nr_surpluses++;
				1463	}
				1464	} else if (hw_inuse < hw_active) {
				1465	u32 new_hwi, new_inuse;
				1466
				1467	/* was donating but might need to take back some */
				1468	if (waitqueue_active(&iocg->waitq)) {
				1469	new_hwi = hw_active;
				1470	} else {
				1471	new_hwi = max(hw_inuse,
				1472	usage * SURPLUS_SCALE_PCT / 100 +
				1473	SURPLUS_SCALE_ABS);
				1474	}
				1475
				1476	new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
				1477	hw_inuse);
				1478	new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
				1479
				1480	if (new_inuse > iocg->inuse) {
				1481	TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
				1482	iocg->inuse, new_inuse,
				1483	hw_inuse, new_hwi);
				1484	__propagate_active_weight(iocg, iocg->weight,
				1485	new_inuse);
				1486	}
				1487	} else {
				1488	/* genuninely out of vtime */
				1489	nr_shortages++;
				1490	}
				1491	}
				1492
				1493	if (!nr_shortages \|\| !nr_surpluses)
				1494	goto skip_surplus_transfers;
				1495
				1496	/* there are both shortages and surpluses, transfer surpluses */
				1497	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
				1498	u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
				1499	int nr_valid = 0;
				1500
				1501	if (!iocg->has_surplus)
				1502	continue;
				1503
				1504	/* base the decision on max historical usage */
				1505	for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
				1506	if (iocg->usages[i]) {
				1507	usage = max(usage, iocg->usages[i]);
				1508	nr_valid++;
				1509	}
				1510	}
				1511	if (nr_valid < MIN_VALID_USAGES)
				1512	continue;
				1513
				1514	current_hweight(iocg, &hw_active, &hw_inuse);
				1515	new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
				1516	if (!new_hwi)
				1517	continue;
				1518
				1519	new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
				1520	hw_inuse);
				1521	if (new_inuse < iocg->inuse) {
				1522	TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
				1523	iocg->inuse, new_inuse,
				1524	hw_inuse, new_hwi);
				1525	__propagate_active_weight(iocg, iocg->weight, new_inuse);
				1526	}
				1527	}
				1528	skip_surplus_transfers:
				1529	commit_active_weights(ioc);
				1530
				1531	/*
				1532	* If q is getting clogged or we're missing too much, we're issuing
				1533	* too much IO and should lower vtime rate. If we're not missing
				1534	* and experiencing shortages but not surpluses, we're too stingy
				1535	* and should increase vtime rate.
				1536	*/
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1537	prev_busy_level = ioc->busy_level;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1538	if (rq_wait_pct > RQ_WAIT_BUSY_PCT \|\|
				1539	missed_ppm[READ] > ppm_rthr \|\|
				1540	missed_ppm[WRITE] > ppm_wthr) {
				1541	ioc->busy_level = max(ioc->busy_level, 0);
				1542	ioc->busy_level++;
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1543	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1544	missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
				1545	missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1546	/* take action iff there is contention */
				1547	if (nr_shortages && !nr_lagging) {
				1548	ioc->busy_level = min(ioc->busy_level, 0);
				1549	/* redistribute surpluses first */
				1550	if (!nr_surpluses)
				1551	ioc->busy_level--;
				1552	}
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1553	} else {
				1554	ioc->busy_level = 0;
				1555	}
				1556
				1557	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
				1558
Tejun Heo	7cd806a	2019-09-25 16:03:09 -0700	[diff] [blame]	1559	if (ioc->busy_level > 0 \|\| (ioc->busy_level < 0 && !nr_lagging)) {
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1560	u64 vrate = atomic64_read(&ioc->vtime_rate);
				1561	u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
				1562
				1563	/* rq_wait signal is always reliable, ignore user vrate_min */
				1564	if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
				1565	vrate_min = VRATE_MIN;
				1566
				1567	/*
				1568	* If vrate is out of bounds, apply clamp gradually as the
				1569	* bounds can change abruptly. Otherwise, apply busy_level
				1570	* based adjustment.
				1571	*/
				1572	if (vrate < vrate_min) {
				1573	vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
				1574	100);
				1575	vrate = min(vrate, vrate_min);
				1576	} else if (vrate > vrate_max) {
				1577	vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
				1578	100);
				1579	vrate = max(vrate, vrate_max);
				1580	} else {
				1581	int idx = min_t(int, abs(ioc->busy_level),
				1582	ARRAY_SIZE(vrate_adj_pct) - 1);
				1583	u32 adj_pct = vrate_adj_pct[idx];
				1584
				1585	if (ioc->busy_level > 0)
				1586	adj_pct = 100 - adj_pct;
				1587	else
				1588	adj_pct = 100 + adj_pct;
				1589
				1590	vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
				1591	vrate_min, vrate_max);
				1592	}
				1593
Waiman Long	d6c8e94	2020-04-21 09:07:55 -0400	[diff] [blame]	1594	trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1595	nr_lagging, nr_shortages,
				1596	nr_surpluses);
				1597
				1598	atomic64_set(&ioc->vtime_rate, vrate);
				1599	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
				1600	ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1601	} else if (ioc->busy_level != prev_busy_level \|\| nr_lagging) {
				1602	trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
Waiman Long	d6c8e94	2020-04-21 09:07:55 -0400	[diff] [blame]	1603	missed_ppm, rq_wait_pct, nr_lagging,
Tejun Heo	25d41e4	2019-09-25 16:02:07 -0700	[diff] [blame]	1604	nr_shortages, nr_surpluses);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1605	}
				1606
				1607	ioc_refresh_params(ioc, false);
				1608
				1609	/*
				1610	* This period is done. Move onto the next one. If nothing's
				1611	* going on with the device, stop the timer.
				1612	*/
				1613	atomic64_inc(&ioc->cur_period);
				1614
				1615	if (ioc->running != IOC_STOP) {
				1616	if (!list_empty(&ioc->active_iocgs)) {
				1617	ioc_start_period(ioc, &now);
				1618	} else {
				1619	ioc->busy_level = 0;
				1620	ioc->running = IOC_IDLE;
				1621	}
				1622	}
				1623
				1624	spin_unlock_irq(&ioc->lock);
				1625	}
				1626
				1627	static void calc_vtime_cost_builtin(struct bio bio, struct ioc_gq iocg,
				1628	bool is_merge, u64 *costp)
				1629	{
				1630	struct ioc *ioc = iocg->ioc;
				1631	u64 coef_seqio, coef_randio, coef_page;
				1632	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
				1633	u64 seek_pages = 0;
				1634	u64 cost = 0;
				1635
				1636	switch (bio_op(bio)) {
				1637	case REQ_OP_READ:
				1638	coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
				1639	coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
				1640	coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
				1641	break;
				1642	case REQ_OP_WRITE:
				1643	coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
				1644	coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
				1645	coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
				1646	break;
				1647	default:
				1648	goto out;
				1649	}
				1650
				1651	if (iocg->cursor) {
				1652	seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
				1653	seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
				1654	}
				1655
				1656	if (!is_merge) {
				1657	if (seek_pages > LCOEF_RANDIO_PAGES) {
				1658	cost += coef_randio;
				1659	} else {
				1660	cost += coef_seqio;
				1661	}
				1662	}
				1663	cost += pages * coef_page;
				1664	out:
				1665	*costp = cost;
				1666	}
				1667
				1668	static u64 calc_vtime_cost(struct bio bio, struct ioc_gq iocg, bool is_merge)
				1669	{
				1670	u64 cost;
				1671
				1672	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
				1673	return cost;
				1674	}
				1675
				1676	static void ioc_rqos_throttle(struct rq_qos rqos, struct bio bio)
				1677	{
				1678	struct blkcg_gq *blkg = bio->bi_blkg;
				1679	struct ioc *ioc = rqos_to_ioc(rqos);
				1680	struct ioc_gq *iocg = blkg_to_iocg(blkg);
				1681	struct ioc_now now;
				1682	struct iocg_wait wait;
				1683	u32 hw_active, hw_inuse;
				1684	u64 abs_cost, cost, vtime;
				1685
				1686	/* bypass IOs if disabled or for root cgroup */
				1687	if (!ioc->enabled \|\| !iocg->level)
				1688	return;
				1689
				1690	/* always activate so that even 0 cost IOs get protected to some level */
				1691	if (!iocg_activate(iocg, &now))
				1692	return;
				1693
				1694	/* calculate the absolute vtime cost */
				1695	abs_cost = calc_vtime_cost(bio, iocg, false);
				1696	if (!abs_cost)
				1697	return;
				1698
				1699	iocg->cursor = bio_end_sector(bio);
				1700
				1701	vtime = atomic64_read(&iocg->vtime);
				1702	current_hweight(iocg, &hw_active, &hw_inuse);
				1703
				1704	if (hw_inuse < hw_active &&
				1705	time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
				1706	TRACE_IOCG_PATH(inuse_reset, iocg, &now,
				1707	iocg->inuse, iocg->weight, hw_inuse, hw_active);
				1708	spin_lock_irq(&ioc->lock);
				1709	propagate_active_weight(iocg, iocg->weight, iocg->weight);
				1710	spin_unlock_irq(&ioc->lock);
				1711	current_hweight(iocg, &hw_active, &hw_inuse);
				1712	}
				1713
				1714	cost = abs_cost_to_cost(abs_cost, hw_inuse);
				1715
				1716	/*
				1717	* If no one's waiting and within budget, issue right away. The
				1718	* tests are racy but the races aren't systemic - we only miss once
				1719	* in a while which is fine.
				1720	*/
				1721	if (!waitqueue_active(&iocg->waitq) &&
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1722	!atomic64_read(&iocg->abs_vdebt) &&
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1723	time_before_eq64(vtime + cost, now.vnow)) {
				1724	iocg_commit_bio(iocg, bio, cost);
				1725	return;
				1726	}
				1727
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1728	/*
				1729	* We're over budget. If @bio has to be issued regardless,
				1730	* remember the abs_cost instead of advancing vtime.
				1731	* iocg_kick_waitq() will pay off the debt before waking more IOs.
				1732	* This way, the debt is continuously paid off each period with the
				1733	* actual budget available to the cgroup. If we just wound vtime,
				1734	* we would incorrectly use the current hw_inuse for the entire
				1735	* amount which, for example, can lead to the cgroup staying
				1736	* blocked for a long time even with substantially raised hw_inuse.
				1737	*/
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1738	if (bio_issue_as_root_blkg(bio) \|\| fatal_signal_pending(current)) {
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	1739	atomic64_add(abs_cost, &iocg->abs_vdebt);
Tejun Heo	d7bd15a	2019-12-16 13:34:00 -0800	[diff] [blame]	1740	if (iocg_kick_delay(iocg, &now, cost))
				1741	blkcg_schedule_throttle(rqos->q,
				1742	(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1743	return;
				1744	}
				1745
				1746	/*
				1747	* Append self to the waitq and schedule the wakeup timer if we're
				1748	* the first waiter. The timer duration is calculated based on the
				1749	* current vrate. vtime and hweight changes can make it too short
				1750	* or too long. Each wait entry records the absolute cost it's
				1751	* waiting for to allow re-evaluation using a custom wait entry.
				1752	*
				1753	* If too short, the timer simply reschedules itself. If too long,
				1754	* the period timer will notice and trigger wakeups.
				1755	*
				1756	* All waiters are on iocg->waitq and the wait states are
				1757	* synchronized using waitq.lock.
				1758	*/
				1759	spin_lock_irq(&iocg->waitq.lock);
				1760
				1761	/*
				1762	* We activated above but w/o any synchronization. Deactivation is
				1763	* synchronized with waitq.lock and we won't get deactivated as
				1764	* long as we're waiting, so we're good if we're activated here.
				1765	* In the unlikely case that we are deactivated, just issue the IO.
				1766	*/
				1767	if (unlikely(list_empty(&iocg->active_list))) {
				1768	spin_unlock_irq(&iocg->waitq.lock);
				1769	iocg_commit_bio(iocg, bio, cost);
				1770	return;
				1771	}
				1772
				1773	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
				1774	wait.wait.private = current;
				1775	wait.bio = bio;
				1776	wait.abs_cost = abs_cost;
				1777	wait.committed = false; /* will be set true by waker */
				1778
				1779	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
				1780	iocg_kick_waitq(iocg, &now);
				1781
				1782	spin_unlock_irq(&iocg->waitq.lock);
				1783
				1784	while (true) {
				1785	set_current_state(TASK_UNINTERRUPTIBLE);
				1786	if (wait.committed)
				1787	break;
				1788	io_schedule();
				1789	}
				1790
				1791	/* waker already committed us, proceed */
				1792	finish_wait(&iocg->waitq, &wait.wait);
				1793	}
				1794
				1795	static void ioc_rqos_merge(struct rq_qos rqos, struct request rq,
				1796	struct bio *bio)
				1797	{
				1798	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1799	struct ioc *ioc = iocg->ioc;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1800	sector_t bio_end = bio_end_sector(bio);
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1801	struct ioc_now now;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1802	u32 hw_inuse;
				1803	u64 abs_cost, cost;
				1804
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1805	/* bypass if disabled or for root cgroup */
				1806	if (!ioc->enabled \|\| !iocg->level)
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1807	return;
				1808
				1809	abs_cost = calc_vtime_cost(bio, iocg, true);
				1810	if (!abs_cost)
				1811	return;
				1812
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1813	ioc_now(ioc, &now);
				1814	current_hweight(iocg, NULL, &hw_inuse);
				1815	cost = abs_cost_to_cost(abs_cost, hw_inuse);
				1816
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1817	/* update cursor if backmerging into the request at the cursor */
				1818	if (blk_rq_pos(rq) < bio_end &&
				1819	blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
				1820	iocg->cursor = bio_end;
				1821
Tejun Heo	e1518f6	2019-09-04 12:45:53 -0700	[diff] [blame]	1822	/*
				1823	* Charge if there's enough vtime budget and the existing request
				1824	* has cost assigned. Otherwise, account it as debt. See debt
				1825	* handling in ioc_rqos_throttle() for details.
				1826	*/
				1827	if (rq->bio && rq->bio->bi_iocost_cost &&
				1828	time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
				1829	iocg_commit_bio(iocg, bio, cost);
				1830	else
				1831	atomic64_add(abs_cost, &iocg->abs_vdebt);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1832	}
				1833
				1834	static void ioc_rqos_done_bio(struct rq_qos rqos, struct bio bio)
				1835	{
				1836	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
				1837
				1838	if (iocg && bio->bi_iocost_cost)
				1839	atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
				1840	}
				1841
				1842	static void ioc_rqos_done(struct rq_qos rqos, struct request rq)
				1843	{
				1844	struct ioc *ioc = rqos_to_ioc(rqos);
				1845	u64 on_q_ns, rq_wait_ns;
				1846	int pidx, rw;
				1847
				1848	if (!ioc->enabled \|\| !rq->alloc_time_ns \|\| !rq->start_time_ns)
				1849	return;
				1850
				1851	switch (req_op(rq) & REQ_OP_MASK) {
				1852	case REQ_OP_READ:
				1853	pidx = QOS_RLAT;
				1854	rw = READ;
				1855	break;
				1856	case REQ_OP_WRITE:
				1857	pidx = QOS_WLAT;
				1858	rw = WRITE;
				1859	break;
				1860	default:
				1861	return;
				1862	}
				1863
				1864	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
				1865	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
				1866
				1867	if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
				1868	this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
				1869	else
				1870	this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
				1871
				1872	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
				1873	}
				1874
				1875	static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
				1876	{
				1877	struct ioc *ioc = rqos_to_ioc(rqos);
				1878
				1879	spin_lock_irq(&ioc->lock);
				1880	ioc_refresh_params(ioc, false);
				1881	spin_unlock_irq(&ioc->lock);
				1882	}
				1883
				1884	static void ioc_rqos_exit(struct rq_qos *rqos)
				1885	{
				1886	struct ioc *ioc = rqos_to_ioc(rqos);
				1887
				1888	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
				1889
				1890	spin_lock_irq(&ioc->lock);
				1891	ioc->running = IOC_STOP;
				1892	spin_unlock_irq(&ioc->lock);
				1893
				1894	del_timer_sync(&ioc->timer);
				1895	free_percpu(ioc->pcpu_stat);
				1896	kfree(ioc);
				1897	}
				1898
				1899	static struct rq_qos_ops ioc_rqos_ops = {
				1900	.throttle = ioc_rqos_throttle,
				1901	.merge = ioc_rqos_merge,
				1902	.done_bio = ioc_rqos_done_bio,
				1903	.done = ioc_rqos_done,
				1904	.queue_depth_changed = ioc_rqos_queue_depth_changed,
				1905	.exit = ioc_rqos_exit,
				1906	};
				1907
				1908	static int blk_iocost_init(struct request_queue *q)
				1909	{
				1910	struct ioc *ioc;
				1911	struct rq_qos *rqos;
				1912	int ret;
				1913
				1914	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
				1915	if (!ioc)
				1916	return -ENOMEM;
				1917
				1918	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
				1919	if (!ioc->pcpu_stat) {
				1920	kfree(ioc);
				1921	return -ENOMEM;
				1922	}
				1923
				1924	rqos = &ioc->rqos;
				1925	rqos->id = RQ_QOS_COST;
				1926	rqos->ops = &ioc_rqos_ops;
				1927	rqos->q = q;
				1928
				1929	spin_lock_init(&ioc->lock);
				1930	timer_setup(&ioc->timer, ioc_timer_fn, 0);
				1931	INIT_LIST_HEAD(&ioc->active_iocgs);
				1932
				1933	ioc->running = IOC_IDLE;
				1934	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
				1935	seqcount_init(&ioc->period_seqcount);
				1936	ioc->period_at = ktime_to_us(ktime_get());
				1937	atomic64_set(&ioc->cur_period, 0);
				1938	atomic_set(&ioc->hweight_gen, 0);
				1939
				1940	spin_lock_irq(&ioc->lock);
				1941	ioc->autop_idx = AUTOP_INVALID;
				1942	ioc_refresh_params(ioc, true);
				1943	spin_unlock_irq(&ioc->lock);
				1944
				1945	rq_qos_add(q, rqos);
				1946	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
				1947	if (ret) {
				1948	rq_qos_del(q, rqos);
Tejun Heo	3532e72	2019-08-29 08:53:06 -0700	[diff] [blame]	1949	free_percpu(ioc->pcpu_stat);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1950	kfree(ioc);
				1951	return ret;
				1952	}
				1953	return 0;
				1954	}
				1955
				1956	static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
				1957	{
				1958	struct ioc_cgrp *iocc;
				1959
				1960	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
Tejun Heo	e916ad2	2019-08-30 06:10:58 -0700	[diff] [blame]	1961	if (!iocc)
				1962	return NULL;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1963
Tejun Heo	e916ad2	2019-08-30 06:10:58 -0700	[diff] [blame]	1964	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	1965	return &iocc->cpd;
				1966	}
				1967
				1968	static void ioc_cpd_free(struct blkcg_policy_data *cpd)
				1969	{
				1970	kfree(container_of(cpd, struct ioc_cgrp, cpd));
				1971	}
				1972
				1973	static struct blkg_policy_data ioc_pd_alloc(gfp_t gfp, struct request_queue q,
				1974	struct blkcg *blkcg)
				1975	{
				1976	int levels = blkcg->css.cgroup->level + 1;
				1977	struct ioc_gq *iocg;
				1978
				1979	iocg = kzalloc_node(sizeof(iocg) + levels sizeof(iocg->ancestors[0]),
				1980	gfp, q->node);
				1981	if (!iocg)
				1982	return NULL;
				1983
				1984	return &iocg->pd;
				1985	}
				1986
				1987	static void ioc_pd_init(struct blkg_policy_data *pd)
				1988	{
				1989	struct ioc_gq *iocg = pd_to_iocg(pd);
				1990	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
				1991	struct ioc *ioc = q_to_ioc(blkg->q);
				1992	struct ioc_now now;
				1993	struct blkcg_gq *tblkg;
				1994	unsigned long flags;
				1995
				1996	ioc_now(ioc, &now);
				1997
				1998	iocg->ioc = ioc;
				1999	atomic64_set(&iocg->vtime, now.vnow);
				2000	atomic64_set(&iocg->done_vtime, now.vnow);
Tejun Heo	36a5248	2019-09-04 12:45:52 -0700	[diff] [blame]	2001	atomic64_set(&iocg->abs_vdebt, 0);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2002	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
				2003	INIT_LIST_HEAD(&iocg->active_list);
				2004	iocg->hweight_active = HWEIGHT_WHOLE;
				2005	iocg->hweight_inuse = HWEIGHT_WHOLE;
				2006
				2007	init_waitqueue_head(&iocg->waitq);
				2008	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
				2009	iocg->waitq_timer.function = iocg_waitq_timer_fn;
				2010	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
				2011	iocg->delay_timer.function = iocg_delay_timer_fn;
				2012
				2013	iocg->level = blkg->blkcg->css.cgroup->level;
				2014
				2015	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
				2016	struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
				2017	iocg->ancestors[tiocg->level] = tiocg;
				2018	}
				2019
				2020	spin_lock_irqsave(&ioc->lock, flags);
				2021	weight_updated(iocg);
				2022	spin_unlock_irqrestore(&ioc->lock, flags);
				2023	}
				2024
				2025	static void ioc_pd_free(struct blkg_policy_data *pd)
				2026	{
				2027	struct ioc_gq *iocg = pd_to_iocg(pd);
				2028	struct ioc *ioc = iocg->ioc;
				2029
				2030	if (ioc) {
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2031	spin_lock(&ioc->lock);
				2032	if (!list_empty(&iocg->active_list)) {
				2033	propagate_active_weight(iocg, 0, 0);
				2034	list_del_init(&iocg->active_list);
				2035	}
				2036	spin_unlock(&ioc->lock);
Tejun Heo	e036c4c	2019-09-10 09:15:25 -0700	[diff] [blame]	2037
				2038	hrtimer_cancel(&iocg->waitq_timer);
				2039	hrtimer_cancel(&iocg->delay_timer);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2040	}
				2041	kfree(iocg);
				2042	}
				2043
				2044	static u64 ioc_weight_prfill(struct seq_file sf, struct blkg_policy_data pd,
				2045	int off)
				2046	{
				2047	const char *dname = blkg_dev_name(pd->blkg);
				2048	struct ioc_gq *iocg = pd_to_iocg(pd);
				2049
				2050	if (dname && iocg->cfg_weight)
				2051	seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
				2052	return 0;
				2053	}
				2054
				2055
				2056	static int ioc_weight_show(struct seq_file sf, void v)
				2057	{
				2058	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2059	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
				2060
				2061	seq_printf(sf, "default %u\n", iocc->dfl_weight);
				2062	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
				2063	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2064	return 0;
				2065	}
				2066
				2067	static ssize_t ioc_weight_write(struct kernfs_open_file of, char buf,
				2068	size_t nbytes, loff_t off)
				2069	{
				2070	struct blkcg *blkcg = css_to_blkcg(of_css(of));
				2071	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
				2072	struct blkg_conf_ctx ctx;
				2073	struct ioc_gq *iocg;
				2074	u32 v;
				2075	int ret;
				2076
				2077	if (!strchr(buf, ':')) {
				2078	struct blkcg_gq *blkg;
				2079
				2080	if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
				2081	return -EINVAL;
				2082
				2083	if (v < CGROUP_WEIGHT_MIN \|\| v > CGROUP_WEIGHT_MAX)
				2084	return -EINVAL;
				2085
				2086	spin_lock(&blkcg->lock);
				2087	iocc->dfl_weight = v;
				2088	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
				2089	struct ioc_gq *iocg = blkg_to_iocg(blkg);
				2090
				2091	if (iocg) {
				2092	spin_lock_irq(&iocg->ioc->lock);
				2093	weight_updated(iocg);
				2094	spin_unlock_irq(&iocg->ioc->lock);
				2095	}
				2096	}
				2097	spin_unlock(&blkcg->lock);
				2098
				2099	return nbytes;
				2100	}
				2101
				2102	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
				2103	if (ret)
				2104	return ret;
				2105
				2106	iocg = blkg_to_iocg(ctx.blkg);
				2107
				2108	if (!strncmp(ctx.body, "default", 7)) {
				2109	v = 0;
				2110	} else {
				2111	if (!sscanf(ctx.body, "%u", &v))
				2112	goto einval;
				2113	if (v < CGROUP_WEIGHT_MIN \|\| v > CGROUP_WEIGHT_MAX)
				2114	goto einval;
				2115	}
				2116
Dan Carpenter	41591a5	2019-10-31 13:53:41 +0300	[diff] [blame]	2117	spin_lock(&iocg->ioc->lock);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2118	iocg->cfg_weight = v;
				2119	weight_updated(iocg);
Dan Carpenter	41591a5	2019-10-31 13:53:41 +0300	[diff] [blame]	2120	spin_unlock(&iocg->ioc->lock);
Tejun Heo	7caa471	2019-08-28 15:05:58 -0700	[diff] [blame]	2121
				2122	blkg_conf_finish(&ctx);
				2123	return nbytes;
				2124
				2125	einval:
				2126	blkg_conf_finish(&ctx);
				2127	return -EINVAL;
				2128	}
				2129
				2130	static u64 ioc_qos_prfill(struct seq_file sf, struct blkg_policy_data pd,
				2131	int off)
				2132	{
				2133	const char *dname = blkg_dev_name(pd->blkg);
				2134	struct ioc *ioc = pd_to_iocg(pd)->ioc;
				2135
				2136	if (!dname)
				2137	return 0;
				2138
				2139	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
				2140	dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
				2141	ioc->params.qos[QOS_RPPM] / 10000,
				2142	ioc->params.qos[QOS_RPPM] % 10000 / 100,
				2143	ioc->params.qos[QOS_RLAT],
				2144	ioc->params.qos[QOS_WPPM] / 10000,
				2145	ioc->params.qos[QOS_WPPM] % 10000 / 100,
				2146	ioc->params.qos[QOS_WLAT],
				2147	ioc->params.qos[QOS_MIN] / 10000,
				2148	ioc->params.qos[QOS_MIN] % 10000 / 100,
				2149	ioc->params.qos[QOS_MAX] / 10000,
				2150	ioc->params.qos[QOS_MAX] % 10000 / 100);
				2151	return 0;
				2152	}
				2153
				2154	static int ioc_qos_show(struct seq_file sf, void v)
				2155	{
				2156	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2157
				2158	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
				2159	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2160	return 0;
				2161	}
				2162
				2163	static const match_table_t qos_ctrl_tokens = {
				2164	{ QOS_ENABLE, "enable=%u" },
				2165	{ QOS_CTRL, "ctrl=%s" },
				2166	{ NR_QOS_CTRL_PARAMS, NULL },
				2167	};
				2168
				2169	static const match_table_t qos_tokens = {
				2170	{ QOS_RPPM, "rpct=%s" },
				2171	{ QOS_RLAT, "rlat=%u" },
				2172	{ QOS_WPPM, "wpct=%s" },
				2173	{ QOS_WLAT, "wlat=%u" },
				2174	{ QOS_MIN, "min=%s" },
				2175	{ QOS_MAX, "max=%s" },
				2176	{ NR_QOS_PARAMS, NULL },
				2177	};
				2178
				2179	static ssize_t ioc_qos_write(struct kernfs_open_file of, char input,
				2180	size_t nbytes, loff_t off)
				2181	{
				2182	struct gendisk *disk;
				2183	struct ioc *ioc;
				2184	u32 qos[NR_QOS_PARAMS];
				2185	bool enable, user;
				2186	char *p;
				2187	int ret;
				2188
				2189	disk = blkcg_conf_get_disk(&input);
				2190	if (IS_ERR(disk))
				2191	return PTR_ERR(disk);
				2192
				2193	ioc = q_to_ioc(disk->queue);
				2194	if (!ioc) {
				2195	ret = blk_iocost_init(disk->queue);
				2196	if (ret)
				2197	goto err;
				2198	ioc = q_to_ioc(disk->queue);
				2199	}
				2200
				2201	spin_lock_irq(&ioc->lock);
				2202	memcpy(qos, ioc->params.qos, sizeof(qos));
				2203	enable = ioc->enabled;
				2204	user = ioc->user_qos_params;
				2205	spin_unlock_irq(&ioc->lock);
				2206
				2207	while ((p = strsep(&input, " \t\n"))) {
				2208	substring_t args[MAX_OPT_ARGS];
				2209	char buf[32];
				2210	int tok;
				2211	s64 v;
				2212
				2213	if (!*p)
				2214	continue;
				2215
				2216	switch (match_token(p, qos_ctrl_tokens, args)) {
				2217	case QOS_ENABLE:
				2218	match_u64(&args[0], &v);
				2219	enable = v;
				2220	continue;
				2221	case QOS_CTRL:
				2222	match_strlcpy(buf, &args[0], sizeof(buf));
				2223	if (!strcmp(buf, "auto"))
				2224	user = false;
				2225	else if (!strcmp(buf, "user"))
				2226	user = true;
				2227	else
				2228	goto einval;
				2229	continue;
				2230	}
				2231
				2232	tok = match_token(p, qos_tokens, args);
				2233	switch (tok) {
				2234	case QOS_RPPM:
				2235	case QOS_WPPM:
				2236	if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
				2237	sizeof(buf))
				2238	goto einval;
				2239	if (cgroup_parse_float(buf, 2, &v))
				2240	goto einval;
				2241	if (v < 0 \|\| v > 10000)
				2242	goto einval;
				2243	qos[tok] = v * 100;
				2244	break;
				2245	case QOS_RLAT:
				2246	case QOS_WLAT:
				2247	if (match_u64(&args[0], &v))
				2248	goto einval;
				2249	qos[tok] = v;
				2250	break;
				2251	case QOS_MIN:
				2252	case QOS_MAX:
				2253	if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
				2254	sizeof(buf))
				2255	goto einval;
				2256	if (cgroup_parse_float(buf, 2, &v))
				2257	goto einval;
				2258	if (v < 0)
				2259	goto einval;
				2260	qos[tok] = clamp_t(s64, v * 100,
				2261	VRATE_MIN_PPM, VRATE_MAX_PPM);
				2262	break;
				2263	default:
				2264	goto einval;
				2265	}
				2266	user = true;
				2267	}
				2268
				2269	if (qos[QOS_MIN] > qos[QOS_MAX])
				2270	goto einval;
				2271
				2272	spin_lock_irq(&ioc->lock);
				2273
				2274	if (enable) {
				2275	blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
				2276	ioc->enabled = true;
				2277	} else {
				2278	blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
				2279	ioc->enabled = false;
				2280	}
				2281
				2282	if (user) {
				2283	memcpy(ioc->params.qos, qos, sizeof(qos));
				2284	ioc->user_qos_params = true;
				2285	} else {
				2286	ioc->user_qos_params = false;
				2287	}
				2288
				2289	ioc_refresh_params(ioc, true);
				2290	spin_unlock_irq(&ioc->lock);
				2291
				2292	put_disk_and_module(disk);
				2293	return nbytes;
				2294	einval:
				2295	ret = -EINVAL;
				2296	err:
				2297	put_disk_and_module(disk);
				2298	return ret;
				2299	}
				2300
				2301	static u64 ioc_cost_model_prfill(struct seq_file *sf,
				2302	struct blkg_policy_data *pd, int off)
				2303	{
				2304	const char *dname = blkg_dev_name(pd->blkg);
				2305	struct ioc *ioc = pd_to_iocg(pd)->ioc;
				2306	u64 *u = ioc->params.i_lcoefs;
				2307
				2308	if (!dname)
				2309	return 0;
				2310
				2311	seq_printf(sf, "%s ctrl=%s model=linear "
				2312	"rbps=%llu rseqiops=%llu rrandiops=%llu "
				2313	"wbps=%llu wseqiops=%llu wrandiops=%llu\n",
				2314	dname, ioc->user_cost_model ? "user" : "auto",
				2315	u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
				2316	u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
				2317	return 0;
				2318	}
				2319
				2320	static int ioc_cost_model_show(struct seq_file sf, void v)
				2321	{
				2322	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				2323
				2324	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
				2325	&blkcg_policy_iocost, seq_cft(sf)->private, false);
				2326	return 0;
				2327	}
				2328
				2329	static const match_table_t cost_ctrl_tokens = {
				2330	{ COST_CTRL, "ctrl=%s" },
				2331	{ COST_MODEL, "model=%s" },
				2332	{ NR_COST_CTRL_PARAMS, NULL },
				2333	};
				2334
				2335	static const match_table_t i_lcoef_tokens = {
				2336	{ I_LCOEF_RBPS, "rbps=%u" },
				2337	{ I_LCOEF_RSEQIOPS, "rseqiops=%u" },
				2338	{ I_LCOEF_RRANDIOPS, "rrandiops=%u" },
				2339	{ I_LCOEF_WBPS, "wbps=%u" },
				2340	{ I_LCOEF_WSEQIOPS, "wseqiops=%u" },
				2341	{ I_LCOEF_WRANDIOPS, "wrandiops=%u" },
				2342	{ NR_I_LCOEFS, NULL },
				2343	};
				2344
				2345	static ssize_t ioc_cost_model_write(struct kernfs_open_file of, char input,
				2346	size_t nbytes, loff_t off)
				2347	{
				2348	struct gendisk *disk;
				2349	struct ioc *ioc;
				2350	u64 u[NR_I_LCOEFS];
				2351	bool user;
				2352	char *p;
				2353	int ret;
				2354
				2355	disk = blkcg_conf_get_disk(&input);
				2356	if (IS_ERR(disk))
				2357	return PTR_ERR(disk);
				2358
				2359	ioc = q_to_ioc(disk->queue);
				2360	if (!ioc) {
				2361	ret = blk_iocost_init(disk->queue);
				2362	if (ret)
				2363	goto err;
				2364	ioc = q_to_ioc(disk->queue);
				2365	}
				2366
				2367	spin_lock_irq(&ioc->lock);
				2368	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
				2369	user = ioc->user_cost_model;
				2370	spin_unlock_irq(&ioc->lock);
				2371
				2372	while ((p = strsep(&input, " \t\n"))) {
				2373	substring_t args[MAX_OPT_ARGS];
				2374	char buf[32];
				2375	int tok;
				2376	u64 v;
				2377
				2378	if (!*p)
				2379	continue;
				2380
				2381	switch (match_token(p, cost_ctrl_tokens, args)) {
				2382	case COST_CTRL:
				2383	match_strlcpy(buf, &args[0], sizeof(buf));
				2384	if (!strcmp(buf, "auto"))
				2385	user = false;
				2386	else if (!strcmp(buf, "user"))
				2387	user = true;
				2388	else
				2389	goto einval;
				2390	continue;
				2391	case COST_MODEL:
				2392	match_strlcpy(buf, &args[0], sizeof(buf));
				2393	if (strcmp(buf, "linear"))
				2394	goto einval;
				2395	continue;
				2396	}
				2397
				2398	tok = match_token(p, i_lcoef_tokens, args);
				2399	if (tok == NR_I_LCOEFS)
				2400	goto einval;
				2401	if (match_u64(&args[0], &v))
				2402	goto einval;
				2403	u[tok] = v;
				2404	user = true;
				2405	}
				2406
				2407	spin_lock_irq(&ioc->lock);
				2408	if (user) {
				2409	memcpy(ioc->params.i_lcoefs, u, sizeof(u));
				2410	ioc->user_cost_model = true;
				2411	} else {
				2412	ioc->user_cost_model = false;
				2413	}
				2414	ioc_refresh_params(ioc, true);
				2415	spin_unlock_irq(&ioc->lock);
				2416
				2417	put_disk_and_module(disk);
				2418	return nbytes;
				2419
				2420	einval:
				2421	ret = -EINVAL;
				2422	err:
				2423	put_disk_and_module(disk);
				2424	return ret;
				2425	}
				2426
				2427	static struct cftype ioc_files[] = {
				2428	{
				2429	.name = "weight",
				2430	.flags = CFTYPE_NOT_ON_ROOT,
				2431	.seq_show = ioc_weight_show,
				2432	.write = ioc_weight_write,
				2433	},
				2434	{
				2435	.name = "cost.qos",
				2436	.flags = CFTYPE_ONLY_ON_ROOT,
				2437	.seq_show = ioc_qos_show,
				2438	.write = ioc_qos_write,
				2439	},
				2440	{
				2441	.name = "cost.model",
				2442	.flags = CFTYPE_ONLY_ON_ROOT,
				2443	.seq_show = ioc_cost_model_show,
				2444	.write = ioc_cost_model_write,
				2445	},
				2446	{}
				2447	};
				2448
				2449	static struct blkcg_policy blkcg_policy_iocost = {
				2450	.dfl_cftypes = ioc_files,
				2451	.cpd_alloc_fn = ioc_cpd_alloc,
				2452	.cpd_free_fn = ioc_cpd_free,
				2453	.pd_alloc_fn = ioc_pd_alloc,
				2454	.pd_init_fn = ioc_pd_init,
				2455	.pd_free_fn = ioc_pd_free,
				2456	};
				2457
				2458	static int __init ioc_init(void)
				2459	{
				2460	return blkcg_policy_register(&blkcg_policy_iocost);
				2461	}
				2462
				2463	static void __exit ioc_exit(void)
				2464	{
				2465	return blkcg_policy_unregister(&blkcg_policy_iocost);
				2466	}
				2467
				2468	module_init(ioc_init);
				2469	module_exit(ioc_exit);