blob: ecc23b827e5d2f35d2b1fdec056147951e579015 [file] [log] [blame]
Tejun Heo7caa4712019-08-28 15:05:58 -07001/* SPDX-License-Identifier: GPL-2.0
2 *
3 * IO cost model based controller.
4 *
5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
7 * Copyright (C) 2019 Facebook
8 *
9 * One challenge of controlling IO resources is the lack of trivially
10 * observable cost metric. This is distinguished from CPU and memory where
11 * wallclock time and the number of bytes can serve as accurate enough
12 * approximations.
13 *
14 * Bandwidth and iops are the most commonly used metrics for IO devices but
15 * depending on the type and specifics of the device, different IO patterns
16 * easily lead to multiple orders of magnitude variations rendering them
17 * useless for the purpose of IO capacity distribution. While on-device
18 * time, with a lot of clutches, could serve as a useful approximation for
19 * non-queued rotational devices, this is no longer viable with modern
20 * devices, even the rotational ones.
21 *
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
24 * dominates while a contiguous transfer contributes a smaller amount
25 * proportional to the size. If we can characterize at least the relative
26 * costs of these different types of IOs, it should be possible to
27 * implement a reasonable work-conserving proportional IO resource
28 * distribution.
29 *
30 * 1. IO Cost Model
31 *
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
35 * be able to process ~100 of those IOs in a second.
36 *
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
40 * added. While simple, this model captures the operational
41 * characteristics of a wide varienty of devices well enough. Default
42 * paramters for several different classes of devices are provided and the
43 * parameters can be configured from userspace via
44 * /sys/fs/cgroup/io.cost.model.
45 *
46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47 * device-specific coefficients.
48 *
49 * 2. Control Strategy
50 *
51 * The device virtual time (vtime) is used as the primary control metric.
52 * The control strategy is composed of the following three parts.
53 *
54 * 2-1. Vtime Distribution
55 *
56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
57 * calculated. Please consider the following hierarchy where the numbers
58 * inside parentheses denote the configured weights.
59 *
60 * root
61 * / \
62 * A (w:100) B (w:300)
63 * / \
64 * A0 (w:100) A1 (w:100)
65 *
66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67 * of equal weight, each gets 50% share. If then B starts issuing IOs, B
68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69 * 12.5% each. The distribution mechanism only cares about these flattened
70 * shares. They're called hweights (hierarchical weights) and always add
Tejun Heofe20cdb52020-09-01 14:52:38 -040071 * upto 1 (WEIGHT_ONE).
Tejun Heo7caa4712019-08-28 15:05:58 -070072 *
73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75 * against the device vtime - an IO which takes 10ms on the underlying
76 * device is considered to take 80ms on A0.
77 *
78 * This constitutes the basis of IO capacity distribution. Each cgroup's
79 * vtime is running at a rate determined by its hweight. A cgroup tracks
80 * the vtime consumed by past IOs and can issue a new IO iff doing so
81 * wouldn't outrun the current device vtime. Otherwise, the IO is
82 * suspended until the vtime has progressed enough to cover it.
83 *
84 * 2-2. Vrate Adjustment
85 *
86 * It's unrealistic to expect the cost model to be perfect. There are too
87 * many devices and even on the same device the overall performance
88 * fluctuates depending on numerous factors such as IO mixture and device
89 * internal garbage collection. The controller needs to adapt dynamically.
90 *
91 * This is achieved by adjusting the overall IO rate according to how busy
92 * the device is. If the device becomes overloaded, we're sending down too
93 * many IOs and should generally slow down. If there are waiting issuers
94 * but the device isn't saturated, we're issuing too few and should
95 * generally speed up.
96 *
97 * To slow down, we lower the vrate - the rate at which the device vtime
98 * passes compared to the wall clock. For example, if the vtime is running
99 * at the vrate of 75%, all cgroups added up would only be able to issue
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
101 *
102 * Device business is determined using two criteria - rq wait and
103 * completion latencies.
104 *
105 * When a device gets saturated, the on-device and then the request queues
106 * fill up and a bio which is ready to be issued has to wait for a request
107 * to become available. When this delay becomes noticeable, it's a clear
108 * indication that the device is saturated and we lower the vrate. This
109 * saturation signal is fairly conservative as it only triggers when both
110 * hardware and software queues are filled up, and is used as the default
111 * busy signal.
112 *
113 * As devices can have deep queues and be unfair in how the queued commands
114 * are executed, soley depending on rq wait may not result in satisfactory
115 * control quality. For a better control quality, completion latency QoS
116 * parameters can be configured so that the device is considered saturated
117 * if N'th percentile completion latency rises above the set point.
118 *
119 * The completion latency requirements are a function of both the
120 * underlying device characteristics and the desired IO latency quality of
121 * service. There is an inherent trade-off - the tighter the latency QoS,
122 * the higher the bandwidth lossage. Latency QoS is disabled by default
123 * and can be set through /sys/fs/cgroup/io.cost.qos.
124 *
125 * 2-3. Work Conservation
126 *
127 * Imagine two cgroups A and B with equal weights. A is issuing a small IO
128 * periodically while B is sending out enough parallel IOs to saturate the
129 * device on its own. Let's say A's usage amounts to 100ms worth of IO
130 * cost per second, i.e., 10% of the device capacity. The naive
131 * distribution of half and half would lead to 60% utilization of the
132 * device, a significant reduction in the total amount of work done
133 * compared to free-for-all competition. This is too high a cost to pay
134 * for IO control.
135 *
136 * To conserve the total amount of work done, we keep track of how much
137 * each active cgroup is actually using and yield part of its weight if
138 * there are other cgroups which can make use of it. In the above case,
139 * A's weight will be lowered so that it hovers above the actual usage and
140 * B would be able to use the rest.
141 *
142 * As we don't want to penalize a cgroup for donating its weight, the
143 * surplus weight adjustment factors in a margin and has an immediate
144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
145 *
146 * Note that adjusting down surplus weights has the same effects as
147 * accelerating vtime for other cgroups and work conservation can also be
148 * implemented by adjusting vrate dynamically. However, squaring who can
149 * donate and should take back how much requires hweight propagations
150 * anyway making it easier to implement and understand as a separate
151 * mechanism.
Tejun Heo6954ff12019-08-28 15:05:59 -0700152 *
153 * 3. Monitoring
154 *
155 * Instead of debugfs or other clumsy monitoring mechanisms, this
156 * controller uses a drgn based monitoring script -
157 * tools/cgroup/iocost_monitor.py. For details on drgn, please see
158 * https://github.com/osandov/drgn. The ouput looks like the following.
159 *
160 * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
Tejun Heo7c1ee702019-09-04 12:45:56 -0700161 * active weight hweight% inflt% dbt delay usages%
162 * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
163 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
Tejun Heo6954ff12019-08-28 15:05:59 -0700164 *
165 * - per : Timer period
166 * - cur_per : Internal wall and device vtime clock
167 * - vrate : Device virtual time rate against wall clock
168 * - weight : Surplus-adjusted and configured weights
169 * - hweight : Surplus-adjusted and configured hierarchical weights
170 * - inflt : The percentage of in-flight IO cost at the end of last period
171 * - del_ms : Deferred issuer delay induction level and duration
172 * - usages : Usage history
Tejun Heo7caa4712019-08-28 15:05:58 -0700173 */
174
175#include <linux/kernel.h>
176#include <linux/module.h>
177#include <linux/timer.h>
178#include <linux/time64.h>
179#include <linux/parser.h>
180#include <linux/sched/signal.h>
181#include <linux/blk-cgroup.h>
Tejun Heo5e124f72020-09-01 14:52:33 -0400182#include <asm/local.h>
183#include <asm/local64.h>
Tejun Heo7caa4712019-08-28 15:05:58 -0700184#include "blk-rq-qos.h"
185#include "blk-stat.h"
186#include "blk-wbt.h"
187
188#ifdef CONFIG_TRACEPOINTS
189
190/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
191#define TRACE_IOCG_PATH_LEN 1024
192static DEFINE_SPINLOCK(trace_iocg_path_lock);
193static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
194
195#define TRACE_IOCG_PATH(type, iocg, ...) \
196 do { \
197 unsigned long flags; \
198 if (trace_iocost_##type##_enabled()) { \
199 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
201 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
202 trace_iocost_##type(iocg, trace_iocg_path, \
203 ##__VA_ARGS__); \
204 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
205 } \
206 } while (0)
207
208#else /* CONFIG_TRACE_POINTS */
209#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
210#endif /* CONFIG_TRACE_POINTS */
211
212enum {
213 MILLION = 1000000,
214
215 /* timer period is calculated from latency requirements, bound it */
216 MIN_PERIOD = USEC_PER_MSEC,
217 MAX_PERIOD = USEC_PER_SEC,
218
219 /*
220 * A cgroup's vtime can run 50% behind the device vtime, which
221 * serves as its IO credit buffer. Surplus weight adjustment is
222 * immediately canceled if the vtime margin runs below 10%.
223 */
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400224 MARGIN_MIN_PCT = 10,
225 MARGIN_MAX_PCT = 50,
Tejun Heo7caa4712019-08-28 15:05:58 -0700226
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400227 /* Have some play in timer operations */
228 TIMER_SLACK_PCT = 1,
Tejun Heo7caa4712019-08-28 15:05:58 -0700229
230 /*
231 * vtime can wrap well within a reasonable uptime when vrate is
232 * consistently raised. Don't trust recorded cgroup vtime if the
233 * period counter indicates that it's older than 5mins.
234 */
235 VTIME_VALID_DUR = 300 * USEC_PER_SEC,
236
237 /*
238 * Remember the past three non-zero usages and use the max for
239 * surplus calculation. Three slots guarantee that we remember one
240 * full period usage from the last active stretch even after
241 * partial deactivation and re-activation periods. Don't start
242 * giving away weight before collecting two data points to prevent
243 * hweight adjustments based on one partial activation period.
244 */
245 NR_USAGE_SLOTS = 3,
246 MIN_VALID_USAGES = 2,
247
248 /* 1/64k is granular enough and can easily be handled w/ u32 */
Tejun Heofe20cdb52020-09-01 14:52:38 -0400249 WEIGHT_ONE = 1 << 16,
Tejun Heo7caa4712019-08-28 15:05:58 -0700250
251 /*
252 * As vtime is used to calculate the cost of each IO, it needs to
253 * be fairly high precision. For example, it should be able to
254 * represent the cost of a single page worth of discard with
255 * suffificient accuracy. At the same time, it should be able to
256 * represent reasonably long enough durations to be useful and
257 * convenient during operation.
258 *
259 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
260 * granularity and days of wrap-around time even at extreme vrates.
261 */
262 VTIME_PER_SEC_SHIFT = 37,
263 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
264 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
Tejun Heocd006502020-04-13 12:27:56 -0400265 VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
Tejun Heo7caa4712019-08-28 15:05:58 -0700266
267 /* bound vrate adjustments within two orders of magnitude */
268 VRATE_MIN_PPM = 10000, /* 1% */
269 VRATE_MAX_PPM = 100000000, /* 10000% */
270
271 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
272 VRATE_CLAMP_ADJ_PCT = 4,
273
274 /* if IOs end up waiting for requests, issue less */
275 RQ_WAIT_BUSY_PCT = 5,
276
277 /* unbusy hysterisis */
278 UNBUSY_THR_PCT = 75,
279
280 /* don't let cmds which take a very long time pin lagging for too long */
281 MAX_LAGGING_PERIODS = 10,
282
283 /*
284 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
285 * donate the surplus.
286 */
287 SURPLUS_SCALE_PCT = 125, /* * 125% */
Tejun Heofe20cdb52020-09-01 14:52:38 -0400288 SURPLUS_SCALE_ABS = WEIGHT_ONE / 50, /* + 2% */
289 SURPLUS_MIN_ADJ_DELTA = WEIGHT_ONE / 33, /* 3% */
Tejun Heo7caa4712019-08-28 15:05:58 -0700290
291 /* switch iff the conditions are met for longer than this */
292 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
293
294 /*
295 * Count IO size in 4k pages. The 12bit shift helps keeping
296 * size-proportional components of cost calculation in closer
297 * numbers of digits to per-IO cost components.
298 */
299 IOC_PAGE_SHIFT = 12,
300 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
301 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
302
303 /* if apart further than 16M, consider randio for linear model */
304 LCOEF_RANDIO_PAGES = 4096,
305};
306
307enum ioc_running {
308 IOC_IDLE,
309 IOC_RUNNING,
310 IOC_STOP,
311};
312
313/* io.cost.qos controls including per-dev enable of the whole controller */
314enum {
315 QOS_ENABLE,
316 QOS_CTRL,
317 NR_QOS_CTRL_PARAMS,
318};
319
320/* io.cost.qos params */
321enum {
322 QOS_RPPM,
323 QOS_RLAT,
324 QOS_WPPM,
325 QOS_WLAT,
326 QOS_MIN,
327 QOS_MAX,
328 NR_QOS_PARAMS,
329};
330
331/* io.cost.model controls */
332enum {
333 COST_CTRL,
334 COST_MODEL,
335 NR_COST_CTRL_PARAMS,
336};
337
338/* builtin linear cost model coefficients */
339enum {
340 I_LCOEF_RBPS,
341 I_LCOEF_RSEQIOPS,
342 I_LCOEF_RRANDIOPS,
343 I_LCOEF_WBPS,
344 I_LCOEF_WSEQIOPS,
345 I_LCOEF_WRANDIOPS,
346 NR_I_LCOEFS,
347};
348
349enum {
350 LCOEF_RPAGE,
351 LCOEF_RSEQIO,
352 LCOEF_RRANDIO,
353 LCOEF_WPAGE,
354 LCOEF_WSEQIO,
355 LCOEF_WRANDIO,
356 NR_LCOEFS,
357};
358
359enum {
360 AUTOP_INVALID,
361 AUTOP_HDD,
362 AUTOP_SSD_QD1,
363 AUTOP_SSD_DFL,
364 AUTOP_SSD_FAST,
365};
366
367struct ioc_gq;
368
369struct ioc_params {
370 u32 qos[NR_QOS_PARAMS];
371 u64 i_lcoefs[NR_I_LCOEFS];
372 u64 lcoefs[NR_LCOEFS];
373 u32 too_fast_vrate_pct;
374 u32 too_slow_vrate_pct;
375};
376
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400377struct ioc_margins {
378 s64 min;
379 s64 max;
380};
381
Tejun Heo7caa4712019-08-28 15:05:58 -0700382struct ioc_missed {
Tejun Heo5e124f72020-09-01 14:52:33 -0400383 local_t nr_met;
384 local_t nr_missed;
Tejun Heo7caa4712019-08-28 15:05:58 -0700385 u32 last_met;
386 u32 last_missed;
387};
388
389struct ioc_pcpu_stat {
390 struct ioc_missed missed[2];
391
Tejun Heo5e124f72020-09-01 14:52:33 -0400392 local64_t rq_wait_ns;
Tejun Heo7caa4712019-08-28 15:05:58 -0700393 u64 last_rq_wait_ns;
394};
395
396/* per device */
397struct ioc {
398 struct rq_qos rqos;
399
400 bool enabled;
401
402 struct ioc_params params;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400403 struct ioc_margins margins;
Tejun Heo7caa4712019-08-28 15:05:58 -0700404 u32 period_us;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400405 u32 timer_slack_ns;
Tejun Heo7caa4712019-08-28 15:05:58 -0700406 u64 vrate_min;
407 u64 vrate_max;
408
409 spinlock_t lock;
410 struct timer_list timer;
411 struct list_head active_iocgs; /* active cgroups */
412 struct ioc_pcpu_stat __percpu *pcpu_stat;
413
414 enum ioc_running running;
415 atomic64_t vtime_rate;
416
Ahmed S. Darwish67b7b642020-07-20 17:55:26 +0200417 seqcount_spinlock_t period_seqcount;
Tejun Heoce955702020-09-01 14:52:40 -0400418 u64 period_at; /* wallclock starttime */
Tejun Heo7caa4712019-08-28 15:05:58 -0700419 u64 period_at_vtime; /* vtime starttime */
420
421 atomic64_t cur_period; /* inc'd each period */
422 int busy_level; /* saturation history */
423
Tejun Heo7caa4712019-08-28 15:05:58 -0700424 bool weights_updated;
425 atomic_t hweight_gen; /* for lazy hweights */
426
427 u64 autop_too_fast_at;
428 u64 autop_too_slow_at;
429 int autop_idx;
430 bool user_qos_params:1;
431 bool user_cost_model:1;
432};
433
Tejun Heo97eb1972020-09-01 14:52:43 -0400434struct iocg_pcpu_stat {
435 local64_t abs_vusage;
436};
437
438struct iocg_stat {
439 u64 usage_us;
440};
441
Tejun Heo7caa4712019-08-28 15:05:58 -0700442/* per device-cgroup pair */
443struct ioc_gq {
444 struct blkg_policy_data pd;
445 struct ioc *ioc;
446
447 /*
448 * A iocg can get its weight from two sources - an explicit
449 * per-device-cgroup configuration or the default weight of the
450 * cgroup. `cfg_weight` is the explicit per-device-cgroup
451 * configuration. `weight` is the effective considering both
452 * sources.
453 *
454 * When an idle cgroup becomes active its `active` goes from 0 to
455 * `weight`. `inuse` is the surplus adjusted active weight.
456 * `active` and `inuse` are used to calculate `hweight_active` and
457 * `hweight_inuse`.
458 *
459 * `last_inuse` remembers `inuse` while an iocg is idle to persist
460 * surplus adjustments.
461 */
462 u32 cfg_weight;
463 u32 weight;
464 u32 active;
465 u32 inuse;
466 u32 last_inuse;
467
468 sector_t cursor; /* to detect randio */
469
470 /*
471 * `vtime` is this iocg's vtime cursor which progresses as IOs are
472 * issued. If lagging behind device vtime, the delta represents
473 * the currently available IO budget. If runnning ahead, the
474 * overage.
475 *
476 * `vtime_done` is the same but progressed on completion rather
477 * than issue. The delta behind `vtime` represents the cost of
478 * currently in-flight IOs.
Tejun Heo7caa4712019-08-28 15:05:58 -0700479 */
480 atomic64_t vtime;
481 atomic64_t done_vtime;
Tejun Heo0b80f982020-05-04 19:27:54 -0400482 u64 abs_vdebt;
Tejun Heo7caa4712019-08-28 15:05:58 -0700483
484 /*
485 * The period this iocg was last active in. Used for deactivation
486 * and invalidating `vtime`.
487 */
488 atomic64_t active_period;
489 struct list_head active_list;
490
Tejun Heo00410f12020-09-01 14:52:34 -0400491 /* see __propagate_weights() and current_hweight() for details */
Tejun Heo7caa4712019-08-28 15:05:58 -0700492 u64 child_active_sum;
493 u64 child_inuse_sum;
Tejun Heoe08d02a2020-09-01 14:52:48 -0400494 u64 child_adjusted_sum;
Tejun Heo7caa4712019-08-28 15:05:58 -0700495 int hweight_gen;
496 u32 hweight_active;
497 u32 hweight_inuse;
Tejun Heoe08d02a2020-09-01 14:52:48 -0400498 u32 hweight_donating;
Tejun Heo93f7d2d2020-09-01 14:52:47 -0400499 u32 hweight_after_donation;
Tejun Heo7caa4712019-08-28 15:05:58 -0700500
Tejun Heo97eb1972020-09-01 14:52:43 -0400501 struct list_head walk_list;
Tejun Heo8692d2d2020-09-01 14:52:45 -0400502 struct list_head surplus_list;
Tejun Heo97eb1972020-09-01 14:52:43 -0400503
Tejun Heo7caa4712019-08-28 15:05:58 -0700504 struct wait_queue_head waitq;
505 struct hrtimer waitq_timer;
506 struct hrtimer delay_timer;
507
Tejun Heo1aa50d02020-09-01 14:52:44 -0400508 /* timestamp at the latest activation */
509 u64 activated_at;
510
Tejun Heo97eb1972020-09-01 14:52:43 -0400511 /* statistics */
512 struct iocg_pcpu_stat __percpu *pcpu_stat;
513 struct iocg_stat local_stat;
514 struct iocg_stat desc_stat;
515 struct iocg_stat last_stat;
516 u64 last_stat_abs_vusage;
517
Tejun Heofe20cdb52020-09-01 14:52:38 -0400518 /* usage is recorded as fractions of WEIGHT_ONE */
Tejun Heo1aa50d02020-09-01 14:52:44 -0400519 u32 usage_delta_us;
Tejun Heo7caa4712019-08-28 15:05:58 -0700520 int usage_idx;
521 u32 usages[NR_USAGE_SLOTS];
522
523 /* this iocg's depth in the hierarchy and ancestors including self */
524 int level;
525 struct ioc_gq *ancestors[];
526};
527
528/* per cgroup */
529struct ioc_cgrp {
530 struct blkcg_policy_data cpd;
531 unsigned int dfl_weight;
532};
533
534struct ioc_now {
535 u64 now_ns;
Tejun Heoce955702020-09-01 14:52:40 -0400536 u64 now;
Tejun Heo7caa4712019-08-28 15:05:58 -0700537 u64 vnow;
538 u64 vrate;
539};
540
541struct iocg_wait {
542 struct wait_queue_entry wait;
543 struct bio *bio;
544 u64 abs_cost;
545 bool committed;
546};
547
548struct iocg_wake_ctx {
549 struct ioc_gq *iocg;
550 u32 hw_inuse;
551 s64 vbudget;
552};
553
554static const struct ioc_params autop[] = {
555 [AUTOP_HDD] = {
556 .qos = {
Tejun Heo7afccca2019-09-25 16:03:35 -0700557 [QOS_RLAT] = 250000, /* 250ms */
558 [QOS_WLAT] = 250000,
Tejun Heo7caa4712019-08-28 15:05:58 -0700559 [QOS_MIN] = VRATE_MIN_PPM,
560 [QOS_MAX] = VRATE_MAX_PPM,
561 },
562 .i_lcoefs = {
563 [I_LCOEF_RBPS] = 174019176,
564 [I_LCOEF_RSEQIOPS] = 41708,
565 [I_LCOEF_RRANDIOPS] = 370,
566 [I_LCOEF_WBPS] = 178075866,
567 [I_LCOEF_WSEQIOPS] = 42705,
568 [I_LCOEF_WRANDIOPS] = 378,
569 },
570 },
571 [AUTOP_SSD_QD1] = {
572 .qos = {
573 [QOS_RLAT] = 25000, /* 25ms */
574 [QOS_WLAT] = 25000,
575 [QOS_MIN] = VRATE_MIN_PPM,
576 [QOS_MAX] = VRATE_MAX_PPM,
577 },
578 .i_lcoefs = {
579 [I_LCOEF_RBPS] = 245855193,
580 [I_LCOEF_RSEQIOPS] = 61575,
581 [I_LCOEF_RRANDIOPS] = 6946,
582 [I_LCOEF_WBPS] = 141365009,
583 [I_LCOEF_WSEQIOPS] = 33716,
584 [I_LCOEF_WRANDIOPS] = 26796,
585 },
586 },
587 [AUTOP_SSD_DFL] = {
588 .qos = {
589 [QOS_RLAT] = 25000, /* 25ms */
590 [QOS_WLAT] = 25000,
591 [QOS_MIN] = VRATE_MIN_PPM,
592 [QOS_MAX] = VRATE_MAX_PPM,
593 },
594 .i_lcoefs = {
595 [I_LCOEF_RBPS] = 488636629,
596 [I_LCOEF_RSEQIOPS] = 8932,
597 [I_LCOEF_RRANDIOPS] = 8518,
598 [I_LCOEF_WBPS] = 427891549,
599 [I_LCOEF_WSEQIOPS] = 28755,
600 [I_LCOEF_WRANDIOPS] = 21940,
601 },
602 .too_fast_vrate_pct = 500,
603 },
604 [AUTOP_SSD_FAST] = {
605 .qos = {
606 [QOS_RLAT] = 5000, /* 5ms */
607 [QOS_WLAT] = 5000,
608 [QOS_MIN] = VRATE_MIN_PPM,
609 [QOS_MAX] = VRATE_MAX_PPM,
610 },
611 .i_lcoefs = {
612 [I_LCOEF_RBPS] = 3102524156LLU,
613 [I_LCOEF_RSEQIOPS] = 724816,
614 [I_LCOEF_RRANDIOPS] = 778122,
615 [I_LCOEF_WBPS] = 1742780862LLU,
616 [I_LCOEF_WSEQIOPS] = 425702,
617 [I_LCOEF_WRANDIOPS] = 443193,
618 },
619 .too_slow_vrate_pct = 10,
620 },
621};
622
623/*
624 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
625 * vtime credit shortage and down on device saturation.
626 */
627static u32 vrate_adj_pct[] =
628 { 0, 0, 0, 0,
629 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
630 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
631 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
632
633static struct blkcg_policy blkcg_policy_iocost;
634
635/* accessors and helpers */
636static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
637{
638 return container_of(rqos, struct ioc, rqos);
639}
640
641static struct ioc *q_to_ioc(struct request_queue *q)
642{
643 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
644}
645
646static const char *q_name(struct request_queue *q)
647{
648 if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
649 return kobject_name(q->kobj.parent);
650 else
651 return "<unknown>";
652}
653
654static const char __maybe_unused *ioc_name(struct ioc *ioc)
655{
656 return q_name(ioc->rqos.q);
657}
658
659static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
660{
661 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
662}
663
664static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
665{
666 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
667}
668
669static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
670{
671 return pd_to_blkg(&iocg->pd);
672}
673
674static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
675{
676 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
677 struct ioc_cgrp, cpd);
678}
679
680/*
681 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
Tejun Heo36a52482019-09-04 12:45:52 -0700682 * weight, the more expensive each IO. Must round up.
Tejun Heo7caa4712019-08-28 15:05:58 -0700683 */
684static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
685{
Tejun Heofe20cdb52020-09-01 14:52:38 -0400686 return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
Tejun Heo7caa4712019-08-28 15:05:58 -0700687}
688
Tejun Heo36a52482019-09-04 12:45:52 -0700689/*
690 * The inverse of abs_cost_to_cost(). Must round up.
691 */
692static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
693{
Tejun Heofe20cdb52020-09-01 14:52:38 -0400694 return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
Tejun Heo36a52482019-09-04 12:45:52 -0700695}
696
Tejun Heo97eb1972020-09-01 14:52:43 -0400697static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
698 u64 abs_cost, u64 cost)
Tejun Heo7caa4712019-08-28 15:05:58 -0700699{
Tejun Heo97eb1972020-09-01 14:52:43 -0400700 struct iocg_pcpu_stat *gcs;
701
Tejun Heo7caa4712019-08-28 15:05:58 -0700702 bio->bi_iocost_cost = cost;
703 atomic64_add(cost, &iocg->vtime);
Tejun Heo97eb1972020-09-01 14:52:43 -0400704
705 gcs = get_cpu_ptr(iocg->pcpu_stat);
706 local64_add(abs_cost, &gcs->abs_vusage);
707 put_cpu_ptr(gcs);
Tejun Heo7caa4712019-08-28 15:05:58 -0700708}
709
Tejun Heoda437b92020-09-01 14:52:42 -0400710static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
711{
712 if (lock_ioc) {
713 spin_lock_irqsave(&iocg->ioc->lock, *flags);
714 spin_lock(&iocg->waitq.lock);
715 } else {
716 spin_lock_irqsave(&iocg->waitq.lock, *flags);
717 }
718}
719
720static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
721{
722 if (unlock_ioc) {
723 spin_unlock(&iocg->waitq.lock);
724 spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
725 } else {
726 spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
727 }
728}
729
Tejun Heo7caa4712019-08-28 15:05:58 -0700730#define CREATE_TRACE_POINTS
731#include <trace/events/iocost.h>
732
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400733static void ioc_refresh_margins(struct ioc *ioc)
734{
735 struct ioc_margins *margins = &ioc->margins;
736 u32 period_us = ioc->period_us;
737 u64 vrate = atomic64_read(&ioc->vtime_rate);
738
739 margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
740 margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate;
741}
742
Tejun Heo7caa4712019-08-28 15:05:58 -0700743/* latency Qos params changed, update period_us and all the dependent params */
744static void ioc_refresh_period_us(struct ioc *ioc)
745{
746 u32 ppm, lat, multi, period_us;
747
748 lockdep_assert_held(&ioc->lock);
749
750 /* pick the higher latency target */
751 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
752 ppm = ioc->params.qos[QOS_RPPM];
753 lat = ioc->params.qos[QOS_RLAT];
754 } else {
755 ppm = ioc->params.qos[QOS_WPPM];
756 lat = ioc->params.qos[QOS_WLAT];
757 }
758
759 /*
760 * We want the period to be long enough to contain a healthy number
761 * of IOs while short enough for granular control. Define it as a
762 * multiple of the latency target. Ideally, the multiplier should
763 * be scaled according to the percentile so that it would nominally
764 * contain a certain number of requests. Let's be simpler and
765 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
766 */
767 if (ppm)
768 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
769 else
770 multi = 2;
771 period_us = multi * lat;
772 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
773
774 /* calculate dependent params */
775 ioc->period_us = period_us;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400776 ioc->timer_slack_ns = div64_u64(
777 (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
778 100);
779 ioc_refresh_margins(ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -0700780}
781
782static int ioc_autop_idx(struct ioc *ioc)
783{
784 int idx = ioc->autop_idx;
785 const struct ioc_params *p = &autop[idx];
786 u32 vrate_pct;
787 u64 now_ns;
788
789 /* rotational? */
790 if (!blk_queue_nonrot(ioc->rqos.q))
791 return AUTOP_HDD;
792
793 /* handle SATA SSDs w/ broken NCQ */
794 if (blk_queue_depth(ioc->rqos.q) == 1)
795 return AUTOP_SSD_QD1;
796
797 /* use one of the normal ssd sets */
798 if (idx < AUTOP_SSD_DFL)
799 return AUTOP_SSD_DFL;
800
801 /* if user is overriding anything, maintain what was there */
802 if (ioc->user_qos_params || ioc->user_cost_model)
803 return idx;
804
805 /* step up/down based on the vrate */
806 vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
807 VTIME_PER_USEC);
808 now_ns = ktime_get_ns();
809
810 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
811 if (!ioc->autop_too_fast_at)
812 ioc->autop_too_fast_at = now_ns;
813 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
814 return idx + 1;
815 } else {
816 ioc->autop_too_fast_at = 0;
817 }
818
819 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
820 if (!ioc->autop_too_slow_at)
821 ioc->autop_too_slow_at = now_ns;
822 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
823 return idx - 1;
824 } else {
825 ioc->autop_too_slow_at = 0;
826 }
827
828 return idx;
829}
830
831/*
832 * Take the followings as input
833 *
834 * @bps maximum sequential throughput
835 * @seqiops maximum sequential 4k iops
836 * @randiops maximum random 4k iops
837 *
838 * and calculate the linear model cost coefficients.
839 *
840 * *@page per-page cost 1s / (@bps / 4096)
841 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
842 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
843 */
844static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
845 u64 *page, u64 *seqio, u64 *randio)
846{
847 u64 v;
848
849 *page = *seqio = *randio = 0;
850
851 if (bps)
852 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
853 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
854
855 if (seqiops) {
856 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
857 if (v > *page)
858 *seqio = v - *page;
859 }
860
861 if (randiops) {
862 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
863 if (v > *page)
864 *randio = v - *page;
865 }
866}
867
868static void ioc_refresh_lcoefs(struct ioc *ioc)
869{
870 u64 *u = ioc->params.i_lcoefs;
871 u64 *c = ioc->params.lcoefs;
872
873 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
874 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
875 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
876 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
877}
878
879static bool ioc_refresh_params(struct ioc *ioc, bool force)
880{
881 const struct ioc_params *p;
882 int idx;
883
884 lockdep_assert_held(&ioc->lock);
885
886 idx = ioc_autop_idx(ioc);
887 p = &autop[idx];
888
889 if (idx == ioc->autop_idx && !force)
890 return false;
891
892 if (idx != ioc->autop_idx)
893 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
894
895 ioc->autop_idx = idx;
896 ioc->autop_too_fast_at = 0;
897 ioc->autop_too_slow_at = 0;
898
899 if (!ioc->user_qos_params)
900 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
901 if (!ioc->user_cost_model)
902 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
903
904 ioc_refresh_period_us(ioc);
905 ioc_refresh_lcoefs(ioc);
906
907 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
908 VTIME_PER_USEC, MILLION);
909 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
910 VTIME_PER_USEC, MILLION);
911
912 return true;
913}
914
915/* take a snapshot of the current [v]time and vrate */
916static void ioc_now(struct ioc *ioc, struct ioc_now *now)
917{
918 unsigned seq;
919
920 now->now_ns = ktime_get();
921 now->now = ktime_to_us(now->now_ns);
922 now->vrate = atomic64_read(&ioc->vtime_rate);
923
924 /*
925 * The current vtime is
926 *
927 * vtime at period start + (wallclock time since the start) * vrate
928 *
929 * As a consistent snapshot of `period_at_vtime` and `period_at` is
930 * needed, they're seqcount protected.
931 */
932 do {
933 seq = read_seqcount_begin(&ioc->period_seqcount);
934 now->vnow = ioc->period_at_vtime +
935 (now->now - ioc->period_at) * now->vrate;
936 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
937}
938
939static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
940{
Tejun Heo7caa4712019-08-28 15:05:58 -0700941 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
942
943 write_seqcount_begin(&ioc->period_seqcount);
944 ioc->period_at = now->now;
945 ioc->period_at_vtime = now->vnow;
946 write_seqcount_end(&ioc->period_seqcount);
947
948 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
949 add_timer(&ioc->timer);
950}
951
952/*
953 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
954 * weight sums and propagate upwards accordingly.
955 */
Tejun Heo00410f12020-09-01 14:52:34 -0400956static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
Tejun Heo7caa4712019-08-28 15:05:58 -0700957{
958 struct ioc *ioc = iocg->ioc;
959 int lvl;
960
961 lockdep_assert_held(&ioc->lock);
962
Tejun Heodb84a722020-09-01 14:52:35 -0400963 inuse = clamp_t(u32, inuse, 1, active);
964
965 if (active == iocg->active && inuse == iocg->inuse)
966 return;
Tejun Heo7caa4712019-08-28 15:05:58 -0700967
968 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
969 struct ioc_gq *parent = iocg->ancestors[lvl];
970 struct ioc_gq *child = iocg->ancestors[lvl + 1];
971 u32 parent_active = 0, parent_inuse = 0;
972
973 /* update the level sums */
974 parent->child_active_sum += (s32)(active - child->active);
975 parent->child_inuse_sum += (s32)(inuse - child->inuse);
976 /* apply the udpates */
977 child->active = active;
978 child->inuse = inuse;
979
980 /*
981 * The delta between inuse and active sums indicates that
982 * that much of weight is being given away. Parent's inuse
983 * and active should reflect the ratio.
984 */
985 if (parent->child_active_sum) {
986 parent_active = parent->weight;
987 parent_inuse = DIV64_U64_ROUND_UP(
988 parent_active * parent->child_inuse_sum,
989 parent->child_active_sum);
990 }
991
992 /* do we need to keep walking up? */
993 if (parent_active == parent->active &&
994 parent_inuse == parent->inuse)
995 break;
996
997 active = parent_active;
998 inuse = parent_inuse;
999 }
1000
1001 ioc->weights_updated = true;
1002}
1003
Tejun Heo00410f12020-09-01 14:52:34 -04001004static void commit_weights(struct ioc *ioc)
Tejun Heo7caa4712019-08-28 15:05:58 -07001005{
1006 lockdep_assert_held(&ioc->lock);
1007
1008 if (ioc->weights_updated) {
1009 /* paired with rmb in current_hweight(), see there */
1010 smp_wmb();
1011 atomic_inc(&ioc->hweight_gen);
1012 ioc->weights_updated = false;
1013 }
1014}
1015
Tejun Heo00410f12020-09-01 14:52:34 -04001016static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
Tejun Heo7caa4712019-08-28 15:05:58 -07001017{
Tejun Heo00410f12020-09-01 14:52:34 -04001018 __propagate_weights(iocg, active, inuse);
1019 commit_weights(iocg->ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -07001020}
1021
1022static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1023{
1024 struct ioc *ioc = iocg->ioc;
1025 int lvl;
1026 u32 hwa, hwi;
1027 int ioc_gen;
1028
1029 /* hot path - if uptodate, use cached */
1030 ioc_gen = atomic_read(&ioc->hweight_gen);
1031 if (ioc_gen == iocg->hweight_gen)
1032 goto out;
1033
1034 /*
Tejun Heo00410f12020-09-01 14:52:34 -04001035 * Paired with wmb in commit_weights(). If we saw the updated
1036 * hweight_gen, all the weight updates from __propagate_weights() are
1037 * visible too.
Tejun Heo7caa4712019-08-28 15:05:58 -07001038 *
1039 * We can race with weight updates during calculation and get it
1040 * wrong. However, hweight_gen would have changed and a future
1041 * reader will recalculate and we're guaranteed to discard the
1042 * wrong result soon.
1043 */
1044 smp_rmb();
1045
Tejun Heofe20cdb52020-09-01 14:52:38 -04001046 hwa = hwi = WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07001047 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1048 struct ioc_gq *parent = iocg->ancestors[lvl];
1049 struct ioc_gq *child = iocg->ancestors[lvl + 1];
Tejun Heobd0adb92020-09-01 14:52:39 -04001050 u64 active_sum = READ_ONCE(parent->child_active_sum);
1051 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
Tejun Heo7caa4712019-08-28 15:05:58 -07001052 u32 active = READ_ONCE(child->active);
1053 u32 inuse = READ_ONCE(child->inuse);
1054
1055 /* we can race with deactivations and either may read as zero */
1056 if (!active_sum || !inuse_sum)
1057 continue;
1058
Tejun Heobd0adb92020-09-01 14:52:39 -04001059 active_sum = max_t(u64, active, active_sum);
1060 hwa = div64_u64((u64)hwa * active, active_sum);
Tejun Heo7caa4712019-08-28 15:05:58 -07001061
Tejun Heobd0adb92020-09-01 14:52:39 -04001062 inuse_sum = max_t(u64, inuse, inuse_sum);
1063 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
Tejun Heo7caa4712019-08-28 15:05:58 -07001064 }
1065
1066 iocg->hweight_active = max_t(u32, hwa, 1);
1067 iocg->hweight_inuse = max_t(u32, hwi, 1);
1068 iocg->hweight_gen = ioc_gen;
1069out:
1070 if (hw_activep)
1071 *hw_activep = iocg->hweight_active;
1072 if (hw_inusep)
1073 *hw_inusep = iocg->hweight_inuse;
1074}
1075
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001076/*
1077 * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1078 * other weights stay unchanged.
1079 */
1080static u32 current_hweight_max(struct ioc_gq *iocg)
1081{
1082 u32 hwm = WEIGHT_ONE;
1083 u32 inuse = iocg->active;
1084 u64 child_inuse_sum;
1085 int lvl;
1086
1087 lockdep_assert_held(&iocg->ioc->lock);
1088
1089 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1090 struct ioc_gq *parent = iocg->ancestors[lvl];
1091 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1092
1093 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1094 hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1095 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1096 parent->child_active_sum);
1097 }
1098
1099 return max_t(u32, hwm, 1);
1100}
1101
Tejun Heo7caa4712019-08-28 15:05:58 -07001102static void weight_updated(struct ioc_gq *iocg)
1103{
1104 struct ioc *ioc = iocg->ioc;
1105 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1106 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1107 u32 weight;
1108
1109 lockdep_assert_held(&ioc->lock);
1110
1111 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1112 if (weight != iocg->weight && iocg->active)
Tejun Heo00410f12020-09-01 14:52:34 -04001113 propagate_weights(iocg, weight,
Tejun Heobd0adb92020-09-01 14:52:39 -04001114 DIV64_U64_ROUND_UP((u64)iocg->inuse * weight,
1115 iocg->weight));
Tejun Heo7caa4712019-08-28 15:05:58 -07001116 iocg->weight = weight;
1117}
1118
1119static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1120{
1121 struct ioc *ioc = iocg->ioc;
1122 u64 last_period, cur_period, max_period_delta;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001123 u64 vtime, vmin;
Tejun Heo7caa4712019-08-28 15:05:58 -07001124 int i;
1125
1126 /*
1127 * If seem to be already active, just update the stamp to tell the
1128 * timer that we're still active. We don't mind occassional races.
1129 */
1130 if (!list_empty(&iocg->active_list)) {
1131 ioc_now(ioc, now);
1132 cur_period = atomic64_read(&ioc->cur_period);
1133 if (atomic64_read(&iocg->active_period) != cur_period)
1134 atomic64_set(&iocg->active_period, cur_period);
1135 return true;
1136 }
1137
1138 /* racy check on internal node IOs, treat as root level IOs */
1139 if (iocg->child_active_sum)
1140 return false;
1141
1142 spin_lock_irq(&ioc->lock);
1143
1144 ioc_now(ioc, now);
1145
1146 /* update period */
1147 cur_period = atomic64_read(&ioc->cur_period);
1148 last_period = atomic64_read(&iocg->active_period);
1149 atomic64_set(&iocg->active_period, cur_period);
1150
1151 /* already activated or breaking leaf-only constraint? */
Jiufei Xue8b37bc22019-11-13 15:21:31 +08001152 if (!list_empty(&iocg->active_list))
1153 goto succeed_unlock;
1154 for (i = iocg->level - 1; i > 0; i--)
1155 if (!list_empty(&iocg->ancestors[i]->active_list))
Tejun Heo7caa4712019-08-28 15:05:58 -07001156 goto fail_unlock;
Jiufei Xue8b37bc22019-11-13 15:21:31 +08001157
Tejun Heo7caa4712019-08-28 15:05:58 -07001158 if (iocg->child_active_sum)
1159 goto fail_unlock;
1160
1161 /*
1162 * vtime may wrap when vrate is raised substantially due to
1163 * underestimated IO costs. Look at the period and ignore its
1164 * vtime if the iocg has been idle for too long. Also, cap the
1165 * budget it can start with to the margin.
1166 */
1167 max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1168 vtime = atomic64_read(&iocg->vtime);
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001169 vmin = now->vnow - ioc->margins.max;
Tejun Heo7caa4712019-08-28 15:05:58 -07001170
1171 if (last_period + max_period_delta < cur_period ||
1172 time_before64(vtime, vmin)) {
1173 atomic64_add(vmin - vtime, &iocg->vtime);
1174 atomic64_add(vmin - vtime, &iocg->done_vtime);
1175 vtime = vmin;
1176 }
1177
1178 /*
1179 * Activate, propagate weight and start period timer if not
1180 * running. Reset hweight_gen to avoid accidental match from
1181 * wrapping.
1182 */
1183 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1184 list_add(&iocg->active_list, &ioc->active_iocgs);
Tejun Heo00410f12020-09-01 14:52:34 -04001185 propagate_weights(iocg, iocg->weight,
1186 iocg->last_inuse ?: iocg->weight);
Tejun Heo7caa4712019-08-28 15:05:58 -07001187
1188 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1189 last_period, cur_period, vtime);
1190
Tejun Heo1aa50d02020-09-01 14:52:44 -04001191 iocg->activated_at = now->now;
Tejun Heo7caa4712019-08-28 15:05:58 -07001192
1193 if (ioc->running == IOC_IDLE) {
1194 ioc->running = IOC_RUNNING;
1195 ioc_start_period(ioc, now);
1196 }
1197
Jiufei Xue8b37bc22019-11-13 15:21:31 +08001198succeed_unlock:
Tejun Heo7caa4712019-08-28 15:05:58 -07001199 spin_unlock_irq(&ioc->lock);
1200 return true;
1201
1202fail_unlock:
1203 spin_unlock_irq(&ioc->lock);
1204 return false;
1205}
1206
Tejun Heo6ef20f72020-09-01 14:52:36 -04001207static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1208{
1209 struct ioc *ioc = iocg->ioc;
1210 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1211 u64 vtime = atomic64_read(&iocg->vtime);
Tejun Heo6ef20f72020-09-01 14:52:36 -04001212 u64 delta_ns, expires, oexpires;
1213 u32 hw_inuse;
1214
1215 lockdep_assert_held(&iocg->waitq.lock);
1216
1217 /* debt-adjust vtime */
1218 current_hweight(iocg, NULL, &hw_inuse);
1219 vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1220
1221 /*
1222 * Clear or maintain depending on the overage. Non-zero vdebt is what
1223 * guarantees that @iocg is online and future iocg_kick_delay() will
1224 * clear use_delay. Don't leave it on when there's no vdebt.
1225 */
1226 if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1227 blkcg_clear_delay(blkg);
1228 return false;
1229 }
1230 if (!atomic_read(&blkg->use_delay) &&
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001231 time_before_eq64(vtime, now->vnow + ioc->margins.max))
Tejun Heo6ef20f72020-09-01 14:52:36 -04001232 return false;
1233
1234 /* use delay */
1235 delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
1236 now->vrate) * NSEC_PER_USEC;
1237 blkcg_set_delay(blkg, delta_ns);
1238 expires = now->now_ns + delta_ns;
1239
1240 /* if already active and close enough, don't bother */
1241 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1242 if (hrtimer_is_queued(&iocg->delay_timer) &&
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001243 abs(oexpires - expires) <= ioc->timer_slack_ns)
Tejun Heo6ef20f72020-09-01 14:52:36 -04001244 return true;
1245
1246 hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001247 ioc->timer_slack_ns, HRTIMER_MODE_ABS);
Tejun Heo6ef20f72020-09-01 14:52:36 -04001248 return true;
1249}
1250
1251static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1252{
1253 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1254 struct ioc_now now;
1255 unsigned long flags;
1256
1257 spin_lock_irqsave(&iocg->waitq.lock, flags);
1258 ioc_now(iocg->ioc, &now);
1259 iocg_kick_delay(iocg, &now);
1260 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1261
1262 return HRTIMER_NORESTART;
1263}
1264
Tejun Heo7caa4712019-08-28 15:05:58 -07001265static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1266 int flags, void *key)
1267{
1268 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1269 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1270 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1271
1272 ctx->vbudget -= cost;
1273
1274 if (ctx->vbudget < 0)
1275 return -1;
1276
Tejun Heo97eb1972020-09-01 14:52:43 -04001277 iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
Tejun Heo7caa4712019-08-28 15:05:58 -07001278
1279 /*
1280 * autoremove_wake_function() removes the wait entry only when it
1281 * actually changed the task state. We want the wait always
1282 * removed. Remove explicitly and use default_wake_function().
1283 */
1284 list_del_init(&wq_entry->entry);
1285 wait->committed = true;
1286
1287 default_wake_function(wq_entry, mode, flags, key);
1288 return 0;
1289}
1290
Tejun Heoda437b92020-09-01 14:52:42 -04001291/*
1292 * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1293 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1294 * addition to iocg->waitq.lock.
1295 */
1296static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1297 struct ioc_now *now)
Tejun Heo7caa4712019-08-28 15:05:58 -07001298{
1299 struct ioc *ioc = iocg->ioc;
1300 struct iocg_wake_ctx ctx = { .iocg = iocg };
Tejun Heoda437b92020-09-01 14:52:42 -04001301 u64 vshortage, expires, oexpires;
Tejun Heo36a52482019-09-04 12:45:52 -07001302 s64 vbudget;
1303 u32 hw_inuse;
Tejun Heo7caa4712019-08-28 15:05:58 -07001304
1305 lockdep_assert_held(&iocg->waitq.lock);
1306
Tejun Heo36a52482019-09-04 12:45:52 -07001307 current_hweight(iocg, NULL, &hw_inuse);
1308 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1309
1310 /* pay off debt */
Tejun Heoda437b92020-09-01 14:52:42 -04001311 if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1312 u64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
Tejun Heo36a52482019-09-04 12:45:52 -07001313 u64 delta = min_t(u64, vbudget, vdebt);
1314 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
Tejun Heo0b80f982020-05-04 19:27:54 -04001315 iocg->abs_vdebt);
Tejun Heo36a52482019-09-04 12:45:52 -07001316
Tejun Heoda437b92020-09-01 14:52:42 -04001317 lockdep_assert_held(&ioc->lock);
1318
Tejun Heo36a52482019-09-04 12:45:52 -07001319 atomic64_add(delta, &iocg->vtime);
1320 atomic64_add(delta, &iocg->done_vtime);
Tejun Heo0b80f982020-05-04 19:27:54 -04001321 iocg->abs_vdebt -= abs_delta;
Tejun Heoda437b92020-09-01 14:52:42 -04001322 vbudget -= vdebt;
Tejun Heo7b84b492020-09-01 14:52:37 -04001323
1324 iocg_kick_delay(iocg, now);
Tejun Heo36a52482019-09-04 12:45:52 -07001325 }
1326
Tejun Heo7caa4712019-08-28 15:05:58 -07001327 /*
Tejun Heoda437b92020-09-01 14:52:42 -04001328 * Debt can still be outstanding if we haven't paid all yet or the
1329 * caller raced and called without @pay_debt. Shouldn't wake up waiters
1330 * under debt. Make sure @vbudget reflects the outstanding amount and is
1331 * not positive.
1332 */
1333 if (iocg->abs_vdebt) {
1334 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1335 vbudget = min_t(s64, 0, vbudget - vdebt);
1336 }
1337
1338 /*
Tejun Heo7caa4712019-08-28 15:05:58 -07001339 * Wake up the ones which are due and see how much vtime we'll need
1340 * for the next one.
1341 */
Tejun Heo36a52482019-09-04 12:45:52 -07001342 ctx.hw_inuse = hw_inuse;
Tejun Heoda437b92020-09-01 14:52:42 -04001343 ctx.vbudget = vbudget;
Tejun Heo7caa4712019-08-28 15:05:58 -07001344 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1345 if (!waitqueue_active(&iocg->waitq))
1346 return;
1347 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1348 return;
1349
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001350 /* determine next wakeup, add a timer margin to guarantee chunking */
Tejun Heo7caa4712019-08-28 15:05:58 -07001351 vshortage = -ctx.vbudget;
1352 expires = now->now_ns +
1353 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001354 expires += ioc->timer_slack_ns;
Tejun Heo7caa4712019-08-28 15:05:58 -07001355
1356 /* if already active and close enough, don't bother */
1357 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1358 if (hrtimer_is_queued(&iocg->waitq_timer) &&
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001359 abs(oexpires - expires) <= ioc->timer_slack_ns)
Tejun Heo7caa4712019-08-28 15:05:58 -07001360 return;
1361
1362 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001363 ioc->timer_slack_ns, HRTIMER_MODE_ABS);
Tejun Heo7caa4712019-08-28 15:05:58 -07001364}
1365
1366static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1367{
1368 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
Tejun Heoda437b92020-09-01 14:52:42 -04001369 bool pay_debt = READ_ONCE(iocg->abs_vdebt);
Tejun Heo7caa4712019-08-28 15:05:58 -07001370 struct ioc_now now;
1371 unsigned long flags;
1372
1373 ioc_now(iocg->ioc, &now);
1374
Tejun Heoda437b92020-09-01 14:52:42 -04001375 iocg_lock(iocg, pay_debt, &flags);
1376 iocg_kick_waitq(iocg, pay_debt, &now);
1377 iocg_unlock(iocg, pay_debt, &flags);
Tejun Heo7caa4712019-08-28 15:05:58 -07001378
1379 return HRTIMER_NORESTART;
1380}
1381
Tejun Heo7caa4712019-08-28 15:05:58 -07001382static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1383{
1384 u32 nr_met[2] = { };
1385 u32 nr_missed[2] = { };
1386 u64 rq_wait_ns = 0;
1387 int cpu, rw;
1388
1389 for_each_online_cpu(cpu) {
1390 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1391 u64 this_rq_wait_ns;
1392
1393 for (rw = READ; rw <= WRITE; rw++) {
Tejun Heo5e124f72020-09-01 14:52:33 -04001394 u32 this_met = local_read(&stat->missed[rw].nr_met);
1395 u32 this_missed = local_read(&stat->missed[rw].nr_missed);
Tejun Heo7caa4712019-08-28 15:05:58 -07001396
1397 nr_met[rw] += this_met - stat->missed[rw].last_met;
1398 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1399 stat->missed[rw].last_met = this_met;
1400 stat->missed[rw].last_missed = this_missed;
1401 }
1402
Tejun Heo5e124f72020-09-01 14:52:33 -04001403 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
Tejun Heo7caa4712019-08-28 15:05:58 -07001404 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1405 stat->last_rq_wait_ns = this_rq_wait_ns;
1406 }
1407
1408 for (rw = READ; rw <= WRITE; rw++) {
1409 if (nr_met[rw] + nr_missed[rw])
1410 missed_ppm_ar[rw] =
1411 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1412 nr_met[rw] + nr_missed[rw]);
1413 else
1414 missed_ppm_ar[rw] = 0;
1415 }
1416
1417 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1418 ioc->period_us * NSEC_PER_USEC);
1419}
1420
1421/* was iocg idle this period? */
1422static bool iocg_is_idle(struct ioc_gq *iocg)
1423{
1424 struct ioc *ioc = iocg->ioc;
1425
1426 /* did something get issued this period? */
1427 if (atomic64_read(&iocg->active_period) ==
1428 atomic64_read(&ioc->cur_period))
1429 return false;
1430
1431 /* is something in flight? */
Tejun Heodcd65892020-03-10 13:07:46 -04001432 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
Tejun Heo7caa4712019-08-28 15:05:58 -07001433 return false;
1434
1435 return true;
1436}
1437
Tejun Heo97eb1972020-09-01 14:52:43 -04001438/*
1439 * Call this function on the target leaf @iocg's to build pre-order traversal
1440 * list of all the ancestors in @inner_walk. The inner nodes are linked through
1441 * ->walk_list and the caller is responsible for dissolving the list after use.
1442 */
1443static void iocg_build_inner_walk(struct ioc_gq *iocg,
1444 struct list_head *inner_walk)
1445{
1446 int lvl;
1447
1448 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1449
1450 /* find the first ancestor which hasn't been visited yet */
1451 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1452 if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1453 break;
1454 }
1455
1456 /* walk down and visit the inner nodes to get pre-order traversal */
1457 while (++lvl <= iocg->level - 1) {
1458 struct ioc_gq *inner = iocg->ancestors[lvl];
1459
1460 /* record traversal order */
1461 list_add_tail(&inner->walk_list, inner_walk);
1462 }
1463}
1464
1465/* collect per-cpu counters and propagate the deltas to the parent */
1466static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
1467{
1468 struct iocg_stat new_stat;
1469 u64 abs_vusage = 0;
1470 u64 vusage_delta;
1471 int cpu;
1472
1473 lockdep_assert_held(&iocg->ioc->lock);
1474
1475 /* collect per-cpu counters */
1476 for_each_possible_cpu(cpu) {
1477 abs_vusage += local64_read(
1478 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1479 }
1480 vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1481 iocg->last_stat_abs_vusage = abs_vusage;
1482
Tejun Heo1aa50d02020-09-01 14:52:44 -04001483 iocg->usage_delta_us = div64_u64(vusage_delta, now->vrate);
1484 iocg->local_stat.usage_us += iocg->usage_delta_us;
Tejun Heo97eb1972020-09-01 14:52:43 -04001485
1486 new_stat.usage_us =
1487 iocg->local_stat.usage_us + iocg->desc_stat.usage_us;
1488
1489 /* propagate the deltas to the parent */
1490 if (iocg->level > 0) {
1491 struct iocg_stat *parent_stat =
1492 &iocg->ancestors[iocg->level - 1]->desc_stat;
1493
1494 parent_stat->usage_us +=
1495 new_stat.usage_us - iocg->last_stat.usage_us;
1496 }
1497
1498 iocg->last_stat = new_stat;
1499}
1500
1501/* get stat counters ready for reading on all active iocgs */
1502static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1503{
1504 LIST_HEAD(inner_walk);
1505 struct ioc_gq *iocg, *tiocg;
1506
1507 /* flush leaves and build inner node walk list */
1508 list_for_each_entry(iocg, target_iocgs, active_list) {
1509 iocg_flush_stat_one(iocg, now);
1510 iocg_build_inner_walk(iocg, &inner_walk);
1511 }
1512
1513 /* keep flushing upwards by walking the inner list backwards */
1514 list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1515 iocg_flush_stat_one(iocg, now);
1516 list_del_init(&iocg->walk_list);
1517 }
1518}
1519
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001520/*
1521 * Determine what @iocg's hweight_inuse should be after donating unused
1522 * capacity. @hwm is the upper bound and used to signal no donation. This
1523 * function also throws away @iocg's excess budget.
1524 */
1525static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
1526 struct ioc_now *now)
Tejun Heo7caa4712019-08-28 15:05:58 -07001527{
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001528 struct ioc *ioc = iocg->ioc;
1529 u64 vtime = atomic64_read(&iocg->vtime);
1530 s64 excess;
1531
1532 /* see whether minimum margin requirement is met */
1533 if (waitqueue_active(&iocg->waitq) ||
1534 time_after64(vtime, now->vnow - ioc->margins.min))
1535 return hwm;
1536
1537 /* throw away excess above max */
1538 excess = now->vnow - vtime - ioc->margins.max;
1539 if (excess > 0) {
1540 atomic64_add(excess, &iocg->vtime);
1541 atomic64_add(excess, &iocg->done_vtime);
1542 vtime += excess;
1543 }
1544
Tejun Heo7caa4712019-08-28 15:05:58 -07001545 /* add margin */
1546 usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1547 usage += SURPLUS_SCALE_ABS;
1548
1549 /* don't bother if the surplus is too small */
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001550 if (usage + SURPLUS_MIN_ADJ_DELTA > hwm)
1551 return hwm;
Tejun Heo7caa4712019-08-28 15:05:58 -07001552
1553 return usage;
1554}
1555
Tejun Heoe08d02a2020-09-01 14:52:48 -04001556/*
1557 * For work-conservation, an iocg which isn't using all of its share should
1558 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1559 * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1560 *
1561 * #1 is mathematically simpler but has the drawback of requiring synchronous
1562 * global hweight_inuse updates when idle iocg's get activated or inuse weights
1563 * change due to donation snapbacks as it has the possibility of grossly
1564 * overshooting what's allowed by the model and vrate.
1565 *
1566 * #2 is inherently safe with local operations. The donating iocg can easily
1567 * snap back to higher weights when needed without worrying about impacts on
1568 * other nodes as the impacts will be inherently correct. This also makes idle
1569 * iocg activations safe. The only effect activations have is decreasing
1570 * hweight_inuse of others, the right solution to which is for those iocgs to
1571 * snap back to higher weights.
1572 *
1573 * So, we go with #2. The challenge is calculating how each donating iocg's
1574 * inuse should be adjusted to achieve the target donation amounts. This is done
1575 * using Andy's method described in the following pdf.
1576 *
1577 * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1578 *
1579 * Given the weights and target after-donation hweight_inuse values, Andy's
1580 * method determines how the proportional distribution should look like at each
1581 * sibling level to maintain the relative relationship between all non-donating
1582 * pairs. To roughly summarize, it divides the tree into donating and
1583 * non-donating parts, calculates global donation rate which is used to
1584 * determine the target hweight_inuse for each node, and then derives per-level
1585 * proportions.
1586 *
1587 * The following pdf shows that global distribution calculated this way can be
1588 * achieved by scaling inuse weights of donating leaves and propagating the
1589 * adjustments upwards proportionally.
1590 *
1591 * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1592 *
1593 * Combining the above two, we can determine how each leaf iocg's inuse should
1594 * be adjusted to achieve the target donation.
1595 *
1596 * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1597 *
1598 * The inline comments use symbols from the last pdf.
1599 *
1600 * b is the sum of the absolute budgets in the subtree. 1 for the root node.
1601 * f is the sum of the absolute budgets of non-donating nodes in the subtree.
1602 * t is the sum of the absolute budgets of donating nodes in the subtree.
1603 * w is the weight of the node. w = w_f + w_t
1604 * w_f is the non-donating portion of w. w_f = w * f / b
1605 * w_b is the donating portion of w. w_t = w * t / b
1606 * s is the sum of all sibling weights. s = Sum(w) for siblings
1607 * s_f and s_t are the non-donating and donating portions of s.
1608 *
1609 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1610 * w_pt is the donating portion of the parent's weight and w'_pt the same value
1611 * after adjustments. Subscript r denotes the root node's values.
1612 */
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001613static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1614{
Tejun Heoe08d02a2020-09-01 14:52:48 -04001615 LIST_HEAD(over_hwa);
1616 LIST_HEAD(inner_walk);
1617 struct ioc_gq *iocg, *tiocg, *root_iocg;
1618 u32 after_sum, over_sum, over_target, gamma;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001619
Tejun Heoe08d02a2020-09-01 14:52:48 -04001620 /*
1621 * It's pretty unlikely but possible for the total sum of
1622 * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1623 * confuse the following calculations. If such condition is detected,
1624 * scale down everyone over its full share equally to keep the sum below
1625 * WEIGHT_ONE.
1626 */
1627 after_sum = 0;
1628 over_sum = 0;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001629 list_for_each_entry(iocg, surpluses, surplus_list) {
Tejun Heoe08d02a2020-09-01 14:52:48 -04001630 u32 hwa;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001631
Tejun Heoe08d02a2020-09-01 14:52:48 -04001632 current_hweight(iocg, &hwa, NULL);
1633 after_sum += iocg->hweight_after_donation;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001634
Tejun Heoe08d02a2020-09-01 14:52:48 -04001635 if (iocg->hweight_after_donation > hwa) {
1636 over_sum += iocg->hweight_after_donation;
1637 list_add(&iocg->walk_list, &over_hwa);
1638 }
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001639 }
Tejun Heoe08d02a2020-09-01 14:52:48 -04001640
1641 if (after_sum >= WEIGHT_ONE) {
1642 /*
1643 * The delta should be deducted from the over_sum, calculate
1644 * target over_sum value.
1645 */
1646 u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1647 WARN_ON_ONCE(over_sum <= over_delta);
1648 over_target = over_sum - over_delta;
1649 } else {
1650 over_target = 0;
1651 }
1652
1653 list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1654 if (over_target)
1655 iocg->hweight_after_donation =
1656 div_u64((u64)iocg->hweight_after_donation *
1657 over_target, over_sum);
1658 list_del_init(&iocg->walk_list);
1659 }
1660
1661 /*
1662 * Build pre-order inner node walk list and prepare for donation
1663 * adjustment calculations.
1664 */
1665 list_for_each_entry(iocg, surpluses, surplus_list) {
1666 iocg_build_inner_walk(iocg, &inner_walk);
1667 }
1668
1669 root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1670 WARN_ON_ONCE(root_iocg->level > 0);
1671
1672 list_for_each_entry(iocg, &inner_walk, walk_list) {
1673 iocg->child_adjusted_sum = 0;
1674 iocg->hweight_donating = 0;
1675 iocg->hweight_after_donation = 0;
1676 }
1677
1678 /*
1679 * Propagate the donating budget (b_t) and after donation budget (b'_t)
1680 * up the hierarchy.
1681 */
1682 list_for_each_entry(iocg, surpluses, surplus_list) {
1683 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1684
1685 parent->hweight_donating += iocg->hweight_donating;
1686 parent->hweight_after_donation += iocg->hweight_after_donation;
1687 }
1688
1689 list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1690 if (iocg->level > 0) {
1691 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1692
1693 parent->hweight_donating += iocg->hweight_donating;
1694 parent->hweight_after_donation += iocg->hweight_after_donation;
1695 }
1696 }
1697
1698 /*
1699 * Calculate inner hwa's (b) and make sure the donation values are
1700 * within the accepted ranges as we're doing low res calculations with
1701 * roundups.
1702 */
1703 list_for_each_entry(iocg, &inner_walk, walk_list) {
1704 if (iocg->level) {
1705 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1706
1707 iocg->hweight_active = DIV64_U64_ROUND_UP(
1708 (u64)parent->hweight_active * iocg->active,
1709 parent->child_active_sum);
1710
1711 }
1712
1713 iocg->hweight_donating = min(iocg->hweight_donating,
1714 iocg->hweight_active);
1715 iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1716 iocg->hweight_donating - 1);
1717 if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1718 iocg->hweight_donating <= 1 ||
1719 iocg->hweight_after_donation == 0)) {
1720 pr_warn("iocg: invalid donation weights in ");
1721 pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1722 pr_cont(": active=%u donating=%u after=%u\n",
1723 iocg->hweight_active, iocg->hweight_donating,
1724 iocg->hweight_after_donation);
1725 }
1726 }
1727
1728 /*
1729 * Calculate the global donation rate (gamma) - the rate to adjust
1730 * non-donating budgets by. No need to use 64bit multiplication here as
1731 * the first operand is guaranteed to be smaller than WEIGHT_ONE
1732 * (1<<16).
1733 *
1734 * gamma = (1 - t_r') / (1 - t_r)
1735 */
1736 gamma = DIV_ROUND_UP(
1737 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1738 WEIGHT_ONE - root_iocg->hweight_donating);
1739
1740 /*
1741 * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1742 * nodes.
1743 */
1744 list_for_each_entry(iocg, &inner_walk, walk_list) {
1745 struct ioc_gq *parent;
1746 u32 inuse, wpt, wptp;
1747 u64 st, sf;
1748
1749 if (iocg->level == 0) {
1750 /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1751 iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1752 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1753 WEIGHT_ONE - iocg->hweight_after_donation);
1754 continue;
1755 }
1756
1757 parent = iocg->ancestors[iocg->level - 1];
1758
1759 /* b' = gamma * b_f + b_t' */
1760 iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1761 (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
1762 WEIGHT_ONE) + iocg->hweight_after_donation;
1763
1764 /* w' = s' * b' / b'_p */
1765 inuse = DIV64_U64_ROUND_UP(
1766 (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
1767 parent->hweight_inuse);
1768
1769 /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
1770 st = DIV64_U64_ROUND_UP(
1771 iocg->child_active_sum * iocg->hweight_donating,
1772 iocg->hweight_active);
1773 sf = iocg->child_active_sum - st;
1774 wpt = DIV64_U64_ROUND_UP(
1775 (u64)iocg->active * iocg->hweight_donating,
1776 iocg->hweight_active);
1777 wptp = DIV64_U64_ROUND_UP(
1778 (u64)inuse * iocg->hweight_after_donation,
1779 iocg->hweight_inuse);
1780
1781 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
1782 }
1783
1784 /*
1785 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
1786 * we can finally determine leaf adjustments.
1787 */
1788 list_for_each_entry(iocg, surpluses, surplus_list) {
1789 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1790 u32 inuse;
1791
1792 /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
1793 inuse = DIV64_U64_ROUND_UP(
1794 parent->child_adjusted_sum * iocg->hweight_after_donation,
1795 parent->hweight_inuse);
1796 __propagate_weights(iocg, iocg->active, inuse);
1797 }
1798
1799 /* walk list should be dissolved after use */
1800 list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
1801 list_del_init(&iocg->walk_list);
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001802}
1803
Tejun Heo7caa4712019-08-28 15:05:58 -07001804static void ioc_timer_fn(struct timer_list *timer)
1805{
1806 struct ioc *ioc = container_of(timer, struct ioc, timer);
1807 struct ioc_gq *iocg, *tiocg;
1808 struct ioc_now now;
Tejun Heo8692d2d2020-09-01 14:52:45 -04001809 LIST_HEAD(surpluses);
Tejun Heo065655c2020-09-01 14:52:46 -04001810 int nr_shortages = 0, nr_lagging = 0;
Tejun Heo7caa4712019-08-28 15:05:58 -07001811 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1812 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1813 u32 missed_ppm[2], rq_wait_pct;
1814 u64 period_vtime;
Tejun Heo25d41e42019-09-25 16:02:07 -07001815 int prev_busy_level, i;
Tejun Heo7caa4712019-08-28 15:05:58 -07001816
1817 /* how were the latencies during the period? */
1818 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1819
1820 /* take care of active iocgs */
1821 spin_lock_irq(&ioc->lock);
1822
1823 ioc_now(ioc, &now);
1824
1825 period_vtime = now.vnow - ioc->period_at_vtime;
1826 if (WARN_ON_ONCE(!period_vtime)) {
1827 spin_unlock_irq(&ioc->lock);
1828 return;
1829 }
1830
Tejun Heo97eb1972020-09-01 14:52:43 -04001831 iocg_flush_stat(&ioc->active_iocgs, &now);
1832
Tejun Heo7caa4712019-08-28 15:05:58 -07001833 /*
1834 * Waiters determine the sleep durations based on the vrate they
1835 * saw at the time of sleep. If vrate has increased, some waiters
1836 * could be sleeping for too long. Wake up tardy waiters which
1837 * should have woken up in the last period and expire idle iocgs.
1838 */
1839 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
Chengming Zhoud9012a52020-07-30 17:03:21 +08001840 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
Tejun Heo0b80f982020-05-04 19:27:54 -04001841 !iocg_is_idle(iocg))
Tejun Heo7caa4712019-08-28 15:05:58 -07001842 continue;
1843
1844 spin_lock(&iocg->waitq.lock);
1845
Tejun Heo0b80f982020-05-04 19:27:54 -04001846 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
Tejun Heo7caa4712019-08-28 15:05:58 -07001847 /* might be oversleeping vtime / hweight changes, kick */
Tejun Heoda437b92020-09-01 14:52:42 -04001848 iocg_kick_waitq(iocg, true, &now);
Tejun Heo7caa4712019-08-28 15:05:58 -07001849 } else if (iocg_is_idle(iocg)) {
1850 /* no waiter and idle, deactivate */
1851 iocg->last_inuse = iocg->inuse;
Tejun Heo00410f12020-09-01 14:52:34 -04001852 __propagate_weights(iocg, 0, 0);
Tejun Heo7caa4712019-08-28 15:05:58 -07001853 list_del_init(&iocg->active_list);
1854 }
1855
1856 spin_unlock(&iocg->waitq.lock);
1857 }
Tejun Heo00410f12020-09-01 14:52:34 -04001858 commit_weights(ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -07001859
1860 /* calc usages and see whether some weights need to be moved around */
1861 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001862 u64 vdone, vtime, usage_us;
Tejun Heo7caa4712019-08-28 15:05:58 -07001863 u32 hw_active, hw_inuse, usage;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001864 int uidx, nr_valid;
Tejun Heo7caa4712019-08-28 15:05:58 -07001865
1866 /*
1867 * Collect unused and wind vtime closer to vnow to prevent
1868 * iocgs from accumulating a large amount of budget.
1869 */
1870 vdone = atomic64_read(&iocg->done_vtime);
1871 vtime = atomic64_read(&iocg->vtime);
1872 current_hweight(iocg, &hw_active, &hw_inuse);
1873
1874 /*
1875 * Latency QoS detection doesn't account for IOs which are
1876 * in-flight for longer than a period. Detect them by
1877 * comparing vdone against period start. If lagging behind
1878 * IOs from past periods, don't increase vrate.
1879 */
Tejun Heo7cd806a2019-09-25 16:03:09 -07001880 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1881 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
Tejun Heo7caa4712019-08-28 15:05:58 -07001882 time_after64(vtime, vdone) &&
1883 time_after64(vtime, now.vnow -
1884 MAX_LAGGING_PERIODS * period_vtime) &&
1885 time_before64(vdone, now.vnow - period_vtime))
1886 nr_lagging++;
1887
Tejun Heo7caa4712019-08-28 15:05:58 -07001888 /*
Tejun Heo1aa50d02020-09-01 14:52:44 -04001889 * Determine absolute usage factoring in pending and in-flight
1890 * IOs to avoid stalls and high-latency completions appearing as
1891 * idle.
Tejun Heo7caa4712019-08-28 15:05:58 -07001892 */
Tejun Heo1aa50d02020-09-01 14:52:44 -04001893 usage_us = iocg->usage_delta_us;
1894 if (waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow))
1895 usage_us += DIV64_U64_ROUND_UP(
1896 cost_to_abs_cost(now.vnow - vtime, hw_inuse),
1897 now.vrate);
1898 if (vdone != vtime) {
1899 u64 inflight_us = DIV64_U64_ROUND_UP(
1900 cost_to_abs_cost(vtime - vdone, hw_inuse),
1901 now.vrate);
1902 usage_us = max(usage_us, inflight_us);
1903 }
Tejun Heo7caa4712019-08-28 15:05:58 -07001904
Tejun Heo1aa50d02020-09-01 14:52:44 -04001905 /* convert to hweight based usage ratio and record */
1906 uidx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1907
1908 if (time_after64(vtime, now.vnow - ioc->margins.min)) {
1909 iocg->usage_idx = uidx;
1910 iocg->usages[uidx] = WEIGHT_ONE;
1911 } else if (usage_us) {
1912 u64 started_at, dur;
1913
1914 if (time_after64(iocg->activated_at, ioc->period_at))
1915 started_at = iocg->activated_at;
1916 else
1917 started_at = ioc->period_at;
1918
1919 dur = max_t(u64, now.now - started_at, 1);
Tejun Heo1aa50d02020-09-01 14:52:44 -04001920
1921 iocg->usage_idx = uidx;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001922 iocg->usages[uidx] = clamp_t(u32,
1923 DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, dur),
1924 1, WEIGHT_ONE);
Tejun Heo7caa4712019-08-28 15:05:58 -07001925 }
1926
Tejun Heo7caa4712019-08-28 15:05:58 -07001927 /* base the decision on max historical usage */
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001928 for (i = 0, usage = 0, nr_valid = 0; i < NR_USAGE_SLOTS; i++) {
Tejun Heo7caa4712019-08-28 15:05:58 -07001929 if (iocg->usages[i]) {
1930 usage = max(usage, iocg->usages[i]);
1931 nr_valid++;
1932 }
1933 }
1934 if (nr_valid < MIN_VALID_USAGES)
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001935 usage = WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07001936
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001937 /* see whether there's surplus vtime */
1938 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
1939 if (hw_inuse < hw_active ||
1940 (!waitqueue_active(&iocg->waitq) &&
1941 time_before64(vtime, now.vnow - ioc->margins.max))) {
Tejun Heoe08d02a2020-09-01 14:52:48 -04001942 u32 hwa, hwm, new_hwi;
Tejun Heo7caa4712019-08-28 15:05:58 -07001943
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001944 /*
1945 * Already donating or accumulated enough to start.
1946 * Determine the donation amount.
1947 */
Tejun Heoe08d02a2020-09-01 14:52:48 -04001948 current_hweight(iocg, &hwa, NULL);
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001949 hwm = current_hweight_max(iocg);
1950 new_hwi = hweight_after_donation(iocg, hwm, usage,
1951 &now);
1952 if (new_hwi < hwm) {
Tejun Heoe08d02a2020-09-01 14:52:48 -04001953 iocg->hweight_donating = hwa;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001954 iocg->hweight_after_donation = new_hwi;
1955 list_add(&iocg->surplus_list, &surpluses);
1956 } else {
1957 __propagate_weights(iocg, iocg->active,
1958 iocg->active);
1959 nr_shortages++;
1960 }
1961 } else {
1962 /* genuinely short on vtime */
1963 nr_shortages++;
Tejun Heo7caa4712019-08-28 15:05:58 -07001964 }
1965 }
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001966
1967 if (!list_empty(&surpluses) && nr_shortages)
1968 transfer_surpluses(&surpluses, &now);
1969
Tejun Heo00410f12020-09-01 14:52:34 -04001970 commit_weights(ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -07001971
Tejun Heo8692d2d2020-09-01 14:52:45 -04001972 /* surplus list should be dissolved after use */
1973 list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
1974 list_del_init(&iocg->surplus_list);
1975
Tejun Heo7caa4712019-08-28 15:05:58 -07001976 /*
1977 * If q is getting clogged or we're missing too much, we're issuing
1978 * too much IO and should lower vtime rate. If we're not missing
1979 * and experiencing shortages but not surpluses, we're too stingy
1980 * and should increase vtime rate.
1981 */
Tejun Heo25d41e42019-09-25 16:02:07 -07001982 prev_busy_level = ioc->busy_level;
Tejun Heo7caa4712019-08-28 15:05:58 -07001983 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1984 missed_ppm[READ] > ppm_rthr ||
1985 missed_ppm[WRITE] > ppm_wthr) {
Tejun Heo81ca6272019-10-14 17:18:11 -07001986 /* clearly missing QoS targets, slow down vrate */
Tejun Heo7caa4712019-08-28 15:05:58 -07001987 ioc->busy_level = max(ioc->busy_level, 0);
1988 ioc->busy_level++;
Tejun Heo7cd806a2019-09-25 16:03:09 -07001989 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
Tejun Heo7caa4712019-08-28 15:05:58 -07001990 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1991 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
Tejun Heo81ca6272019-10-14 17:18:11 -07001992 /* QoS targets are being met with >25% margin */
1993 if (nr_shortages) {
1994 /*
1995 * We're throttling while the device has spare
1996 * capacity. If vrate was being slowed down, stop.
1997 */
Tejun Heo7cd806a2019-09-25 16:03:09 -07001998 ioc->busy_level = min(ioc->busy_level, 0);
Tejun Heo81ca6272019-10-14 17:18:11 -07001999
2000 /*
2001 * If there are IOs spanning multiple periods, wait
Tejun Heo065655c2020-09-01 14:52:46 -04002002 * them out before pushing the device harder.
Tejun Heo81ca6272019-10-14 17:18:11 -07002003 */
Tejun Heo065655c2020-09-01 14:52:46 -04002004 if (!nr_lagging)
Tejun Heo7cd806a2019-09-25 16:03:09 -07002005 ioc->busy_level--;
Tejun Heo81ca6272019-10-14 17:18:11 -07002006 } else {
2007 /*
2008 * Nobody is being throttled and the users aren't
2009 * issuing enough IOs to saturate the device. We
2010 * simply don't know how close the device is to
2011 * saturation. Coast.
2012 */
2013 ioc->busy_level = 0;
Tejun Heo7cd806a2019-09-25 16:03:09 -07002014 }
Tejun Heo7caa4712019-08-28 15:05:58 -07002015 } else {
Tejun Heo81ca6272019-10-14 17:18:11 -07002016 /* inside the hysterisis margin, we're good */
Tejun Heo7caa4712019-08-28 15:05:58 -07002017 ioc->busy_level = 0;
2018 }
2019
2020 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2021
Tejun Heo7cd806a2019-09-25 16:03:09 -07002022 if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
Tejun Heo7caa4712019-08-28 15:05:58 -07002023 u64 vrate = atomic64_read(&ioc->vtime_rate);
2024 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
2025
2026 /* rq_wait signal is always reliable, ignore user vrate_min */
2027 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
2028 vrate_min = VRATE_MIN;
2029
2030 /*
2031 * If vrate is out of bounds, apply clamp gradually as the
2032 * bounds can change abruptly. Otherwise, apply busy_level
2033 * based adjustment.
2034 */
2035 if (vrate < vrate_min) {
2036 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
2037 100);
2038 vrate = min(vrate, vrate_min);
2039 } else if (vrate > vrate_max) {
2040 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
2041 100);
2042 vrate = max(vrate, vrate_max);
2043 } else {
2044 int idx = min_t(int, abs(ioc->busy_level),
2045 ARRAY_SIZE(vrate_adj_pct) - 1);
2046 u32 adj_pct = vrate_adj_pct[idx];
2047
2048 if (ioc->busy_level > 0)
2049 adj_pct = 100 - adj_pct;
2050 else
2051 adj_pct = 100 + adj_pct;
2052
2053 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
2054 vrate_min, vrate_max);
2055 }
2056
Waiman Longd6c8e942020-04-21 09:07:55 -04002057 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
Tejun Heo065655c2020-09-01 14:52:46 -04002058 nr_lagging, nr_shortages);
Tejun Heo7caa4712019-08-28 15:05:58 -07002059
2060 atomic64_set(&ioc->vtime_rate, vrate);
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04002061 ioc_refresh_margins(ioc);
Tejun Heo25d41e42019-09-25 16:02:07 -07002062 } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
2063 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
Waiman Longd6c8e942020-04-21 09:07:55 -04002064 missed_ppm, rq_wait_pct, nr_lagging,
Tejun Heo065655c2020-09-01 14:52:46 -04002065 nr_shortages);
Tejun Heo7caa4712019-08-28 15:05:58 -07002066 }
2067
2068 ioc_refresh_params(ioc, false);
2069
2070 /*
2071 * This period is done. Move onto the next one. If nothing's
2072 * going on with the device, stop the timer.
2073 */
2074 atomic64_inc(&ioc->cur_period);
2075
2076 if (ioc->running != IOC_STOP) {
2077 if (!list_empty(&ioc->active_iocgs)) {
2078 ioc_start_period(ioc, &now);
2079 } else {
2080 ioc->busy_level = 0;
2081 ioc->running = IOC_IDLE;
2082 }
2083 }
2084
2085 spin_unlock_irq(&ioc->lock);
2086}
2087
2088static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2089 bool is_merge, u64 *costp)
2090{
2091 struct ioc *ioc = iocg->ioc;
2092 u64 coef_seqio, coef_randio, coef_page;
2093 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2094 u64 seek_pages = 0;
2095 u64 cost = 0;
2096
2097 switch (bio_op(bio)) {
2098 case REQ_OP_READ:
2099 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
2100 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
2101 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
2102 break;
2103 case REQ_OP_WRITE:
2104 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
2105 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
2106 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
2107 break;
2108 default:
2109 goto out;
2110 }
2111
2112 if (iocg->cursor) {
2113 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2114 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2115 }
2116
2117 if (!is_merge) {
2118 if (seek_pages > LCOEF_RANDIO_PAGES) {
2119 cost += coef_randio;
2120 } else {
2121 cost += coef_seqio;
2122 }
2123 }
2124 cost += pages * coef_page;
2125out:
2126 *costp = cost;
2127}
2128
2129static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2130{
2131 u64 cost;
2132
2133 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2134 return cost;
2135}
2136
Tejun Heocd006502020-04-13 12:27:56 -04002137static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2138 u64 *costp)
2139{
2140 unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2141
2142 switch (req_op(rq)) {
2143 case REQ_OP_READ:
2144 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2145 break;
2146 case REQ_OP_WRITE:
2147 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2148 break;
2149 default:
2150 *costp = 0;
2151 }
2152}
2153
2154static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2155{
2156 u64 cost;
2157
2158 calc_size_vtime_cost_builtin(rq, ioc, &cost);
2159 return cost;
2160}
2161
Tejun Heo7caa4712019-08-28 15:05:58 -07002162static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2163{
2164 struct blkcg_gq *blkg = bio->bi_blkg;
2165 struct ioc *ioc = rqos_to_ioc(rqos);
2166 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2167 struct ioc_now now;
2168 struct iocg_wait wait;
2169 u32 hw_active, hw_inuse;
2170 u64 abs_cost, cost, vtime;
Tejun Heoda437b92020-09-01 14:52:42 -04002171 bool use_debt, ioc_locked;
2172 unsigned long flags;
Tejun Heo7caa4712019-08-28 15:05:58 -07002173
2174 /* bypass IOs if disabled or for root cgroup */
2175 if (!ioc->enabled || !iocg->level)
2176 return;
2177
2178 /* always activate so that even 0 cost IOs get protected to some level */
2179 if (!iocg_activate(iocg, &now))
2180 return;
2181
2182 /* calculate the absolute vtime cost */
2183 abs_cost = calc_vtime_cost(bio, iocg, false);
2184 if (!abs_cost)
2185 return;
2186
2187 iocg->cursor = bio_end_sector(bio);
2188
2189 vtime = atomic64_read(&iocg->vtime);
2190 current_hweight(iocg, &hw_active, &hw_inuse);
2191
2192 if (hw_inuse < hw_active &&
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04002193 time_after_eq64(vtime + ioc->margins.min, now.vnow)) {
Tejun Heo7caa4712019-08-28 15:05:58 -07002194 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
2195 iocg->inuse, iocg->weight, hw_inuse, hw_active);
2196 spin_lock_irq(&ioc->lock);
Tejun Heo00410f12020-09-01 14:52:34 -04002197 propagate_weights(iocg, iocg->weight, iocg->weight);
Tejun Heo7caa4712019-08-28 15:05:58 -07002198 spin_unlock_irq(&ioc->lock);
2199 current_hweight(iocg, &hw_active, &hw_inuse);
2200 }
2201
2202 cost = abs_cost_to_cost(abs_cost, hw_inuse);
2203
2204 /*
2205 * If no one's waiting and within budget, issue right away. The
2206 * tests are racy but the races aren't systemic - we only miss once
2207 * in a while which is fine.
2208 */
Tejun Heo0b80f982020-05-04 19:27:54 -04002209 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
Tejun Heo7caa4712019-08-28 15:05:58 -07002210 time_before_eq64(vtime + cost, now.vnow)) {
Tejun Heo97eb1972020-09-01 14:52:43 -04002211 iocg_commit_bio(iocg, bio, abs_cost, cost);
Tejun Heo7caa4712019-08-28 15:05:58 -07002212 return;
2213 }
2214
Tejun Heo36a52482019-09-04 12:45:52 -07002215 /*
Tejun Heoda437b92020-09-01 14:52:42 -04002216 * We're over budget. This can be handled in two ways. IOs which may
2217 * cause priority inversions are punted to @ioc->aux_iocg and charged as
2218 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2219 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2220 * whether debt handling is needed and acquire locks accordingly.
Tejun Heo0b80f982020-05-04 19:27:54 -04002221 */
Tejun Heoda437b92020-09-01 14:52:42 -04002222 use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2223 ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
Tejun Heo0b80f982020-05-04 19:27:54 -04002224
Tejun Heoda437b92020-09-01 14:52:42 -04002225 iocg_lock(iocg, ioc_locked, &flags);
2226
2227 /*
2228 * @iocg must stay activated for debt and waitq handling. Deactivation
2229 * is synchronized against both ioc->lock and waitq.lock and we won't
2230 * get deactivated as long as we're waiting or has debt, so we're good
2231 * if we're activated here. In the unlikely cases that we aren't, just
2232 * issue the IO.
2233 */
Tejun Heo0b80f982020-05-04 19:27:54 -04002234 if (unlikely(list_empty(&iocg->active_list))) {
Tejun Heoda437b92020-09-01 14:52:42 -04002235 iocg_unlock(iocg, ioc_locked, &flags);
Tejun Heo97eb1972020-09-01 14:52:43 -04002236 iocg_commit_bio(iocg, bio, abs_cost, cost);
Tejun Heo0b80f982020-05-04 19:27:54 -04002237 return;
2238 }
2239
2240 /*
2241 * We're over budget. If @bio has to be issued regardless, remember
2242 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2243 * off the debt before waking more IOs.
2244 *
Tejun Heo36a52482019-09-04 12:45:52 -07002245 * This way, the debt is continuously paid off each period with the
Tejun Heo0b80f982020-05-04 19:27:54 -04002246 * actual budget available to the cgroup. If we just wound vtime, we
2247 * would incorrectly use the current hw_inuse for the entire amount
2248 * which, for example, can lead to the cgroup staying blocked for a
2249 * long time even with substantially raised hw_inuse.
2250 *
2251 * An iocg with vdebt should stay online so that the timer can keep
2252 * deducting its vdebt and [de]activate use_delay mechanism
2253 * accordingly. We don't want to race against the timer trying to
2254 * clear them and leave @iocg inactive w/ dangling use_delay heavily
2255 * penalizing the cgroup and its descendants.
Tejun Heo36a52482019-09-04 12:45:52 -07002256 */
Tejun Heoda437b92020-09-01 14:52:42 -04002257 if (use_debt) {
Tejun Heo0b80f982020-05-04 19:27:54 -04002258 iocg->abs_vdebt += abs_cost;
Tejun Heo54c52e12020-04-13 12:27:55 -04002259 if (iocg_kick_delay(iocg, &now))
Tejun Heod7bd15a2019-12-16 13:34:00 -08002260 blkcg_schedule_throttle(rqos->q,
2261 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
Tejun Heoda437b92020-09-01 14:52:42 -04002262 iocg_unlock(iocg, ioc_locked, &flags);
Tejun Heo7caa4712019-08-28 15:05:58 -07002263 return;
2264 }
2265
2266 /*
2267 * Append self to the waitq and schedule the wakeup timer if we're
2268 * the first waiter. The timer duration is calculated based on the
2269 * current vrate. vtime and hweight changes can make it too short
2270 * or too long. Each wait entry records the absolute cost it's
2271 * waiting for to allow re-evaluation using a custom wait entry.
2272 *
2273 * If too short, the timer simply reschedules itself. If too long,
2274 * the period timer will notice and trigger wakeups.
2275 *
2276 * All waiters are on iocg->waitq and the wait states are
2277 * synchronized using waitq.lock.
2278 */
Tejun Heo7caa4712019-08-28 15:05:58 -07002279 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2280 wait.wait.private = current;
2281 wait.bio = bio;
2282 wait.abs_cost = abs_cost;
2283 wait.committed = false; /* will be set true by waker */
2284
2285 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
Tejun Heoda437b92020-09-01 14:52:42 -04002286 iocg_kick_waitq(iocg, ioc_locked, &now);
Tejun Heo7caa4712019-08-28 15:05:58 -07002287
Tejun Heoda437b92020-09-01 14:52:42 -04002288 iocg_unlock(iocg, ioc_locked, &flags);
Tejun Heo7caa4712019-08-28 15:05:58 -07002289
2290 while (true) {
2291 set_current_state(TASK_UNINTERRUPTIBLE);
2292 if (wait.committed)
2293 break;
2294 io_schedule();
2295 }
2296
2297 /* waker already committed us, proceed */
2298 finish_wait(&iocg->waitq, &wait.wait);
2299}
2300
2301static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2302 struct bio *bio)
2303{
2304 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
Tejun Heoe1518f62019-09-04 12:45:53 -07002305 struct ioc *ioc = iocg->ioc;
Tejun Heo7caa4712019-08-28 15:05:58 -07002306 sector_t bio_end = bio_end_sector(bio);
Tejun Heoe1518f62019-09-04 12:45:53 -07002307 struct ioc_now now;
Tejun Heo7caa4712019-08-28 15:05:58 -07002308 u32 hw_inuse;
2309 u64 abs_cost, cost;
Tejun Heo0b80f982020-05-04 19:27:54 -04002310 unsigned long flags;
Tejun Heo7caa4712019-08-28 15:05:58 -07002311
Tejun Heoe1518f62019-09-04 12:45:53 -07002312 /* bypass if disabled or for root cgroup */
2313 if (!ioc->enabled || !iocg->level)
Tejun Heo7caa4712019-08-28 15:05:58 -07002314 return;
2315
2316 abs_cost = calc_vtime_cost(bio, iocg, true);
2317 if (!abs_cost)
2318 return;
2319
Tejun Heoe1518f62019-09-04 12:45:53 -07002320 ioc_now(ioc, &now);
2321 current_hweight(iocg, NULL, &hw_inuse);
2322 cost = abs_cost_to_cost(abs_cost, hw_inuse);
2323
Tejun Heo7caa4712019-08-28 15:05:58 -07002324 /* update cursor if backmerging into the request at the cursor */
2325 if (blk_rq_pos(rq) < bio_end &&
2326 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2327 iocg->cursor = bio_end;
2328
Tejun Heoe1518f62019-09-04 12:45:53 -07002329 /*
Tejun Heo0b80f982020-05-04 19:27:54 -04002330 * Charge if there's enough vtime budget and the existing request has
2331 * cost assigned.
Tejun Heoe1518f62019-09-04 12:45:53 -07002332 */
2333 if (rq->bio && rq->bio->bi_iocost_cost &&
Tejun Heo0b80f982020-05-04 19:27:54 -04002334 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
Tejun Heo97eb1972020-09-01 14:52:43 -04002335 iocg_commit_bio(iocg, bio, abs_cost, cost);
Tejun Heo0b80f982020-05-04 19:27:54 -04002336 return;
2337 }
2338
2339 /*
2340 * Otherwise, account it as debt if @iocg is online, which it should
2341 * be for the vast majority of cases. See debt handling in
2342 * ioc_rqos_throttle() for details.
2343 */
2344 spin_lock_irqsave(&iocg->waitq.lock, flags);
2345 if (likely(!list_empty(&iocg->active_list))) {
2346 iocg->abs_vdebt += abs_cost;
Jens Axboe873f1c82020-05-09 16:13:58 -06002347 iocg_kick_delay(iocg, &now);
Tejun Heo0b80f982020-05-04 19:27:54 -04002348 } else {
Tejun Heo97eb1972020-09-01 14:52:43 -04002349 iocg_commit_bio(iocg, bio, abs_cost, cost);
Tejun Heo0b80f982020-05-04 19:27:54 -04002350 }
2351 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
Tejun Heo7caa4712019-08-28 15:05:58 -07002352}
2353
2354static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2355{
2356 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2357
2358 if (iocg && bio->bi_iocost_cost)
2359 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2360}
2361
2362static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2363{
2364 struct ioc *ioc = rqos_to_ioc(rqos);
Tejun Heo5e124f72020-09-01 14:52:33 -04002365 struct ioc_pcpu_stat *ccs;
Tejun Heocd006502020-04-13 12:27:56 -04002366 u64 on_q_ns, rq_wait_ns, size_nsec;
Tejun Heo7caa4712019-08-28 15:05:58 -07002367 int pidx, rw;
2368
2369 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2370 return;
2371
2372 switch (req_op(rq) & REQ_OP_MASK) {
2373 case REQ_OP_READ:
2374 pidx = QOS_RLAT;
2375 rw = READ;
2376 break;
2377 case REQ_OP_WRITE:
2378 pidx = QOS_WLAT;
2379 rw = WRITE;
2380 break;
2381 default:
2382 return;
2383 }
2384
2385 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2386 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
Tejun Heocd006502020-04-13 12:27:56 -04002387 size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
Tejun Heo7caa4712019-08-28 15:05:58 -07002388
Tejun Heo5e124f72020-09-01 14:52:33 -04002389 ccs = get_cpu_ptr(ioc->pcpu_stat);
2390
Tejun Heocd006502020-04-13 12:27:56 -04002391 if (on_q_ns <= size_nsec ||
2392 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
Tejun Heo5e124f72020-09-01 14:52:33 -04002393 local_inc(&ccs->missed[rw].nr_met);
Tejun Heo7caa4712019-08-28 15:05:58 -07002394 else
Tejun Heo5e124f72020-09-01 14:52:33 -04002395 local_inc(&ccs->missed[rw].nr_missed);
Tejun Heo7caa4712019-08-28 15:05:58 -07002396
Tejun Heo5e124f72020-09-01 14:52:33 -04002397 local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2398
2399 put_cpu_ptr(ccs);
Tejun Heo7caa4712019-08-28 15:05:58 -07002400}
2401
2402static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2403{
2404 struct ioc *ioc = rqos_to_ioc(rqos);
2405
2406 spin_lock_irq(&ioc->lock);
2407 ioc_refresh_params(ioc, false);
2408 spin_unlock_irq(&ioc->lock);
2409}
2410
2411static void ioc_rqos_exit(struct rq_qos *rqos)
2412{
2413 struct ioc *ioc = rqos_to_ioc(rqos);
2414
2415 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2416
2417 spin_lock_irq(&ioc->lock);
2418 ioc->running = IOC_STOP;
2419 spin_unlock_irq(&ioc->lock);
2420
2421 del_timer_sync(&ioc->timer);
2422 free_percpu(ioc->pcpu_stat);
2423 kfree(ioc);
2424}
2425
2426static struct rq_qos_ops ioc_rqos_ops = {
2427 .throttle = ioc_rqos_throttle,
2428 .merge = ioc_rqos_merge,
2429 .done_bio = ioc_rqos_done_bio,
2430 .done = ioc_rqos_done,
2431 .queue_depth_changed = ioc_rqos_queue_depth_changed,
2432 .exit = ioc_rqos_exit,
2433};
2434
2435static int blk_iocost_init(struct request_queue *q)
2436{
2437 struct ioc *ioc;
2438 struct rq_qos *rqos;
Tejun Heo5e124f72020-09-01 14:52:33 -04002439 int i, cpu, ret;
Tejun Heo7caa4712019-08-28 15:05:58 -07002440
2441 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2442 if (!ioc)
2443 return -ENOMEM;
2444
2445 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2446 if (!ioc->pcpu_stat) {
2447 kfree(ioc);
2448 return -ENOMEM;
2449 }
2450
Tejun Heo5e124f72020-09-01 14:52:33 -04002451 for_each_possible_cpu(cpu) {
2452 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2453
2454 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2455 local_set(&ccs->missed[i].nr_met, 0);
2456 local_set(&ccs->missed[i].nr_missed, 0);
2457 }
2458 local64_set(&ccs->rq_wait_ns, 0);
2459 }
2460
Tejun Heo7caa4712019-08-28 15:05:58 -07002461 rqos = &ioc->rqos;
2462 rqos->id = RQ_QOS_COST;
2463 rqos->ops = &ioc_rqos_ops;
2464 rqos->q = q;
2465
2466 spin_lock_init(&ioc->lock);
2467 timer_setup(&ioc->timer, ioc_timer_fn, 0);
2468 INIT_LIST_HEAD(&ioc->active_iocgs);
2469
2470 ioc->running = IOC_IDLE;
2471 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
Ahmed S. Darwish67b7b642020-07-20 17:55:26 +02002472 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
Tejun Heo7caa4712019-08-28 15:05:58 -07002473 ioc->period_at = ktime_to_us(ktime_get());
2474 atomic64_set(&ioc->cur_period, 0);
2475 atomic_set(&ioc->hweight_gen, 0);
2476
2477 spin_lock_irq(&ioc->lock);
2478 ioc->autop_idx = AUTOP_INVALID;
2479 ioc_refresh_params(ioc, true);
2480 spin_unlock_irq(&ioc->lock);
2481
2482 rq_qos_add(q, rqos);
2483 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2484 if (ret) {
2485 rq_qos_del(q, rqos);
Tejun Heo3532e722019-08-29 08:53:06 -07002486 free_percpu(ioc->pcpu_stat);
Tejun Heo7caa4712019-08-28 15:05:58 -07002487 kfree(ioc);
2488 return ret;
2489 }
2490 return 0;
2491}
2492
2493static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2494{
2495 struct ioc_cgrp *iocc;
2496
2497 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
Tejun Heoe916ad22019-08-30 06:10:58 -07002498 if (!iocc)
2499 return NULL;
Tejun Heo7caa4712019-08-28 15:05:58 -07002500
Tejun Heobd0adb92020-09-01 14:52:39 -04002501 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002502 return &iocc->cpd;
2503}
2504
2505static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2506{
2507 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2508}
2509
2510static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2511 struct blkcg *blkcg)
2512{
2513 int levels = blkcg->css.cgroup->level + 1;
2514 struct ioc_gq *iocg;
2515
Gustavo A. R. Silvaf61d6e22020-06-19 18:08:30 -05002516 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
Tejun Heo7caa4712019-08-28 15:05:58 -07002517 if (!iocg)
2518 return NULL;
2519
Tejun Heo97eb1972020-09-01 14:52:43 -04002520 iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2521 if (!iocg->pcpu_stat) {
2522 kfree(iocg);
2523 return NULL;
2524 }
2525
Tejun Heo7caa4712019-08-28 15:05:58 -07002526 return &iocg->pd;
2527}
2528
2529static void ioc_pd_init(struct blkg_policy_data *pd)
2530{
2531 struct ioc_gq *iocg = pd_to_iocg(pd);
2532 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2533 struct ioc *ioc = q_to_ioc(blkg->q);
2534 struct ioc_now now;
2535 struct blkcg_gq *tblkg;
2536 unsigned long flags;
2537
2538 ioc_now(ioc, &now);
2539
2540 iocg->ioc = ioc;
2541 atomic64_set(&iocg->vtime, now.vnow);
2542 atomic64_set(&iocg->done_vtime, now.vnow);
2543 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2544 INIT_LIST_HEAD(&iocg->active_list);
Tejun Heo97eb1972020-09-01 14:52:43 -04002545 INIT_LIST_HEAD(&iocg->walk_list);
Tejun Heo8692d2d2020-09-01 14:52:45 -04002546 INIT_LIST_HEAD(&iocg->surplus_list);
Tejun Heofe20cdb52020-09-01 14:52:38 -04002547 iocg->hweight_active = WEIGHT_ONE;
2548 iocg->hweight_inuse = WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002549
2550 init_waitqueue_head(&iocg->waitq);
2551 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2552 iocg->waitq_timer.function = iocg_waitq_timer_fn;
2553 hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2554 iocg->delay_timer.function = iocg_delay_timer_fn;
2555
2556 iocg->level = blkg->blkcg->css.cgroup->level;
2557
2558 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2559 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2560 iocg->ancestors[tiocg->level] = tiocg;
2561 }
2562
2563 spin_lock_irqsave(&ioc->lock, flags);
2564 weight_updated(iocg);
2565 spin_unlock_irqrestore(&ioc->lock, flags);
2566}
2567
2568static void ioc_pd_free(struct blkg_policy_data *pd)
2569{
2570 struct ioc_gq *iocg = pd_to_iocg(pd);
2571 struct ioc *ioc = iocg->ioc;
Tejun Heo5aeac7c2020-09-01 14:52:31 -04002572 unsigned long flags;
Tejun Heo7caa4712019-08-28 15:05:58 -07002573
2574 if (ioc) {
Tejun Heo5aeac7c2020-09-01 14:52:31 -04002575 spin_lock_irqsave(&ioc->lock, flags);
Tejun Heo97eb1972020-09-01 14:52:43 -04002576
Tejun Heo7caa4712019-08-28 15:05:58 -07002577 if (!list_empty(&iocg->active_list)) {
Tejun Heo00410f12020-09-01 14:52:34 -04002578 propagate_weights(iocg, 0, 0);
Tejun Heo7caa4712019-08-28 15:05:58 -07002579 list_del_init(&iocg->active_list);
2580 }
Tejun Heo97eb1972020-09-01 14:52:43 -04002581
2582 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
Tejun Heo8692d2d2020-09-01 14:52:45 -04002583 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
Tejun Heo97eb1972020-09-01 14:52:43 -04002584
Tejun Heo5aeac7c2020-09-01 14:52:31 -04002585 spin_unlock_irqrestore(&ioc->lock, flags);
Tejun Heoe036c4c2019-09-10 09:15:25 -07002586
2587 hrtimer_cancel(&iocg->waitq_timer);
2588 hrtimer_cancel(&iocg->delay_timer);
Tejun Heo7caa4712019-08-28 15:05:58 -07002589 }
Tejun Heo97eb1972020-09-01 14:52:43 -04002590 free_percpu(iocg->pcpu_stat);
Tejun Heo7caa4712019-08-28 15:05:58 -07002591 kfree(iocg);
2592}
2593
Tejun Heo97eb1972020-09-01 14:52:43 -04002594static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
2595{
2596 struct ioc_gq *iocg = pd_to_iocg(pd);
2597 struct ioc *ioc = iocg->ioc;
2598 size_t pos = 0;
2599
2600 if (!ioc->enabled)
2601 return 0;
2602
2603 if (iocg->level == 0) {
2604 unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
2605 atomic64_read(&ioc->vtime_rate) * 10000,
2606 VTIME_PER_USEC);
2607 pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
2608 vp10k / 100, vp10k % 100);
2609 }
2610
2611 pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
2612 iocg->last_stat.usage_us);
2613
2614 return pos;
2615}
2616
Tejun Heo7caa4712019-08-28 15:05:58 -07002617static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2618 int off)
2619{
2620 const char *dname = blkg_dev_name(pd->blkg);
2621 struct ioc_gq *iocg = pd_to_iocg(pd);
2622
2623 if (dname && iocg->cfg_weight)
Tejun Heobd0adb92020-09-01 14:52:39 -04002624 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
Tejun Heo7caa4712019-08-28 15:05:58 -07002625 return 0;
2626}
2627
2628
2629static int ioc_weight_show(struct seq_file *sf, void *v)
2630{
2631 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2632 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2633
Tejun Heobd0adb92020-09-01 14:52:39 -04002634 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
Tejun Heo7caa4712019-08-28 15:05:58 -07002635 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2636 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2637 return 0;
2638}
2639
2640static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2641 size_t nbytes, loff_t off)
2642{
2643 struct blkcg *blkcg = css_to_blkcg(of_css(of));
2644 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2645 struct blkg_conf_ctx ctx;
2646 struct ioc_gq *iocg;
2647 u32 v;
2648 int ret;
2649
2650 if (!strchr(buf, ':')) {
2651 struct blkcg_gq *blkg;
2652
2653 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2654 return -EINVAL;
2655
2656 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2657 return -EINVAL;
2658
2659 spin_lock(&blkcg->lock);
Tejun Heobd0adb92020-09-01 14:52:39 -04002660 iocc->dfl_weight = v * WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002661 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2662 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2663
2664 if (iocg) {
2665 spin_lock_irq(&iocg->ioc->lock);
2666 weight_updated(iocg);
2667 spin_unlock_irq(&iocg->ioc->lock);
2668 }
2669 }
2670 spin_unlock(&blkcg->lock);
2671
2672 return nbytes;
2673 }
2674
2675 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2676 if (ret)
2677 return ret;
2678
2679 iocg = blkg_to_iocg(ctx.blkg);
2680
2681 if (!strncmp(ctx.body, "default", 7)) {
2682 v = 0;
2683 } else {
2684 if (!sscanf(ctx.body, "%u", &v))
2685 goto einval;
2686 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2687 goto einval;
2688 }
2689
Dan Carpenter41591a52019-10-31 13:53:41 +03002690 spin_lock(&iocg->ioc->lock);
Tejun Heobd0adb92020-09-01 14:52:39 -04002691 iocg->cfg_weight = v * WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002692 weight_updated(iocg);
Dan Carpenter41591a52019-10-31 13:53:41 +03002693 spin_unlock(&iocg->ioc->lock);
Tejun Heo7caa4712019-08-28 15:05:58 -07002694
2695 blkg_conf_finish(&ctx);
2696 return nbytes;
2697
2698einval:
2699 blkg_conf_finish(&ctx);
2700 return -EINVAL;
2701}
2702
2703static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2704 int off)
2705{
2706 const char *dname = blkg_dev_name(pd->blkg);
2707 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2708
2709 if (!dname)
2710 return 0;
2711
2712 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2713 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2714 ioc->params.qos[QOS_RPPM] / 10000,
2715 ioc->params.qos[QOS_RPPM] % 10000 / 100,
2716 ioc->params.qos[QOS_RLAT],
2717 ioc->params.qos[QOS_WPPM] / 10000,
2718 ioc->params.qos[QOS_WPPM] % 10000 / 100,
2719 ioc->params.qos[QOS_WLAT],
2720 ioc->params.qos[QOS_MIN] / 10000,
2721 ioc->params.qos[QOS_MIN] % 10000 / 100,
2722 ioc->params.qos[QOS_MAX] / 10000,
2723 ioc->params.qos[QOS_MAX] % 10000 / 100);
2724 return 0;
2725}
2726
2727static int ioc_qos_show(struct seq_file *sf, void *v)
2728{
2729 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2730
2731 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2732 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2733 return 0;
2734}
2735
2736static const match_table_t qos_ctrl_tokens = {
2737 { QOS_ENABLE, "enable=%u" },
2738 { QOS_CTRL, "ctrl=%s" },
2739 { NR_QOS_CTRL_PARAMS, NULL },
2740};
2741
2742static const match_table_t qos_tokens = {
2743 { QOS_RPPM, "rpct=%s" },
2744 { QOS_RLAT, "rlat=%u" },
2745 { QOS_WPPM, "wpct=%s" },
2746 { QOS_WLAT, "wlat=%u" },
2747 { QOS_MIN, "min=%s" },
2748 { QOS_MAX, "max=%s" },
2749 { NR_QOS_PARAMS, NULL },
2750};
2751
2752static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2753 size_t nbytes, loff_t off)
2754{
2755 struct gendisk *disk;
2756 struct ioc *ioc;
2757 u32 qos[NR_QOS_PARAMS];
2758 bool enable, user;
2759 char *p;
2760 int ret;
2761
2762 disk = blkcg_conf_get_disk(&input);
2763 if (IS_ERR(disk))
2764 return PTR_ERR(disk);
2765
2766 ioc = q_to_ioc(disk->queue);
2767 if (!ioc) {
2768 ret = blk_iocost_init(disk->queue);
2769 if (ret)
2770 goto err;
2771 ioc = q_to_ioc(disk->queue);
2772 }
2773
2774 spin_lock_irq(&ioc->lock);
2775 memcpy(qos, ioc->params.qos, sizeof(qos));
2776 enable = ioc->enabled;
2777 user = ioc->user_qos_params;
2778 spin_unlock_irq(&ioc->lock);
2779
2780 while ((p = strsep(&input, " \t\n"))) {
2781 substring_t args[MAX_OPT_ARGS];
2782 char buf[32];
2783 int tok;
2784 s64 v;
2785
2786 if (!*p)
2787 continue;
2788
2789 switch (match_token(p, qos_ctrl_tokens, args)) {
2790 case QOS_ENABLE:
2791 match_u64(&args[0], &v);
2792 enable = v;
2793 continue;
2794 case QOS_CTRL:
2795 match_strlcpy(buf, &args[0], sizeof(buf));
2796 if (!strcmp(buf, "auto"))
2797 user = false;
2798 else if (!strcmp(buf, "user"))
2799 user = true;
2800 else
2801 goto einval;
2802 continue;
2803 }
2804
2805 tok = match_token(p, qos_tokens, args);
2806 switch (tok) {
2807 case QOS_RPPM:
2808 case QOS_WPPM:
2809 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2810 sizeof(buf))
2811 goto einval;
2812 if (cgroup_parse_float(buf, 2, &v))
2813 goto einval;
2814 if (v < 0 || v > 10000)
2815 goto einval;
2816 qos[tok] = v * 100;
2817 break;
2818 case QOS_RLAT:
2819 case QOS_WLAT:
2820 if (match_u64(&args[0], &v))
2821 goto einval;
2822 qos[tok] = v;
2823 break;
2824 case QOS_MIN:
2825 case QOS_MAX:
2826 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2827 sizeof(buf))
2828 goto einval;
2829 if (cgroup_parse_float(buf, 2, &v))
2830 goto einval;
2831 if (v < 0)
2832 goto einval;
2833 qos[tok] = clamp_t(s64, v * 100,
2834 VRATE_MIN_PPM, VRATE_MAX_PPM);
2835 break;
2836 default:
2837 goto einval;
2838 }
2839 user = true;
2840 }
2841
2842 if (qos[QOS_MIN] > qos[QOS_MAX])
2843 goto einval;
2844
2845 spin_lock_irq(&ioc->lock);
2846
2847 if (enable) {
Tejun Heocd006502020-04-13 12:27:56 -04002848 blk_stat_enable_accounting(ioc->rqos.q);
Tejun Heo7caa4712019-08-28 15:05:58 -07002849 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2850 ioc->enabled = true;
2851 } else {
2852 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2853 ioc->enabled = false;
2854 }
2855
2856 if (user) {
2857 memcpy(ioc->params.qos, qos, sizeof(qos));
2858 ioc->user_qos_params = true;
2859 } else {
2860 ioc->user_qos_params = false;
2861 }
2862
2863 ioc_refresh_params(ioc, true);
2864 spin_unlock_irq(&ioc->lock);
2865
2866 put_disk_and_module(disk);
2867 return nbytes;
2868einval:
2869 ret = -EINVAL;
2870err:
2871 put_disk_and_module(disk);
2872 return ret;
2873}
2874
2875static u64 ioc_cost_model_prfill(struct seq_file *sf,
2876 struct blkg_policy_data *pd, int off)
2877{
2878 const char *dname = blkg_dev_name(pd->blkg);
2879 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2880 u64 *u = ioc->params.i_lcoefs;
2881
2882 if (!dname)
2883 return 0;
2884
2885 seq_printf(sf, "%s ctrl=%s model=linear "
2886 "rbps=%llu rseqiops=%llu rrandiops=%llu "
2887 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2888 dname, ioc->user_cost_model ? "user" : "auto",
2889 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2890 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2891 return 0;
2892}
2893
2894static int ioc_cost_model_show(struct seq_file *sf, void *v)
2895{
2896 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2897
2898 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2899 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2900 return 0;
2901}
2902
2903static const match_table_t cost_ctrl_tokens = {
2904 { COST_CTRL, "ctrl=%s" },
2905 { COST_MODEL, "model=%s" },
2906 { NR_COST_CTRL_PARAMS, NULL },
2907};
2908
2909static const match_table_t i_lcoef_tokens = {
2910 { I_LCOEF_RBPS, "rbps=%u" },
2911 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
2912 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
2913 { I_LCOEF_WBPS, "wbps=%u" },
2914 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
2915 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
2916 { NR_I_LCOEFS, NULL },
2917};
2918
2919static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2920 size_t nbytes, loff_t off)
2921{
2922 struct gendisk *disk;
2923 struct ioc *ioc;
2924 u64 u[NR_I_LCOEFS];
2925 bool user;
2926 char *p;
2927 int ret;
2928
2929 disk = blkcg_conf_get_disk(&input);
2930 if (IS_ERR(disk))
2931 return PTR_ERR(disk);
2932
2933 ioc = q_to_ioc(disk->queue);
2934 if (!ioc) {
2935 ret = blk_iocost_init(disk->queue);
2936 if (ret)
2937 goto err;
2938 ioc = q_to_ioc(disk->queue);
2939 }
2940
2941 spin_lock_irq(&ioc->lock);
2942 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2943 user = ioc->user_cost_model;
2944 spin_unlock_irq(&ioc->lock);
2945
2946 while ((p = strsep(&input, " \t\n"))) {
2947 substring_t args[MAX_OPT_ARGS];
2948 char buf[32];
2949 int tok;
2950 u64 v;
2951
2952 if (!*p)
2953 continue;
2954
2955 switch (match_token(p, cost_ctrl_tokens, args)) {
2956 case COST_CTRL:
2957 match_strlcpy(buf, &args[0], sizeof(buf));
2958 if (!strcmp(buf, "auto"))
2959 user = false;
2960 else if (!strcmp(buf, "user"))
2961 user = true;
2962 else
2963 goto einval;
2964 continue;
2965 case COST_MODEL:
2966 match_strlcpy(buf, &args[0], sizeof(buf));
2967 if (strcmp(buf, "linear"))
2968 goto einval;
2969 continue;
2970 }
2971
2972 tok = match_token(p, i_lcoef_tokens, args);
2973 if (tok == NR_I_LCOEFS)
2974 goto einval;
2975 if (match_u64(&args[0], &v))
2976 goto einval;
2977 u[tok] = v;
2978 user = true;
2979 }
2980
2981 spin_lock_irq(&ioc->lock);
2982 if (user) {
2983 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2984 ioc->user_cost_model = true;
2985 } else {
2986 ioc->user_cost_model = false;
2987 }
2988 ioc_refresh_params(ioc, true);
2989 spin_unlock_irq(&ioc->lock);
2990
2991 put_disk_and_module(disk);
2992 return nbytes;
2993
2994einval:
2995 ret = -EINVAL;
2996err:
2997 put_disk_and_module(disk);
2998 return ret;
2999}
3000
3001static struct cftype ioc_files[] = {
3002 {
3003 .name = "weight",
3004 .flags = CFTYPE_NOT_ON_ROOT,
3005 .seq_show = ioc_weight_show,
3006 .write = ioc_weight_write,
3007 },
3008 {
3009 .name = "cost.qos",
3010 .flags = CFTYPE_ONLY_ON_ROOT,
3011 .seq_show = ioc_qos_show,
3012 .write = ioc_qos_write,
3013 },
3014 {
3015 .name = "cost.model",
3016 .flags = CFTYPE_ONLY_ON_ROOT,
3017 .seq_show = ioc_cost_model_show,
3018 .write = ioc_cost_model_write,
3019 },
3020 {}
3021};
3022
3023static struct blkcg_policy blkcg_policy_iocost = {
3024 .dfl_cftypes = ioc_files,
3025 .cpd_alloc_fn = ioc_cpd_alloc,
3026 .cpd_free_fn = ioc_cpd_free,
3027 .pd_alloc_fn = ioc_pd_alloc,
3028 .pd_init_fn = ioc_pd_init,
3029 .pd_free_fn = ioc_pd_free,
Tejun Heo97eb1972020-09-01 14:52:43 -04003030 .pd_stat_fn = ioc_pd_stat,
Tejun Heo7caa4712019-08-28 15:05:58 -07003031};
3032
3033static int __init ioc_init(void)
3034{
3035 return blkcg_policy_register(&blkcg_policy_iocost);
3036}
3037
3038static void __exit ioc_exit(void)
3039{
3040 return blkcg_policy_unregister(&blkcg_policy_iocost);
3041}
3042
3043module_init(ioc_init);
3044module_exit(ioc_exit);