blob: 694f1487208aca0f23ffb2cc95c51b3546032cff [file] [log] [blame]
Tejun Heo7caa4712019-08-28 15:05:58 -07001/* SPDX-License-Identifier: GPL-2.0
2 *
3 * IO cost model based controller.
4 *
5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
7 * Copyright (C) 2019 Facebook
8 *
9 * One challenge of controlling IO resources is the lack of trivially
10 * observable cost metric. This is distinguished from CPU and memory where
11 * wallclock time and the number of bytes can serve as accurate enough
12 * approximations.
13 *
14 * Bandwidth and iops are the most commonly used metrics for IO devices but
15 * depending on the type and specifics of the device, different IO patterns
16 * easily lead to multiple orders of magnitude variations rendering them
17 * useless for the purpose of IO capacity distribution. While on-device
18 * time, with a lot of clutches, could serve as a useful approximation for
19 * non-queued rotational devices, this is no longer viable with modern
20 * devices, even the rotational ones.
21 *
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
24 * dominates while a contiguous transfer contributes a smaller amount
25 * proportional to the size. If we can characterize at least the relative
26 * costs of these different types of IOs, it should be possible to
27 * implement a reasonable work-conserving proportional IO resource
28 * distribution.
29 *
30 * 1. IO Cost Model
31 *
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
35 * be able to process ~100 of those IOs in a second.
36 *
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
40 * added. While simple, this model captures the operational
41 * characteristics of a wide varienty of devices well enough. Default
42 * paramters for several different classes of devices are provided and the
43 * parameters can be configured from userspace via
44 * /sys/fs/cgroup/io.cost.model.
45 *
46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47 * device-specific coefficients.
48 *
49 * 2. Control Strategy
50 *
51 * The device virtual time (vtime) is used as the primary control metric.
52 * The control strategy is composed of the following three parts.
53 *
54 * 2-1. Vtime Distribution
55 *
56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
57 * calculated. Please consider the following hierarchy where the numbers
58 * inside parentheses denote the configured weights.
59 *
60 * root
61 * / \
62 * A (w:100) B (w:300)
63 * / \
64 * A0 (w:100) A1 (w:100)
65 *
66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67 * of equal weight, each gets 50% share. If then B starts issuing IOs, B
68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69 * 12.5% each. The distribution mechanism only cares about these flattened
70 * shares. They're called hweights (hierarchical weights) and always add
Tejun Heofe20cdb52020-09-01 14:52:38 -040071 * upto 1 (WEIGHT_ONE).
Tejun Heo7caa4712019-08-28 15:05:58 -070072 *
73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75 * against the device vtime - an IO which takes 10ms on the underlying
76 * device is considered to take 80ms on A0.
77 *
78 * This constitutes the basis of IO capacity distribution. Each cgroup's
79 * vtime is running at a rate determined by its hweight. A cgroup tracks
80 * the vtime consumed by past IOs and can issue a new IO iff doing so
81 * wouldn't outrun the current device vtime. Otherwise, the IO is
82 * suspended until the vtime has progressed enough to cover it.
83 *
84 * 2-2. Vrate Adjustment
85 *
86 * It's unrealistic to expect the cost model to be perfect. There are too
87 * many devices and even on the same device the overall performance
88 * fluctuates depending on numerous factors such as IO mixture and device
89 * internal garbage collection. The controller needs to adapt dynamically.
90 *
91 * This is achieved by adjusting the overall IO rate according to how busy
92 * the device is. If the device becomes overloaded, we're sending down too
93 * many IOs and should generally slow down. If there are waiting issuers
94 * but the device isn't saturated, we're issuing too few and should
95 * generally speed up.
96 *
97 * To slow down, we lower the vrate - the rate at which the device vtime
98 * passes compared to the wall clock. For example, if the vtime is running
99 * at the vrate of 75%, all cgroups added up would only be able to issue
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
101 *
102 * Device business is determined using two criteria - rq wait and
103 * completion latencies.
104 *
105 * When a device gets saturated, the on-device and then the request queues
106 * fill up and a bio which is ready to be issued has to wait for a request
107 * to become available. When this delay becomes noticeable, it's a clear
108 * indication that the device is saturated and we lower the vrate. This
109 * saturation signal is fairly conservative as it only triggers when both
110 * hardware and software queues are filled up, and is used as the default
111 * busy signal.
112 *
113 * As devices can have deep queues and be unfair in how the queued commands
114 * are executed, soley depending on rq wait may not result in satisfactory
115 * control quality. For a better control quality, completion latency QoS
116 * parameters can be configured so that the device is considered saturated
117 * if N'th percentile completion latency rises above the set point.
118 *
119 * The completion latency requirements are a function of both the
120 * underlying device characteristics and the desired IO latency quality of
121 * service. There is an inherent trade-off - the tighter the latency QoS,
122 * the higher the bandwidth lossage. Latency QoS is disabled by default
123 * and can be set through /sys/fs/cgroup/io.cost.qos.
124 *
125 * 2-3. Work Conservation
126 *
127 * Imagine two cgroups A and B with equal weights. A is issuing a small IO
128 * periodically while B is sending out enough parallel IOs to saturate the
129 * device on its own. Let's say A's usage amounts to 100ms worth of IO
130 * cost per second, i.e., 10% of the device capacity. The naive
131 * distribution of half and half would lead to 60% utilization of the
132 * device, a significant reduction in the total amount of work done
133 * compared to free-for-all competition. This is too high a cost to pay
134 * for IO control.
135 *
136 * To conserve the total amount of work done, we keep track of how much
137 * each active cgroup is actually using and yield part of its weight if
138 * there are other cgroups which can make use of it. In the above case,
139 * A's weight will be lowered so that it hovers above the actual usage and
140 * B would be able to use the rest.
141 *
142 * As we don't want to penalize a cgroup for donating its weight, the
143 * surplus weight adjustment factors in a margin and has an immediate
144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
145 *
146 * Note that adjusting down surplus weights has the same effects as
147 * accelerating vtime for other cgroups and work conservation can also be
148 * implemented by adjusting vrate dynamically. However, squaring who can
149 * donate and should take back how much requires hweight propagations
150 * anyway making it easier to implement and understand as a separate
151 * mechanism.
Tejun Heo6954ff12019-08-28 15:05:59 -0700152 *
153 * 3. Monitoring
154 *
155 * Instead of debugfs or other clumsy monitoring mechanisms, this
156 * controller uses a drgn based monitoring script -
157 * tools/cgroup/iocost_monitor.py. For details on drgn, please see
158 * https://github.com/osandov/drgn. The ouput looks like the following.
159 *
160 * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
Tejun Heo7c1ee702019-09-04 12:45:56 -0700161 * active weight hweight% inflt% dbt delay usages%
162 * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
163 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
Tejun Heo6954ff12019-08-28 15:05:59 -0700164 *
165 * - per : Timer period
166 * - cur_per : Internal wall and device vtime clock
167 * - vrate : Device virtual time rate against wall clock
168 * - weight : Surplus-adjusted and configured weights
169 * - hweight : Surplus-adjusted and configured hierarchical weights
170 * - inflt : The percentage of in-flight IO cost at the end of last period
171 * - del_ms : Deferred issuer delay induction level and duration
172 * - usages : Usage history
Tejun Heo7caa4712019-08-28 15:05:58 -0700173 */
174
175#include <linux/kernel.h>
176#include <linux/module.h>
177#include <linux/timer.h>
178#include <linux/time64.h>
179#include <linux/parser.h>
180#include <linux/sched/signal.h>
181#include <linux/blk-cgroup.h>
Tejun Heo5e124f72020-09-01 14:52:33 -0400182#include <asm/local.h>
183#include <asm/local64.h>
Tejun Heo7caa4712019-08-28 15:05:58 -0700184#include "blk-rq-qos.h"
185#include "blk-stat.h"
186#include "blk-wbt.h"
187
188#ifdef CONFIG_TRACEPOINTS
189
190/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
191#define TRACE_IOCG_PATH_LEN 1024
192static DEFINE_SPINLOCK(trace_iocg_path_lock);
193static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
194
195#define TRACE_IOCG_PATH(type, iocg, ...) \
196 do { \
197 unsigned long flags; \
198 if (trace_iocost_##type##_enabled()) { \
199 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
201 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
202 trace_iocost_##type(iocg, trace_iocg_path, \
203 ##__VA_ARGS__); \
204 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
205 } \
206 } while (0)
207
208#else /* CONFIG_TRACE_POINTS */
209#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
210#endif /* CONFIG_TRACE_POINTS */
211
212enum {
213 MILLION = 1000000,
214
215 /* timer period is calculated from latency requirements, bound it */
216 MIN_PERIOD = USEC_PER_MSEC,
217 MAX_PERIOD = USEC_PER_SEC,
218
219 /*
Tejun Heof1de2432020-09-01 14:52:49 -0400220 * iocg->vtime is targeted at 50% behind the device vtime, which
Tejun Heo7caa4712019-08-28 15:05:58 -0700221 * serves as its IO credit buffer. Surplus weight adjustment is
222 * immediately canceled if the vtime margin runs below 10%.
223 */
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400224 MARGIN_MIN_PCT = 10,
Tejun Heof1de2432020-09-01 14:52:49 -0400225 MARGIN_LOW_PCT = 20,
226 MARGIN_TARGET_PCT = 50,
227 MARGIN_MAX_PCT = 100,
Tejun Heo7caa4712019-08-28 15:05:58 -0700228
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400229 /* Have some play in timer operations */
230 TIMER_SLACK_PCT = 1,
Tejun Heo7caa4712019-08-28 15:05:58 -0700231
232 /*
233 * vtime can wrap well within a reasonable uptime when vrate is
234 * consistently raised. Don't trust recorded cgroup vtime if the
235 * period counter indicates that it's older than 5mins.
236 */
237 VTIME_VALID_DUR = 300 * USEC_PER_SEC,
238
Tejun Heo7caa4712019-08-28 15:05:58 -0700239 /* 1/64k is granular enough and can easily be handled w/ u32 */
Tejun Heofe20cdb52020-09-01 14:52:38 -0400240 WEIGHT_ONE = 1 << 16,
Tejun Heo7caa4712019-08-28 15:05:58 -0700241
242 /*
243 * As vtime is used to calculate the cost of each IO, it needs to
244 * be fairly high precision. For example, it should be able to
245 * represent the cost of a single page worth of discard with
246 * suffificient accuracy. At the same time, it should be able to
247 * represent reasonably long enough durations to be useful and
248 * convenient during operation.
249 *
250 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
251 * granularity and days of wrap-around time even at extreme vrates.
252 */
253 VTIME_PER_SEC_SHIFT = 37,
254 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
255 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
Tejun Heocd006502020-04-13 12:27:56 -0400256 VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
Tejun Heo7caa4712019-08-28 15:05:58 -0700257
258 /* bound vrate adjustments within two orders of magnitude */
259 VRATE_MIN_PPM = 10000, /* 1% */
260 VRATE_MAX_PPM = 100000000, /* 10000% */
261
262 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
263 VRATE_CLAMP_ADJ_PCT = 4,
264
265 /* if IOs end up waiting for requests, issue less */
266 RQ_WAIT_BUSY_PCT = 5,
267
268 /* unbusy hysterisis */
269 UNBUSY_THR_PCT = 75,
270
271 /* don't let cmds which take a very long time pin lagging for too long */
272 MAX_LAGGING_PERIODS = 10,
273
Tejun Heo7caa4712019-08-28 15:05:58 -0700274 /* switch iff the conditions are met for longer than this */
275 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
276
277 /*
278 * Count IO size in 4k pages. The 12bit shift helps keeping
279 * size-proportional components of cost calculation in closer
280 * numbers of digits to per-IO cost components.
281 */
282 IOC_PAGE_SHIFT = 12,
283 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
284 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
285
286 /* if apart further than 16M, consider randio for linear model */
287 LCOEF_RANDIO_PAGES = 4096,
288};
289
290enum ioc_running {
291 IOC_IDLE,
292 IOC_RUNNING,
293 IOC_STOP,
294};
295
296/* io.cost.qos controls including per-dev enable of the whole controller */
297enum {
298 QOS_ENABLE,
299 QOS_CTRL,
300 NR_QOS_CTRL_PARAMS,
301};
302
303/* io.cost.qos params */
304enum {
305 QOS_RPPM,
306 QOS_RLAT,
307 QOS_WPPM,
308 QOS_WLAT,
309 QOS_MIN,
310 QOS_MAX,
311 NR_QOS_PARAMS,
312};
313
314/* io.cost.model controls */
315enum {
316 COST_CTRL,
317 COST_MODEL,
318 NR_COST_CTRL_PARAMS,
319};
320
321/* builtin linear cost model coefficients */
322enum {
323 I_LCOEF_RBPS,
324 I_LCOEF_RSEQIOPS,
325 I_LCOEF_RRANDIOPS,
326 I_LCOEF_WBPS,
327 I_LCOEF_WSEQIOPS,
328 I_LCOEF_WRANDIOPS,
329 NR_I_LCOEFS,
330};
331
332enum {
333 LCOEF_RPAGE,
334 LCOEF_RSEQIO,
335 LCOEF_RRANDIO,
336 LCOEF_WPAGE,
337 LCOEF_WSEQIO,
338 LCOEF_WRANDIO,
339 NR_LCOEFS,
340};
341
342enum {
343 AUTOP_INVALID,
344 AUTOP_HDD,
345 AUTOP_SSD_QD1,
346 AUTOP_SSD_DFL,
347 AUTOP_SSD_FAST,
348};
349
350struct ioc_gq;
351
352struct ioc_params {
353 u32 qos[NR_QOS_PARAMS];
354 u64 i_lcoefs[NR_I_LCOEFS];
355 u64 lcoefs[NR_LCOEFS];
356 u32 too_fast_vrate_pct;
357 u32 too_slow_vrate_pct;
358};
359
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400360struct ioc_margins {
361 s64 min;
Tejun Heof1de2432020-09-01 14:52:49 -0400362 s64 low;
363 s64 target;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400364 s64 max;
365};
366
Tejun Heo7caa4712019-08-28 15:05:58 -0700367struct ioc_missed {
Tejun Heo5e124f72020-09-01 14:52:33 -0400368 local_t nr_met;
369 local_t nr_missed;
Tejun Heo7caa4712019-08-28 15:05:58 -0700370 u32 last_met;
371 u32 last_missed;
372};
373
374struct ioc_pcpu_stat {
375 struct ioc_missed missed[2];
376
Tejun Heo5e124f72020-09-01 14:52:33 -0400377 local64_t rq_wait_ns;
Tejun Heo7caa4712019-08-28 15:05:58 -0700378 u64 last_rq_wait_ns;
379};
380
381/* per device */
382struct ioc {
383 struct rq_qos rqos;
384
385 bool enabled;
386
387 struct ioc_params params;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400388 struct ioc_margins margins;
Tejun Heo7caa4712019-08-28 15:05:58 -0700389 u32 period_us;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400390 u32 timer_slack_ns;
Tejun Heo7caa4712019-08-28 15:05:58 -0700391 u64 vrate_min;
392 u64 vrate_max;
393
394 spinlock_t lock;
395 struct timer_list timer;
396 struct list_head active_iocgs; /* active cgroups */
397 struct ioc_pcpu_stat __percpu *pcpu_stat;
398
399 enum ioc_running running;
400 atomic64_t vtime_rate;
401
Ahmed S. Darwish67b7b642020-07-20 17:55:26 +0200402 seqcount_spinlock_t period_seqcount;
Tejun Heoce955702020-09-01 14:52:40 -0400403 u64 period_at; /* wallclock starttime */
Tejun Heo7caa4712019-08-28 15:05:58 -0700404 u64 period_at_vtime; /* vtime starttime */
405
406 atomic64_t cur_period; /* inc'd each period */
407 int busy_level; /* saturation history */
408
Tejun Heo7caa4712019-08-28 15:05:58 -0700409 bool weights_updated;
410 atomic_t hweight_gen; /* for lazy hweights */
411
412 u64 autop_too_fast_at;
413 u64 autop_too_slow_at;
414 int autop_idx;
415 bool user_qos_params:1;
416 bool user_cost_model:1;
417};
418
Tejun Heo97eb1972020-09-01 14:52:43 -0400419struct iocg_pcpu_stat {
420 local64_t abs_vusage;
421};
422
423struct iocg_stat {
424 u64 usage_us;
425};
426
Tejun Heo7caa4712019-08-28 15:05:58 -0700427/* per device-cgroup pair */
428struct ioc_gq {
429 struct blkg_policy_data pd;
430 struct ioc *ioc;
431
432 /*
433 * A iocg can get its weight from two sources - an explicit
434 * per-device-cgroup configuration or the default weight of the
435 * cgroup. `cfg_weight` is the explicit per-device-cgroup
436 * configuration. `weight` is the effective considering both
437 * sources.
438 *
439 * When an idle cgroup becomes active its `active` goes from 0 to
440 * `weight`. `inuse` is the surplus adjusted active weight.
441 * `active` and `inuse` are used to calculate `hweight_active` and
442 * `hweight_inuse`.
443 *
444 * `last_inuse` remembers `inuse` while an iocg is idle to persist
445 * surplus adjustments.
446 */
447 u32 cfg_weight;
448 u32 weight;
449 u32 active;
450 u32 inuse;
451 u32 last_inuse;
452
453 sector_t cursor; /* to detect randio */
454
455 /*
456 * `vtime` is this iocg's vtime cursor which progresses as IOs are
457 * issued. If lagging behind device vtime, the delta represents
458 * the currently available IO budget. If runnning ahead, the
459 * overage.
460 *
461 * `vtime_done` is the same but progressed on completion rather
462 * than issue. The delta behind `vtime` represents the cost of
463 * currently in-flight IOs.
Tejun Heo7caa4712019-08-28 15:05:58 -0700464 */
465 atomic64_t vtime;
466 atomic64_t done_vtime;
Tejun Heo0b80f982020-05-04 19:27:54 -0400467 u64 abs_vdebt;
Tejun Heo7caa4712019-08-28 15:05:58 -0700468
469 /*
470 * The period this iocg was last active in. Used for deactivation
471 * and invalidating `vtime`.
472 */
473 atomic64_t active_period;
474 struct list_head active_list;
475
Tejun Heo00410f12020-09-01 14:52:34 -0400476 /* see __propagate_weights() and current_hweight() for details */
Tejun Heo7caa4712019-08-28 15:05:58 -0700477 u64 child_active_sum;
478 u64 child_inuse_sum;
Tejun Heoe08d02a2020-09-01 14:52:48 -0400479 u64 child_adjusted_sum;
Tejun Heo7caa4712019-08-28 15:05:58 -0700480 int hweight_gen;
481 u32 hweight_active;
482 u32 hweight_inuse;
Tejun Heoe08d02a2020-09-01 14:52:48 -0400483 u32 hweight_donating;
Tejun Heo93f7d2d2020-09-01 14:52:47 -0400484 u32 hweight_after_donation;
Tejun Heo7caa4712019-08-28 15:05:58 -0700485
Tejun Heo97eb1972020-09-01 14:52:43 -0400486 struct list_head walk_list;
Tejun Heo8692d2d2020-09-01 14:52:45 -0400487 struct list_head surplus_list;
Tejun Heo97eb1972020-09-01 14:52:43 -0400488
Tejun Heo7caa4712019-08-28 15:05:58 -0700489 struct wait_queue_head waitq;
490 struct hrtimer waitq_timer;
491 struct hrtimer delay_timer;
492
Tejun Heo1aa50d02020-09-01 14:52:44 -0400493 /* timestamp at the latest activation */
494 u64 activated_at;
495
Tejun Heo97eb1972020-09-01 14:52:43 -0400496 /* statistics */
497 struct iocg_pcpu_stat __percpu *pcpu_stat;
498 struct iocg_stat local_stat;
499 struct iocg_stat desc_stat;
500 struct iocg_stat last_stat;
501 u64 last_stat_abs_vusage;
Tejun Heof1de2432020-09-01 14:52:49 -0400502 u64 usage_delta_us;
Tejun Heo7caa4712019-08-28 15:05:58 -0700503
504 /* this iocg's depth in the hierarchy and ancestors including self */
505 int level;
506 struct ioc_gq *ancestors[];
507};
508
509/* per cgroup */
510struct ioc_cgrp {
511 struct blkcg_policy_data cpd;
512 unsigned int dfl_weight;
513};
514
515struct ioc_now {
516 u64 now_ns;
Tejun Heoce955702020-09-01 14:52:40 -0400517 u64 now;
Tejun Heo7caa4712019-08-28 15:05:58 -0700518 u64 vnow;
519 u64 vrate;
520};
521
522struct iocg_wait {
523 struct wait_queue_entry wait;
524 struct bio *bio;
525 u64 abs_cost;
526 bool committed;
527};
528
529struct iocg_wake_ctx {
530 struct ioc_gq *iocg;
531 u32 hw_inuse;
532 s64 vbudget;
533};
534
535static const struct ioc_params autop[] = {
536 [AUTOP_HDD] = {
537 .qos = {
Tejun Heo7afccca2019-09-25 16:03:35 -0700538 [QOS_RLAT] = 250000, /* 250ms */
539 [QOS_WLAT] = 250000,
Tejun Heo7caa4712019-08-28 15:05:58 -0700540 [QOS_MIN] = VRATE_MIN_PPM,
541 [QOS_MAX] = VRATE_MAX_PPM,
542 },
543 .i_lcoefs = {
544 [I_LCOEF_RBPS] = 174019176,
545 [I_LCOEF_RSEQIOPS] = 41708,
546 [I_LCOEF_RRANDIOPS] = 370,
547 [I_LCOEF_WBPS] = 178075866,
548 [I_LCOEF_WSEQIOPS] = 42705,
549 [I_LCOEF_WRANDIOPS] = 378,
550 },
551 },
552 [AUTOP_SSD_QD1] = {
553 .qos = {
554 [QOS_RLAT] = 25000, /* 25ms */
555 [QOS_WLAT] = 25000,
556 [QOS_MIN] = VRATE_MIN_PPM,
557 [QOS_MAX] = VRATE_MAX_PPM,
558 },
559 .i_lcoefs = {
560 [I_LCOEF_RBPS] = 245855193,
561 [I_LCOEF_RSEQIOPS] = 61575,
562 [I_LCOEF_RRANDIOPS] = 6946,
563 [I_LCOEF_WBPS] = 141365009,
564 [I_LCOEF_WSEQIOPS] = 33716,
565 [I_LCOEF_WRANDIOPS] = 26796,
566 },
567 },
568 [AUTOP_SSD_DFL] = {
569 .qos = {
570 [QOS_RLAT] = 25000, /* 25ms */
571 [QOS_WLAT] = 25000,
572 [QOS_MIN] = VRATE_MIN_PPM,
573 [QOS_MAX] = VRATE_MAX_PPM,
574 },
575 .i_lcoefs = {
576 [I_LCOEF_RBPS] = 488636629,
577 [I_LCOEF_RSEQIOPS] = 8932,
578 [I_LCOEF_RRANDIOPS] = 8518,
579 [I_LCOEF_WBPS] = 427891549,
580 [I_LCOEF_WSEQIOPS] = 28755,
581 [I_LCOEF_WRANDIOPS] = 21940,
582 },
583 .too_fast_vrate_pct = 500,
584 },
585 [AUTOP_SSD_FAST] = {
586 .qos = {
587 [QOS_RLAT] = 5000, /* 5ms */
588 [QOS_WLAT] = 5000,
589 [QOS_MIN] = VRATE_MIN_PPM,
590 [QOS_MAX] = VRATE_MAX_PPM,
591 },
592 .i_lcoefs = {
593 [I_LCOEF_RBPS] = 3102524156LLU,
594 [I_LCOEF_RSEQIOPS] = 724816,
595 [I_LCOEF_RRANDIOPS] = 778122,
596 [I_LCOEF_WBPS] = 1742780862LLU,
597 [I_LCOEF_WSEQIOPS] = 425702,
598 [I_LCOEF_WRANDIOPS] = 443193,
599 },
600 .too_slow_vrate_pct = 10,
601 },
602};
603
604/*
605 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
606 * vtime credit shortage and down on device saturation.
607 */
608static u32 vrate_adj_pct[] =
609 { 0, 0, 0, 0,
610 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
611 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
612 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
613
614static struct blkcg_policy blkcg_policy_iocost;
615
616/* accessors and helpers */
617static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
618{
619 return container_of(rqos, struct ioc, rqos);
620}
621
622static struct ioc *q_to_ioc(struct request_queue *q)
623{
624 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
625}
626
627static const char *q_name(struct request_queue *q)
628{
629 if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
630 return kobject_name(q->kobj.parent);
631 else
632 return "<unknown>";
633}
634
635static const char __maybe_unused *ioc_name(struct ioc *ioc)
636{
637 return q_name(ioc->rqos.q);
638}
639
640static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
641{
642 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
643}
644
645static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
646{
647 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
648}
649
650static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
651{
652 return pd_to_blkg(&iocg->pd);
653}
654
655static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
656{
657 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
658 struct ioc_cgrp, cpd);
659}
660
661/*
662 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
Tejun Heo36a52482019-09-04 12:45:52 -0700663 * weight, the more expensive each IO. Must round up.
Tejun Heo7caa4712019-08-28 15:05:58 -0700664 */
665static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
666{
Tejun Heofe20cdb52020-09-01 14:52:38 -0400667 return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
Tejun Heo7caa4712019-08-28 15:05:58 -0700668}
669
Tejun Heo36a52482019-09-04 12:45:52 -0700670/*
671 * The inverse of abs_cost_to_cost(). Must round up.
672 */
673static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
674{
Tejun Heofe20cdb52020-09-01 14:52:38 -0400675 return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
Tejun Heo36a52482019-09-04 12:45:52 -0700676}
677
Tejun Heo97eb1972020-09-01 14:52:43 -0400678static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
679 u64 abs_cost, u64 cost)
Tejun Heo7caa4712019-08-28 15:05:58 -0700680{
Tejun Heo97eb1972020-09-01 14:52:43 -0400681 struct iocg_pcpu_stat *gcs;
682
Tejun Heo7caa4712019-08-28 15:05:58 -0700683 bio->bi_iocost_cost = cost;
684 atomic64_add(cost, &iocg->vtime);
Tejun Heo97eb1972020-09-01 14:52:43 -0400685
686 gcs = get_cpu_ptr(iocg->pcpu_stat);
687 local64_add(abs_cost, &gcs->abs_vusage);
688 put_cpu_ptr(gcs);
Tejun Heo7caa4712019-08-28 15:05:58 -0700689}
690
Tejun Heoda437b92020-09-01 14:52:42 -0400691static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
692{
693 if (lock_ioc) {
694 spin_lock_irqsave(&iocg->ioc->lock, *flags);
695 spin_lock(&iocg->waitq.lock);
696 } else {
697 spin_lock_irqsave(&iocg->waitq.lock, *flags);
698 }
699}
700
701static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
702{
703 if (unlock_ioc) {
704 spin_unlock(&iocg->waitq.lock);
705 spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
706 } else {
707 spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
708 }
709}
710
Tejun Heo7caa4712019-08-28 15:05:58 -0700711#define CREATE_TRACE_POINTS
712#include <trace/events/iocost.h>
713
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400714static void ioc_refresh_margins(struct ioc *ioc)
715{
716 struct ioc_margins *margins = &ioc->margins;
717 u32 period_us = ioc->period_us;
718 u64 vrate = atomic64_read(&ioc->vtime_rate);
719
720 margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
Tejun Heof1de2432020-09-01 14:52:49 -0400721 margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
722 margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400723 margins->max = (period_us * MARGIN_MAX_PCT / 100) * vrate;
724}
725
Tejun Heo7caa4712019-08-28 15:05:58 -0700726/* latency Qos params changed, update period_us and all the dependent params */
727static void ioc_refresh_period_us(struct ioc *ioc)
728{
729 u32 ppm, lat, multi, period_us;
730
731 lockdep_assert_held(&ioc->lock);
732
733 /* pick the higher latency target */
734 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
735 ppm = ioc->params.qos[QOS_RPPM];
736 lat = ioc->params.qos[QOS_RLAT];
737 } else {
738 ppm = ioc->params.qos[QOS_WPPM];
739 lat = ioc->params.qos[QOS_WLAT];
740 }
741
742 /*
743 * We want the period to be long enough to contain a healthy number
744 * of IOs while short enough for granular control. Define it as a
745 * multiple of the latency target. Ideally, the multiplier should
746 * be scaled according to the percentile so that it would nominally
747 * contain a certain number of requests. Let's be simpler and
748 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
749 */
750 if (ppm)
751 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
752 else
753 multi = 2;
754 period_us = multi * lat;
755 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
756
757 /* calculate dependent params */
758 ioc->period_us = period_us;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -0400759 ioc->timer_slack_ns = div64_u64(
760 (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
761 100);
762 ioc_refresh_margins(ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -0700763}
764
765static int ioc_autop_idx(struct ioc *ioc)
766{
767 int idx = ioc->autop_idx;
768 const struct ioc_params *p = &autop[idx];
769 u32 vrate_pct;
770 u64 now_ns;
771
772 /* rotational? */
773 if (!blk_queue_nonrot(ioc->rqos.q))
774 return AUTOP_HDD;
775
776 /* handle SATA SSDs w/ broken NCQ */
777 if (blk_queue_depth(ioc->rqos.q) == 1)
778 return AUTOP_SSD_QD1;
779
780 /* use one of the normal ssd sets */
781 if (idx < AUTOP_SSD_DFL)
782 return AUTOP_SSD_DFL;
783
784 /* if user is overriding anything, maintain what was there */
785 if (ioc->user_qos_params || ioc->user_cost_model)
786 return idx;
787
788 /* step up/down based on the vrate */
789 vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
790 VTIME_PER_USEC);
791 now_ns = ktime_get_ns();
792
793 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
794 if (!ioc->autop_too_fast_at)
795 ioc->autop_too_fast_at = now_ns;
796 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
797 return idx + 1;
798 } else {
799 ioc->autop_too_fast_at = 0;
800 }
801
802 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
803 if (!ioc->autop_too_slow_at)
804 ioc->autop_too_slow_at = now_ns;
805 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
806 return idx - 1;
807 } else {
808 ioc->autop_too_slow_at = 0;
809 }
810
811 return idx;
812}
813
814/*
815 * Take the followings as input
816 *
817 * @bps maximum sequential throughput
818 * @seqiops maximum sequential 4k iops
819 * @randiops maximum random 4k iops
820 *
821 * and calculate the linear model cost coefficients.
822 *
823 * *@page per-page cost 1s / (@bps / 4096)
824 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
825 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
826 */
827static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
828 u64 *page, u64 *seqio, u64 *randio)
829{
830 u64 v;
831
832 *page = *seqio = *randio = 0;
833
834 if (bps)
835 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
836 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
837
838 if (seqiops) {
839 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
840 if (v > *page)
841 *seqio = v - *page;
842 }
843
844 if (randiops) {
845 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
846 if (v > *page)
847 *randio = v - *page;
848 }
849}
850
851static void ioc_refresh_lcoefs(struct ioc *ioc)
852{
853 u64 *u = ioc->params.i_lcoefs;
854 u64 *c = ioc->params.lcoefs;
855
856 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
857 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
858 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
859 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
860}
861
862static bool ioc_refresh_params(struct ioc *ioc, bool force)
863{
864 const struct ioc_params *p;
865 int idx;
866
867 lockdep_assert_held(&ioc->lock);
868
869 idx = ioc_autop_idx(ioc);
870 p = &autop[idx];
871
872 if (idx == ioc->autop_idx && !force)
873 return false;
874
875 if (idx != ioc->autop_idx)
876 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
877
878 ioc->autop_idx = idx;
879 ioc->autop_too_fast_at = 0;
880 ioc->autop_too_slow_at = 0;
881
882 if (!ioc->user_qos_params)
883 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
884 if (!ioc->user_cost_model)
885 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
886
887 ioc_refresh_period_us(ioc);
888 ioc_refresh_lcoefs(ioc);
889
890 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
891 VTIME_PER_USEC, MILLION);
892 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
893 VTIME_PER_USEC, MILLION);
894
895 return true;
896}
897
898/* take a snapshot of the current [v]time and vrate */
899static void ioc_now(struct ioc *ioc, struct ioc_now *now)
900{
901 unsigned seq;
902
903 now->now_ns = ktime_get();
904 now->now = ktime_to_us(now->now_ns);
905 now->vrate = atomic64_read(&ioc->vtime_rate);
906
907 /*
908 * The current vtime is
909 *
910 * vtime at period start + (wallclock time since the start) * vrate
911 *
912 * As a consistent snapshot of `period_at_vtime` and `period_at` is
913 * needed, they're seqcount protected.
914 */
915 do {
916 seq = read_seqcount_begin(&ioc->period_seqcount);
917 now->vnow = ioc->period_at_vtime +
918 (now->now - ioc->period_at) * now->vrate;
919 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
920}
921
922static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
923{
Tejun Heo7caa4712019-08-28 15:05:58 -0700924 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
925
926 write_seqcount_begin(&ioc->period_seqcount);
927 ioc->period_at = now->now;
928 ioc->period_at_vtime = now->vnow;
929 write_seqcount_end(&ioc->period_seqcount);
930
931 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
932 add_timer(&ioc->timer);
933}
934
935/*
936 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
937 * weight sums and propagate upwards accordingly.
938 */
Tejun Heo00410f12020-09-01 14:52:34 -0400939static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
Tejun Heo7caa4712019-08-28 15:05:58 -0700940{
941 struct ioc *ioc = iocg->ioc;
942 int lvl;
943
944 lockdep_assert_held(&ioc->lock);
945
Tejun Heodb84a722020-09-01 14:52:35 -0400946 inuse = clamp_t(u32, inuse, 1, active);
947
948 if (active == iocg->active && inuse == iocg->inuse)
949 return;
Tejun Heo7caa4712019-08-28 15:05:58 -0700950
951 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
952 struct ioc_gq *parent = iocg->ancestors[lvl];
953 struct ioc_gq *child = iocg->ancestors[lvl + 1];
954 u32 parent_active = 0, parent_inuse = 0;
955
956 /* update the level sums */
957 parent->child_active_sum += (s32)(active - child->active);
958 parent->child_inuse_sum += (s32)(inuse - child->inuse);
959 /* apply the udpates */
960 child->active = active;
961 child->inuse = inuse;
962
963 /*
964 * The delta between inuse and active sums indicates that
965 * that much of weight is being given away. Parent's inuse
966 * and active should reflect the ratio.
967 */
968 if (parent->child_active_sum) {
969 parent_active = parent->weight;
970 parent_inuse = DIV64_U64_ROUND_UP(
971 parent_active * parent->child_inuse_sum,
972 parent->child_active_sum);
973 }
974
975 /* do we need to keep walking up? */
976 if (parent_active == parent->active &&
977 parent_inuse == parent->inuse)
978 break;
979
980 active = parent_active;
981 inuse = parent_inuse;
982 }
983
984 ioc->weights_updated = true;
985}
986
Tejun Heo00410f12020-09-01 14:52:34 -0400987static void commit_weights(struct ioc *ioc)
Tejun Heo7caa4712019-08-28 15:05:58 -0700988{
989 lockdep_assert_held(&ioc->lock);
990
991 if (ioc->weights_updated) {
992 /* paired with rmb in current_hweight(), see there */
993 smp_wmb();
994 atomic_inc(&ioc->hweight_gen);
995 ioc->weights_updated = false;
996 }
997}
998
Tejun Heo00410f12020-09-01 14:52:34 -0400999static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
Tejun Heo7caa4712019-08-28 15:05:58 -07001000{
Tejun Heo00410f12020-09-01 14:52:34 -04001001 __propagate_weights(iocg, active, inuse);
1002 commit_weights(iocg->ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -07001003}
1004
1005static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1006{
1007 struct ioc *ioc = iocg->ioc;
1008 int lvl;
1009 u32 hwa, hwi;
1010 int ioc_gen;
1011
1012 /* hot path - if uptodate, use cached */
1013 ioc_gen = atomic_read(&ioc->hweight_gen);
1014 if (ioc_gen == iocg->hweight_gen)
1015 goto out;
1016
1017 /*
Tejun Heo00410f12020-09-01 14:52:34 -04001018 * Paired with wmb in commit_weights(). If we saw the updated
1019 * hweight_gen, all the weight updates from __propagate_weights() are
1020 * visible too.
Tejun Heo7caa4712019-08-28 15:05:58 -07001021 *
1022 * We can race with weight updates during calculation and get it
1023 * wrong. However, hweight_gen would have changed and a future
1024 * reader will recalculate and we're guaranteed to discard the
1025 * wrong result soon.
1026 */
1027 smp_rmb();
1028
Tejun Heofe20cdb52020-09-01 14:52:38 -04001029 hwa = hwi = WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07001030 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1031 struct ioc_gq *parent = iocg->ancestors[lvl];
1032 struct ioc_gq *child = iocg->ancestors[lvl + 1];
Tejun Heobd0adb92020-09-01 14:52:39 -04001033 u64 active_sum = READ_ONCE(parent->child_active_sum);
1034 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
Tejun Heo7caa4712019-08-28 15:05:58 -07001035 u32 active = READ_ONCE(child->active);
1036 u32 inuse = READ_ONCE(child->inuse);
1037
1038 /* we can race with deactivations and either may read as zero */
1039 if (!active_sum || !inuse_sum)
1040 continue;
1041
Tejun Heobd0adb92020-09-01 14:52:39 -04001042 active_sum = max_t(u64, active, active_sum);
1043 hwa = div64_u64((u64)hwa * active, active_sum);
Tejun Heo7caa4712019-08-28 15:05:58 -07001044
Tejun Heobd0adb92020-09-01 14:52:39 -04001045 inuse_sum = max_t(u64, inuse, inuse_sum);
1046 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
Tejun Heo7caa4712019-08-28 15:05:58 -07001047 }
1048
1049 iocg->hweight_active = max_t(u32, hwa, 1);
1050 iocg->hweight_inuse = max_t(u32, hwi, 1);
1051 iocg->hweight_gen = ioc_gen;
1052out:
1053 if (hw_activep)
1054 *hw_activep = iocg->hweight_active;
1055 if (hw_inusep)
1056 *hw_inusep = iocg->hweight_inuse;
1057}
1058
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001059/*
1060 * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1061 * other weights stay unchanged.
1062 */
1063static u32 current_hweight_max(struct ioc_gq *iocg)
1064{
1065 u32 hwm = WEIGHT_ONE;
1066 u32 inuse = iocg->active;
1067 u64 child_inuse_sum;
1068 int lvl;
1069
1070 lockdep_assert_held(&iocg->ioc->lock);
1071
1072 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1073 struct ioc_gq *parent = iocg->ancestors[lvl];
1074 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1075
1076 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1077 hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1078 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1079 parent->child_active_sum);
1080 }
1081
1082 return max_t(u32, hwm, 1);
1083}
1084
Tejun Heo7caa4712019-08-28 15:05:58 -07001085static void weight_updated(struct ioc_gq *iocg)
1086{
1087 struct ioc *ioc = iocg->ioc;
1088 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1089 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1090 u32 weight;
1091
1092 lockdep_assert_held(&ioc->lock);
1093
1094 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1095 if (weight != iocg->weight && iocg->active)
Tejun Heo00410f12020-09-01 14:52:34 -04001096 propagate_weights(iocg, weight,
Tejun Heobd0adb92020-09-01 14:52:39 -04001097 DIV64_U64_ROUND_UP((u64)iocg->inuse * weight,
1098 iocg->weight));
Tejun Heo7caa4712019-08-28 15:05:58 -07001099 iocg->weight = weight;
1100}
1101
1102static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1103{
1104 struct ioc *ioc = iocg->ioc;
1105 u64 last_period, cur_period, max_period_delta;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001106 u64 vtime, vmin;
Tejun Heo7caa4712019-08-28 15:05:58 -07001107 int i;
1108
1109 /*
1110 * If seem to be already active, just update the stamp to tell the
1111 * timer that we're still active. We don't mind occassional races.
1112 */
1113 if (!list_empty(&iocg->active_list)) {
1114 ioc_now(ioc, now);
1115 cur_period = atomic64_read(&ioc->cur_period);
1116 if (atomic64_read(&iocg->active_period) != cur_period)
1117 atomic64_set(&iocg->active_period, cur_period);
1118 return true;
1119 }
1120
1121 /* racy check on internal node IOs, treat as root level IOs */
1122 if (iocg->child_active_sum)
1123 return false;
1124
1125 spin_lock_irq(&ioc->lock);
1126
1127 ioc_now(ioc, now);
1128
1129 /* update period */
1130 cur_period = atomic64_read(&ioc->cur_period);
1131 last_period = atomic64_read(&iocg->active_period);
1132 atomic64_set(&iocg->active_period, cur_period);
1133
1134 /* already activated or breaking leaf-only constraint? */
Jiufei Xue8b37bc22019-11-13 15:21:31 +08001135 if (!list_empty(&iocg->active_list))
1136 goto succeed_unlock;
1137 for (i = iocg->level - 1; i > 0; i--)
1138 if (!list_empty(&iocg->ancestors[i]->active_list))
Tejun Heo7caa4712019-08-28 15:05:58 -07001139 goto fail_unlock;
Jiufei Xue8b37bc22019-11-13 15:21:31 +08001140
Tejun Heo7caa4712019-08-28 15:05:58 -07001141 if (iocg->child_active_sum)
1142 goto fail_unlock;
1143
1144 /*
1145 * vtime may wrap when vrate is raised substantially due to
1146 * underestimated IO costs. Look at the period and ignore its
1147 * vtime if the iocg has been idle for too long. Also, cap the
1148 * budget it can start with to the margin.
1149 */
1150 max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1151 vtime = atomic64_read(&iocg->vtime);
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001152 vmin = now->vnow - ioc->margins.max;
Tejun Heo7caa4712019-08-28 15:05:58 -07001153
1154 if (last_period + max_period_delta < cur_period ||
1155 time_before64(vtime, vmin)) {
1156 atomic64_add(vmin - vtime, &iocg->vtime);
1157 atomic64_add(vmin - vtime, &iocg->done_vtime);
1158 vtime = vmin;
1159 }
1160
1161 /*
1162 * Activate, propagate weight and start period timer if not
1163 * running. Reset hweight_gen to avoid accidental match from
1164 * wrapping.
1165 */
1166 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1167 list_add(&iocg->active_list, &ioc->active_iocgs);
Tejun Heo00410f12020-09-01 14:52:34 -04001168 propagate_weights(iocg, iocg->weight,
1169 iocg->last_inuse ?: iocg->weight);
Tejun Heo7caa4712019-08-28 15:05:58 -07001170
1171 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1172 last_period, cur_period, vtime);
1173
Tejun Heo1aa50d02020-09-01 14:52:44 -04001174 iocg->activated_at = now->now;
Tejun Heo7caa4712019-08-28 15:05:58 -07001175
1176 if (ioc->running == IOC_IDLE) {
1177 ioc->running = IOC_RUNNING;
1178 ioc_start_period(ioc, now);
1179 }
1180
Jiufei Xue8b37bc22019-11-13 15:21:31 +08001181succeed_unlock:
Tejun Heo7caa4712019-08-28 15:05:58 -07001182 spin_unlock_irq(&ioc->lock);
1183 return true;
1184
1185fail_unlock:
1186 spin_unlock_irq(&ioc->lock);
1187 return false;
1188}
1189
Tejun Heo6ef20f72020-09-01 14:52:36 -04001190static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1191{
1192 struct ioc *ioc = iocg->ioc;
1193 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1194 u64 vtime = atomic64_read(&iocg->vtime);
Tejun Heo6ef20f72020-09-01 14:52:36 -04001195 u64 delta_ns, expires, oexpires;
1196 u32 hw_inuse;
1197
1198 lockdep_assert_held(&iocg->waitq.lock);
1199
1200 /* debt-adjust vtime */
1201 current_hweight(iocg, NULL, &hw_inuse);
1202 vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1203
1204 /*
1205 * Clear or maintain depending on the overage. Non-zero vdebt is what
1206 * guarantees that @iocg is online and future iocg_kick_delay() will
1207 * clear use_delay. Don't leave it on when there's no vdebt.
1208 */
1209 if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1210 blkcg_clear_delay(blkg);
1211 return false;
1212 }
1213 if (!atomic_read(&blkg->use_delay) &&
Tejun Heof1de2432020-09-01 14:52:49 -04001214 time_before_eq64(vtime, now->vnow + ioc->margins.target))
Tejun Heo6ef20f72020-09-01 14:52:36 -04001215 return false;
1216
1217 /* use delay */
1218 delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
1219 now->vrate) * NSEC_PER_USEC;
1220 blkcg_set_delay(blkg, delta_ns);
1221 expires = now->now_ns + delta_ns;
1222
1223 /* if already active and close enough, don't bother */
1224 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1225 if (hrtimer_is_queued(&iocg->delay_timer) &&
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001226 abs(oexpires - expires) <= ioc->timer_slack_ns)
Tejun Heo6ef20f72020-09-01 14:52:36 -04001227 return true;
1228
1229 hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001230 ioc->timer_slack_ns, HRTIMER_MODE_ABS);
Tejun Heo6ef20f72020-09-01 14:52:36 -04001231 return true;
1232}
1233
1234static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1235{
1236 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1237 struct ioc_now now;
1238 unsigned long flags;
1239
1240 spin_lock_irqsave(&iocg->waitq.lock, flags);
1241 ioc_now(iocg->ioc, &now);
1242 iocg_kick_delay(iocg, &now);
1243 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1244
1245 return HRTIMER_NORESTART;
1246}
1247
Tejun Heo7caa4712019-08-28 15:05:58 -07001248static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1249 int flags, void *key)
1250{
1251 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1252 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1253 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1254
1255 ctx->vbudget -= cost;
1256
1257 if (ctx->vbudget < 0)
1258 return -1;
1259
Tejun Heo97eb1972020-09-01 14:52:43 -04001260 iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
Tejun Heo7caa4712019-08-28 15:05:58 -07001261
1262 /*
1263 * autoremove_wake_function() removes the wait entry only when it
1264 * actually changed the task state. We want the wait always
1265 * removed. Remove explicitly and use default_wake_function().
1266 */
1267 list_del_init(&wq_entry->entry);
1268 wait->committed = true;
1269
1270 default_wake_function(wq_entry, mode, flags, key);
1271 return 0;
1272}
1273
Tejun Heoda437b92020-09-01 14:52:42 -04001274/*
1275 * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1276 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1277 * addition to iocg->waitq.lock.
1278 */
1279static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1280 struct ioc_now *now)
Tejun Heo7caa4712019-08-28 15:05:58 -07001281{
1282 struct ioc *ioc = iocg->ioc;
1283 struct iocg_wake_ctx ctx = { .iocg = iocg };
Tejun Heoda437b92020-09-01 14:52:42 -04001284 u64 vshortage, expires, oexpires;
Tejun Heo36a52482019-09-04 12:45:52 -07001285 s64 vbudget;
1286 u32 hw_inuse;
Tejun Heo7caa4712019-08-28 15:05:58 -07001287
1288 lockdep_assert_held(&iocg->waitq.lock);
1289
Tejun Heo36a52482019-09-04 12:45:52 -07001290 current_hweight(iocg, NULL, &hw_inuse);
1291 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1292
1293 /* pay off debt */
Tejun Heoda437b92020-09-01 14:52:42 -04001294 if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1295 u64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
Tejun Heo36a52482019-09-04 12:45:52 -07001296 u64 delta = min_t(u64, vbudget, vdebt);
1297 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
Tejun Heo0b80f982020-05-04 19:27:54 -04001298 iocg->abs_vdebt);
Tejun Heo36a52482019-09-04 12:45:52 -07001299
Tejun Heoda437b92020-09-01 14:52:42 -04001300 lockdep_assert_held(&ioc->lock);
1301
Tejun Heo36a52482019-09-04 12:45:52 -07001302 atomic64_add(delta, &iocg->vtime);
1303 atomic64_add(delta, &iocg->done_vtime);
Tejun Heo0b80f982020-05-04 19:27:54 -04001304 iocg->abs_vdebt -= abs_delta;
Tejun Heoda437b92020-09-01 14:52:42 -04001305 vbudget -= vdebt;
Tejun Heo7b84b492020-09-01 14:52:37 -04001306
1307 iocg_kick_delay(iocg, now);
Tejun Heo36a52482019-09-04 12:45:52 -07001308 }
1309
Tejun Heo7caa4712019-08-28 15:05:58 -07001310 /*
Tejun Heoda437b92020-09-01 14:52:42 -04001311 * Debt can still be outstanding if we haven't paid all yet or the
1312 * caller raced and called without @pay_debt. Shouldn't wake up waiters
1313 * under debt. Make sure @vbudget reflects the outstanding amount and is
1314 * not positive.
1315 */
1316 if (iocg->abs_vdebt) {
1317 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1318 vbudget = min_t(s64, 0, vbudget - vdebt);
1319 }
1320
1321 /*
Tejun Heo7caa4712019-08-28 15:05:58 -07001322 * Wake up the ones which are due and see how much vtime we'll need
1323 * for the next one.
1324 */
Tejun Heo36a52482019-09-04 12:45:52 -07001325 ctx.hw_inuse = hw_inuse;
Tejun Heoda437b92020-09-01 14:52:42 -04001326 ctx.vbudget = vbudget;
Tejun Heo7caa4712019-08-28 15:05:58 -07001327 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1328 if (!waitqueue_active(&iocg->waitq))
1329 return;
1330 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1331 return;
1332
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001333 /* determine next wakeup, add a timer margin to guarantee chunking */
Tejun Heo7caa4712019-08-28 15:05:58 -07001334 vshortage = -ctx.vbudget;
1335 expires = now->now_ns +
1336 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001337 expires += ioc->timer_slack_ns;
Tejun Heo7caa4712019-08-28 15:05:58 -07001338
1339 /* if already active and close enough, don't bother */
1340 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1341 if (hrtimer_is_queued(&iocg->waitq_timer) &&
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001342 abs(oexpires - expires) <= ioc->timer_slack_ns)
Tejun Heo7caa4712019-08-28 15:05:58 -07001343 return;
1344
1345 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04001346 ioc->timer_slack_ns, HRTIMER_MODE_ABS);
Tejun Heo7caa4712019-08-28 15:05:58 -07001347}
1348
1349static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1350{
1351 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
Tejun Heoda437b92020-09-01 14:52:42 -04001352 bool pay_debt = READ_ONCE(iocg->abs_vdebt);
Tejun Heo7caa4712019-08-28 15:05:58 -07001353 struct ioc_now now;
1354 unsigned long flags;
1355
1356 ioc_now(iocg->ioc, &now);
1357
Tejun Heoda437b92020-09-01 14:52:42 -04001358 iocg_lock(iocg, pay_debt, &flags);
1359 iocg_kick_waitq(iocg, pay_debt, &now);
1360 iocg_unlock(iocg, pay_debt, &flags);
Tejun Heo7caa4712019-08-28 15:05:58 -07001361
1362 return HRTIMER_NORESTART;
1363}
1364
Tejun Heo7caa4712019-08-28 15:05:58 -07001365static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1366{
1367 u32 nr_met[2] = { };
1368 u32 nr_missed[2] = { };
1369 u64 rq_wait_ns = 0;
1370 int cpu, rw;
1371
1372 for_each_online_cpu(cpu) {
1373 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1374 u64 this_rq_wait_ns;
1375
1376 for (rw = READ; rw <= WRITE; rw++) {
Tejun Heo5e124f72020-09-01 14:52:33 -04001377 u32 this_met = local_read(&stat->missed[rw].nr_met);
1378 u32 this_missed = local_read(&stat->missed[rw].nr_missed);
Tejun Heo7caa4712019-08-28 15:05:58 -07001379
1380 nr_met[rw] += this_met - stat->missed[rw].last_met;
1381 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1382 stat->missed[rw].last_met = this_met;
1383 stat->missed[rw].last_missed = this_missed;
1384 }
1385
Tejun Heo5e124f72020-09-01 14:52:33 -04001386 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
Tejun Heo7caa4712019-08-28 15:05:58 -07001387 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1388 stat->last_rq_wait_ns = this_rq_wait_ns;
1389 }
1390
1391 for (rw = READ; rw <= WRITE; rw++) {
1392 if (nr_met[rw] + nr_missed[rw])
1393 missed_ppm_ar[rw] =
1394 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1395 nr_met[rw] + nr_missed[rw]);
1396 else
1397 missed_ppm_ar[rw] = 0;
1398 }
1399
1400 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1401 ioc->period_us * NSEC_PER_USEC);
1402}
1403
1404/* was iocg idle this period? */
1405static bool iocg_is_idle(struct ioc_gq *iocg)
1406{
1407 struct ioc *ioc = iocg->ioc;
1408
1409 /* did something get issued this period? */
1410 if (atomic64_read(&iocg->active_period) ==
1411 atomic64_read(&ioc->cur_period))
1412 return false;
1413
1414 /* is something in flight? */
Tejun Heodcd65892020-03-10 13:07:46 -04001415 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
Tejun Heo7caa4712019-08-28 15:05:58 -07001416 return false;
1417
1418 return true;
1419}
1420
Tejun Heo97eb1972020-09-01 14:52:43 -04001421/*
1422 * Call this function on the target leaf @iocg's to build pre-order traversal
1423 * list of all the ancestors in @inner_walk. The inner nodes are linked through
1424 * ->walk_list and the caller is responsible for dissolving the list after use.
1425 */
1426static void iocg_build_inner_walk(struct ioc_gq *iocg,
1427 struct list_head *inner_walk)
1428{
1429 int lvl;
1430
1431 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1432
1433 /* find the first ancestor which hasn't been visited yet */
1434 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1435 if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1436 break;
1437 }
1438
1439 /* walk down and visit the inner nodes to get pre-order traversal */
1440 while (++lvl <= iocg->level - 1) {
1441 struct ioc_gq *inner = iocg->ancestors[lvl];
1442
1443 /* record traversal order */
1444 list_add_tail(&inner->walk_list, inner_walk);
1445 }
1446}
1447
1448/* collect per-cpu counters and propagate the deltas to the parent */
1449static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
1450{
1451 struct iocg_stat new_stat;
1452 u64 abs_vusage = 0;
1453 u64 vusage_delta;
1454 int cpu;
1455
1456 lockdep_assert_held(&iocg->ioc->lock);
1457
1458 /* collect per-cpu counters */
1459 for_each_possible_cpu(cpu) {
1460 abs_vusage += local64_read(
1461 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1462 }
1463 vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1464 iocg->last_stat_abs_vusage = abs_vusage;
1465
Tejun Heo1aa50d02020-09-01 14:52:44 -04001466 iocg->usage_delta_us = div64_u64(vusage_delta, now->vrate);
1467 iocg->local_stat.usage_us += iocg->usage_delta_us;
Tejun Heo97eb1972020-09-01 14:52:43 -04001468
1469 new_stat.usage_us =
1470 iocg->local_stat.usage_us + iocg->desc_stat.usage_us;
1471
1472 /* propagate the deltas to the parent */
1473 if (iocg->level > 0) {
1474 struct iocg_stat *parent_stat =
1475 &iocg->ancestors[iocg->level - 1]->desc_stat;
1476
1477 parent_stat->usage_us +=
1478 new_stat.usage_us - iocg->last_stat.usage_us;
1479 }
1480
1481 iocg->last_stat = new_stat;
1482}
1483
1484/* get stat counters ready for reading on all active iocgs */
1485static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1486{
1487 LIST_HEAD(inner_walk);
1488 struct ioc_gq *iocg, *tiocg;
1489
1490 /* flush leaves and build inner node walk list */
1491 list_for_each_entry(iocg, target_iocgs, active_list) {
1492 iocg_flush_stat_one(iocg, now);
1493 iocg_build_inner_walk(iocg, &inner_walk);
1494 }
1495
1496 /* keep flushing upwards by walking the inner list backwards */
1497 list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1498 iocg_flush_stat_one(iocg, now);
1499 list_del_init(&iocg->walk_list);
1500 }
1501}
1502
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001503/*
1504 * Determine what @iocg's hweight_inuse should be after donating unused
1505 * capacity. @hwm is the upper bound and used to signal no donation. This
1506 * function also throws away @iocg's excess budget.
1507 */
1508static u32 hweight_after_donation(struct ioc_gq *iocg, u32 hwm, u32 usage,
1509 struct ioc_now *now)
Tejun Heo7caa4712019-08-28 15:05:58 -07001510{
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001511 struct ioc *ioc = iocg->ioc;
1512 u64 vtime = atomic64_read(&iocg->vtime);
Tejun Heof1de2432020-09-01 14:52:49 -04001513 s64 excess, delta, target, new_hwi;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001514
1515 /* see whether minimum margin requirement is met */
1516 if (waitqueue_active(&iocg->waitq) ||
1517 time_after64(vtime, now->vnow - ioc->margins.min))
1518 return hwm;
1519
1520 /* throw away excess above max */
1521 excess = now->vnow - vtime - ioc->margins.max;
1522 if (excess > 0) {
1523 atomic64_add(excess, &iocg->vtime);
1524 atomic64_add(excess, &iocg->done_vtime);
1525 vtime += excess;
1526 }
1527
Tejun Heof1de2432020-09-01 14:52:49 -04001528 /*
1529 * Let's say the distance between iocg's and device's vtimes as a
1530 * fraction of period duration is delta. Assuming that the iocg will
1531 * consume the usage determined above, we want to determine new_hwi so
1532 * that delta equals MARGIN_TARGET at the end of the next period.
1533 *
1534 * We need to execute usage worth of IOs while spending the sum of the
1535 * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1536 * (delta):
1537 *
1538 * usage = (1 - MARGIN_TARGET + delta) * new_hwi
1539 *
1540 * Therefore, the new_hwi is:
1541 *
1542 * new_hwi = usage / (1 - MARGIN_TARGET + delta)
1543 */
1544 delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1545 now->vnow - ioc->period_at_vtime);
1546 target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1547 new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
Tejun Heo7caa4712019-08-28 15:05:58 -07001548
Tejun Heof1de2432020-09-01 14:52:49 -04001549 return clamp_t(s64, new_hwi, 1, hwm);
Tejun Heo7caa4712019-08-28 15:05:58 -07001550}
1551
Tejun Heoe08d02a2020-09-01 14:52:48 -04001552/*
1553 * For work-conservation, an iocg which isn't using all of its share should
1554 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1555 * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1556 *
1557 * #1 is mathematically simpler but has the drawback of requiring synchronous
1558 * global hweight_inuse updates when idle iocg's get activated or inuse weights
1559 * change due to donation snapbacks as it has the possibility of grossly
1560 * overshooting what's allowed by the model and vrate.
1561 *
1562 * #2 is inherently safe with local operations. The donating iocg can easily
1563 * snap back to higher weights when needed without worrying about impacts on
1564 * other nodes as the impacts will be inherently correct. This also makes idle
1565 * iocg activations safe. The only effect activations have is decreasing
1566 * hweight_inuse of others, the right solution to which is for those iocgs to
1567 * snap back to higher weights.
1568 *
1569 * So, we go with #2. The challenge is calculating how each donating iocg's
1570 * inuse should be adjusted to achieve the target donation amounts. This is done
1571 * using Andy's method described in the following pdf.
1572 *
1573 * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1574 *
1575 * Given the weights and target after-donation hweight_inuse values, Andy's
1576 * method determines how the proportional distribution should look like at each
1577 * sibling level to maintain the relative relationship between all non-donating
1578 * pairs. To roughly summarize, it divides the tree into donating and
1579 * non-donating parts, calculates global donation rate which is used to
1580 * determine the target hweight_inuse for each node, and then derives per-level
1581 * proportions.
1582 *
1583 * The following pdf shows that global distribution calculated this way can be
1584 * achieved by scaling inuse weights of donating leaves and propagating the
1585 * adjustments upwards proportionally.
1586 *
1587 * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1588 *
1589 * Combining the above two, we can determine how each leaf iocg's inuse should
1590 * be adjusted to achieve the target donation.
1591 *
1592 * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1593 *
1594 * The inline comments use symbols from the last pdf.
1595 *
1596 * b is the sum of the absolute budgets in the subtree. 1 for the root node.
1597 * f is the sum of the absolute budgets of non-donating nodes in the subtree.
1598 * t is the sum of the absolute budgets of donating nodes in the subtree.
1599 * w is the weight of the node. w = w_f + w_t
1600 * w_f is the non-donating portion of w. w_f = w * f / b
1601 * w_b is the donating portion of w. w_t = w * t / b
1602 * s is the sum of all sibling weights. s = Sum(w) for siblings
1603 * s_f and s_t are the non-donating and donating portions of s.
1604 *
1605 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1606 * w_pt is the donating portion of the parent's weight and w'_pt the same value
1607 * after adjustments. Subscript r denotes the root node's values.
1608 */
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001609static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1610{
Tejun Heoe08d02a2020-09-01 14:52:48 -04001611 LIST_HEAD(over_hwa);
1612 LIST_HEAD(inner_walk);
1613 struct ioc_gq *iocg, *tiocg, *root_iocg;
1614 u32 after_sum, over_sum, over_target, gamma;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001615
Tejun Heoe08d02a2020-09-01 14:52:48 -04001616 /*
1617 * It's pretty unlikely but possible for the total sum of
1618 * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1619 * confuse the following calculations. If such condition is detected,
1620 * scale down everyone over its full share equally to keep the sum below
1621 * WEIGHT_ONE.
1622 */
1623 after_sum = 0;
1624 over_sum = 0;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001625 list_for_each_entry(iocg, surpluses, surplus_list) {
Tejun Heoe08d02a2020-09-01 14:52:48 -04001626 u32 hwa;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001627
Tejun Heoe08d02a2020-09-01 14:52:48 -04001628 current_hweight(iocg, &hwa, NULL);
1629 after_sum += iocg->hweight_after_donation;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001630
Tejun Heoe08d02a2020-09-01 14:52:48 -04001631 if (iocg->hweight_after_donation > hwa) {
1632 over_sum += iocg->hweight_after_donation;
1633 list_add(&iocg->walk_list, &over_hwa);
1634 }
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001635 }
Tejun Heoe08d02a2020-09-01 14:52:48 -04001636
1637 if (after_sum >= WEIGHT_ONE) {
1638 /*
1639 * The delta should be deducted from the over_sum, calculate
1640 * target over_sum value.
1641 */
1642 u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1643 WARN_ON_ONCE(over_sum <= over_delta);
1644 over_target = over_sum - over_delta;
1645 } else {
1646 over_target = 0;
1647 }
1648
1649 list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1650 if (over_target)
1651 iocg->hweight_after_donation =
1652 div_u64((u64)iocg->hweight_after_donation *
1653 over_target, over_sum);
1654 list_del_init(&iocg->walk_list);
1655 }
1656
1657 /*
1658 * Build pre-order inner node walk list and prepare for donation
1659 * adjustment calculations.
1660 */
1661 list_for_each_entry(iocg, surpluses, surplus_list) {
1662 iocg_build_inner_walk(iocg, &inner_walk);
1663 }
1664
1665 root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1666 WARN_ON_ONCE(root_iocg->level > 0);
1667
1668 list_for_each_entry(iocg, &inner_walk, walk_list) {
1669 iocg->child_adjusted_sum = 0;
1670 iocg->hweight_donating = 0;
1671 iocg->hweight_after_donation = 0;
1672 }
1673
1674 /*
1675 * Propagate the donating budget (b_t) and after donation budget (b'_t)
1676 * up the hierarchy.
1677 */
1678 list_for_each_entry(iocg, surpluses, surplus_list) {
1679 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1680
1681 parent->hweight_donating += iocg->hweight_donating;
1682 parent->hweight_after_donation += iocg->hweight_after_donation;
1683 }
1684
1685 list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1686 if (iocg->level > 0) {
1687 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1688
1689 parent->hweight_donating += iocg->hweight_donating;
1690 parent->hweight_after_donation += iocg->hweight_after_donation;
1691 }
1692 }
1693
1694 /*
1695 * Calculate inner hwa's (b) and make sure the donation values are
1696 * within the accepted ranges as we're doing low res calculations with
1697 * roundups.
1698 */
1699 list_for_each_entry(iocg, &inner_walk, walk_list) {
1700 if (iocg->level) {
1701 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1702
1703 iocg->hweight_active = DIV64_U64_ROUND_UP(
1704 (u64)parent->hweight_active * iocg->active,
1705 parent->child_active_sum);
1706
1707 }
1708
1709 iocg->hweight_donating = min(iocg->hweight_donating,
1710 iocg->hweight_active);
1711 iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1712 iocg->hweight_donating - 1);
1713 if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1714 iocg->hweight_donating <= 1 ||
1715 iocg->hweight_after_donation == 0)) {
1716 pr_warn("iocg: invalid donation weights in ");
1717 pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1718 pr_cont(": active=%u donating=%u after=%u\n",
1719 iocg->hweight_active, iocg->hweight_donating,
1720 iocg->hweight_after_donation);
1721 }
1722 }
1723
1724 /*
1725 * Calculate the global donation rate (gamma) - the rate to adjust
1726 * non-donating budgets by. No need to use 64bit multiplication here as
1727 * the first operand is guaranteed to be smaller than WEIGHT_ONE
1728 * (1<<16).
1729 *
1730 * gamma = (1 - t_r') / (1 - t_r)
1731 */
1732 gamma = DIV_ROUND_UP(
1733 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1734 WEIGHT_ONE - root_iocg->hweight_donating);
1735
1736 /*
1737 * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1738 * nodes.
1739 */
1740 list_for_each_entry(iocg, &inner_walk, walk_list) {
1741 struct ioc_gq *parent;
1742 u32 inuse, wpt, wptp;
1743 u64 st, sf;
1744
1745 if (iocg->level == 0) {
1746 /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1747 iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1748 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1749 WEIGHT_ONE - iocg->hweight_after_donation);
1750 continue;
1751 }
1752
1753 parent = iocg->ancestors[iocg->level - 1];
1754
1755 /* b' = gamma * b_f + b_t' */
1756 iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1757 (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
1758 WEIGHT_ONE) + iocg->hweight_after_donation;
1759
1760 /* w' = s' * b' / b'_p */
1761 inuse = DIV64_U64_ROUND_UP(
1762 (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
1763 parent->hweight_inuse);
1764
1765 /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
1766 st = DIV64_U64_ROUND_UP(
1767 iocg->child_active_sum * iocg->hweight_donating,
1768 iocg->hweight_active);
1769 sf = iocg->child_active_sum - st;
1770 wpt = DIV64_U64_ROUND_UP(
1771 (u64)iocg->active * iocg->hweight_donating,
1772 iocg->hweight_active);
1773 wptp = DIV64_U64_ROUND_UP(
1774 (u64)inuse * iocg->hweight_after_donation,
1775 iocg->hweight_inuse);
1776
1777 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
1778 }
1779
1780 /*
1781 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
1782 * we can finally determine leaf adjustments.
1783 */
1784 list_for_each_entry(iocg, surpluses, surplus_list) {
1785 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1786 u32 inuse;
1787
1788 /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
1789 inuse = DIV64_U64_ROUND_UP(
1790 parent->child_adjusted_sum * iocg->hweight_after_donation,
1791 parent->hweight_inuse);
1792 __propagate_weights(iocg, iocg->active, inuse);
1793 }
1794
1795 /* walk list should be dissolved after use */
1796 list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
1797 list_del_init(&iocg->walk_list);
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001798}
1799
Tejun Heo7caa4712019-08-28 15:05:58 -07001800static void ioc_timer_fn(struct timer_list *timer)
1801{
1802 struct ioc *ioc = container_of(timer, struct ioc, timer);
1803 struct ioc_gq *iocg, *tiocg;
1804 struct ioc_now now;
Tejun Heo8692d2d2020-09-01 14:52:45 -04001805 LIST_HEAD(surpluses);
Tejun Heo065655c2020-09-01 14:52:46 -04001806 int nr_shortages = 0, nr_lagging = 0;
Tejun Heo7caa4712019-08-28 15:05:58 -07001807 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1808 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1809 u32 missed_ppm[2], rq_wait_pct;
1810 u64 period_vtime;
Tejun Heof1de2432020-09-01 14:52:49 -04001811 int prev_busy_level;
Tejun Heo7caa4712019-08-28 15:05:58 -07001812
1813 /* how were the latencies during the period? */
1814 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1815
1816 /* take care of active iocgs */
1817 spin_lock_irq(&ioc->lock);
1818
1819 ioc_now(ioc, &now);
1820
1821 period_vtime = now.vnow - ioc->period_at_vtime;
1822 if (WARN_ON_ONCE(!period_vtime)) {
1823 spin_unlock_irq(&ioc->lock);
1824 return;
1825 }
1826
Tejun Heo97eb1972020-09-01 14:52:43 -04001827 iocg_flush_stat(&ioc->active_iocgs, &now);
1828
Tejun Heo7caa4712019-08-28 15:05:58 -07001829 /*
1830 * Waiters determine the sleep durations based on the vrate they
1831 * saw at the time of sleep. If vrate has increased, some waiters
1832 * could be sleeping for too long. Wake up tardy waiters which
1833 * should have woken up in the last period and expire idle iocgs.
1834 */
1835 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
Chengming Zhoud9012a52020-07-30 17:03:21 +08001836 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
Tejun Heo0b80f982020-05-04 19:27:54 -04001837 !iocg_is_idle(iocg))
Tejun Heo7caa4712019-08-28 15:05:58 -07001838 continue;
1839
1840 spin_lock(&iocg->waitq.lock);
1841
Tejun Heo0b80f982020-05-04 19:27:54 -04001842 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
Tejun Heo7caa4712019-08-28 15:05:58 -07001843 /* might be oversleeping vtime / hweight changes, kick */
Tejun Heoda437b92020-09-01 14:52:42 -04001844 iocg_kick_waitq(iocg, true, &now);
Tejun Heo7caa4712019-08-28 15:05:58 -07001845 } else if (iocg_is_idle(iocg)) {
1846 /* no waiter and idle, deactivate */
1847 iocg->last_inuse = iocg->inuse;
Tejun Heo00410f12020-09-01 14:52:34 -04001848 __propagate_weights(iocg, 0, 0);
Tejun Heo7caa4712019-08-28 15:05:58 -07001849 list_del_init(&iocg->active_list);
1850 }
1851
1852 spin_unlock(&iocg->waitq.lock);
1853 }
Tejun Heo00410f12020-09-01 14:52:34 -04001854 commit_weights(ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -07001855
Tejun Heof1de2432020-09-01 14:52:49 -04001856 /* calc usage and see whether some weights need to be moved around */
Tejun Heo7caa4712019-08-28 15:05:58 -07001857 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
Tejun Heof1de2432020-09-01 14:52:49 -04001858 u64 vdone, vtime, usage_us, usage_dur;
1859 u32 usage, hw_active, hw_inuse;
Tejun Heo7caa4712019-08-28 15:05:58 -07001860
1861 /*
1862 * Collect unused and wind vtime closer to vnow to prevent
1863 * iocgs from accumulating a large amount of budget.
1864 */
1865 vdone = atomic64_read(&iocg->done_vtime);
1866 vtime = atomic64_read(&iocg->vtime);
1867 current_hweight(iocg, &hw_active, &hw_inuse);
1868
1869 /*
1870 * Latency QoS detection doesn't account for IOs which are
1871 * in-flight for longer than a period. Detect them by
1872 * comparing vdone against period start. If lagging behind
1873 * IOs from past periods, don't increase vrate.
1874 */
Tejun Heo7cd806a2019-09-25 16:03:09 -07001875 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1876 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
Tejun Heo7caa4712019-08-28 15:05:58 -07001877 time_after64(vtime, vdone) &&
1878 time_after64(vtime, now.vnow -
1879 MAX_LAGGING_PERIODS * period_vtime) &&
1880 time_before64(vdone, now.vnow - period_vtime))
1881 nr_lagging++;
1882
Tejun Heo7caa4712019-08-28 15:05:58 -07001883 /*
Tejun Heof1de2432020-09-01 14:52:49 -04001884 * Determine absolute usage factoring in in-flight IOs to avoid
1885 * high-latency completions appearing as idle.
Tejun Heo7caa4712019-08-28 15:05:58 -07001886 */
Tejun Heo1aa50d02020-09-01 14:52:44 -04001887 usage_us = iocg->usage_delta_us;
Tejun Heof1de2432020-09-01 14:52:49 -04001888
Tejun Heo1aa50d02020-09-01 14:52:44 -04001889 if (vdone != vtime) {
1890 u64 inflight_us = DIV64_U64_ROUND_UP(
1891 cost_to_abs_cost(vtime - vdone, hw_inuse),
1892 now.vrate);
1893 usage_us = max(usage_us, inflight_us);
1894 }
Tejun Heo7caa4712019-08-28 15:05:58 -07001895
Tejun Heof1de2432020-09-01 14:52:49 -04001896 /* convert to hweight based usage ratio */
1897 if (time_after64(iocg->activated_at, ioc->period_at))
1898 usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
1899 else
1900 usage_dur = max_t(u64, now.now - ioc->period_at, 1);
Tejun Heo1aa50d02020-09-01 14:52:44 -04001901
Tejun Heof1de2432020-09-01 14:52:49 -04001902 usage = clamp_t(u32,
1903 DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
1904 usage_dur),
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001905 1, WEIGHT_ONE);
Tejun Heo7caa4712019-08-28 15:05:58 -07001906
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001907 /* see whether there's surplus vtime */
1908 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
1909 if (hw_inuse < hw_active ||
1910 (!waitqueue_active(&iocg->waitq) &&
Tejun Heof1de2432020-09-01 14:52:49 -04001911 time_before64(vtime, now.vnow - ioc->margins.low))) {
Tejun Heoe08d02a2020-09-01 14:52:48 -04001912 u32 hwa, hwm, new_hwi;
Tejun Heo7caa4712019-08-28 15:05:58 -07001913
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001914 /*
1915 * Already donating or accumulated enough to start.
1916 * Determine the donation amount.
1917 */
Tejun Heoe08d02a2020-09-01 14:52:48 -04001918 current_hweight(iocg, &hwa, NULL);
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001919 hwm = current_hweight_max(iocg);
1920 new_hwi = hweight_after_donation(iocg, hwm, usage,
1921 &now);
1922 if (new_hwi < hwm) {
Tejun Heoe08d02a2020-09-01 14:52:48 -04001923 iocg->hweight_donating = hwa;
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001924 iocg->hweight_after_donation = new_hwi;
1925 list_add(&iocg->surplus_list, &surpluses);
1926 } else {
1927 __propagate_weights(iocg, iocg->active,
1928 iocg->active);
1929 nr_shortages++;
1930 }
1931 } else {
1932 /* genuinely short on vtime */
1933 nr_shortages++;
Tejun Heo7caa4712019-08-28 15:05:58 -07001934 }
1935 }
Tejun Heo93f7d2d2020-09-01 14:52:47 -04001936
1937 if (!list_empty(&surpluses) && nr_shortages)
1938 transfer_surpluses(&surpluses, &now);
1939
Tejun Heo00410f12020-09-01 14:52:34 -04001940 commit_weights(ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -07001941
Tejun Heo8692d2d2020-09-01 14:52:45 -04001942 /* surplus list should be dissolved after use */
1943 list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
1944 list_del_init(&iocg->surplus_list);
1945
Tejun Heo7caa4712019-08-28 15:05:58 -07001946 /*
1947 * If q is getting clogged or we're missing too much, we're issuing
1948 * too much IO and should lower vtime rate. If we're not missing
1949 * and experiencing shortages but not surpluses, we're too stingy
1950 * and should increase vtime rate.
1951 */
Tejun Heo25d41e42019-09-25 16:02:07 -07001952 prev_busy_level = ioc->busy_level;
Tejun Heo7caa4712019-08-28 15:05:58 -07001953 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1954 missed_ppm[READ] > ppm_rthr ||
1955 missed_ppm[WRITE] > ppm_wthr) {
Tejun Heo81ca6272019-10-14 17:18:11 -07001956 /* clearly missing QoS targets, slow down vrate */
Tejun Heo7caa4712019-08-28 15:05:58 -07001957 ioc->busy_level = max(ioc->busy_level, 0);
1958 ioc->busy_level++;
Tejun Heo7cd806a2019-09-25 16:03:09 -07001959 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
Tejun Heo7caa4712019-08-28 15:05:58 -07001960 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1961 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
Tejun Heo81ca6272019-10-14 17:18:11 -07001962 /* QoS targets are being met with >25% margin */
1963 if (nr_shortages) {
1964 /*
1965 * We're throttling while the device has spare
1966 * capacity. If vrate was being slowed down, stop.
1967 */
Tejun Heo7cd806a2019-09-25 16:03:09 -07001968 ioc->busy_level = min(ioc->busy_level, 0);
Tejun Heo81ca6272019-10-14 17:18:11 -07001969
1970 /*
1971 * If there are IOs spanning multiple periods, wait
Tejun Heo065655c2020-09-01 14:52:46 -04001972 * them out before pushing the device harder.
Tejun Heo81ca6272019-10-14 17:18:11 -07001973 */
Tejun Heo065655c2020-09-01 14:52:46 -04001974 if (!nr_lagging)
Tejun Heo7cd806a2019-09-25 16:03:09 -07001975 ioc->busy_level--;
Tejun Heo81ca6272019-10-14 17:18:11 -07001976 } else {
1977 /*
1978 * Nobody is being throttled and the users aren't
1979 * issuing enough IOs to saturate the device. We
1980 * simply don't know how close the device is to
1981 * saturation. Coast.
1982 */
1983 ioc->busy_level = 0;
Tejun Heo7cd806a2019-09-25 16:03:09 -07001984 }
Tejun Heo7caa4712019-08-28 15:05:58 -07001985 } else {
Tejun Heo81ca6272019-10-14 17:18:11 -07001986 /* inside the hysterisis margin, we're good */
Tejun Heo7caa4712019-08-28 15:05:58 -07001987 ioc->busy_level = 0;
1988 }
1989
1990 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1991
Tejun Heo7cd806a2019-09-25 16:03:09 -07001992 if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
Tejun Heo7caa4712019-08-28 15:05:58 -07001993 u64 vrate = atomic64_read(&ioc->vtime_rate);
1994 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1995
1996 /* rq_wait signal is always reliable, ignore user vrate_min */
1997 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1998 vrate_min = VRATE_MIN;
1999
2000 /*
2001 * If vrate is out of bounds, apply clamp gradually as the
2002 * bounds can change abruptly. Otherwise, apply busy_level
2003 * based adjustment.
2004 */
2005 if (vrate < vrate_min) {
2006 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
2007 100);
2008 vrate = min(vrate, vrate_min);
2009 } else if (vrate > vrate_max) {
2010 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
2011 100);
2012 vrate = max(vrate, vrate_max);
2013 } else {
2014 int idx = min_t(int, abs(ioc->busy_level),
2015 ARRAY_SIZE(vrate_adj_pct) - 1);
2016 u32 adj_pct = vrate_adj_pct[idx];
2017
2018 if (ioc->busy_level > 0)
2019 adj_pct = 100 - adj_pct;
2020 else
2021 adj_pct = 100 + adj_pct;
2022
2023 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
2024 vrate_min, vrate_max);
2025 }
2026
Waiman Longd6c8e942020-04-21 09:07:55 -04002027 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
Tejun Heo065655c2020-09-01 14:52:46 -04002028 nr_lagging, nr_shortages);
Tejun Heo7caa4712019-08-28 15:05:58 -07002029
2030 atomic64_set(&ioc->vtime_rate, vrate);
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04002031 ioc_refresh_margins(ioc);
Tejun Heo25d41e42019-09-25 16:02:07 -07002032 } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
2033 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
Waiman Longd6c8e942020-04-21 09:07:55 -04002034 missed_ppm, rq_wait_pct, nr_lagging,
Tejun Heo065655c2020-09-01 14:52:46 -04002035 nr_shortages);
Tejun Heo7caa4712019-08-28 15:05:58 -07002036 }
2037
2038 ioc_refresh_params(ioc, false);
2039
2040 /*
2041 * This period is done. Move onto the next one. If nothing's
2042 * going on with the device, stop the timer.
2043 */
2044 atomic64_inc(&ioc->cur_period);
2045
2046 if (ioc->running != IOC_STOP) {
2047 if (!list_empty(&ioc->active_iocgs)) {
2048 ioc_start_period(ioc, &now);
2049 } else {
2050 ioc->busy_level = 0;
2051 ioc->running = IOC_IDLE;
2052 }
2053 }
2054
2055 spin_unlock_irq(&ioc->lock);
2056}
2057
2058static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2059 bool is_merge, u64 *costp)
2060{
2061 struct ioc *ioc = iocg->ioc;
2062 u64 coef_seqio, coef_randio, coef_page;
2063 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2064 u64 seek_pages = 0;
2065 u64 cost = 0;
2066
2067 switch (bio_op(bio)) {
2068 case REQ_OP_READ:
2069 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
2070 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
2071 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
2072 break;
2073 case REQ_OP_WRITE:
2074 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
2075 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
2076 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
2077 break;
2078 default:
2079 goto out;
2080 }
2081
2082 if (iocg->cursor) {
2083 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2084 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2085 }
2086
2087 if (!is_merge) {
2088 if (seek_pages > LCOEF_RANDIO_PAGES) {
2089 cost += coef_randio;
2090 } else {
2091 cost += coef_seqio;
2092 }
2093 }
2094 cost += pages * coef_page;
2095out:
2096 *costp = cost;
2097}
2098
2099static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2100{
2101 u64 cost;
2102
2103 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2104 return cost;
2105}
2106
Tejun Heocd006502020-04-13 12:27:56 -04002107static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2108 u64 *costp)
2109{
2110 unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2111
2112 switch (req_op(rq)) {
2113 case REQ_OP_READ:
2114 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2115 break;
2116 case REQ_OP_WRITE:
2117 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2118 break;
2119 default:
2120 *costp = 0;
2121 }
2122}
2123
2124static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2125{
2126 u64 cost;
2127
2128 calc_size_vtime_cost_builtin(rq, ioc, &cost);
2129 return cost;
2130}
2131
Tejun Heo7caa4712019-08-28 15:05:58 -07002132static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2133{
2134 struct blkcg_gq *blkg = bio->bi_blkg;
2135 struct ioc *ioc = rqos_to_ioc(rqos);
2136 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2137 struct ioc_now now;
2138 struct iocg_wait wait;
2139 u32 hw_active, hw_inuse;
2140 u64 abs_cost, cost, vtime;
Tejun Heoda437b92020-09-01 14:52:42 -04002141 bool use_debt, ioc_locked;
2142 unsigned long flags;
Tejun Heo7caa4712019-08-28 15:05:58 -07002143
2144 /* bypass IOs if disabled or for root cgroup */
2145 if (!ioc->enabled || !iocg->level)
2146 return;
2147
Tejun Heo7caa4712019-08-28 15:05:58 -07002148 /* calculate the absolute vtime cost */
2149 abs_cost = calc_vtime_cost(bio, iocg, false);
2150 if (!abs_cost)
2151 return;
2152
Tejun Heof1de2432020-09-01 14:52:49 -04002153 if (!iocg_activate(iocg, &now))
2154 return;
2155
Tejun Heo7caa4712019-08-28 15:05:58 -07002156 iocg->cursor = bio_end_sector(bio);
2157
2158 vtime = atomic64_read(&iocg->vtime);
2159 current_hweight(iocg, &hw_active, &hw_inuse);
2160
2161 if (hw_inuse < hw_active &&
Tejun Heo7ca5b2e2020-09-01 14:52:41 -04002162 time_after_eq64(vtime + ioc->margins.min, now.vnow)) {
Tejun Heo7caa4712019-08-28 15:05:58 -07002163 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
2164 iocg->inuse, iocg->weight, hw_inuse, hw_active);
2165 spin_lock_irq(&ioc->lock);
Tejun Heo00410f12020-09-01 14:52:34 -04002166 propagate_weights(iocg, iocg->weight, iocg->weight);
Tejun Heo7caa4712019-08-28 15:05:58 -07002167 spin_unlock_irq(&ioc->lock);
2168 current_hweight(iocg, &hw_active, &hw_inuse);
2169 }
2170
2171 cost = abs_cost_to_cost(abs_cost, hw_inuse);
2172
2173 /*
2174 * If no one's waiting and within budget, issue right away. The
2175 * tests are racy but the races aren't systemic - we only miss once
2176 * in a while which is fine.
2177 */
Tejun Heo0b80f982020-05-04 19:27:54 -04002178 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
Tejun Heo7caa4712019-08-28 15:05:58 -07002179 time_before_eq64(vtime + cost, now.vnow)) {
Tejun Heo97eb1972020-09-01 14:52:43 -04002180 iocg_commit_bio(iocg, bio, abs_cost, cost);
Tejun Heo7caa4712019-08-28 15:05:58 -07002181 return;
2182 }
2183
Tejun Heo36a52482019-09-04 12:45:52 -07002184 /*
Tejun Heoda437b92020-09-01 14:52:42 -04002185 * We're over budget. This can be handled in two ways. IOs which may
2186 * cause priority inversions are punted to @ioc->aux_iocg and charged as
2187 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2188 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2189 * whether debt handling is needed and acquire locks accordingly.
Tejun Heo0b80f982020-05-04 19:27:54 -04002190 */
Tejun Heoda437b92020-09-01 14:52:42 -04002191 use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2192 ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
Tejun Heo0b80f982020-05-04 19:27:54 -04002193
Tejun Heoda437b92020-09-01 14:52:42 -04002194 iocg_lock(iocg, ioc_locked, &flags);
2195
2196 /*
2197 * @iocg must stay activated for debt and waitq handling. Deactivation
2198 * is synchronized against both ioc->lock and waitq.lock and we won't
2199 * get deactivated as long as we're waiting or has debt, so we're good
2200 * if we're activated here. In the unlikely cases that we aren't, just
2201 * issue the IO.
2202 */
Tejun Heo0b80f982020-05-04 19:27:54 -04002203 if (unlikely(list_empty(&iocg->active_list))) {
Tejun Heoda437b92020-09-01 14:52:42 -04002204 iocg_unlock(iocg, ioc_locked, &flags);
Tejun Heo97eb1972020-09-01 14:52:43 -04002205 iocg_commit_bio(iocg, bio, abs_cost, cost);
Tejun Heo0b80f982020-05-04 19:27:54 -04002206 return;
2207 }
2208
2209 /*
2210 * We're over budget. If @bio has to be issued regardless, remember
2211 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2212 * off the debt before waking more IOs.
2213 *
Tejun Heo36a52482019-09-04 12:45:52 -07002214 * This way, the debt is continuously paid off each period with the
Tejun Heo0b80f982020-05-04 19:27:54 -04002215 * actual budget available to the cgroup. If we just wound vtime, we
2216 * would incorrectly use the current hw_inuse for the entire amount
2217 * which, for example, can lead to the cgroup staying blocked for a
2218 * long time even with substantially raised hw_inuse.
2219 *
2220 * An iocg with vdebt should stay online so that the timer can keep
2221 * deducting its vdebt and [de]activate use_delay mechanism
2222 * accordingly. We don't want to race against the timer trying to
2223 * clear them and leave @iocg inactive w/ dangling use_delay heavily
2224 * penalizing the cgroup and its descendants.
Tejun Heo36a52482019-09-04 12:45:52 -07002225 */
Tejun Heoda437b92020-09-01 14:52:42 -04002226 if (use_debt) {
Tejun Heo0b80f982020-05-04 19:27:54 -04002227 iocg->abs_vdebt += abs_cost;
Tejun Heo54c52e12020-04-13 12:27:55 -04002228 if (iocg_kick_delay(iocg, &now))
Tejun Heod7bd15a2019-12-16 13:34:00 -08002229 blkcg_schedule_throttle(rqos->q,
2230 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
Tejun Heoda437b92020-09-01 14:52:42 -04002231 iocg_unlock(iocg, ioc_locked, &flags);
Tejun Heo7caa4712019-08-28 15:05:58 -07002232 return;
2233 }
2234
2235 /*
2236 * Append self to the waitq and schedule the wakeup timer if we're
2237 * the first waiter. The timer duration is calculated based on the
2238 * current vrate. vtime and hweight changes can make it too short
2239 * or too long. Each wait entry records the absolute cost it's
2240 * waiting for to allow re-evaluation using a custom wait entry.
2241 *
2242 * If too short, the timer simply reschedules itself. If too long,
2243 * the period timer will notice and trigger wakeups.
2244 *
2245 * All waiters are on iocg->waitq and the wait states are
2246 * synchronized using waitq.lock.
2247 */
Tejun Heo7caa4712019-08-28 15:05:58 -07002248 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2249 wait.wait.private = current;
2250 wait.bio = bio;
2251 wait.abs_cost = abs_cost;
2252 wait.committed = false; /* will be set true by waker */
2253
2254 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
Tejun Heoda437b92020-09-01 14:52:42 -04002255 iocg_kick_waitq(iocg, ioc_locked, &now);
Tejun Heo7caa4712019-08-28 15:05:58 -07002256
Tejun Heoda437b92020-09-01 14:52:42 -04002257 iocg_unlock(iocg, ioc_locked, &flags);
Tejun Heo7caa4712019-08-28 15:05:58 -07002258
2259 while (true) {
2260 set_current_state(TASK_UNINTERRUPTIBLE);
2261 if (wait.committed)
2262 break;
2263 io_schedule();
2264 }
2265
2266 /* waker already committed us, proceed */
2267 finish_wait(&iocg->waitq, &wait.wait);
2268}
2269
2270static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2271 struct bio *bio)
2272{
2273 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
Tejun Heoe1518f62019-09-04 12:45:53 -07002274 struct ioc *ioc = iocg->ioc;
Tejun Heo7caa4712019-08-28 15:05:58 -07002275 sector_t bio_end = bio_end_sector(bio);
Tejun Heoe1518f62019-09-04 12:45:53 -07002276 struct ioc_now now;
Tejun Heo7caa4712019-08-28 15:05:58 -07002277 u32 hw_inuse;
2278 u64 abs_cost, cost;
Tejun Heo0b80f982020-05-04 19:27:54 -04002279 unsigned long flags;
Tejun Heo7caa4712019-08-28 15:05:58 -07002280
Tejun Heoe1518f62019-09-04 12:45:53 -07002281 /* bypass if disabled or for root cgroup */
2282 if (!ioc->enabled || !iocg->level)
Tejun Heo7caa4712019-08-28 15:05:58 -07002283 return;
2284
2285 abs_cost = calc_vtime_cost(bio, iocg, true);
2286 if (!abs_cost)
2287 return;
2288
Tejun Heoe1518f62019-09-04 12:45:53 -07002289 ioc_now(ioc, &now);
2290 current_hweight(iocg, NULL, &hw_inuse);
2291 cost = abs_cost_to_cost(abs_cost, hw_inuse);
2292
Tejun Heo7caa4712019-08-28 15:05:58 -07002293 /* update cursor if backmerging into the request at the cursor */
2294 if (blk_rq_pos(rq) < bio_end &&
2295 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2296 iocg->cursor = bio_end;
2297
Tejun Heoe1518f62019-09-04 12:45:53 -07002298 /*
Tejun Heo0b80f982020-05-04 19:27:54 -04002299 * Charge if there's enough vtime budget and the existing request has
2300 * cost assigned.
Tejun Heoe1518f62019-09-04 12:45:53 -07002301 */
2302 if (rq->bio && rq->bio->bi_iocost_cost &&
Tejun Heo0b80f982020-05-04 19:27:54 -04002303 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
Tejun Heo97eb1972020-09-01 14:52:43 -04002304 iocg_commit_bio(iocg, bio, abs_cost, cost);
Tejun Heo0b80f982020-05-04 19:27:54 -04002305 return;
2306 }
2307
2308 /*
2309 * Otherwise, account it as debt if @iocg is online, which it should
2310 * be for the vast majority of cases. See debt handling in
2311 * ioc_rqos_throttle() for details.
2312 */
2313 spin_lock_irqsave(&iocg->waitq.lock, flags);
2314 if (likely(!list_empty(&iocg->active_list))) {
2315 iocg->abs_vdebt += abs_cost;
Jens Axboe873f1c82020-05-09 16:13:58 -06002316 iocg_kick_delay(iocg, &now);
Tejun Heo0b80f982020-05-04 19:27:54 -04002317 } else {
Tejun Heo97eb1972020-09-01 14:52:43 -04002318 iocg_commit_bio(iocg, bio, abs_cost, cost);
Tejun Heo0b80f982020-05-04 19:27:54 -04002319 }
2320 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
Tejun Heo7caa4712019-08-28 15:05:58 -07002321}
2322
2323static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2324{
2325 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2326
2327 if (iocg && bio->bi_iocost_cost)
2328 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2329}
2330
2331static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2332{
2333 struct ioc *ioc = rqos_to_ioc(rqos);
Tejun Heo5e124f72020-09-01 14:52:33 -04002334 struct ioc_pcpu_stat *ccs;
Tejun Heocd006502020-04-13 12:27:56 -04002335 u64 on_q_ns, rq_wait_ns, size_nsec;
Tejun Heo7caa4712019-08-28 15:05:58 -07002336 int pidx, rw;
2337
2338 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2339 return;
2340
2341 switch (req_op(rq) & REQ_OP_MASK) {
2342 case REQ_OP_READ:
2343 pidx = QOS_RLAT;
2344 rw = READ;
2345 break;
2346 case REQ_OP_WRITE:
2347 pidx = QOS_WLAT;
2348 rw = WRITE;
2349 break;
2350 default:
2351 return;
2352 }
2353
2354 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2355 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
Tejun Heocd006502020-04-13 12:27:56 -04002356 size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
Tejun Heo7caa4712019-08-28 15:05:58 -07002357
Tejun Heo5e124f72020-09-01 14:52:33 -04002358 ccs = get_cpu_ptr(ioc->pcpu_stat);
2359
Tejun Heocd006502020-04-13 12:27:56 -04002360 if (on_q_ns <= size_nsec ||
2361 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
Tejun Heo5e124f72020-09-01 14:52:33 -04002362 local_inc(&ccs->missed[rw].nr_met);
Tejun Heo7caa4712019-08-28 15:05:58 -07002363 else
Tejun Heo5e124f72020-09-01 14:52:33 -04002364 local_inc(&ccs->missed[rw].nr_missed);
Tejun Heo7caa4712019-08-28 15:05:58 -07002365
Tejun Heo5e124f72020-09-01 14:52:33 -04002366 local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2367
2368 put_cpu_ptr(ccs);
Tejun Heo7caa4712019-08-28 15:05:58 -07002369}
2370
2371static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2372{
2373 struct ioc *ioc = rqos_to_ioc(rqos);
2374
2375 spin_lock_irq(&ioc->lock);
2376 ioc_refresh_params(ioc, false);
2377 spin_unlock_irq(&ioc->lock);
2378}
2379
2380static void ioc_rqos_exit(struct rq_qos *rqos)
2381{
2382 struct ioc *ioc = rqos_to_ioc(rqos);
2383
2384 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2385
2386 spin_lock_irq(&ioc->lock);
2387 ioc->running = IOC_STOP;
2388 spin_unlock_irq(&ioc->lock);
2389
2390 del_timer_sync(&ioc->timer);
2391 free_percpu(ioc->pcpu_stat);
2392 kfree(ioc);
2393}
2394
2395static struct rq_qos_ops ioc_rqos_ops = {
2396 .throttle = ioc_rqos_throttle,
2397 .merge = ioc_rqos_merge,
2398 .done_bio = ioc_rqos_done_bio,
2399 .done = ioc_rqos_done,
2400 .queue_depth_changed = ioc_rqos_queue_depth_changed,
2401 .exit = ioc_rqos_exit,
2402};
2403
2404static int blk_iocost_init(struct request_queue *q)
2405{
2406 struct ioc *ioc;
2407 struct rq_qos *rqos;
Tejun Heo5e124f72020-09-01 14:52:33 -04002408 int i, cpu, ret;
Tejun Heo7caa4712019-08-28 15:05:58 -07002409
2410 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2411 if (!ioc)
2412 return -ENOMEM;
2413
2414 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2415 if (!ioc->pcpu_stat) {
2416 kfree(ioc);
2417 return -ENOMEM;
2418 }
2419
Tejun Heo5e124f72020-09-01 14:52:33 -04002420 for_each_possible_cpu(cpu) {
2421 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2422
2423 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2424 local_set(&ccs->missed[i].nr_met, 0);
2425 local_set(&ccs->missed[i].nr_missed, 0);
2426 }
2427 local64_set(&ccs->rq_wait_ns, 0);
2428 }
2429
Tejun Heo7caa4712019-08-28 15:05:58 -07002430 rqos = &ioc->rqos;
2431 rqos->id = RQ_QOS_COST;
2432 rqos->ops = &ioc_rqos_ops;
2433 rqos->q = q;
2434
2435 spin_lock_init(&ioc->lock);
2436 timer_setup(&ioc->timer, ioc_timer_fn, 0);
2437 INIT_LIST_HEAD(&ioc->active_iocgs);
2438
2439 ioc->running = IOC_IDLE;
2440 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
Ahmed S. Darwish67b7b642020-07-20 17:55:26 +02002441 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
Tejun Heo7caa4712019-08-28 15:05:58 -07002442 ioc->period_at = ktime_to_us(ktime_get());
2443 atomic64_set(&ioc->cur_period, 0);
2444 atomic_set(&ioc->hweight_gen, 0);
2445
2446 spin_lock_irq(&ioc->lock);
2447 ioc->autop_idx = AUTOP_INVALID;
2448 ioc_refresh_params(ioc, true);
2449 spin_unlock_irq(&ioc->lock);
2450
2451 rq_qos_add(q, rqos);
2452 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2453 if (ret) {
2454 rq_qos_del(q, rqos);
Tejun Heo3532e722019-08-29 08:53:06 -07002455 free_percpu(ioc->pcpu_stat);
Tejun Heo7caa4712019-08-28 15:05:58 -07002456 kfree(ioc);
2457 return ret;
2458 }
2459 return 0;
2460}
2461
2462static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2463{
2464 struct ioc_cgrp *iocc;
2465
2466 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
Tejun Heoe916ad22019-08-30 06:10:58 -07002467 if (!iocc)
2468 return NULL;
Tejun Heo7caa4712019-08-28 15:05:58 -07002469
Tejun Heobd0adb92020-09-01 14:52:39 -04002470 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002471 return &iocc->cpd;
2472}
2473
2474static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2475{
2476 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2477}
2478
2479static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2480 struct blkcg *blkcg)
2481{
2482 int levels = blkcg->css.cgroup->level + 1;
2483 struct ioc_gq *iocg;
2484
Gustavo A. R. Silvaf61d6e22020-06-19 18:08:30 -05002485 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
Tejun Heo7caa4712019-08-28 15:05:58 -07002486 if (!iocg)
2487 return NULL;
2488
Tejun Heo97eb1972020-09-01 14:52:43 -04002489 iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2490 if (!iocg->pcpu_stat) {
2491 kfree(iocg);
2492 return NULL;
2493 }
2494
Tejun Heo7caa4712019-08-28 15:05:58 -07002495 return &iocg->pd;
2496}
2497
2498static void ioc_pd_init(struct blkg_policy_data *pd)
2499{
2500 struct ioc_gq *iocg = pd_to_iocg(pd);
2501 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2502 struct ioc *ioc = q_to_ioc(blkg->q);
2503 struct ioc_now now;
2504 struct blkcg_gq *tblkg;
2505 unsigned long flags;
2506
2507 ioc_now(ioc, &now);
2508
2509 iocg->ioc = ioc;
2510 atomic64_set(&iocg->vtime, now.vnow);
2511 atomic64_set(&iocg->done_vtime, now.vnow);
2512 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2513 INIT_LIST_HEAD(&iocg->active_list);
Tejun Heo97eb1972020-09-01 14:52:43 -04002514 INIT_LIST_HEAD(&iocg->walk_list);
Tejun Heo8692d2d2020-09-01 14:52:45 -04002515 INIT_LIST_HEAD(&iocg->surplus_list);
Tejun Heofe20cdb52020-09-01 14:52:38 -04002516 iocg->hweight_active = WEIGHT_ONE;
2517 iocg->hweight_inuse = WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002518
2519 init_waitqueue_head(&iocg->waitq);
2520 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2521 iocg->waitq_timer.function = iocg_waitq_timer_fn;
2522 hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2523 iocg->delay_timer.function = iocg_delay_timer_fn;
2524
2525 iocg->level = blkg->blkcg->css.cgroup->level;
2526
2527 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2528 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2529 iocg->ancestors[tiocg->level] = tiocg;
2530 }
2531
2532 spin_lock_irqsave(&ioc->lock, flags);
2533 weight_updated(iocg);
2534 spin_unlock_irqrestore(&ioc->lock, flags);
2535}
2536
2537static void ioc_pd_free(struct blkg_policy_data *pd)
2538{
2539 struct ioc_gq *iocg = pd_to_iocg(pd);
2540 struct ioc *ioc = iocg->ioc;
Tejun Heo5aeac7c2020-09-01 14:52:31 -04002541 unsigned long flags;
Tejun Heo7caa4712019-08-28 15:05:58 -07002542
2543 if (ioc) {
Tejun Heo5aeac7c2020-09-01 14:52:31 -04002544 spin_lock_irqsave(&ioc->lock, flags);
Tejun Heo97eb1972020-09-01 14:52:43 -04002545
Tejun Heo7caa4712019-08-28 15:05:58 -07002546 if (!list_empty(&iocg->active_list)) {
Tejun Heo00410f12020-09-01 14:52:34 -04002547 propagate_weights(iocg, 0, 0);
Tejun Heo7caa4712019-08-28 15:05:58 -07002548 list_del_init(&iocg->active_list);
2549 }
Tejun Heo97eb1972020-09-01 14:52:43 -04002550
2551 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
Tejun Heo8692d2d2020-09-01 14:52:45 -04002552 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
Tejun Heo97eb1972020-09-01 14:52:43 -04002553
Tejun Heo5aeac7c2020-09-01 14:52:31 -04002554 spin_unlock_irqrestore(&ioc->lock, flags);
Tejun Heoe036c4c2019-09-10 09:15:25 -07002555
2556 hrtimer_cancel(&iocg->waitq_timer);
2557 hrtimer_cancel(&iocg->delay_timer);
Tejun Heo7caa4712019-08-28 15:05:58 -07002558 }
Tejun Heo97eb1972020-09-01 14:52:43 -04002559 free_percpu(iocg->pcpu_stat);
Tejun Heo7caa4712019-08-28 15:05:58 -07002560 kfree(iocg);
2561}
2562
Tejun Heo97eb1972020-09-01 14:52:43 -04002563static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
2564{
2565 struct ioc_gq *iocg = pd_to_iocg(pd);
2566 struct ioc *ioc = iocg->ioc;
2567 size_t pos = 0;
2568
2569 if (!ioc->enabled)
2570 return 0;
2571
2572 if (iocg->level == 0) {
2573 unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
2574 atomic64_read(&ioc->vtime_rate) * 10000,
2575 VTIME_PER_USEC);
2576 pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
2577 vp10k / 100, vp10k % 100);
2578 }
2579
2580 pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
2581 iocg->last_stat.usage_us);
2582
2583 return pos;
2584}
2585
Tejun Heo7caa4712019-08-28 15:05:58 -07002586static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2587 int off)
2588{
2589 const char *dname = blkg_dev_name(pd->blkg);
2590 struct ioc_gq *iocg = pd_to_iocg(pd);
2591
2592 if (dname && iocg->cfg_weight)
Tejun Heobd0adb92020-09-01 14:52:39 -04002593 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
Tejun Heo7caa4712019-08-28 15:05:58 -07002594 return 0;
2595}
2596
2597
2598static int ioc_weight_show(struct seq_file *sf, void *v)
2599{
2600 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2601 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2602
Tejun Heobd0adb92020-09-01 14:52:39 -04002603 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
Tejun Heo7caa4712019-08-28 15:05:58 -07002604 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2605 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2606 return 0;
2607}
2608
2609static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2610 size_t nbytes, loff_t off)
2611{
2612 struct blkcg *blkcg = css_to_blkcg(of_css(of));
2613 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2614 struct blkg_conf_ctx ctx;
2615 struct ioc_gq *iocg;
2616 u32 v;
2617 int ret;
2618
2619 if (!strchr(buf, ':')) {
2620 struct blkcg_gq *blkg;
2621
2622 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2623 return -EINVAL;
2624
2625 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2626 return -EINVAL;
2627
2628 spin_lock(&blkcg->lock);
Tejun Heobd0adb92020-09-01 14:52:39 -04002629 iocc->dfl_weight = v * WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002630 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2631 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2632
2633 if (iocg) {
2634 spin_lock_irq(&iocg->ioc->lock);
2635 weight_updated(iocg);
2636 spin_unlock_irq(&iocg->ioc->lock);
2637 }
2638 }
2639 spin_unlock(&blkcg->lock);
2640
2641 return nbytes;
2642 }
2643
2644 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2645 if (ret)
2646 return ret;
2647
2648 iocg = blkg_to_iocg(ctx.blkg);
2649
2650 if (!strncmp(ctx.body, "default", 7)) {
2651 v = 0;
2652 } else {
2653 if (!sscanf(ctx.body, "%u", &v))
2654 goto einval;
2655 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2656 goto einval;
2657 }
2658
Dan Carpenter41591a52019-10-31 13:53:41 +03002659 spin_lock(&iocg->ioc->lock);
Tejun Heobd0adb92020-09-01 14:52:39 -04002660 iocg->cfg_weight = v * WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002661 weight_updated(iocg);
Dan Carpenter41591a52019-10-31 13:53:41 +03002662 spin_unlock(&iocg->ioc->lock);
Tejun Heo7caa4712019-08-28 15:05:58 -07002663
2664 blkg_conf_finish(&ctx);
2665 return nbytes;
2666
2667einval:
2668 blkg_conf_finish(&ctx);
2669 return -EINVAL;
2670}
2671
2672static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2673 int off)
2674{
2675 const char *dname = blkg_dev_name(pd->blkg);
2676 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2677
2678 if (!dname)
2679 return 0;
2680
2681 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2682 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2683 ioc->params.qos[QOS_RPPM] / 10000,
2684 ioc->params.qos[QOS_RPPM] % 10000 / 100,
2685 ioc->params.qos[QOS_RLAT],
2686 ioc->params.qos[QOS_WPPM] / 10000,
2687 ioc->params.qos[QOS_WPPM] % 10000 / 100,
2688 ioc->params.qos[QOS_WLAT],
2689 ioc->params.qos[QOS_MIN] / 10000,
2690 ioc->params.qos[QOS_MIN] % 10000 / 100,
2691 ioc->params.qos[QOS_MAX] / 10000,
2692 ioc->params.qos[QOS_MAX] % 10000 / 100);
2693 return 0;
2694}
2695
2696static int ioc_qos_show(struct seq_file *sf, void *v)
2697{
2698 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2699
2700 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2701 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2702 return 0;
2703}
2704
2705static const match_table_t qos_ctrl_tokens = {
2706 { QOS_ENABLE, "enable=%u" },
2707 { QOS_CTRL, "ctrl=%s" },
2708 { NR_QOS_CTRL_PARAMS, NULL },
2709};
2710
2711static const match_table_t qos_tokens = {
2712 { QOS_RPPM, "rpct=%s" },
2713 { QOS_RLAT, "rlat=%u" },
2714 { QOS_WPPM, "wpct=%s" },
2715 { QOS_WLAT, "wlat=%u" },
2716 { QOS_MIN, "min=%s" },
2717 { QOS_MAX, "max=%s" },
2718 { NR_QOS_PARAMS, NULL },
2719};
2720
2721static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2722 size_t nbytes, loff_t off)
2723{
2724 struct gendisk *disk;
2725 struct ioc *ioc;
2726 u32 qos[NR_QOS_PARAMS];
2727 bool enable, user;
2728 char *p;
2729 int ret;
2730
2731 disk = blkcg_conf_get_disk(&input);
2732 if (IS_ERR(disk))
2733 return PTR_ERR(disk);
2734
2735 ioc = q_to_ioc(disk->queue);
2736 if (!ioc) {
2737 ret = blk_iocost_init(disk->queue);
2738 if (ret)
2739 goto err;
2740 ioc = q_to_ioc(disk->queue);
2741 }
2742
2743 spin_lock_irq(&ioc->lock);
2744 memcpy(qos, ioc->params.qos, sizeof(qos));
2745 enable = ioc->enabled;
2746 user = ioc->user_qos_params;
2747 spin_unlock_irq(&ioc->lock);
2748
2749 while ((p = strsep(&input, " \t\n"))) {
2750 substring_t args[MAX_OPT_ARGS];
2751 char buf[32];
2752 int tok;
2753 s64 v;
2754
2755 if (!*p)
2756 continue;
2757
2758 switch (match_token(p, qos_ctrl_tokens, args)) {
2759 case QOS_ENABLE:
2760 match_u64(&args[0], &v);
2761 enable = v;
2762 continue;
2763 case QOS_CTRL:
2764 match_strlcpy(buf, &args[0], sizeof(buf));
2765 if (!strcmp(buf, "auto"))
2766 user = false;
2767 else if (!strcmp(buf, "user"))
2768 user = true;
2769 else
2770 goto einval;
2771 continue;
2772 }
2773
2774 tok = match_token(p, qos_tokens, args);
2775 switch (tok) {
2776 case QOS_RPPM:
2777 case QOS_WPPM:
2778 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2779 sizeof(buf))
2780 goto einval;
2781 if (cgroup_parse_float(buf, 2, &v))
2782 goto einval;
2783 if (v < 0 || v > 10000)
2784 goto einval;
2785 qos[tok] = v * 100;
2786 break;
2787 case QOS_RLAT:
2788 case QOS_WLAT:
2789 if (match_u64(&args[0], &v))
2790 goto einval;
2791 qos[tok] = v;
2792 break;
2793 case QOS_MIN:
2794 case QOS_MAX:
2795 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2796 sizeof(buf))
2797 goto einval;
2798 if (cgroup_parse_float(buf, 2, &v))
2799 goto einval;
2800 if (v < 0)
2801 goto einval;
2802 qos[tok] = clamp_t(s64, v * 100,
2803 VRATE_MIN_PPM, VRATE_MAX_PPM);
2804 break;
2805 default:
2806 goto einval;
2807 }
2808 user = true;
2809 }
2810
2811 if (qos[QOS_MIN] > qos[QOS_MAX])
2812 goto einval;
2813
2814 spin_lock_irq(&ioc->lock);
2815
2816 if (enable) {
Tejun Heocd006502020-04-13 12:27:56 -04002817 blk_stat_enable_accounting(ioc->rqos.q);
Tejun Heo7caa4712019-08-28 15:05:58 -07002818 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2819 ioc->enabled = true;
2820 } else {
2821 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2822 ioc->enabled = false;
2823 }
2824
2825 if (user) {
2826 memcpy(ioc->params.qos, qos, sizeof(qos));
2827 ioc->user_qos_params = true;
2828 } else {
2829 ioc->user_qos_params = false;
2830 }
2831
2832 ioc_refresh_params(ioc, true);
2833 spin_unlock_irq(&ioc->lock);
2834
2835 put_disk_and_module(disk);
2836 return nbytes;
2837einval:
2838 ret = -EINVAL;
2839err:
2840 put_disk_and_module(disk);
2841 return ret;
2842}
2843
2844static u64 ioc_cost_model_prfill(struct seq_file *sf,
2845 struct blkg_policy_data *pd, int off)
2846{
2847 const char *dname = blkg_dev_name(pd->blkg);
2848 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2849 u64 *u = ioc->params.i_lcoefs;
2850
2851 if (!dname)
2852 return 0;
2853
2854 seq_printf(sf, "%s ctrl=%s model=linear "
2855 "rbps=%llu rseqiops=%llu rrandiops=%llu "
2856 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2857 dname, ioc->user_cost_model ? "user" : "auto",
2858 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2859 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2860 return 0;
2861}
2862
2863static int ioc_cost_model_show(struct seq_file *sf, void *v)
2864{
2865 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2866
2867 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2868 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2869 return 0;
2870}
2871
2872static const match_table_t cost_ctrl_tokens = {
2873 { COST_CTRL, "ctrl=%s" },
2874 { COST_MODEL, "model=%s" },
2875 { NR_COST_CTRL_PARAMS, NULL },
2876};
2877
2878static const match_table_t i_lcoef_tokens = {
2879 { I_LCOEF_RBPS, "rbps=%u" },
2880 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
2881 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
2882 { I_LCOEF_WBPS, "wbps=%u" },
2883 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
2884 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
2885 { NR_I_LCOEFS, NULL },
2886};
2887
2888static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2889 size_t nbytes, loff_t off)
2890{
2891 struct gendisk *disk;
2892 struct ioc *ioc;
2893 u64 u[NR_I_LCOEFS];
2894 bool user;
2895 char *p;
2896 int ret;
2897
2898 disk = blkcg_conf_get_disk(&input);
2899 if (IS_ERR(disk))
2900 return PTR_ERR(disk);
2901
2902 ioc = q_to_ioc(disk->queue);
2903 if (!ioc) {
2904 ret = blk_iocost_init(disk->queue);
2905 if (ret)
2906 goto err;
2907 ioc = q_to_ioc(disk->queue);
2908 }
2909
2910 spin_lock_irq(&ioc->lock);
2911 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2912 user = ioc->user_cost_model;
2913 spin_unlock_irq(&ioc->lock);
2914
2915 while ((p = strsep(&input, " \t\n"))) {
2916 substring_t args[MAX_OPT_ARGS];
2917 char buf[32];
2918 int tok;
2919 u64 v;
2920
2921 if (!*p)
2922 continue;
2923
2924 switch (match_token(p, cost_ctrl_tokens, args)) {
2925 case COST_CTRL:
2926 match_strlcpy(buf, &args[0], sizeof(buf));
2927 if (!strcmp(buf, "auto"))
2928 user = false;
2929 else if (!strcmp(buf, "user"))
2930 user = true;
2931 else
2932 goto einval;
2933 continue;
2934 case COST_MODEL:
2935 match_strlcpy(buf, &args[0], sizeof(buf));
2936 if (strcmp(buf, "linear"))
2937 goto einval;
2938 continue;
2939 }
2940
2941 tok = match_token(p, i_lcoef_tokens, args);
2942 if (tok == NR_I_LCOEFS)
2943 goto einval;
2944 if (match_u64(&args[0], &v))
2945 goto einval;
2946 u[tok] = v;
2947 user = true;
2948 }
2949
2950 spin_lock_irq(&ioc->lock);
2951 if (user) {
2952 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2953 ioc->user_cost_model = true;
2954 } else {
2955 ioc->user_cost_model = false;
2956 }
2957 ioc_refresh_params(ioc, true);
2958 spin_unlock_irq(&ioc->lock);
2959
2960 put_disk_and_module(disk);
2961 return nbytes;
2962
2963einval:
2964 ret = -EINVAL;
2965err:
2966 put_disk_and_module(disk);
2967 return ret;
2968}
2969
2970static struct cftype ioc_files[] = {
2971 {
2972 .name = "weight",
2973 .flags = CFTYPE_NOT_ON_ROOT,
2974 .seq_show = ioc_weight_show,
2975 .write = ioc_weight_write,
2976 },
2977 {
2978 .name = "cost.qos",
2979 .flags = CFTYPE_ONLY_ON_ROOT,
2980 .seq_show = ioc_qos_show,
2981 .write = ioc_qos_write,
2982 },
2983 {
2984 .name = "cost.model",
2985 .flags = CFTYPE_ONLY_ON_ROOT,
2986 .seq_show = ioc_cost_model_show,
2987 .write = ioc_cost_model_write,
2988 },
2989 {}
2990};
2991
2992static struct blkcg_policy blkcg_policy_iocost = {
2993 .dfl_cftypes = ioc_files,
2994 .cpd_alloc_fn = ioc_cpd_alloc,
2995 .cpd_free_fn = ioc_cpd_free,
2996 .pd_alloc_fn = ioc_pd_alloc,
2997 .pd_init_fn = ioc_pd_init,
2998 .pd_free_fn = ioc_pd_free,
Tejun Heo97eb1972020-09-01 14:52:43 -04002999 .pd_stat_fn = ioc_pd_stat,
Tejun Heo7caa4712019-08-28 15:05:58 -07003000};
3001
3002static int __init ioc_init(void)
3003{
3004 return blkcg_policy_register(&blkcg_policy_iocost);
3005}
3006
3007static void __exit ioc_exit(void)
3008{
3009 return blkcg_policy_unregister(&blkcg_policy_iocost);
3010}
3011
3012module_init(ioc_init);
3013module_exit(ioc_exit);