blob: dc72cd9658371a3319ace110ce7d8700e15e26a7 [file] [log] [blame]
Tejun Heo7caa4712019-08-28 15:05:58 -07001/* SPDX-License-Identifier: GPL-2.0
2 *
3 * IO cost model based controller.
4 *
5 * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6 * Copyright (C) 2019 Andy Newell <newella@fb.com>
7 * Copyright (C) 2019 Facebook
8 *
9 * One challenge of controlling IO resources is the lack of trivially
10 * observable cost metric. This is distinguished from CPU and memory where
11 * wallclock time and the number of bytes can serve as accurate enough
12 * approximations.
13 *
14 * Bandwidth and iops are the most commonly used metrics for IO devices but
15 * depending on the type and specifics of the device, different IO patterns
16 * easily lead to multiple orders of magnitude variations rendering them
17 * useless for the purpose of IO capacity distribution. While on-device
18 * time, with a lot of clutches, could serve as a useful approximation for
19 * non-queued rotational devices, this is no longer viable with modern
20 * devices, even the rotational ones.
21 *
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
24 * dominates while a contiguous transfer contributes a smaller amount
25 * proportional to the size. If we can characterize at least the relative
26 * costs of these different types of IOs, it should be possible to
27 * implement a reasonable work-conserving proportional IO resource
28 * distribution.
29 *
30 * 1. IO Cost Model
31 *
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
35 * be able to process ~100 of those IOs in a second.
36 *
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
40 * added. While simple, this model captures the operational
41 * characteristics of a wide varienty of devices well enough. Default
42 * paramters for several different classes of devices are provided and the
43 * parameters can be configured from userspace via
44 * /sys/fs/cgroup/io.cost.model.
45 *
46 * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47 * device-specific coefficients.
48 *
49 * 2. Control Strategy
50 *
51 * The device virtual time (vtime) is used as the primary control metric.
52 * The control strategy is composed of the following three parts.
53 *
54 * 2-1. Vtime Distribution
55 *
56 * When a cgroup becomes active in terms of IOs, its hierarchical share is
57 * calculated. Please consider the following hierarchy where the numbers
58 * inside parentheses denote the configured weights.
59 *
60 * root
61 * / \
62 * A (w:100) B (w:300)
63 * / \
64 * A0 (w:100) A1 (w:100)
65 *
66 * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67 * of equal weight, each gets 50% share. If then B starts issuing IOs, B
68 * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69 * 12.5% each. The distribution mechanism only cares about these flattened
70 * shares. They're called hweights (hierarchical weights) and always add
Tejun Heofe20cdb52020-09-01 14:52:38 -040071 * upto 1 (WEIGHT_ONE).
Tejun Heo7caa4712019-08-28 15:05:58 -070072 *
73 * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74 * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75 * against the device vtime - an IO which takes 10ms on the underlying
76 * device is considered to take 80ms on A0.
77 *
78 * This constitutes the basis of IO capacity distribution. Each cgroup's
79 * vtime is running at a rate determined by its hweight. A cgroup tracks
80 * the vtime consumed by past IOs and can issue a new IO iff doing so
81 * wouldn't outrun the current device vtime. Otherwise, the IO is
82 * suspended until the vtime has progressed enough to cover it.
83 *
84 * 2-2. Vrate Adjustment
85 *
86 * It's unrealistic to expect the cost model to be perfect. There are too
87 * many devices and even on the same device the overall performance
88 * fluctuates depending on numerous factors such as IO mixture and device
89 * internal garbage collection. The controller needs to adapt dynamically.
90 *
91 * This is achieved by adjusting the overall IO rate according to how busy
92 * the device is. If the device becomes overloaded, we're sending down too
93 * many IOs and should generally slow down. If there are waiting issuers
94 * but the device isn't saturated, we're issuing too few and should
95 * generally speed up.
96 *
97 * To slow down, we lower the vrate - the rate at which the device vtime
98 * passes compared to the wall clock. For example, if the vtime is running
99 * at the vrate of 75%, all cgroups added up would only be able to issue
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
101 *
102 * Device business is determined using two criteria - rq wait and
103 * completion latencies.
104 *
105 * When a device gets saturated, the on-device and then the request queues
106 * fill up and a bio which is ready to be issued has to wait for a request
107 * to become available. When this delay becomes noticeable, it's a clear
108 * indication that the device is saturated and we lower the vrate. This
109 * saturation signal is fairly conservative as it only triggers when both
110 * hardware and software queues are filled up, and is used as the default
111 * busy signal.
112 *
113 * As devices can have deep queues and be unfair in how the queued commands
114 * are executed, soley depending on rq wait may not result in satisfactory
115 * control quality. For a better control quality, completion latency QoS
116 * parameters can be configured so that the device is considered saturated
117 * if N'th percentile completion latency rises above the set point.
118 *
119 * The completion latency requirements are a function of both the
120 * underlying device characteristics and the desired IO latency quality of
121 * service. There is an inherent trade-off - the tighter the latency QoS,
122 * the higher the bandwidth lossage. Latency QoS is disabled by default
123 * and can be set through /sys/fs/cgroup/io.cost.qos.
124 *
125 * 2-3. Work Conservation
126 *
127 * Imagine two cgroups A and B with equal weights. A is issuing a small IO
128 * periodically while B is sending out enough parallel IOs to saturate the
129 * device on its own. Let's say A's usage amounts to 100ms worth of IO
130 * cost per second, i.e., 10% of the device capacity. The naive
131 * distribution of half and half would lead to 60% utilization of the
132 * device, a significant reduction in the total amount of work done
133 * compared to free-for-all competition. This is too high a cost to pay
134 * for IO control.
135 *
136 * To conserve the total amount of work done, we keep track of how much
137 * each active cgroup is actually using and yield part of its weight if
138 * there are other cgroups which can make use of it. In the above case,
139 * A's weight will be lowered so that it hovers above the actual usage and
140 * B would be able to use the rest.
141 *
142 * As we don't want to penalize a cgroup for donating its weight, the
143 * surplus weight adjustment factors in a margin and has an immediate
144 * snapback mechanism in case the cgroup needs more IO vtime for itself.
145 *
146 * Note that adjusting down surplus weights has the same effects as
147 * accelerating vtime for other cgroups and work conservation can also be
148 * implemented by adjusting vrate dynamically. However, squaring who can
149 * donate and should take back how much requires hweight propagations
150 * anyway making it easier to implement and understand as a separate
151 * mechanism.
Tejun Heo6954ff12019-08-28 15:05:59 -0700152 *
153 * 3. Monitoring
154 *
155 * Instead of debugfs or other clumsy monitoring mechanisms, this
156 * controller uses a drgn based monitoring script -
157 * tools/cgroup/iocost_monitor.py. For details on drgn, please see
158 * https://github.com/osandov/drgn. The ouput looks like the following.
159 *
160 * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
Tejun Heo7c1ee702019-09-04 12:45:56 -0700161 * active weight hweight% inflt% dbt delay usages%
162 * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
163 * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
Tejun Heo6954ff12019-08-28 15:05:59 -0700164 *
165 * - per : Timer period
166 * - cur_per : Internal wall and device vtime clock
167 * - vrate : Device virtual time rate against wall clock
168 * - weight : Surplus-adjusted and configured weights
169 * - hweight : Surplus-adjusted and configured hierarchical weights
170 * - inflt : The percentage of in-flight IO cost at the end of last period
171 * - del_ms : Deferred issuer delay induction level and duration
172 * - usages : Usage history
Tejun Heo7caa4712019-08-28 15:05:58 -0700173 */
174
175#include <linux/kernel.h>
176#include <linux/module.h>
177#include <linux/timer.h>
178#include <linux/time64.h>
179#include <linux/parser.h>
180#include <linux/sched/signal.h>
181#include <linux/blk-cgroup.h>
Tejun Heo5e124f72020-09-01 14:52:33 -0400182#include <asm/local.h>
183#include <asm/local64.h>
Tejun Heo7caa4712019-08-28 15:05:58 -0700184#include "blk-rq-qos.h"
185#include "blk-stat.h"
186#include "blk-wbt.h"
187
188#ifdef CONFIG_TRACEPOINTS
189
190/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
191#define TRACE_IOCG_PATH_LEN 1024
192static DEFINE_SPINLOCK(trace_iocg_path_lock);
193static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
194
195#define TRACE_IOCG_PATH(type, iocg, ...) \
196 do { \
197 unsigned long flags; \
198 if (trace_iocost_##type##_enabled()) { \
199 spin_lock_irqsave(&trace_iocg_path_lock, flags); \
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
201 trace_iocg_path, TRACE_IOCG_PATH_LEN); \
202 trace_iocost_##type(iocg, trace_iocg_path, \
203 ##__VA_ARGS__); \
204 spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
205 } \
206 } while (0)
207
208#else /* CONFIG_TRACE_POINTS */
209#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
210#endif /* CONFIG_TRACE_POINTS */
211
212enum {
213 MILLION = 1000000,
214
215 /* timer period is calculated from latency requirements, bound it */
216 MIN_PERIOD = USEC_PER_MSEC,
217 MAX_PERIOD = USEC_PER_SEC,
218
219 /*
220 * A cgroup's vtime can run 50% behind the device vtime, which
221 * serves as its IO credit buffer. Surplus weight adjustment is
222 * immediately canceled if the vtime margin runs below 10%.
223 */
224 MARGIN_PCT = 50,
225 INUSE_MARGIN_PCT = 10,
226
227 /* Have some play in waitq timer operations */
228 WAITQ_TIMER_MARGIN_PCT = 5,
229
230 /*
231 * vtime can wrap well within a reasonable uptime when vrate is
232 * consistently raised. Don't trust recorded cgroup vtime if the
233 * period counter indicates that it's older than 5mins.
234 */
235 VTIME_VALID_DUR = 300 * USEC_PER_SEC,
236
237 /*
238 * Remember the past three non-zero usages and use the max for
239 * surplus calculation. Three slots guarantee that we remember one
240 * full period usage from the last active stretch even after
241 * partial deactivation and re-activation periods. Don't start
242 * giving away weight before collecting two data points to prevent
243 * hweight adjustments based on one partial activation period.
244 */
245 NR_USAGE_SLOTS = 3,
246 MIN_VALID_USAGES = 2,
247
248 /* 1/64k is granular enough and can easily be handled w/ u32 */
Tejun Heofe20cdb52020-09-01 14:52:38 -0400249 WEIGHT_ONE = 1 << 16,
Tejun Heo7caa4712019-08-28 15:05:58 -0700250
251 /*
252 * As vtime is used to calculate the cost of each IO, it needs to
253 * be fairly high precision. For example, it should be able to
254 * represent the cost of a single page worth of discard with
255 * suffificient accuracy. At the same time, it should be able to
256 * represent reasonably long enough durations to be useful and
257 * convenient during operation.
258 *
259 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
260 * granularity and days of wrap-around time even at extreme vrates.
261 */
262 VTIME_PER_SEC_SHIFT = 37,
263 VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
264 VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
Tejun Heocd006502020-04-13 12:27:56 -0400265 VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
Tejun Heo7caa4712019-08-28 15:05:58 -0700266
267 /* bound vrate adjustments within two orders of magnitude */
268 VRATE_MIN_PPM = 10000, /* 1% */
269 VRATE_MAX_PPM = 100000000, /* 10000% */
270
271 VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
272 VRATE_CLAMP_ADJ_PCT = 4,
273
274 /* if IOs end up waiting for requests, issue less */
275 RQ_WAIT_BUSY_PCT = 5,
276
277 /* unbusy hysterisis */
278 UNBUSY_THR_PCT = 75,
279
280 /* don't let cmds which take a very long time pin lagging for too long */
281 MAX_LAGGING_PERIODS = 10,
282
283 /*
284 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
285 * donate the surplus.
286 */
287 SURPLUS_SCALE_PCT = 125, /* * 125% */
Tejun Heofe20cdb52020-09-01 14:52:38 -0400288 SURPLUS_SCALE_ABS = WEIGHT_ONE / 50, /* + 2% */
289 SURPLUS_MIN_ADJ_DELTA = WEIGHT_ONE / 33, /* 3% */
Tejun Heo7caa4712019-08-28 15:05:58 -0700290
291 /* switch iff the conditions are met for longer than this */
292 AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
293
294 /*
295 * Count IO size in 4k pages. The 12bit shift helps keeping
296 * size-proportional components of cost calculation in closer
297 * numbers of digits to per-IO cost components.
298 */
299 IOC_PAGE_SHIFT = 12,
300 IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
301 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
302
303 /* if apart further than 16M, consider randio for linear model */
304 LCOEF_RANDIO_PAGES = 4096,
305};
306
307enum ioc_running {
308 IOC_IDLE,
309 IOC_RUNNING,
310 IOC_STOP,
311};
312
313/* io.cost.qos controls including per-dev enable of the whole controller */
314enum {
315 QOS_ENABLE,
316 QOS_CTRL,
317 NR_QOS_CTRL_PARAMS,
318};
319
320/* io.cost.qos params */
321enum {
322 QOS_RPPM,
323 QOS_RLAT,
324 QOS_WPPM,
325 QOS_WLAT,
326 QOS_MIN,
327 QOS_MAX,
328 NR_QOS_PARAMS,
329};
330
331/* io.cost.model controls */
332enum {
333 COST_CTRL,
334 COST_MODEL,
335 NR_COST_CTRL_PARAMS,
336};
337
338/* builtin linear cost model coefficients */
339enum {
340 I_LCOEF_RBPS,
341 I_LCOEF_RSEQIOPS,
342 I_LCOEF_RRANDIOPS,
343 I_LCOEF_WBPS,
344 I_LCOEF_WSEQIOPS,
345 I_LCOEF_WRANDIOPS,
346 NR_I_LCOEFS,
347};
348
349enum {
350 LCOEF_RPAGE,
351 LCOEF_RSEQIO,
352 LCOEF_RRANDIO,
353 LCOEF_WPAGE,
354 LCOEF_WSEQIO,
355 LCOEF_WRANDIO,
356 NR_LCOEFS,
357};
358
359enum {
360 AUTOP_INVALID,
361 AUTOP_HDD,
362 AUTOP_SSD_QD1,
363 AUTOP_SSD_DFL,
364 AUTOP_SSD_FAST,
365};
366
367struct ioc_gq;
368
369struct ioc_params {
370 u32 qos[NR_QOS_PARAMS];
371 u64 i_lcoefs[NR_I_LCOEFS];
372 u64 lcoefs[NR_LCOEFS];
373 u32 too_fast_vrate_pct;
374 u32 too_slow_vrate_pct;
375};
376
377struct ioc_missed {
Tejun Heo5e124f72020-09-01 14:52:33 -0400378 local_t nr_met;
379 local_t nr_missed;
Tejun Heo7caa4712019-08-28 15:05:58 -0700380 u32 last_met;
381 u32 last_missed;
382};
383
384struct ioc_pcpu_stat {
385 struct ioc_missed missed[2];
386
Tejun Heo5e124f72020-09-01 14:52:33 -0400387 local64_t rq_wait_ns;
Tejun Heo7caa4712019-08-28 15:05:58 -0700388 u64 last_rq_wait_ns;
389};
390
391/* per device */
392struct ioc {
393 struct rq_qos rqos;
394
395 bool enabled;
396
397 struct ioc_params params;
398 u32 period_us;
399 u32 margin_us;
400 u64 vrate_min;
401 u64 vrate_max;
402
403 spinlock_t lock;
404 struct timer_list timer;
405 struct list_head active_iocgs; /* active cgroups */
406 struct ioc_pcpu_stat __percpu *pcpu_stat;
407
408 enum ioc_running running;
409 atomic64_t vtime_rate;
410
Ahmed S. Darwish67b7b642020-07-20 17:55:26 +0200411 seqcount_spinlock_t period_seqcount;
Tejun Heoce955702020-09-01 14:52:40 -0400412 u64 period_at; /* wallclock starttime */
Tejun Heo7caa4712019-08-28 15:05:58 -0700413 u64 period_at_vtime; /* vtime starttime */
414
415 atomic64_t cur_period; /* inc'd each period */
416 int busy_level; /* saturation history */
417
418 u64 inuse_margin_vtime;
419 bool weights_updated;
420 atomic_t hweight_gen; /* for lazy hweights */
421
422 u64 autop_too_fast_at;
423 u64 autop_too_slow_at;
424 int autop_idx;
425 bool user_qos_params:1;
426 bool user_cost_model:1;
427};
428
429/* per device-cgroup pair */
430struct ioc_gq {
431 struct blkg_policy_data pd;
432 struct ioc *ioc;
433
434 /*
435 * A iocg can get its weight from two sources - an explicit
436 * per-device-cgroup configuration or the default weight of the
437 * cgroup. `cfg_weight` is the explicit per-device-cgroup
438 * configuration. `weight` is the effective considering both
439 * sources.
440 *
441 * When an idle cgroup becomes active its `active` goes from 0 to
442 * `weight`. `inuse` is the surplus adjusted active weight.
443 * `active` and `inuse` are used to calculate `hweight_active` and
444 * `hweight_inuse`.
445 *
446 * `last_inuse` remembers `inuse` while an iocg is idle to persist
447 * surplus adjustments.
448 */
449 u32 cfg_weight;
450 u32 weight;
451 u32 active;
452 u32 inuse;
453 u32 last_inuse;
454
455 sector_t cursor; /* to detect randio */
456
457 /*
458 * `vtime` is this iocg's vtime cursor which progresses as IOs are
459 * issued. If lagging behind device vtime, the delta represents
460 * the currently available IO budget. If runnning ahead, the
461 * overage.
462 *
463 * `vtime_done` is the same but progressed on completion rather
464 * than issue. The delta behind `vtime` represents the cost of
465 * currently in-flight IOs.
466 *
467 * `last_vtime` is used to remember `vtime` at the end of the last
468 * period to calculate utilization.
469 */
470 atomic64_t vtime;
471 atomic64_t done_vtime;
Tejun Heo0b80f982020-05-04 19:27:54 -0400472 u64 abs_vdebt;
Tejun Heo7caa4712019-08-28 15:05:58 -0700473 u64 last_vtime;
474
475 /*
476 * The period this iocg was last active in. Used for deactivation
477 * and invalidating `vtime`.
478 */
479 atomic64_t active_period;
480 struct list_head active_list;
481
Tejun Heo00410f12020-09-01 14:52:34 -0400482 /* see __propagate_weights() and current_hweight() for details */
Tejun Heo7caa4712019-08-28 15:05:58 -0700483 u64 child_active_sum;
484 u64 child_inuse_sum;
485 int hweight_gen;
486 u32 hweight_active;
487 u32 hweight_inuse;
488 bool has_surplus;
489
490 struct wait_queue_head waitq;
491 struct hrtimer waitq_timer;
492 struct hrtimer delay_timer;
493
Tejun Heofe20cdb52020-09-01 14:52:38 -0400494 /* usage is recorded as fractions of WEIGHT_ONE */
Tejun Heo7caa4712019-08-28 15:05:58 -0700495 int usage_idx;
496 u32 usages[NR_USAGE_SLOTS];
497
498 /* this iocg's depth in the hierarchy and ancestors including self */
499 int level;
500 struct ioc_gq *ancestors[];
501};
502
503/* per cgroup */
504struct ioc_cgrp {
505 struct blkcg_policy_data cpd;
506 unsigned int dfl_weight;
507};
508
509struct ioc_now {
510 u64 now_ns;
Tejun Heoce955702020-09-01 14:52:40 -0400511 u64 now;
Tejun Heo7caa4712019-08-28 15:05:58 -0700512 u64 vnow;
513 u64 vrate;
514};
515
516struct iocg_wait {
517 struct wait_queue_entry wait;
518 struct bio *bio;
519 u64 abs_cost;
520 bool committed;
521};
522
523struct iocg_wake_ctx {
524 struct ioc_gq *iocg;
525 u32 hw_inuse;
526 s64 vbudget;
527};
528
529static const struct ioc_params autop[] = {
530 [AUTOP_HDD] = {
531 .qos = {
Tejun Heo7afccca2019-09-25 16:03:35 -0700532 [QOS_RLAT] = 250000, /* 250ms */
533 [QOS_WLAT] = 250000,
Tejun Heo7caa4712019-08-28 15:05:58 -0700534 [QOS_MIN] = VRATE_MIN_PPM,
535 [QOS_MAX] = VRATE_MAX_PPM,
536 },
537 .i_lcoefs = {
538 [I_LCOEF_RBPS] = 174019176,
539 [I_LCOEF_RSEQIOPS] = 41708,
540 [I_LCOEF_RRANDIOPS] = 370,
541 [I_LCOEF_WBPS] = 178075866,
542 [I_LCOEF_WSEQIOPS] = 42705,
543 [I_LCOEF_WRANDIOPS] = 378,
544 },
545 },
546 [AUTOP_SSD_QD1] = {
547 .qos = {
548 [QOS_RLAT] = 25000, /* 25ms */
549 [QOS_WLAT] = 25000,
550 [QOS_MIN] = VRATE_MIN_PPM,
551 [QOS_MAX] = VRATE_MAX_PPM,
552 },
553 .i_lcoefs = {
554 [I_LCOEF_RBPS] = 245855193,
555 [I_LCOEF_RSEQIOPS] = 61575,
556 [I_LCOEF_RRANDIOPS] = 6946,
557 [I_LCOEF_WBPS] = 141365009,
558 [I_LCOEF_WSEQIOPS] = 33716,
559 [I_LCOEF_WRANDIOPS] = 26796,
560 },
561 },
562 [AUTOP_SSD_DFL] = {
563 .qos = {
564 [QOS_RLAT] = 25000, /* 25ms */
565 [QOS_WLAT] = 25000,
566 [QOS_MIN] = VRATE_MIN_PPM,
567 [QOS_MAX] = VRATE_MAX_PPM,
568 },
569 .i_lcoefs = {
570 [I_LCOEF_RBPS] = 488636629,
571 [I_LCOEF_RSEQIOPS] = 8932,
572 [I_LCOEF_RRANDIOPS] = 8518,
573 [I_LCOEF_WBPS] = 427891549,
574 [I_LCOEF_WSEQIOPS] = 28755,
575 [I_LCOEF_WRANDIOPS] = 21940,
576 },
577 .too_fast_vrate_pct = 500,
578 },
579 [AUTOP_SSD_FAST] = {
580 .qos = {
581 [QOS_RLAT] = 5000, /* 5ms */
582 [QOS_WLAT] = 5000,
583 [QOS_MIN] = VRATE_MIN_PPM,
584 [QOS_MAX] = VRATE_MAX_PPM,
585 },
586 .i_lcoefs = {
587 [I_LCOEF_RBPS] = 3102524156LLU,
588 [I_LCOEF_RSEQIOPS] = 724816,
589 [I_LCOEF_RRANDIOPS] = 778122,
590 [I_LCOEF_WBPS] = 1742780862LLU,
591 [I_LCOEF_WSEQIOPS] = 425702,
592 [I_LCOEF_WRANDIOPS] = 443193,
593 },
594 .too_slow_vrate_pct = 10,
595 },
596};
597
598/*
599 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
600 * vtime credit shortage and down on device saturation.
601 */
602static u32 vrate_adj_pct[] =
603 { 0, 0, 0, 0,
604 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
605 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
606 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
607
608static struct blkcg_policy blkcg_policy_iocost;
609
610/* accessors and helpers */
611static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
612{
613 return container_of(rqos, struct ioc, rqos);
614}
615
616static struct ioc *q_to_ioc(struct request_queue *q)
617{
618 return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
619}
620
621static const char *q_name(struct request_queue *q)
622{
623 if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
624 return kobject_name(q->kobj.parent);
625 else
626 return "<unknown>";
627}
628
629static const char __maybe_unused *ioc_name(struct ioc *ioc)
630{
631 return q_name(ioc->rqos.q);
632}
633
634static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
635{
636 return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
637}
638
639static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
640{
641 return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
642}
643
644static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
645{
646 return pd_to_blkg(&iocg->pd);
647}
648
649static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
650{
651 return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
652 struct ioc_cgrp, cpd);
653}
654
655/*
656 * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
Tejun Heo36a52482019-09-04 12:45:52 -0700657 * weight, the more expensive each IO. Must round up.
Tejun Heo7caa4712019-08-28 15:05:58 -0700658 */
659static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
660{
Tejun Heofe20cdb52020-09-01 14:52:38 -0400661 return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
Tejun Heo7caa4712019-08-28 15:05:58 -0700662}
663
Tejun Heo36a52482019-09-04 12:45:52 -0700664/*
665 * The inverse of abs_cost_to_cost(). Must round up.
666 */
667static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
668{
Tejun Heofe20cdb52020-09-01 14:52:38 -0400669 return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
Tejun Heo36a52482019-09-04 12:45:52 -0700670}
671
Tejun Heo7caa4712019-08-28 15:05:58 -0700672static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
673{
674 bio->bi_iocost_cost = cost;
675 atomic64_add(cost, &iocg->vtime);
676}
677
678#define CREATE_TRACE_POINTS
679#include <trace/events/iocost.h>
680
681/* latency Qos params changed, update period_us and all the dependent params */
682static void ioc_refresh_period_us(struct ioc *ioc)
683{
684 u32 ppm, lat, multi, period_us;
685
686 lockdep_assert_held(&ioc->lock);
687
688 /* pick the higher latency target */
689 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
690 ppm = ioc->params.qos[QOS_RPPM];
691 lat = ioc->params.qos[QOS_RLAT];
692 } else {
693 ppm = ioc->params.qos[QOS_WPPM];
694 lat = ioc->params.qos[QOS_WLAT];
695 }
696
697 /*
698 * We want the period to be long enough to contain a healthy number
699 * of IOs while short enough for granular control. Define it as a
700 * multiple of the latency target. Ideally, the multiplier should
701 * be scaled according to the percentile so that it would nominally
702 * contain a certain number of requests. Let's be simpler and
703 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
704 */
705 if (ppm)
706 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
707 else
708 multi = 2;
709 period_us = multi * lat;
710 period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
711
712 /* calculate dependent params */
713 ioc->period_us = period_us;
714 ioc->margin_us = period_us * MARGIN_PCT / 100;
715 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
716 period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
717}
718
719static int ioc_autop_idx(struct ioc *ioc)
720{
721 int idx = ioc->autop_idx;
722 const struct ioc_params *p = &autop[idx];
723 u32 vrate_pct;
724 u64 now_ns;
725
726 /* rotational? */
727 if (!blk_queue_nonrot(ioc->rqos.q))
728 return AUTOP_HDD;
729
730 /* handle SATA SSDs w/ broken NCQ */
731 if (blk_queue_depth(ioc->rqos.q) == 1)
732 return AUTOP_SSD_QD1;
733
734 /* use one of the normal ssd sets */
735 if (idx < AUTOP_SSD_DFL)
736 return AUTOP_SSD_DFL;
737
738 /* if user is overriding anything, maintain what was there */
739 if (ioc->user_qos_params || ioc->user_cost_model)
740 return idx;
741
742 /* step up/down based on the vrate */
743 vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
744 VTIME_PER_USEC);
745 now_ns = ktime_get_ns();
746
747 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
748 if (!ioc->autop_too_fast_at)
749 ioc->autop_too_fast_at = now_ns;
750 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
751 return idx + 1;
752 } else {
753 ioc->autop_too_fast_at = 0;
754 }
755
756 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
757 if (!ioc->autop_too_slow_at)
758 ioc->autop_too_slow_at = now_ns;
759 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
760 return idx - 1;
761 } else {
762 ioc->autop_too_slow_at = 0;
763 }
764
765 return idx;
766}
767
768/*
769 * Take the followings as input
770 *
771 * @bps maximum sequential throughput
772 * @seqiops maximum sequential 4k iops
773 * @randiops maximum random 4k iops
774 *
775 * and calculate the linear model cost coefficients.
776 *
777 * *@page per-page cost 1s / (@bps / 4096)
778 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
779 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
780 */
781static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
782 u64 *page, u64 *seqio, u64 *randio)
783{
784 u64 v;
785
786 *page = *seqio = *randio = 0;
787
788 if (bps)
789 *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
790 DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
791
792 if (seqiops) {
793 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
794 if (v > *page)
795 *seqio = v - *page;
796 }
797
798 if (randiops) {
799 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
800 if (v > *page)
801 *randio = v - *page;
802 }
803}
804
805static void ioc_refresh_lcoefs(struct ioc *ioc)
806{
807 u64 *u = ioc->params.i_lcoefs;
808 u64 *c = ioc->params.lcoefs;
809
810 calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
811 &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
812 calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
813 &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
814}
815
816static bool ioc_refresh_params(struct ioc *ioc, bool force)
817{
818 const struct ioc_params *p;
819 int idx;
820
821 lockdep_assert_held(&ioc->lock);
822
823 idx = ioc_autop_idx(ioc);
824 p = &autop[idx];
825
826 if (idx == ioc->autop_idx && !force)
827 return false;
828
829 if (idx != ioc->autop_idx)
830 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
831
832 ioc->autop_idx = idx;
833 ioc->autop_too_fast_at = 0;
834 ioc->autop_too_slow_at = 0;
835
836 if (!ioc->user_qos_params)
837 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
838 if (!ioc->user_cost_model)
839 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
840
841 ioc_refresh_period_us(ioc);
842 ioc_refresh_lcoefs(ioc);
843
844 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
845 VTIME_PER_USEC, MILLION);
846 ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
847 VTIME_PER_USEC, MILLION);
848
849 return true;
850}
851
852/* take a snapshot of the current [v]time and vrate */
853static void ioc_now(struct ioc *ioc, struct ioc_now *now)
854{
855 unsigned seq;
856
857 now->now_ns = ktime_get();
858 now->now = ktime_to_us(now->now_ns);
859 now->vrate = atomic64_read(&ioc->vtime_rate);
860
861 /*
862 * The current vtime is
863 *
864 * vtime at period start + (wallclock time since the start) * vrate
865 *
866 * As a consistent snapshot of `period_at_vtime` and `period_at` is
867 * needed, they're seqcount protected.
868 */
869 do {
870 seq = read_seqcount_begin(&ioc->period_seqcount);
871 now->vnow = ioc->period_at_vtime +
872 (now->now - ioc->period_at) * now->vrate;
873 } while (read_seqcount_retry(&ioc->period_seqcount, seq));
874}
875
876static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
877{
Tejun Heo7caa4712019-08-28 15:05:58 -0700878 WARN_ON_ONCE(ioc->running != IOC_RUNNING);
879
880 write_seqcount_begin(&ioc->period_seqcount);
881 ioc->period_at = now->now;
882 ioc->period_at_vtime = now->vnow;
883 write_seqcount_end(&ioc->period_seqcount);
884
885 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
886 add_timer(&ioc->timer);
887}
888
889/*
890 * Update @iocg's `active` and `inuse` to @active and @inuse, update level
891 * weight sums and propagate upwards accordingly.
892 */
Tejun Heo00410f12020-09-01 14:52:34 -0400893static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
Tejun Heo7caa4712019-08-28 15:05:58 -0700894{
895 struct ioc *ioc = iocg->ioc;
896 int lvl;
897
898 lockdep_assert_held(&ioc->lock);
899
Tejun Heodb84a722020-09-01 14:52:35 -0400900 inuse = clamp_t(u32, inuse, 1, active);
901
902 if (active == iocg->active && inuse == iocg->inuse)
903 return;
Tejun Heo7caa4712019-08-28 15:05:58 -0700904
905 for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
906 struct ioc_gq *parent = iocg->ancestors[lvl];
907 struct ioc_gq *child = iocg->ancestors[lvl + 1];
908 u32 parent_active = 0, parent_inuse = 0;
909
910 /* update the level sums */
911 parent->child_active_sum += (s32)(active - child->active);
912 parent->child_inuse_sum += (s32)(inuse - child->inuse);
913 /* apply the udpates */
914 child->active = active;
915 child->inuse = inuse;
916
917 /*
918 * The delta between inuse and active sums indicates that
919 * that much of weight is being given away. Parent's inuse
920 * and active should reflect the ratio.
921 */
922 if (parent->child_active_sum) {
923 parent_active = parent->weight;
924 parent_inuse = DIV64_U64_ROUND_UP(
925 parent_active * parent->child_inuse_sum,
926 parent->child_active_sum);
927 }
928
929 /* do we need to keep walking up? */
930 if (parent_active == parent->active &&
931 parent_inuse == parent->inuse)
932 break;
933
934 active = parent_active;
935 inuse = parent_inuse;
936 }
937
938 ioc->weights_updated = true;
939}
940
Tejun Heo00410f12020-09-01 14:52:34 -0400941static void commit_weights(struct ioc *ioc)
Tejun Heo7caa4712019-08-28 15:05:58 -0700942{
943 lockdep_assert_held(&ioc->lock);
944
945 if (ioc->weights_updated) {
946 /* paired with rmb in current_hweight(), see there */
947 smp_wmb();
948 atomic_inc(&ioc->hweight_gen);
949 ioc->weights_updated = false;
950 }
951}
952
Tejun Heo00410f12020-09-01 14:52:34 -0400953static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse)
Tejun Heo7caa4712019-08-28 15:05:58 -0700954{
Tejun Heo00410f12020-09-01 14:52:34 -0400955 __propagate_weights(iocg, active, inuse);
956 commit_weights(iocg->ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -0700957}
958
959static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
960{
961 struct ioc *ioc = iocg->ioc;
962 int lvl;
963 u32 hwa, hwi;
964 int ioc_gen;
965
966 /* hot path - if uptodate, use cached */
967 ioc_gen = atomic_read(&ioc->hweight_gen);
968 if (ioc_gen == iocg->hweight_gen)
969 goto out;
970
971 /*
Tejun Heo00410f12020-09-01 14:52:34 -0400972 * Paired with wmb in commit_weights(). If we saw the updated
973 * hweight_gen, all the weight updates from __propagate_weights() are
974 * visible too.
Tejun Heo7caa4712019-08-28 15:05:58 -0700975 *
976 * We can race with weight updates during calculation and get it
977 * wrong. However, hweight_gen would have changed and a future
978 * reader will recalculate and we're guaranteed to discard the
979 * wrong result soon.
980 */
981 smp_rmb();
982
Tejun Heofe20cdb52020-09-01 14:52:38 -0400983 hwa = hwi = WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -0700984 for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
985 struct ioc_gq *parent = iocg->ancestors[lvl];
986 struct ioc_gq *child = iocg->ancestors[lvl + 1];
Tejun Heobd0adb92020-09-01 14:52:39 -0400987 u64 active_sum = READ_ONCE(parent->child_active_sum);
988 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
Tejun Heo7caa4712019-08-28 15:05:58 -0700989 u32 active = READ_ONCE(child->active);
990 u32 inuse = READ_ONCE(child->inuse);
991
992 /* we can race with deactivations and either may read as zero */
993 if (!active_sum || !inuse_sum)
994 continue;
995
Tejun Heobd0adb92020-09-01 14:52:39 -0400996 active_sum = max_t(u64, active, active_sum);
997 hwa = div64_u64((u64)hwa * active, active_sum);
Tejun Heo7caa4712019-08-28 15:05:58 -0700998
Tejun Heobd0adb92020-09-01 14:52:39 -0400999 inuse_sum = max_t(u64, inuse, inuse_sum);
1000 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
Tejun Heo7caa4712019-08-28 15:05:58 -07001001 }
1002
1003 iocg->hweight_active = max_t(u32, hwa, 1);
1004 iocg->hweight_inuse = max_t(u32, hwi, 1);
1005 iocg->hweight_gen = ioc_gen;
1006out:
1007 if (hw_activep)
1008 *hw_activep = iocg->hweight_active;
1009 if (hw_inusep)
1010 *hw_inusep = iocg->hweight_inuse;
1011}
1012
1013static void weight_updated(struct ioc_gq *iocg)
1014{
1015 struct ioc *ioc = iocg->ioc;
1016 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1017 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1018 u32 weight;
1019
1020 lockdep_assert_held(&ioc->lock);
1021
1022 weight = iocg->cfg_weight ?: iocc->dfl_weight;
1023 if (weight != iocg->weight && iocg->active)
Tejun Heo00410f12020-09-01 14:52:34 -04001024 propagate_weights(iocg, weight,
Tejun Heobd0adb92020-09-01 14:52:39 -04001025 DIV64_U64_ROUND_UP((u64)iocg->inuse * weight,
1026 iocg->weight));
Tejun Heo7caa4712019-08-28 15:05:58 -07001027 iocg->weight = weight;
1028}
1029
1030static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1031{
1032 struct ioc *ioc = iocg->ioc;
1033 u64 last_period, cur_period, max_period_delta;
1034 u64 vtime, vmargin, vmin;
1035 int i;
1036
1037 /*
1038 * If seem to be already active, just update the stamp to tell the
1039 * timer that we're still active. We don't mind occassional races.
1040 */
1041 if (!list_empty(&iocg->active_list)) {
1042 ioc_now(ioc, now);
1043 cur_period = atomic64_read(&ioc->cur_period);
1044 if (atomic64_read(&iocg->active_period) != cur_period)
1045 atomic64_set(&iocg->active_period, cur_period);
1046 return true;
1047 }
1048
1049 /* racy check on internal node IOs, treat as root level IOs */
1050 if (iocg->child_active_sum)
1051 return false;
1052
1053 spin_lock_irq(&ioc->lock);
1054
1055 ioc_now(ioc, now);
1056
1057 /* update period */
1058 cur_period = atomic64_read(&ioc->cur_period);
1059 last_period = atomic64_read(&iocg->active_period);
1060 atomic64_set(&iocg->active_period, cur_period);
1061
1062 /* already activated or breaking leaf-only constraint? */
Jiufei Xue8b37bc22019-11-13 15:21:31 +08001063 if (!list_empty(&iocg->active_list))
1064 goto succeed_unlock;
1065 for (i = iocg->level - 1; i > 0; i--)
1066 if (!list_empty(&iocg->ancestors[i]->active_list))
Tejun Heo7caa4712019-08-28 15:05:58 -07001067 goto fail_unlock;
Jiufei Xue8b37bc22019-11-13 15:21:31 +08001068
Tejun Heo7caa4712019-08-28 15:05:58 -07001069 if (iocg->child_active_sum)
1070 goto fail_unlock;
1071
1072 /*
1073 * vtime may wrap when vrate is raised substantially due to
1074 * underestimated IO costs. Look at the period and ignore its
1075 * vtime if the iocg has been idle for too long. Also, cap the
1076 * budget it can start with to the margin.
1077 */
1078 max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1079 vtime = atomic64_read(&iocg->vtime);
1080 vmargin = ioc->margin_us * now->vrate;
1081 vmin = now->vnow - vmargin;
1082
1083 if (last_period + max_period_delta < cur_period ||
1084 time_before64(vtime, vmin)) {
1085 atomic64_add(vmin - vtime, &iocg->vtime);
1086 atomic64_add(vmin - vtime, &iocg->done_vtime);
1087 vtime = vmin;
1088 }
1089
1090 /*
1091 * Activate, propagate weight and start period timer if not
1092 * running. Reset hweight_gen to avoid accidental match from
1093 * wrapping.
1094 */
1095 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1096 list_add(&iocg->active_list, &ioc->active_iocgs);
Tejun Heo00410f12020-09-01 14:52:34 -04001097 propagate_weights(iocg, iocg->weight,
1098 iocg->last_inuse ?: iocg->weight);
Tejun Heo7caa4712019-08-28 15:05:58 -07001099
1100 TRACE_IOCG_PATH(iocg_activate, iocg, now,
1101 last_period, cur_period, vtime);
1102
1103 iocg->last_vtime = vtime;
1104
1105 if (ioc->running == IOC_IDLE) {
1106 ioc->running = IOC_RUNNING;
1107 ioc_start_period(ioc, now);
1108 }
1109
Jiufei Xue8b37bc22019-11-13 15:21:31 +08001110succeed_unlock:
Tejun Heo7caa4712019-08-28 15:05:58 -07001111 spin_unlock_irq(&ioc->lock);
1112 return true;
1113
1114fail_unlock:
1115 spin_unlock_irq(&ioc->lock);
1116 return false;
1117}
1118
Tejun Heo6ef20f72020-09-01 14:52:36 -04001119static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1120{
1121 struct ioc *ioc = iocg->ioc;
1122 struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1123 u64 vtime = atomic64_read(&iocg->vtime);
1124 u64 vmargin = ioc->margin_us * now->vrate;
1125 u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1126 u64 delta_ns, expires, oexpires;
1127 u32 hw_inuse;
1128
1129 lockdep_assert_held(&iocg->waitq.lock);
1130
1131 /* debt-adjust vtime */
1132 current_hweight(iocg, NULL, &hw_inuse);
1133 vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1134
1135 /*
1136 * Clear or maintain depending on the overage. Non-zero vdebt is what
1137 * guarantees that @iocg is online and future iocg_kick_delay() will
1138 * clear use_delay. Don't leave it on when there's no vdebt.
1139 */
1140 if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1141 blkcg_clear_delay(blkg);
1142 return false;
1143 }
1144 if (!atomic_read(&blkg->use_delay) &&
1145 time_before_eq64(vtime, now->vnow + vmargin))
1146 return false;
1147
1148 /* use delay */
1149 delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
1150 now->vrate) * NSEC_PER_USEC;
1151 blkcg_set_delay(blkg, delta_ns);
1152 expires = now->now_ns + delta_ns;
1153
1154 /* if already active and close enough, don't bother */
1155 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1156 if (hrtimer_is_queued(&iocg->delay_timer) &&
1157 abs(oexpires - expires) <= margin_ns / 4)
1158 return true;
1159
1160 hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1161 margin_ns / 4, HRTIMER_MODE_ABS);
1162 return true;
1163}
1164
1165static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1166{
1167 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1168 struct ioc_now now;
1169 unsigned long flags;
1170
1171 spin_lock_irqsave(&iocg->waitq.lock, flags);
1172 ioc_now(iocg->ioc, &now);
1173 iocg_kick_delay(iocg, &now);
1174 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1175
1176 return HRTIMER_NORESTART;
1177}
1178
Tejun Heo7caa4712019-08-28 15:05:58 -07001179static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1180 int flags, void *key)
1181{
1182 struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1183 struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1184 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1185
1186 ctx->vbudget -= cost;
1187
1188 if (ctx->vbudget < 0)
1189 return -1;
1190
1191 iocg_commit_bio(ctx->iocg, wait->bio, cost);
1192
1193 /*
1194 * autoremove_wake_function() removes the wait entry only when it
1195 * actually changed the task state. We want the wait always
1196 * removed. Remove explicitly and use default_wake_function().
1197 */
1198 list_del_init(&wq_entry->entry);
1199 wait->committed = true;
1200
1201 default_wake_function(wq_entry, mode, flags, key);
1202 return 0;
1203}
1204
1205static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1206{
1207 struct ioc *ioc = iocg->ioc;
1208 struct iocg_wake_ctx ctx = { .iocg = iocg };
1209 u64 margin_ns = (u64)(ioc->period_us *
1210 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
Tejun Heo0b80f982020-05-04 19:27:54 -04001211 u64 vdebt, vshortage, expires, oexpires;
Tejun Heo36a52482019-09-04 12:45:52 -07001212 s64 vbudget;
1213 u32 hw_inuse;
Tejun Heo7caa4712019-08-28 15:05:58 -07001214
1215 lockdep_assert_held(&iocg->waitq.lock);
1216
Tejun Heo36a52482019-09-04 12:45:52 -07001217 current_hweight(iocg, NULL, &hw_inuse);
1218 vbudget = now->vnow - atomic64_read(&iocg->vtime);
1219
1220 /* pay off debt */
Tejun Heo0b80f982020-05-04 19:27:54 -04001221 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
Tejun Heo36a52482019-09-04 12:45:52 -07001222 if (vdebt && vbudget > 0) {
1223 u64 delta = min_t(u64, vbudget, vdebt);
1224 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
Tejun Heo0b80f982020-05-04 19:27:54 -04001225 iocg->abs_vdebt);
Tejun Heo36a52482019-09-04 12:45:52 -07001226
1227 atomic64_add(delta, &iocg->vtime);
1228 atomic64_add(delta, &iocg->done_vtime);
Tejun Heo0b80f982020-05-04 19:27:54 -04001229 iocg->abs_vdebt -= abs_delta;
Tejun Heo7b84b492020-09-01 14:52:37 -04001230
1231 iocg_kick_delay(iocg, now);
Tejun Heo36a52482019-09-04 12:45:52 -07001232 }
1233
Tejun Heo7caa4712019-08-28 15:05:58 -07001234 /*
1235 * Wake up the ones which are due and see how much vtime we'll need
1236 * for the next one.
1237 */
Tejun Heo36a52482019-09-04 12:45:52 -07001238 ctx.hw_inuse = hw_inuse;
1239 ctx.vbudget = vbudget - vdebt;
Tejun Heo7caa4712019-08-28 15:05:58 -07001240 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1241 if (!waitqueue_active(&iocg->waitq))
1242 return;
1243 if (WARN_ON_ONCE(ctx.vbudget >= 0))
1244 return;
1245
1246 /* determine next wakeup, add a quarter margin to guarantee chunking */
1247 vshortage = -ctx.vbudget;
1248 expires = now->now_ns +
1249 DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1250 expires += margin_ns / 4;
1251
1252 /* if already active and close enough, don't bother */
1253 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1254 if (hrtimer_is_queued(&iocg->waitq_timer) &&
1255 abs(oexpires - expires) <= margin_ns / 4)
1256 return;
1257
1258 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1259 margin_ns / 4, HRTIMER_MODE_ABS);
1260}
1261
1262static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1263{
1264 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1265 struct ioc_now now;
1266 unsigned long flags;
1267
1268 ioc_now(iocg->ioc, &now);
1269
1270 spin_lock_irqsave(&iocg->waitq.lock, flags);
1271 iocg_kick_waitq(iocg, &now);
1272 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1273
1274 return HRTIMER_NORESTART;
1275}
1276
Tejun Heo7caa4712019-08-28 15:05:58 -07001277static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1278{
1279 u32 nr_met[2] = { };
1280 u32 nr_missed[2] = { };
1281 u64 rq_wait_ns = 0;
1282 int cpu, rw;
1283
1284 for_each_online_cpu(cpu) {
1285 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1286 u64 this_rq_wait_ns;
1287
1288 for (rw = READ; rw <= WRITE; rw++) {
Tejun Heo5e124f72020-09-01 14:52:33 -04001289 u32 this_met = local_read(&stat->missed[rw].nr_met);
1290 u32 this_missed = local_read(&stat->missed[rw].nr_missed);
Tejun Heo7caa4712019-08-28 15:05:58 -07001291
1292 nr_met[rw] += this_met - stat->missed[rw].last_met;
1293 nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1294 stat->missed[rw].last_met = this_met;
1295 stat->missed[rw].last_missed = this_missed;
1296 }
1297
Tejun Heo5e124f72020-09-01 14:52:33 -04001298 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
Tejun Heo7caa4712019-08-28 15:05:58 -07001299 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1300 stat->last_rq_wait_ns = this_rq_wait_ns;
1301 }
1302
1303 for (rw = READ; rw <= WRITE; rw++) {
1304 if (nr_met[rw] + nr_missed[rw])
1305 missed_ppm_ar[rw] =
1306 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1307 nr_met[rw] + nr_missed[rw]);
1308 else
1309 missed_ppm_ar[rw] = 0;
1310 }
1311
1312 *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1313 ioc->period_us * NSEC_PER_USEC);
1314}
1315
1316/* was iocg idle this period? */
1317static bool iocg_is_idle(struct ioc_gq *iocg)
1318{
1319 struct ioc *ioc = iocg->ioc;
1320
1321 /* did something get issued this period? */
1322 if (atomic64_read(&iocg->active_period) ==
1323 atomic64_read(&ioc->cur_period))
1324 return false;
1325
1326 /* is something in flight? */
Tejun Heodcd65892020-03-10 13:07:46 -04001327 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
Tejun Heo7caa4712019-08-28 15:05:58 -07001328 return false;
1329
1330 return true;
1331}
1332
1333/* returns usage with margin added if surplus is large enough */
1334static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1335{
1336 /* add margin */
1337 usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1338 usage += SURPLUS_SCALE_ABS;
1339
1340 /* don't bother if the surplus is too small */
1341 if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1342 return 0;
1343
1344 return usage;
1345}
1346
1347static void ioc_timer_fn(struct timer_list *timer)
1348{
1349 struct ioc *ioc = container_of(timer, struct ioc, timer);
1350 struct ioc_gq *iocg, *tiocg;
1351 struct ioc_now now;
1352 int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1353 u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1354 u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1355 u32 missed_ppm[2], rq_wait_pct;
1356 u64 period_vtime;
Tejun Heo25d41e42019-09-25 16:02:07 -07001357 int prev_busy_level, i;
Tejun Heo7caa4712019-08-28 15:05:58 -07001358
1359 /* how were the latencies during the period? */
1360 ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1361
1362 /* take care of active iocgs */
1363 spin_lock_irq(&ioc->lock);
1364
1365 ioc_now(ioc, &now);
1366
1367 period_vtime = now.vnow - ioc->period_at_vtime;
1368 if (WARN_ON_ONCE(!period_vtime)) {
1369 spin_unlock_irq(&ioc->lock);
1370 return;
1371 }
1372
1373 /*
1374 * Waiters determine the sleep durations based on the vrate they
1375 * saw at the time of sleep. If vrate has increased, some waiters
1376 * could be sleeping for too long. Wake up tardy waiters which
1377 * should have woken up in the last period and expire idle iocgs.
1378 */
1379 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
Chengming Zhoud9012a52020-07-30 17:03:21 +08001380 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
Tejun Heo0b80f982020-05-04 19:27:54 -04001381 !iocg_is_idle(iocg))
Tejun Heo7caa4712019-08-28 15:05:58 -07001382 continue;
1383
1384 spin_lock(&iocg->waitq.lock);
1385
Tejun Heo0b80f982020-05-04 19:27:54 -04001386 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
Tejun Heo7caa4712019-08-28 15:05:58 -07001387 /* might be oversleeping vtime / hweight changes, kick */
1388 iocg_kick_waitq(iocg, &now);
Tejun Heo7caa4712019-08-28 15:05:58 -07001389 } else if (iocg_is_idle(iocg)) {
1390 /* no waiter and idle, deactivate */
1391 iocg->last_inuse = iocg->inuse;
Tejun Heo00410f12020-09-01 14:52:34 -04001392 __propagate_weights(iocg, 0, 0);
Tejun Heo7caa4712019-08-28 15:05:58 -07001393 list_del_init(&iocg->active_list);
1394 }
1395
1396 spin_unlock(&iocg->waitq.lock);
1397 }
Tejun Heo00410f12020-09-01 14:52:34 -04001398 commit_weights(ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -07001399
1400 /* calc usages and see whether some weights need to be moved around */
1401 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1402 u64 vdone, vtime, vusage, vmargin, vmin;
1403 u32 hw_active, hw_inuse, usage;
1404
1405 /*
1406 * Collect unused and wind vtime closer to vnow to prevent
1407 * iocgs from accumulating a large amount of budget.
1408 */
1409 vdone = atomic64_read(&iocg->done_vtime);
1410 vtime = atomic64_read(&iocg->vtime);
1411 current_hweight(iocg, &hw_active, &hw_inuse);
1412
1413 /*
1414 * Latency QoS detection doesn't account for IOs which are
1415 * in-flight for longer than a period. Detect them by
1416 * comparing vdone against period start. If lagging behind
1417 * IOs from past periods, don't increase vrate.
1418 */
Tejun Heo7cd806a2019-09-25 16:03:09 -07001419 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1420 !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
Tejun Heo7caa4712019-08-28 15:05:58 -07001421 time_after64(vtime, vdone) &&
1422 time_after64(vtime, now.vnow -
1423 MAX_LAGGING_PERIODS * period_vtime) &&
1424 time_before64(vdone, now.vnow - period_vtime))
1425 nr_lagging++;
1426
1427 if (waitqueue_active(&iocg->waitq))
1428 vusage = now.vnow - iocg->last_vtime;
1429 else if (time_before64(iocg->last_vtime, vtime))
1430 vusage = vtime - iocg->last_vtime;
1431 else
1432 vusage = 0;
1433
1434 iocg->last_vtime += vusage;
1435 /*
1436 * Factor in in-flight vtime into vusage to avoid
1437 * high-latency completions appearing as idle. This should
1438 * be done after the above ->last_time adjustment.
1439 */
1440 vusage = max(vusage, vtime - vdone);
1441
1442 /* calculate hweight based usage ratio and record */
1443 if (vusage) {
1444 usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1445 period_vtime);
1446 iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1447 iocg->usages[iocg->usage_idx] = usage;
1448 } else {
1449 usage = 0;
1450 }
1451
1452 /* see whether there's surplus vtime */
1453 vmargin = ioc->margin_us * now.vrate;
1454 vmin = now.vnow - vmargin;
1455
1456 iocg->has_surplus = false;
1457
1458 if (!waitqueue_active(&iocg->waitq) &&
1459 time_before64(vtime, vmin)) {
1460 u64 delta = vmin - vtime;
1461
1462 /* throw away surplus vtime */
1463 atomic64_add(delta, &iocg->vtime);
1464 atomic64_add(delta, &iocg->done_vtime);
1465 iocg->last_vtime += delta;
1466 /* if usage is sufficiently low, maybe it can donate */
1467 if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1468 iocg->has_surplus = true;
1469 nr_surpluses++;
1470 }
1471 } else if (hw_inuse < hw_active) {
1472 u32 new_hwi, new_inuse;
1473
1474 /* was donating but might need to take back some */
1475 if (waitqueue_active(&iocg->waitq)) {
1476 new_hwi = hw_active;
1477 } else {
1478 new_hwi = max(hw_inuse,
1479 usage * SURPLUS_SCALE_PCT / 100 +
1480 SURPLUS_SCALE_ABS);
1481 }
1482
1483 new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1484 hw_inuse);
1485 new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1486
1487 if (new_inuse > iocg->inuse) {
1488 TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1489 iocg->inuse, new_inuse,
1490 hw_inuse, new_hwi);
Tejun Heo00410f12020-09-01 14:52:34 -04001491 __propagate_weights(iocg, iocg->weight,
1492 new_inuse);
Tejun Heo7caa4712019-08-28 15:05:58 -07001493 }
1494 } else {
1495 /* genuninely out of vtime */
1496 nr_shortages++;
1497 }
1498 }
1499
1500 if (!nr_shortages || !nr_surpluses)
1501 goto skip_surplus_transfers;
1502
1503 /* there are both shortages and surpluses, transfer surpluses */
1504 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1505 u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1506 int nr_valid = 0;
1507
1508 if (!iocg->has_surplus)
1509 continue;
1510
1511 /* base the decision on max historical usage */
1512 for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1513 if (iocg->usages[i]) {
1514 usage = max(usage, iocg->usages[i]);
1515 nr_valid++;
1516 }
1517 }
1518 if (nr_valid < MIN_VALID_USAGES)
1519 continue;
1520
1521 current_hweight(iocg, &hw_active, &hw_inuse);
1522 new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1523 if (!new_hwi)
1524 continue;
1525
1526 new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1527 hw_inuse);
1528 if (new_inuse < iocg->inuse) {
1529 TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1530 iocg->inuse, new_inuse,
1531 hw_inuse, new_hwi);
Tejun Heo00410f12020-09-01 14:52:34 -04001532 __propagate_weights(iocg, iocg->weight, new_inuse);
Tejun Heo7caa4712019-08-28 15:05:58 -07001533 }
1534 }
1535skip_surplus_transfers:
Tejun Heo00410f12020-09-01 14:52:34 -04001536 commit_weights(ioc);
Tejun Heo7caa4712019-08-28 15:05:58 -07001537
1538 /*
1539 * If q is getting clogged or we're missing too much, we're issuing
1540 * too much IO and should lower vtime rate. If we're not missing
1541 * and experiencing shortages but not surpluses, we're too stingy
1542 * and should increase vtime rate.
1543 */
Tejun Heo25d41e42019-09-25 16:02:07 -07001544 prev_busy_level = ioc->busy_level;
Tejun Heo7caa4712019-08-28 15:05:58 -07001545 if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1546 missed_ppm[READ] > ppm_rthr ||
1547 missed_ppm[WRITE] > ppm_wthr) {
Tejun Heo81ca6272019-10-14 17:18:11 -07001548 /* clearly missing QoS targets, slow down vrate */
Tejun Heo7caa4712019-08-28 15:05:58 -07001549 ioc->busy_level = max(ioc->busy_level, 0);
1550 ioc->busy_level++;
Tejun Heo7cd806a2019-09-25 16:03:09 -07001551 } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
Tejun Heo7caa4712019-08-28 15:05:58 -07001552 missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1553 missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
Tejun Heo81ca6272019-10-14 17:18:11 -07001554 /* QoS targets are being met with >25% margin */
1555 if (nr_shortages) {
1556 /*
1557 * We're throttling while the device has spare
1558 * capacity. If vrate was being slowed down, stop.
1559 */
Tejun Heo7cd806a2019-09-25 16:03:09 -07001560 ioc->busy_level = min(ioc->busy_level, 0);
Tejun Heo81ca6272019-10-14 17:18:11 -07001561
1562 /*
1563 * If there are IOs spanning multiple periods, wait
1564 * them out before pushing the device harder. If
1565 * there are surpluses, let redistribution work it
1566 * out first.
1567 */
1568 if (!nr_lagging && !nr_surpluses)
Tejun Heo7cd806a2019-09-25 16:03:09 -07001569 ioc->busy_level--;
Tejun Heo81ca6272019-10-14 17:18:11 -07001570 } else {
1571 /*
1572 * Nobody is being throttled and the users aren't
1573 * issuing enough IOs to saturate the device. We
1574 * simply don't know how close the device is to
1575 * saturation. Coast.
1576 */
1577 ioc->busy_level = 0;
Tejun Heo7cd806a2019-09-25 16:03:09 -07001578 }
Tejun Heo7caa4712019-08-28 15:05:58 -07001579 } else {
Tejun Heo81ca6272019-10-14 17:18:11 -07001580 /* inside the hysterisis margin, we're good */
Tejun Heo7caa4712019-08-28 15:05:58 -07001581 ioc->busy_level = 0;
1582 }
1583
1584 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1585
Tejun Heo7cd806a2019-09-25 16:03:09 -07001586 if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
Tejun Heo7caa4712019-08-28 15:05:58 -07001587 u64 vrate = atomic64_read(&ioc->vtime_rate);
1588 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1589
1590 /* rq_wait signal is always reliable, ignore user vrate_min */
1591 if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1592 vrate_min = VRATE_MIN;
1593
1594 /*
1595 * If vrate is out of bounds, apply clamp gradually as the
1596 * bounds can change abruptly. Otherwise, apply busy_level
1597 * based adjustment.
1598 */
1599 if (vrate < vrate_min) {
1600 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1601 100);
1602 vrate = min(vrate, vrate_min);
1603 } else if (vrate > vrate_max) {
1604 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1605 100);
1606 vrate = max(vrate, vrate_max);
1607 } else {
1608 int idx = min_t(int, abs(ioc->busy_level),
1609 ARRAY_SIZE(vrate_adj_pct) - 1);
1610 u32 adj_pct = vrate_adj_pct[idx];
1611
1612 if (ioc->busy_level > 0)
1613 adj_pct = 100 - adj_pct;
1614 else
1615 adj_pct = 100 + adj_pct;
1616
1617 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1618 vrate_min, vrate_max);
1619 }
1620
Waiman Longd6c8e942020-04-21 09:07:55 -04001621 trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
Tejun Heo7caa4712019-08-28 15:05:58 -07001622 nr_lagging, nr_shortages,
1623 nr_surpluses);
1624
1625 atomic64_set(&ioc->vtime_rate, vrate);
1626 ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1627 ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
Tejun Heo25d41e42019-09-25 16:02:07 -07001628 } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1629 trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
Waiman Longd6c8e942020-04-21 09:07:55 -04001630 missed_ppm, rq_wait_pct, nr_lagging,
Tejun Heo25d41e42019-09-25 16:02:07 -07001631 nr_shortages, nr_surpluses);
Tejun Heo7caa4712019-08-28 15:05:58 -07001632 }
1633
1634 ioc_refresh_params(ioc, false);
1635
1636 /*
1637 * This period is done. Move onto the next one. If nothing's
1638 * going on with the device, stop the timer.
1639 */
1640 atomic64_inc(&ioc->cur_period);
1641
1642 if (ioc->running != IOC_STOP) {
1643 if (!list_empty(&ioc->active_iocgs)) {
1644 ioc_start_period(ioc, &now);
1645 } else {
1646 ioc->busy_level = 0;
1647 ioc->running = IOC_IDLE;
1648 }
1649 }
1650
1651 spin_unlock_irq(&ioc->lock);
1652}
1653
1654static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1655 bool is_merge, u64 *costp)
1656{
1657 struct ioc *ioc = iocg->ioc;
1658 u64 coef_seqio, coef_randio, coef_page;
1659 u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1660 u64 seek_pages = 0;
1661 u64 cost = 0;
1662
1663 switch (bio_op(bio)) {
1664 case REQ_OP_READ:
1665 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
1666 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
1667 coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
1668 break;
1669 case REQ_OP_WRITE:
1670 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
1671 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
1672 coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
1673 break;
1674 default:
1675 goto out;
1676 }
1677
1678 if (iocg->cursor) {
1679 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1680 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1681 }
1682
1683 if (!is_merge) {
1684 if (seek_pages > LCOEF_RANDIO_PAGES) {
1685 cost += coef_randio;
1686 } else {
1687 cost += coef_seqio;
1688 }
1689 }
1690 cost += pages * coef_page;
1691out:
1692 *costp = cost;
1693}
1694
1695static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1696{
1697 u64 cost;
1698
1699 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1700 return cost;
1701}
1702
Tejun Heocd006502020-04-13 12:27:56 -04001703static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
1704 u64 *costp)
1705{
1706 unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
1707
1708 switch (req_op(rq)) {
1709 case REQ_OP_READ:
1710 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
1711 break;
1712 case REQ_OP_WRITE:
1713 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
1714 break;
1715 default:
1716 *costp = 0;
1717 }
1718}
1719
1720static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
1721{
1722 u64 cost;
1723
1724 calc_size_vtime_cost_builtin(rq, ioc, &cost);
1725 return cost;
1726}
1727
Tejun Heo7caa4712019-08-28 15:05:58 -07001728static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1729{
1730 struct blkcg_gq *blkg = bio->bi_blkg;
1731 struct ioc *ioc = rqos_to_ioc(rqos);
1732 struct ioc_gq *iocg = blkg_to_iocg(blkg);
1733 struct ioc_now now;
1734 struct iocg_wait wait;
1735 u32 hw_active, hw_inuse;
1736 u64 abs_cost, cost, vtime;
1737
1738 /* bypass IOs if disabled or for root cgroup */
1739 if (!ioc->enabled || !iocg->level)
1740 return;
1741
1742 /* always activate so that even 0 cost IOs get protected to some level */
1743 if (!iocg_activate(iocg, &now))
1744 return;
1745
1746 /* calculate the absolute vtime cost */
1747 abs_cost = calc_vtime_cost(bio, iocg, false);
1748 if (!abs_cost)
1749 return;
1750
1751 iocg->cursor = bio_end_sector(bio);
1752
1753 vtime = atomic64_read(&iocg->vtime);
1754 current_hweight(iocg, &hw_active, &hw_inuse);
1755
1756 if (hw_inuse < hw_active &&
1757 time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1758 TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1759 iocg->inuse, iocg->weight, hw_inuse, hw_active);
1760 spin_lock_irq(&ioc->lock);
Tejun Heo00410f12020-09-01 14:52:34 -04001761 propagate_weights(iocg, iocg->weight, iocg->weight);
Tejun Heo7caa4712019-08-28 15:05:58 -07001762 spin_unlock_irq(&ioc->lock);
1763 current_hweight(iocg, &hw_active, &hw_inuse);
1764 }
1765
1766 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1767
1768 /*
1769 * If no one's waiting and within budget, issue right away. The
1770 * tests are racy but the races aren't systemic - we only miss once
1771 * in a while which is fine.
1772 */
Tejun Heo0b80f982020-05-04 19:27:54 -04001773 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
Tejun Heo7caa4712019-08-28 15:05:58 -07001774 time_before_eq64(vtime + cost, now.vnow)) {
1775 iocg_commit_bio(iocg, bio, cost);
1776 return;
1777 }
1778
Tejun Heo36a52482019-09-04 12:45:52 -07001779 /*
Tejun Heo0b80f982020-05-04 19:27:54 -04001780 * We activated above but w/o any synchronization. Deactivation is
1781 * synchronized with waitq.lock and we won't get deactivated as long
1782 * as we're waiting or has debt, so we're good if we're activated
1783 * here. In the unlikely case that we aren't, just issue the IO.
1784 */
1785 spin_lock_irq(&iocg->waitq.lock);
1786
1787 if (unlikely(list_empty(&iocg->active_list))) {
1788 spin_unlock_irq(&iocg->waitq.lock);
1789 iocg_commit_bio(iocg, bio, cost);
1790 return;
1791 }
1792
1793 /*
1794 * We're over budget. If @bio has to be issued regardless, remember
1795 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1796 * off the debt before waking more IOs.
1797 *
Tejun Heo36a52482019-09-04 12:45:52 -07001798 * This way, the debt is continuously paid off each period with the
Tejun Heo0b80f982020-05-04 19:27:54 -04001799 * actual budget available to the cgroup. If we just wound vtime, we
1800 * would incorrectly use the current hw_inuse for the entire amount
1801 * which, for example, can lead to the cgroup staying blocked for a
1802 * long time even with substantially raised hw_inuse.
1803 *
1804 * An iocg with vdebt should stay online so that the timer can keep
1805 * deducting its vdebt and [de]activate use_delay mechanism
1806 * accordingly. We don't want to race against the timer trying to
1807 * clear them and leave @iocg inactive w/ dangling use_delay heavily
1808 * penalizing the cgroup and its descendants.
Tejun Heo36a52482019-09-04 12:45:52 -07001809 */
Tejun Heo7caa4712019-08-28 15:05:58 -07001810 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
Tejun Heo0b80f982020-05-04 19:27:54 -04001811 iocg->abs_vdebt += abs_cost;
Tejun Heo54c52e12020-04-13 12:27:55 -04001812 if (iocg_kick_delay(iocg, &now))
Tejun Heod7bd15a2019-12-16 13:34:00 -08001813 blkcg_schedule_throttle(rqos->q,
1814 (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
Tejun Heo0b80f982020-05-04 19:27:54 -04001815 spin_unlock_irq(&iocg->waitq.lock);
Tejun Heo7caa4712019-08-28 15:05:58 -07001816 return;
1817 }
1818
1819 /*
1820 * Append self to the waitq and schedule the wakeup timer if we're
1821 * the first waiter. The timer duration is calculated based on the
1822 * current vrate. vtime and hweight changes can make it too short
1823 * or too long. Each wait entry records the absolute cost it's
1824 * waiting for to allow re-evaluation using a custom wait entry.
1825 *
1826 * If too short, the timer simply reschedules itself. If too long,
1827 * the period timer will notice and trigger wakeups.
1828 *
1829 * All waiters are on iocg->waitq and the wait states are
1830 * synchronized using waitq.lock.
1831 */
Tejun Heo7caa4712019-08-28 15:05:58 -07001832 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1833 wait.wait.private = current;
1834 wait.bio = bio;
1835 wait.abs_cost = abs_cost;
1836 wait.committed = false; /* will be set true by waker */
1837
1838 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1839 iocg_kick_waitq(iocg, &now);
1840
1841 spin_unlock_irq(&iocg->waitq.lock);
1842
1843 while (true) {
1844 set_current_state(TASK_UNINTERRUPTIBLE);
1845 if (wait.committed)
1846 break;
1847 io_schedule();
1848 }
1849
1850 /* waker already committed us, proceed */
1851 finish_wait(&iocg->waitq, &wait.wait);
1852}
1853
1854static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1855 struct bio *bio)
1856{
1857 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
Tejun Heoe1518f62019-09-04 12:45:53 -07001858 struct ioc *ioc = iocg->ioc;
Tejun Heo7caa4712019-08-28 15:05:58 -07001859 sector_t bio_end = bio_end_sector(bio);
Tejun Heoe1518f62019-09-04 12:45:53 -07001860 struct ioc_now now;
Tejun Heo7caa4712019-08-28 15:05:58 -07001861 u32 hw_inuse;
1862 u64 abs_cost, cost;
Tejun Heo0b80f982020-05-04 19:27:54 -04001863 unsigned long flags;
Tejun Heo7caa4712019-08-28 15:05:58 -07001864
Tejun Heoe1518f62019-09-04 12:45:53 -07001865 /* bypass if disabled or for root cgroup */
1866 if (!ioc->enabled || !iocg->level)
Tejun Heo7caa4712019-08-28 15:05:58 -07001867 return;
1868
1869 abs_cost = calc_vtime_cost(bio, iocg, true);
1870 if (!abs_cost)
1871 return;
1872
Tejun Heoe1518f62019-09-04 12:45:53 -07001873 ioc_now(ioc, &now);
1874 current_hweight(iocg, NULL, &hw_inuse);
1875 cost = abs_cost_to_cost(abs_cost, hw_inuse);
1876
Tejun Heo7caa4712019-08-28 15:05:58 -07001877 /* update cursor if backmerging into the request at the cursor */
1878 if (blk_rq_pos(rq) < bio_end &&
1879 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1880 iocg->cursor = bio_end;
1881
Tejun Heoe1518f62019-09-04 12:45:53 -07001882 /*
Tejun Heo0b80f982020-05-04 19:27:54 -04001883 * Charge if there's enough vtime budget and the existing request has
1884 * cost assigned.
Tejun Heoe1518f62019-09-04 12:45:53 -07001885 */
1886 if (rq->bio && rq->bio->bi_iocost_cost &&
Tejun Heo0b80f982020-05-04 19:27:54 -04001887 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
Tejun Heoe1518f62019-09-04 12:45:53 -07001888 iocg_commit_bio(iocg, bio, cost);
Tejun Heo0b80f982020-05-04 19:27:54 -04001889 return;
1890 }
1891
1892 /*
1893 * Otherwise, account it as debt if @iocg is online, which it should
1894 * be for the vast majority of cases. See debt handling in
1895 * ioc_rqos_throttle() for details.
1896 */
1897 spin_lock_irqsave(&iocg->waitq.lock, flags);
1898 if (likely(!list_empty(&iocg->active_list))) {
1899 iocg->abs_vdebt += abs_cost;
Jens Axboe873f1c82020-05-09 16:13:58 -06001900 iocg_kick_delay(iocg, &now);
Tejun Heo0b80f982020-05-04 19:27:54 -04001901 } else {
1902 iocg_commit_bio(iocg, bio, cost);
1903 }
1904 spin_unlock_irqrestore(&iocg->waitq.lock, flags);
Tejun Heo7caa4712019-08-28 15:05:58 -07001905}
1906
1907static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1908{
1909 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1910
1911 if (iocg && bio->bi_iocost_cost)
1912 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1913}
1914
1915static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1916{
1917 struct ioc *ioc = rqos_to_ioc(rqos);
Tejun Heo5e124f72020-09-01 14:52:33 -04001918 struct ioc_pcpu_stat *ccs;
Tejun Heocd006502020-04-13 12:27:56 -04001919 u64 on_q_ns, rq_wait_ns, size_nsec;
Tejun Heo7caa4712019-08-28 15:05:58 -07001920 int pidx, rw;
1921
1922 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1923 return;
1924
1925 switch (req_op(rq) & REQ_OP_MASK) {
1926 case REQ_OP_READ:
1927 pidx = QOS_RLAT;
1928 rw = READ;
1929 break;
1930 case REQ_OP_WRITE:
1931 pidx = QOS_WLAT;
1932 rw = WRITE;
1933 break;
1934 default:
1935 return;
1936 }
1937
1938 on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1939 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
Tejun Heocd006502020-04-13 12:27:56 -04001940 size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
Tejun Heo7caa4712019-08-28 15:05:58 -07001941
Tejun Heo5e124f72020-09-01 14:52:33 -04001942 ccs = get_cpu_ptr(ioc->pcpu_stat);
1943
Tejun Heocd006502020-04-13 12:27:56 -04001944 if (on_q_ns <= size_nsec ||
1945 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
Tejun Heo5e124f72020-09-01 14:52:33 -04001946 local_inc(&ccs->missed[rw].nr_met);
Tejun Heo7caa4712019-08-28 15:05:58 -07001947 else
Tejun Heo5e124f72020-09-01 14:52:33 -04001948 local_inc(&ccs->missed[rw].nr_missed);
Tejun Heo7caa4712019-08-28 15:05:58 -07001949
Tejun Heo5e124f72020-09-01 14:52:33 -04001950 local64_add(rq_wait_ns, &ccs->rq_wait_ns);
1951
1952 put_cpu_ptr(ccs);
Tejun Heo7caa4712019-08-28 15:05:58 -07001953}
1954
1955static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1956{
1957 struct ioc *ioc = rqos_to_ioc(rqos);
1958
1959 spin_lock_irq(&ioc->lock);
1960 ioc_refresh_params(ioc, false);
1961 spin_unlock_irq(&ioc->lock);
1962}
1963
1964static void ioc_rqos_exit(struct rq_qos *rqos)
1965{
1966 struct ioc *ioc = rqos_to_ioc(rqos);
1967
1968 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1969
1970 spin_lock_irq(&ioc->lock);
1971 ioc->running = IOC_STOP;
1972 spin_unlock_irq(&ioc->lock);
1973
1974 del_timer_sync(&ioc->timer);
1975 free_percpu(ioc->pcpu_stat);
1976 kfree(ioc);
1977}
1978
1979static struct rq_qos_ops ioc_rqos_ops = {
1980 .throttle = ioc_rqos_throttle,
1981 .merge = ioc_rqos_merge,
1982 .done_bio = ioc_rqos_done_bio,
1983 .done = ioc_rqos_done,
1984 .queue_depth_changed = ioc_rqos_queue_depth_changed,
1985 .exit = ioc_rqos_exit,
1986};
1987
1988static int blk_iocost_init(struct request_queue *q)
1989{
1990 struct ioc *ioc;
1991 struct rq_qos *rqos;
Tejun Heo5e124f72020-09-01 14:52:33 -04001992 int i, cpu, ret;
Tejun Heo7caa4712019-08-28 15:05:58 -07001993
1994 ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1995 if (!ioc)
1996 return -ENOMEM;
1997
1998 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1999 if (!ioc->pcpu_stat) {
2000 kfree(ioc);
2001 return -ENOMEM;
2002 }
2003
Tejun Heo5e124f72020-09-01 14:52:33 -04002004 for_each_possible_cpu(cpu) {
2005 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2006
2007 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2008 local_set(&ccs->missed[i].nr_met, 0);
2009 local_set(&ccs->missed[i].nr_missed, 0);
2010 }
2011 local64_set(&ccs->rq_wait_ns, 0);
2012 }
2013
Tejun Heo7caa4712019-08-28 15:05:58 -07002014 rqos = &ioc->rqos;
2015 rqos->id = RQ_QOS_COST;
2016 rqos->ops = &ioc_rqos_ops;
2017 rqos->q = q;
2018
2019 spin_lock_init(&ioc->lock);
2020 timer_setup(&ioc->timer, ioc_timer_fn, 0);
2021 INIT_LIST_HEAD(&ioc->active_iocgs);
2022
2023 ioc->running = IOC_IDLE;
2024 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
Ahmed S. Darwish67b7b642020-07-20 17:55:26 +02002025 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
Tejun Heo7caa4712019-08-28 15:05:58 -07002026 ioc->period_at = ktime_to_us(ktime_get());
2027 atomic64_set(&ioc->cur_period, 0);
2028 atomic_set(&ioc->hweight_gen, 0);
2029
2030 spin_lock_irq(&ioc->lock);
2031 ioc->autop_idx = AUTOP_INVALID;
2032 ioc_refresh_params(ioc, true);
2033 spin_unlock_irq(&ioc->lock);
2034
2035 rq_qos_add(q, rqos);
2036 ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2037 if (ret) {
2038 rq_qos_del(q, rqos);
Tejun Heo3532e722019-08-29 08:53:06 -07002039 free_percpu(ioc->pcpu_stat);
Tejun Heo7caa4712019-08-28 15:05:58 -07002040 kfree(ioc);
2041 return ret;
2042 }
2043 return 0;
2044}
2045
2046static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2047{
2048 struct ioc_cgrp *iocc;
2049
2050 iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
Tejun Heoe916ad22019-08-30 06:10:58 -07002051 if (!iocc)
2052 return NULL;
Tejun Heo7caa4712019-08-28 15:05:58 -07002053
Tejun Heobd0adb92020-09-01 14:52:39 -04002054 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002055 return &iocc->cpd;
2056}
2057
2058static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2059{
2060 kfree(container_of(cpd, struct ioc_cgrp, cpd));
2061}
2062
2063static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2064 struct blkcg *blkcg)
2065{
2066 int levels = blkcg->css.cgroup->level + 1;
2067 struct ioc_gq *iocg;
2068
Gustavo A. R. Silvaf61d6e22020-06-19 18:08:30 -05002069 iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
Tejun Heo7caa4712019-08-28 15:05:58 -07002070 if (!iocg)
2071 return NULL;
2072
2073 return &iocg->pd;
2074}
2075
2076static void ioc_pd_init(struct blkg_policy_data *pd)
2077{
2078 struct ioc_gq *iocg = pd_to_iocg(pd);
2079 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2080 struct ioc *ioc = q_to_ioc(blkg->q);
2081 struct ioc_now now;
2082 struct blkcg_gq *tblkg;
2083 unsigned long flags;
2084
2085 ioc_now(ioc, &now);
2086
2087 iocg->ioc = ioc;
2088 atomic64_set(&iocg->vtime, now.vnow);
2089 atomic64_set(&iocg->done_vtime, now.vnow);
2090 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2091 INIT_LIST_HEAD(&iocg->active_list);
Tejun Heofe20cdb52020-09-01 14:52:38 -04002092 iocg->hweight_active = WEIGHT_ONE;
2093 iocg->hweight_inuse = WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002094
2095 init_waitqueue_head(&iocg->waitq);
2096 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2097 iocg->waitq_timer.function = iocg_waitq_timer_fn;
2098 hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2099 iocg->delay_timer.function = iocg_delay_timer_fn;
2100
2101 iocg->level = blkg->blkcg->css.cgroup->level;
2102
2103 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2104 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2105 iocg->ancestors[tiocg->level] = tiocg;
2106 }
2107
2108 spin_lock_irqsave(&ioc->lock, flags);
2109 weight_updated(iocg);
2110 spin_unlock_irqrestore(&ioc->lock, flags);
2111}
2112
2113static void ioc_pd_free(struct blkg_policy_data *pd)
2114{
2115 struct ioc_gq *iocg = pd_to_iocg(pd);
2116 struct ioc *ioc = iocg->ioc;
Tejun Heo5aeac7c2020-09-01 14:52:31 -04002117 unsigned long flags;
Tejun Heo7caa4712019-08-28 15:05:58 -07002118
2119 if (ioc) {
Tejun Heo5aeac7c2020-09-01 14:52:31 -04002120 spin_lock_irqsave(&ioc->lock, flags);
Tejun Heo7caa4712019-08-28 15:05:58 -07002121 if (!list_empty(&iocg->active_list)) {
Tejun Heo00410f12020-09-01 14:52:34 -04002122 propagate_weights(iocg, 0, 0);
Tejun Heo7caa4712019-08-28 15:05:58 -07002123 list_del_init(&iocg->active_list);
2124 }
Tejun Heo5aeac7c2020-09-01 14:52:31 -04002125 spin_unlock_irqrestore(&ioc->lock, flags);
Tejun Heoe036c4c2019-09-10 09:15:25 -07002126
2127 hrtimer_cancel(&iocg->waitq_timer);
2128 hrtimer_cancel(&iocg->delay_timer);
Tejun Heo7caa4712019-08-28 15:05:58 -07002129 }
2130 kfree(iocg);
2131}
2132
2133static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2134 int off)
2135{
2136 const char *dname = blkg_dev_name(pd->blkg);
2137 struct ioc_gq *iocg = pd_to_iocg(pd);
2138
2139 if (dname && iocg->cfg_weight)
Tejun Heobd0adb92020-09-01 14:52:39 -04002140 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
Tejun Heo7caa4712019-08-28 15:05:58 -07002141 return 0;
2142}
2143
2144
2145static int ioc_weight_show(struct seq_file *sf, void *v)
2146{
2147 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2148 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2149
Tejun Heobd0adb92020-09-01 14:52:39 -04002150 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
Tejun Heo7caa4712019-08-28 15:05:58 -07002151 blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2152 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2153 return 0;
2154}
2155
2156static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2157 size_t nbytes, loff_t off)
2158{
2159 struct blkcg *blkcg = css_to_blkcg(of_css(of));
2160 struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2161 struct blkg_conf_ctx ctx;
2162 struct ioc_gq *iocg;
2163 u32 v;
2164 int ret;
2165
2166 if (!strchr(buf, ':')) {
2167 struct blkcg_gq *blkg;
2168
2169 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2170 return -EINVAL;
2171
2172 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2173 return -EINVAL;
2174
2175 spin_lock(&blkcg->lock);
Tejun Heobd0adb92020-09-01 14:52:39 -04002176 iocc->dfl_weight = v * WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002177 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2178 struct ioc_gq *iocg = blkg_to_iocg(blkg);
2179
2180 if (iocg) {
2181 spin_lock_irq(&iocg->ioc->lock);
2182 weight_updated(iocg);
2183 spin_unlock_irq(&iocg->ioc->lock);
2184 }
2185 }
2186 spin_unlock(&blkcg->lock);
2187
2188 return nbytes;
2189 }
2190
2191 ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2192 if (ret)
2193 return ret;
2194
2195 iocg = blkg_to_iocg(ctx.blkg);
2196
2197 if (!strncmp(ctx.body, "default", 7)) {
2198 v = 0;
2199 } else {
2200 if (!sscanf(ctx.body, "%u", &v))
2201 goto einval;
2202 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2203 goto einval;
2204 }
2205
Dan Carpenter41591a52019-10-31 13:53:41 +03002206 spin_lock(&iocg->ioc->lock);
Tejun Heobd0adb92020-09-01 14:52:39 -04002207 iocg->cfg_weight = v * WEIGHT_ONE;
Tejun Heo7caa4712019-08-28 15:05:58 -07002208 weight_updated(iocg);
Dan Carpenter41591a52019-10-31 13:53:41 +03002209 spin_unlock(&iocg->ioc->lock);
Tejun Heo7caa4712019-08-28 15:05:58 -07002210
2211 blkg_conf_finish(&ctx);
2212 return nbytes;
2213
2214einval:
2215 blkg_conf_finish(&ctx);
2216 return -EINVAL;
2217}
2218
2219static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2220 int off)
2221{
2222 const char *dname = blkg_dev_name(pd->blkg);
2223 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2224
2225 if (!dname)
2226 return 0;
2227
2228 seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2229 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2230 ioc->params.qos[QOS_RPPM] / 10000,
2231 ioc->params.qos[QOS_RPPM] % 10000 / 100,
2232 ioc->params.qos[QOS_RLAT],
2233 ioc->params.qos[QOS_WPPM] / 10000,
2234 ioc->params.qos[QOS_WPPM] % 10000 / 100,
2235 ioc->params.qos[QOS_WLAT],
2236 ioc->params.qos[QOS_MIN] / 10000,
2237 ioc->params.qos[QOS_MIN] % 10000 / 100,
2238 ioc->params.qos[QOS_MAX] / 10000,
2239 ioc->params.qos[QOS_MAX] % 10000 / 100);
2240 return 0;
2241}
2242
2243static int ioc_qos_show(struct seq_file *sf, void *v)
2244{
2245 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2246
2247 blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2248 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2249 return 0;
2250}
2251
2252static const match_table_t qos_ctrl_tokens = {
2253 { QOS_ENABLE, "enable=%u" },
2254 { QOS_CTRL, "ctrl=%s" },
2255 { NR_QOS_CTRL_PARAMS, NULL },
2256};
2257
2258static const match_table_t qos_tokens = {
2259 { QOS_RPPM, "rpct=%s" },
2260 { QOS_RLAT, "rlat=%u" },
2261 { QOS_WPPM, "wpct=%s" },
2262 { QOS_WLAT, "wlat=%u" },
2263 { QOS_MIN, "min=%s" },
2264 { QOS_MAX, "max=%s" },
2265 { NR_QOS_PARAMS, NULL },
2266};
2267
2268static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2269 size_t nbytes, loff_t off)
2270{
2271 struct gendisk *disk;
2272 struct ioc *ioc;
2273 u32 qos[NR_QOS_PARAMS];
2274 bool enable, user;
2275 char *p;
2276 int ret;
2277
2278 disk = blkcg_conf_get_disk(&input);
2279 if (IS_ERR(disk))
2280 return PTR_ERR(disk);
2281
2282 ioc = q_to_ioc(disk->queue);
2283 if (!ioc) {
2284 ret = blk_iocost_init(disk->queue);
2285 if (ret)
2286 goto err;
2287 ioc = q_to_ioc(disk->queue);
2288 }
2289
2290 spin_lock_irq(&ioc->lock);
2291 memcpy(qos, ioc->params.qos, sizeof(qos));
2292 enable = ioc->enabled;
2293 user = ioc->user_qos_params;
2294 spin_unlock_irq(&ioc->lock);
2295
2296 while ((p = strsep(&input, " \t\n"))) {
2297 substring_t args[MAX_OPT_ARGS];
2298 char buf[32];
2299 int tok;
2300 s64 v;
2301
2302 if (!*p)
2303 continue;
2304
2305 switch (match_token(p, qos_ctrl_tokens, args)) {
2306 case QOS_ENABLE:
2307 match_u64(&args[0], &v);
2308 enable = v;
2309 continue;
2310 case QOS_CTRL:
2311 match_strlcpy(buf, &args[0], sizeof(buf));
2312 if (!strcmp(buf, "auto"))
2313 user = false;
2314 else if (!strcmp(buf, "user"))
2315 user = true;
2316 else
2317 goto einval;
2318 continue;
2319 }
2320
2321 tok = match_token(p, qos_tokens, args);
2322 switch (tok) {
2323 case QOS_RPPM:
2324 case QOS_WPPM:
2325 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2326 sizeof(buf))
2327 goto einval;
2328 if (cgroup_parse_float(buf, 2, &v))
2329 goto einval;
2330 if (v < 0 || v > 10000)
2331 goto einval;
2332 qos[tok] = v * 100;
2333 break;
2334 case QOS_RLAT:
2335 case QOS_WLAT:
2336 if (match_u64(&args[0], &v))
2337 goto einval;
2338 qos[tok] = v;
2339 break;
2340 case QOS_MIN:
2341 case QOS_MAX:
2342 if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2343 sizeof(buf))
2344 goto einval;
2345 if (cgroup_parse_float(buf, 2, &v))
2346 goto einval;
2347 if (v < 0)
2348 goto einval;
2349 qos[tok] = clamp_t(s64, v * 100,
2350 VRATE_MIN_PPM, VRATE_MAX_PPM);
2351 break;
2352 default:
2353 goto einval;
2354 }
2355 user = true;
2356 }
2357
2358 if (qos[QOS_MIN] > qos[QOS_MAX])
2359 goto einval;
2360
2361 spin_lock_irq(&ioc->lock);
2362
2363 if (enable) {
Tejun Heocd006502020-04-13 12:27:56 -04002364 blk_stat_enable_accounting(ioc->rqos.q);
Tejun Heo7caa4712019-08-28 15:05:58 -07002365 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2366 ioc->enabled = true;
2367 } else {
2368 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2369 ioc->enabled = false;
2370 }
2371
2372 if (user) {
2373 memcpy(ioc->params.qos, qos, sizeof(qos));
2374 ioc->user_qos_params = true;
2375 } else {
2376 ioc->user_qos_params = false;
2377 }
2378
2379 ioc_refresh_params(ioc, true);
2380 spin_unlock_irq(&ioc->lock);
2381
2382 put_disk_and_module(disk);
2383 return nbytes;
2384einval:
2385 ret = -EINVAL;
2386err:
2387 put_disk_and_module(disk);
2388 return ret;
2389}
2390
2391static u64 ioc_cost_model_prfill(struct seq_file *sf,
2392 struct blkg_policy_data *pd, int off)
2393{
2394 const char *dname = blkg_dev_name(pd->blkg);
2395 struct ioc *ioc = pd_to_iocg(pd)->ioc;
2396 u64 *u = ioc->params.i_lcoefs;
2397
2398 if (!dname)
2399 return 0;
2400
2401 seq_printf(sf, "%s ctrl=%s model=linear "
2402 "rbps=%llu rseqiops=%llu rrandiops=%llu "
2403 "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2404 dname, ioc->user_cost_model ? "user" : "auto",
2405 u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2406 u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2407 return 0;
2408}
2409
2410static int ioc_cost_model_show(struct seq_file *sf, void *v)
2411{
2412 struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2413
2414 blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2415 &blkcg_policy_iocost, seq_cft(sf)->private, false);
2416 return 0;
2417}
2418
2419static const match_table_t cost_ctrl_tokens = {
2420 { COST_CTRL, "ctrl=%s" },
2421 { COST_MODEL, "model=%s" },
2422 { NR_COST_CTRL_PARAMS, NULL },
2423};
2424
2425static const match_table_t i_lcoef_tokens = {
2426 { I_LCOEF_RBPS, "rbps=%u" },
2427 { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
2428 { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
2429 { I_LCOEF_WBPS, "wbps=%u" },
2430 { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
2431 { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
2432 { NR_I_LCOEFS, NULL },
2433};
2434
2435static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2436 size_t nbytes, loff_t off)
2437{
2438 struct gendisk *disk;
2439 struct ioc *ioc;
2440 u64 u[NR_I_LCOEFS];
2441 bool user;
2442 char *p;
2443 int ret;
2444
2445 disk = blkcg_conf_get_disk(&input);
2446 if (IS_ERR(disk))
2447 return PTR_ERR(disk);
2448
2449 ioc = q_to_ioc(disk->queue);
2450 if (!ioc) {
2451 ret = blk_iocost_init(disk->queue);
2452 if (ret)
2453 goto err;
2454 ioc = q_to_ioc(disk->queue);
2455 }
2456
2457 spin_lock_irq(&ioc->lock);
2458 memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2459 user = ioc->user_cost_model;
2460 spin_unlock_irq(&ioc->lock);
2461
2462 while ((p = strsep(&input, " \t\n"))) {
2463 substring_t args[MAX_OPT_ARGS];
2464 char buf[32];
2465 int tok;
2466 u64 v;
2467
2468 if (!*p)
2469 continue;
2470
2471 switch (match_token(p, cost_ctrl_tokens, args)) {
2472 case COST_CTRL:
2473 match_strlcpy(buf, &args[0], sizeof(buf));
2474 if (!strcmp(buf, "auto"))
2475 user = false;
2476 else if (!strcmp(buf, "user"))
2477 user = true;
2478 else
2479 goto einval;
2480 continue;
2481 case COST_MODEL:
2482 match_strlcpy(buf, &args[0], sizeof(buf));
2483 if (strcmp(buf, "linear"))
2484 goto einval;
2485 continue;
2486 }
2487
2488 tok = match_token(p, i_lcoef_tokens, args);
2489 if (tok == NR_I_LCOEFS)
2490 goto einval;
2491 if (match_u64(&args[0], &v))
2492 goto einval;
2493 u[tok] = v;
2494 user = true;
2495 }
2496
2497 spin_lock_irq(&ioc->lock);
2498 if (user) {
2499 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2500 ioc->user_cost_model = true;
2501 } else {
2502 ioc->user_cost_model = false;
2503 }
2504 ioc_refresh_params(ioc, true);
2505 spin_unlock_irq(&ioc->lock);
2506
2507 put_disk_and_module(disk);
2508 return nbytes;
2509
2510einval:
2511 ret = -EINVAL;
2512err:
2513 put_disk_and_module(disk);
2514 return ret;
2515}
2516
2517static struct cftype ioc_files[] = {
2518 {
2519 .name = "weight",
2520 .flags = CFTYPE_NOT_ON_ROOT,
2521 .seq_show = ioc_weight_show,
2522 .write = ioc_weight_write,
2523 },
2524 {
2525 .name = "cost.qos",
2526 .flags = CFTYPE_ONLY_ON_ROOT,
2527 .seq_show = ioc_qos_show,
2528 .write = ioc_qos_write,
2529 },
2530 {
2531 .name = "cost.model",
2532 .flags = CFTYPE_ONLY_ON_ROOT,
2533 .seq_show = ioc_cost_model_show,
2534 .write = ioc_cost_model_write,
2535 },
2536 {}
2537};
2538
2539static struct blkcg_policy blkcg_policy_iocost = {
2540 .dfl_cftypes = ioc_files,
2541 .cpd_alloc_fn = ioc_cpd_alloc,
2542 .cpd_free_fn = ioc_cpd_free,
2543 .pd_alloc_fn = ioc_pd_alloc,
2544 .pd_init_fn = ioc_pd_init,
2545 .pd_free_fn = ioc_pd_free,
2546};
2547
2548static int __init ioc_init(void)
2549{
2550 return blkcg_policy_register(&blkcg_policy_iocost);
2551}
2552
2553static void __exit ioc_exit(void)
2554{
2555 return blkcg_policy_unregister(&blkcg_policy_iocost);
2556}
2557
2558module_init(ioc_init);
2559module_exit(ioc_exit);