| /* |
| * The Kyber I/O scheduler. Controls latency by throttling queue depths using |
| * scalable techniques. |
| * |
| * Copyright (C) 2017 Facebook |
| * |
| * This program is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU General Public |
| * License v2 as published by the Free Software Foundation. |
| * |
| * This program is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * General Public License for more details. |
| * |
| * You should have received a copy of the GNU General Public License |
| * along with this program. If not, see <https://www.gnu.org/licenses/>. |
| */ |
| |
| #include <linux/kernel.h> |
| #include <linux/blkdev.h> |
| #include <linux/blk-mq.h> |
| #include <linux/elevator.h> |
| #include <linux/module.h> |
| #include <linux/sbitmap.h> |
| |
| #include "blk.h" |
| #include "blk-mq.h" |
| #include "blk-mq-sched.h" |
| #include "blk-mq-tag.h" |
| #include "blk-stat.h" |
| |
| /* Scheduling domains. */ |
| enum { |
| KYBER_READ, |
| KYBER_SYNC_WRITE, |
| KYBER_OTHER, /* Async writes, discard, etc. */ |
| KYBER_NUM_DOMAINS, |
| }; |
| |
| enum { |
| KYBER_MIN_DEPTH = 256, |
| |
| /* |
| * In order to prevent starvation of synchronous requests by a flood of |
| * asynchronous requests, we reserve 25% of requests for synchronous |
| * operations. |
| */ |
| KYBER_ASYNC_PERCENT = 75, |
| }; |
| |
| /* |
| * Initial device-wide depths for each scheduling domain. |
| * |
| * Even for fast devices with lots of tags like NVMe, you can saturate |
| * the device with only a fraction of the maximum possible queue depth. |
| * So, we cap these to a reasonable value. |
| */ |
| static const unsigned int kyber_depth[] = { |
| [KYBER_READ] = 256, |
| [KYBER_SYNC_WRITE] = 128, |
| [KYBER_OTHER] = 64, |
| }; |
| |
| /* |
| * Scheduling domain batch sizes. We favor reads. |
| */ |
| static const unsigned int kyber_batch_size[] = { |
| [KYBER_READ] = 16, |
| [KYBER_SYNC_WRITE] = 8, |
| [KYBER_OTHER] = 8, |
| }; |
| |
| struct kyber_queue_data { |
| struct request_queue *q; |
| |
| struct blk_stat_callback *cb; |
| |
| /* |
| * The device is divided into multiple scheduling domains based on the |
| * request type. Each domain has a fixed number of in-flight requests of |
| * that type device-wide, limited by these tokens. |
| */ |
| struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; |
| |
| /* |
| * Async request percentage, converted to per-word depth for |
| * sbitmap_get_shallow(). |
| */ |
| unsigned int async_depth; |
| |
| /* Target latencies in nanoseconds. */ |
| u64 read_lat_nsec, write_lat_nsec; |
| }; |
| |
| struct kyber_hctx_data { |
| spinlock_t lock; |
| struct list_head rqs[KYBER_NUM_DOMAINS]; |
| unsigned int cur_domain; |
| unsigned int batching; |
| wait_queue_t domain_wait[KYBER_NUM_DOMAINS]; |
| atomic_t wait_index[KYBER_NUM_DOMAINS]; |
| }; |
| |
| static unsigned int rq_sched_domain(const struct request *rq) |
| { |
| unsigned int op = rq->cmd_flags; |
| |
| if ((op & REQ_OP_MASK) == REQ_OP_READ) |
| return KYBER_READ; |
| else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) |
| return KYBER_SYNC_WRITE; |
| else |
| return KYBER_OTHER; |
| } |
| |
| enum { |
| NONE = 0, |
| GOOD = 1, |
| GREAT = 2, |
| BAD = -1, |
| AWFUL = -2, |
| }; |
| |
| #define IS_GOOD(status) ((status) > 0) |
| #define IS_BAD(status) ((status) < 0) |
| |
| static int kyber_lat_status(struct blk_stat_callback *cb, |
| unsigned int sched_domain, u64 target) |
| { |
| u64 latency; |
| |
| if (!cb->stat[sched_domain].nr_samples) |
| return NONE; |
| |
| latency = cb->stat[sched_domain].mean; |
| if (latency >= 2 * target) |
| return AWFUL; |
| else if (latency > target) |
| return BAD; |
| else if (latency <= target / 2) |
| return GREAT; |
| else /* (latency <= target) */ |
| return GOOD; |
| } |
| |
| /* |
| * Adjust the read or synchronous write depth given the status of reads and |
| * writes. The goal is that the latencies of the two domains are fair (i.e., if |
| * one is good, then the other is good). |
| */ |
| static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd, |
| unsigned int sched_domain, int this_status, |
| int other_status) |
| { |
| unsigned int orig_depth, depth; |
| |
| /* |
| * If this domain had no samples, or reads and writes are both good or |
| * both bad, don't adjust the depth. |
| */ |
| if (this_status == NONE || |
| (IS_GOOD(this_status) && IS_GOOD(other_status)) || |
| (IS_BAD(this_status) && IS_BAD(other_status))) |
| return; |
| |
| orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth; |
| |
| if (other_status == NONE) { |
| depth++; |
| } else { |
| switch (this_status) { |
| case GOOD: |
| if (other_status == AWFUL) |
| depth -= max(depth / 4, 1U); |
| else |
| depth -= max(depth / 8, 1U); |
| break; |
| case GREAT: |
| if (other_status == AWFUL) |
| depth /= 2; |
| else |
| depth -= max(depth / 4, 1U); |
| break; |
| case BAD: |
| depth++; |
| break; |
| case AWFUL: |
| if (other_status == GREAT) |
| depth += 2; |
| else |
| depth++; |
| break; |
| } |
| } |
| |
| depth = clamp(depth, 1U, kyber_depth[sched_domain]); |
| if (depth != orig_depth) |
| sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); |
| } |
| |
| /* |
| * Adjust the depth of other requests given the status of reads and synchronous |
| * writes. As long as either domain is doing fine, we don't throttle, but if |
| * both domains are doing badly, we throttle heavily. |
| */ |
| static void kyber_adjust_other_depth(struct kyber_queue_data *kqd, |
| int read_status, int write_status, |
| bool have_samples) |
| { |
| unsigned int orig_depth, depth; |
| int status; |
| |
| orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth; |
| |
| if (read_status == NONE && write_status == NONE) { |
| depth += 2; |
| } else if (have_samples) { |
| if (read_status == NONE) |
| status = write_status; |
| else if (write_status == NONE) |
| status = read_status; |
| else |
| status = max(read_status, write_status); |
| switch (status) { |
| case GREAT: |
| depth += 2; |
| break; |
| case GOOD: |
| depth++; |
| break; |
| case BAD: |
| depth -= max(depth / 4, 1U); |
| break; |
| case AWFUL: |
| depth /= 2; |
| break; |
| } |
| } |
| |
| depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]); |
| if (depth != orig_depth) |
| sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth); |
| } |
| |
| /* |
| * Apply heuristics for limiting queue depths based on gathered latency |
| * statistics. |
| */ |
| static void kyber_stat_timer_fn(struct blk_stat_callback *cb) |
| { |
| struct kyber_queue_data *kqd = cb->data; |
| int read_status, write_status; |
| |
| read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec); |
| write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec); |
| |
| kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status); |
| kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status); |
| kyber_adjust_other_depth(kqd, read_status, write_status, |
| cb->stat[KYBER_OTHER].nr_samples != 0); |
| |
| /* |
| * Continue monitoring latencies if we aren't hitting the targets or |
| * we're still throttling other requests. |
| */ |
| if (!blk_stat_is_active(kqd->cb) && |
| ((IS_BAD(read_status) || IS_BAD(write_status) || |
| kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER]))) |
| blk_stat_activate_msecs(kqd->cb, 100); |
| } |
| |
| static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) |
| { |
| /* |
| * All of the hardware queues have the same depth, so we can just grab |
| * the shift of the first one. |
| */ |
| return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; |
| } |
| |
| static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) |
| { |
| struct kyber_queue_data *kqd; |
| unsigned int max_tokens; |
| unsigned int shift; |
| int ret = -ENOMEM; |
| int i; |
| |
| kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); |
| if (!kqd) |
| goto err; |
| kqd->q = q; |
| |
| kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain, |
| KYBER_NUM_DOMAINS, kqd); |
| if (!kqd->cb) |
| goto err_kqd; |
| |
| /* |
| * The maximum number of tokens for any scheduling domain is at least |
| * the queue depth of a single hardware queue. If the hardware doesn't |
| * have many tags, still provide a reasonable number. |
| */ |
| max_tokens = max_t(unsigned int, q->tag_set->queue_depth, |
| KYBER_MIN_DEPTH); |
| for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
| WARN_ON(!kyber_depth[i]); |
| WARN_ON(!kyber_batch_size[i]); |
| ret = sbitmap_queue_init_node(&kqd->domain_tokens[i], |
| max_tokens, -1, false, GFP_KERNEL, |
| q->node); |
| if (ret) { |
| while (--i >= 0) |
| sbitmap_queue_free(&kqd->domain_tokens[i]); |
| goto err_cb; |
| } |
| sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]); |
| } |
| |
| shift = kyber_sched_tags_shift(kqd); |
| kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; |
| |
| kqd->read_lat_nsec = 2000000ULL; |
| kqd->write_lat_nsec = 10000000ULL; |
| |
| return kqd; |
| |
| err_cb: |
| blk_stat_free_callback(kqd->cb); |
| err_kqd: |
| kfree(kqd); |
| err: |
| return ERR_PTR(ret); |
| } |
| |
| static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) |
| { |
| struct kyber_queue_data *kqd; |
| struct elevator_queue *eq; |
| |
| eq = elevator_alloc(q, e); |
| if (!eq) |
| return -ENOMEM; |
| |
| kqd = kyber_queue_data_alloc(q); |
| if (IS_ERR(kqd)) { |
| kobject_put(&eq->kobj); |
| return PTR_ERR(kqd); |
| } |
| |
| eq->elevator_data = kqd; |
| q->elevator = eq; |
| |
| blk_stat_add_callback(q, kqd->cb); |
| |
| return 0; |
| } |
| |
| static void kyber_exit_sched(struct elevator_queue *e) |
| { |
| struct kyber_queue_data *kqd = e->elevator_data; |
| struct request_queue *q = kqd->q; |
| int i; |
| |
| blk_stat_remove_callback(q, kqd->cb); |
| |
| for (i = 0; i < KYBER_NUM_DOMAINS; i++) |
| sbitmap_queue_free(&kqd->domain_tokens[i]); |
| blk_stat_free_callback(kqd->cb); |
| kfree(kqd); |
| } |
| |
| static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) |
| { |
| struct kyber_hctx_data *khd; |
| int i; |
| |
| khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node); |
| if (!khd) |
| return -ENOMEM; |
| |
| spin_lock_init(&khd->lock); |
| |
| for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
| INIT_LIST_HEAD(&khd->rqs[i]); |
| INIT_LIST_HEAD(&khd->domain_wait[i].task_list); |
| atomic_set(&khd->wait_index[i], 0); |
| } |
| |
| khd->cur_domain = 0; |
| khd->batching = 0; |
| |
| hctx->sched_data = khd; |
| |
| return 0; |
| } |
| |
| static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) |
| { |
| kfree(hctx->sched_data); |
| } |
| |
| static int rq_get_domain_token(struct request *rq) |
| { |
| return (long)rq->elv.priv[0]; |
| } |
| |
| static void rq_set_domain_token(struct request *rq, int token) |
| { |
| rq->elv.priv[0] = (void *)(long)token; |
| } |
| |
| static void rq_clear_domain_token(struct kyber_queue_data *kqd, |
| struct request *rq) |
| { |
| unsigned int sched_domain; |
| int nr; |
| |
| nr = rq_get_domain_token(rq); |
| if (nr != -1) { |
| sched_domain = rq_sched_domain(rq); |
| sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr, |
| rq->mq_ctx->cpu); |
| } |
| } |
| |
| static struct request *kyber_get_request(struct request_queue *q, |
| unsigned int op, |
| struct blk_mq_alloc_data *data) |
| { |
| struct kyber_queue_data *kqd = q->elevator->elevator_data; |
| struct request *rq; |
| |
| /* |
| * We use the scheduler tags as per-hardware queue queueing tokens. |
| * Async requests can be limited at this stage. |
| */ |
| if (!op_is_sync(op)) |
| data->shallow_depth = kqd->async_depth; |
| |
| rq = __blk_mq_alloc_request(data, op); |
| if (rq) |
| rq_set_domain_token(rq, -1); |
| return rq; |
| } |
| |
| static void kyber_put_request(struct request *rq) |
| { |
| struct request_queue *q = rq->q; |
| struct kyber_queue_data *kqd = q->elevator->elevator_data; |
| |
| rq_clear_domain_token(kqd, rq); |
| blk_mq_finish_request(rq); |
| } |
| |
| static void kyber_completed_request(struct request *rq) |
| { |
| struct request_queue *q = rq->q; |
| struct kyber_queue_data *kqd = q->elevator->elevator_data; |
| unsigned int sched_domain; |
| u64 now, latency, target; |
| |
| /* |
| * Check if this request met our latency goal. If not, quickly gather |
| * some statistics and start throttling. |
| */ |
| sched_domain = rq_sched_domain(rq); |
| switch (sched_domain) { |
| case KYBER_READ: |
| target = kqd->read_lat_nsec; |
| break; |
| case KYBER_SYNC_WRITE: |
| target = kqd->write_lat_nsec; |
| break; |
| default: |
| return; |
| } |
| |
| /* If we are already monitoring latencies, don't check again. */ |
| if (blk_stat_is_active(kqd->cb)) |
| return; |
| |
| now = __blk_stat_time(ktime_to_ns(ktime_get())); |
| if (now < blk_stat_time(&rq->issue_stat)) |
| return; |
| |
| latency = now - blk_stat_time(&rq->issue_stat); |
| |
| if (latency > target) |
| blk_stat_activate_msecs(kqd->cb, 10); |
| } |
| |
| static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd, |
| struct blk_mq_hw_ctx *hctx) |
| { |
| LIST_HEAD(rq_list); |
| struct request *rq, *next; |
| |
| blk_mq_flush_busy_ctxs(hctx, &rq_list); |
| list_for_each_entry_safe(rq, next, &rq_list, queuelist) { |
| unsigned int sched_domain; |
| |
| sched_domain = rq_sched_domain(rq); |
| list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]); |
| } |
| } |
| |
| static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags, |
| void *key) |
| { |
| struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); |
| |
| list_del_init(&wait->task_list); |
| blk_mq_run_hw_queue(hctx, true); |
| return 1; |
| } |
| |
| static int kyber_get_domain_token(struct kyber_queue_data *kqd, |
| struct kyber_hctx_data *khd, |
| struct blk_mq_hw_ctx *hctx) |
| { |
| unsigned int sched_domain = khd->cur_domain; |
| struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; |
| wait_queue_t *wait = &khd->domain_wait[sched_domain]; |
| struct sbq_wait_state *ws; |
| int nr; |
| |
| nr = __sbitmap_queue_get(domain_tokens); |
| if (nr >= 0) |
| return nr; |
| |
| /* |
| * If we failed to get a domain token, make sure the hardware queue is |
| * run when one becomes available. Note that this is serialized on |
| * khd->lock, but we still need to be careful about the waker. |
| */ |
| if (list_empty_careful(&wait->task_list)) { |
| init_waitqueue_func_entry(wait, kyber_domain_wake); |
| wait->private = hctx; |
| ws = sbq_wait_ptr(domain_tokens, |
| &khd->wait_index[sched_domain]); |
| add_wait_queue(&ws->wait, wait); |
| |
| /* |
| * Try again in case a token was freed before we got on the wait |
| * queue. |
| */ |
| nr = __sbitmap_queue_get(domain_tokens); |
| } |
| return nr; |
| } |
| |
| static struct request * |
| kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, |
| struct kyber_hctx_data *khd, |
| struct blk_mq_hw_ctx *hctx, |
| bool *flushed) |
| { |
| struct list_head *rqs; |
| struct request *rq; |
| int nr; |
| |
| rqs = &khd->rqs[khd->cur_domain]; |
| rq = list_first_entry_or_null(rqs, struct request, queuelist); |
| |
| /* |
| * If there wasn't already a pending request and we haven't flushed the |
| * software queues yet, flush the software queues and check again. |
| */ |
| if (!rq && !*flushed) { |
| kyber_flush_busy_ctxs(khd, hctx); |
| *flushed = true; |
| rq = list_first_entry_or_null(rqs, struct request, queuelist); |
| } |
| |
| if (rq) { |
| nr = kyber_get_domain_token(kqd, khd, hctx); |
| if (nr >= 0) { |
| khd->batching++; |
| rq_set_domain_token(rq, nr); |
| list_del_init(&rq->queuelist); |
| return rq; |
| } |
| } |
| |
| /* There were either no pending requests or no tokens. */ |
| return NULL; |
| } |
| |
| static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) |
| { |
| struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; |
| struct kyber_hctx_data *khd = hctx->sched_data; |
| bool flushed = false; |
| struct request *rq; |
| int i; |
| |
| spin_lock(&khd->lock); |
| |
| /* |
| * First, if we are still entitled to batch, try to dispatch a request |
| * from the batch. |
| */ |
| if (khd->batching < kyber_batch_size[khd->cur_domain]) { |
| rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed); |
| if (rq) |
| goto out; |
| } |
| |
| /* |
| * Either, |
| * 1. We were no longer entitled to a batch. |
| * 2. The domain we were batching didn't have any requests. |
| * 3. The domain we were batching was out of tokens. |
| * |
| * Start another batch. Note that this wraps back around to the original |
| * domain if no other domains have requests or tokens. |
| */ |
| khd->batching = 0; |
| for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
| if (khd->cur_domain == KYBER_NUM_DOMAINS - 1) |
| khd->cur_domain = 0; |
| else |
| khd->cur_domain++; |
| |
| rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed); |
| if (rq) |
| goto out; |
| } |
| |
| rq = NULL; |
| out: |
| spin_unlock(&khd->lock); |
| return rq; |
| } |
| |
| static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) |
| { |
| struct kyber_hctx_data *khd = hctx->sched_data; |
| int i; |
| |
| for (i = 0; i < KYBER_NUM_DOMAINS; i++) { |
| if (!list_empty_careful(&khd->rqs[i])) |
| return true; |
| } |
| return false; |
| } |
| |
| #define KYBER_LAT_SHOW_STORE(op) \ |
| static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \ |
| char *page) \ |
| { \ |
| struct kyber_queue_data *kqd = e->elevator_data; \ |
| \ |
| return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \ |
| } \ |
| \ |
| static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ |
| const char *page, size_t count) \ |
| { \ |
| struct kyber_queue_data *kqd = e->elevator_data; \ |
| unsigned long long nsec; \ |
| int ret; \ |
| \ |
| ret = kstrtoull(page, 10, &nsec); \ |
| if (ret) \ |
| return ret; \ |
| \ |
| kqd->op##_lat_nsec = nsec; \ |
| \ |
| return count; \ |
| } |
| KYBER_LAT_SHOW_STORE(read); |
| KYBER_LAT_SHOW_STORE(write); |
| #undef KYBER_LAT_SHOW_STORE |
| |
| #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) |
| static struct elv_fs_entry kyber_sched_attrs[] = { |
| KYBER_LAT_ATTR(read), |
| KYBER_LAT_ATTR(write), |
| __ATTR_NULL |
| }; |
| #undef KYBER_LAT_ATTR |
| |
| static struct elevator_type kyber_sched = { |
| .ops.mq = { |
| .init_sched = kyber_init_sched, |
| .exit_sched = kyber_exit_sched, |
| .init_hctx = kyber_init_hctx, |
| .exit_hctx = kyber_exit_hctx, |
| .get_request = kyber_get_request, |
| .put_request = kyber_put_request, |
| .completed_request = kyber_completed_request, |
| .dispatch_request = kyber_dispatch_request, |
| .has_work = kyber_has_work, |
| }, |
| .uses_mq = true, |
| .elevator_attrs = kyber_sched_attrs, |
| .elevator_name = "kyber", |
| .elevator_owner = THIS_MODULE, |
| }; |
| |
| static int __init kyber_init(void) |
| { |
| return elv_register(&kyber_sched); |
| } |
| |
| static void __exit kyber_exit(void) |
| { |
| elv_unregister(&kyber_sched); |
| } |
| |
| module_init(kyber_init); |
| module_exit(kyber_exit); |
| |
| MODULE_AUTHOR("Omar Sandoval"); |
| MODULE_LICENSE("GPL"); |
| MODULE_DESCRIPTION("Kyber I/O scheduler"); |