blk-mq: drain I/O when all CPUs in a hctx are offline
Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup
up queue mapping. Thomas mentioned the following point[1]:
"That was the constraint of managed interrupts from the very beginning:
The driver/subsystem has to quiesce the interrupt line and the associated
queue _before_ it gets shutdown in CPU unplug and not fiddle with it
until it's restarted by the core when the CPU is plugged in again."
However, current blk-mq implementation doesn't quiesce hw queue before
the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is a
cpuhp state handled after the CPU is down, so there isn't any chance to
quiesce the hctx before shutting down the CPU.
Add new CPUHP_AP_BLK_MQ_ONLINE state to stop allocating from blk-mq hctxs
where the last CPU goes away, and wait for completion of in-flight
requests. This guarantees that there is no inflight I/O before shutting
down the managed IRQ.
Add a BLK_MQ_F_STACKING and set it for dm-rq and loop, so we don't need
to wait for completion of in-flight requests from these drivers to avoid
a potential dead-lock. It is safe to do this for stacking drivers as those
do not use interrupts at all and their I/O completions are triggered by
underlying devices I/O completion.
[1] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/
[hch: different retry mechanism, merged two patches, minor cleanups]
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Daniel Wagner <dwagner@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 560ef5d..9a36ac1c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -375,14 +375,30 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
e->type->ops.limit_depth(data->cmd_flags, data);
}
+retry:
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
if (!(data->flags & BLK_MQ_REQ_INTERNAL))
blk_mq_tag_busy(data->hctx);
+ /*
+ * Waiting allocations only fail because of an inactive hctx. In that
+ * case just retry the hctx assignment and tag allocation as CPU hotplug
+ * should have migrated us to an online CPU by now.
+ */
tag = blk_mq_get_tag(data);
- if (tag == BLK_MQ_NO_TAG)
- return NULL;
+ if (tag == BLK_MQ_NO_TAG) {
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
+ return NULL;
+
+ /*
+ * Give up the CPU and sleep for a random short time to ensure
+ * that thread using a realtime scheduling class are migrated
+ * off the the CPU, and thus off the hctx that is going away.
+ */
+ msleep(3);
+ goto retry;
+ }
return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
}
@@ -2335,6 +2351,86 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
return -ENOMEM;
}
+struct rq_iter_data {
+ struct blk_mq_hw_ctx *hctx;
+ bool has_rq;
+};
+
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+{
+ struct rq_iter_data *iter_data = data;
+
+ if (rq->mq_hctx != iter_data->hctx)
+ return true;
+ iter_data->has_rq = true;
+ return false;
+}
+
+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
+{
+ struct blk_mq_tags *tags = hctx->sched_tags ?
+ hctx->sched_tags : hctx->tags;
+ struct rq_iter_data data = {
+ .hctx = hctx,
+ };
+
+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+ return data.has_rq;
+}
+
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
+ struct blk_mq_hw_ctx *hctx)
+{
+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+ return false;
+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+ return false;
+ return true;
+}
+
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
+{
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+ struct blk_mq_hw_ctx, cpuhp_online);
+
+ if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
+ return 0;
+
+ /*
+ * Prevent new request from being allocated on the current hctx.
+ *
+ * The smp_mb__after_atomic() Pairs with the implied barrier in
+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
+ * seen once we return from the tag allocator.
+ */
+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+ smp_mb__after_atomic();
+
+ /*
+ * Try to grab a reference to the queue and wait for any outstanding
+ * requests. If we could not grab a reference the queue has been
+ * frozen and there are no requests.
+ */
+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
+ while (blk_mq_hctx_has_requests(hctx))
+ msleep(5);
+ percpu_ref_put(&hctx->queue->q_usage_counter);
+ }
+
+ return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+ struct blk_mq_hw_ctx, cpuhp_online);
+
+ if (cpumask_test_cpu(cpu, hctx->cpumask))
+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+ return 0;
+}
+
/*
* 'cpu' is going away. splice any existing rq_list entries from this
* software queue to the hw queue dispatch list, and ensure that it
@@ -2348,6 +2444,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
enum hctx_type type;
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
+ return 0;
+
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
type = hctx->type;
@@ -2371,6 +2470,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+ &hctx->cpuhp_online);
cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
&hctx->cpuhp_dead);
}
@@ -2430,6 +2532,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
{
hctx->queue_num = hctx_idx;
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+ &hctx->cpuhp_online);
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
hctx->tags = set->tags[hctx_idx];
@@ -3684,6 +3789,9 @@ static int __init blk_mq_init(void)
{
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
+ blk_mq_hctx_notify_online,
+ blk_mq_hctx_notify_offline);
return 0;
}
subsys_initcall(blk_mq_init);