blk-mq: move srcu from blk_mq_hw_ctx to request_queue
In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch
critical area. However, this srcu instance stays at the end of hctx, and
it often takes standalone cacheline, often cold.
Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on
the indirect percpu variable which is allocated from heap instead of
being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It
doesn't matter if srcu structure stays in hctx or request queue.
So switch to per-request-queue srcu for protecting dispatch, and this
way simplifies quiesce a lot, not mention quiesce is always done on the
request queue wide.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211203131534.3668411-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/block/blk-core.c b/block/blk-core.c
index b0660c9..10619fd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -66,6 +66,7 @@ DEFINE_IDA(blk_queue_ida);
* For queue allocation
*/
struct kmem_cache *blk_requestq_cachep;
+struct kmem_cache *blk_requestq_srcu_cachep;
/*
* Controlling structure to kblockd
@@ -437,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-struct request_queue *blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
{
struct request_queue *q;
int ret;
- q = kmem_cache_alloc_node(blk_requestq_cachep,
- GFP_KERNEL | __GFP_ZERO, node_id);
+ q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
+ GFP_KERNEL | __GFP_ZERO, node_id);
if (!q)
return NULL;
+ if (alloc_srcu) {
+ blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
+ if (init_srcu_struct(q->srcu) != 0)
+ goto fail_q;
+ }
+
q->last_merge = NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
if (q->id < 0)
- goto fail_q;
+ goto fail_srcu;
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
if (ret)
@@ -508,8 +515,11 @@ struct request_queue *blk_alloc_queue(int node_id)
bioset_exit(&q->bio_split);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
+fail_srcu:
+ if (alloc_srcu)
+ cleanup_srcu_struct(q->srcu);
fail_q:
- kmem_cache_free(blk_requestq_cachep, q);
+ kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
return NULL;
}
@@ -1301,6 +1311,9 @@ int __init blk_dev_init(void)
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct bio, bi_opf));
+ BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
+ __alignof__(struct request_queue)) !=
+ sizeof(struct request_queue));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1311,6 +1324,10 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+ blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
+ sizeof(struct request_queue) +
+ sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
+
blk_debugfs_root = debugfs_create_dir("block", NULL);
return 0;