blk-mq: add blk_mq_free_hctx_request()

It's silly to use blk_mq_free_request() which in turn maps the
request to the hardware queue, for places where we already know
what the hardware queue is. This saves us an extra mapping of a
hardware queue on request completion, if the caller knows this
information already.

Signed-off-by: Jens Axboe <axboe@fb.com>
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fdf1215..4347aa2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -269,16 +269,23 @@
 	blk_mq_queue_exit(q);
 }
 
-void blk_mq_free_request(struct request *rq)
+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
+
+	ctx->rq_completed[rq_is_sync(rq)]++;
+	__blk_mq_free_request(hctx, ctx, rq);
+
+}
+EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
+
+void blk_mq_free_request(struct request *rq)
+{
 	struct blk_mq_hw_ctx *hctx;
 	struct request_queue *q = rq->q;
 
-	ctx->rq_completed[rq_is_sync(rq)]++;
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-	__blk_mq_free_request(hctx, ctx, rq);
+	hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
+	blk_mq_free_hctx_request(hctx, rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index c3b64ec..fb0a4fb 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -169,6 +169,7 @@
 void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_run_queues(struct request_queue *q, bool async);
 void blk_mq_free_request(struct request *rq);
+void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		gfp_t gfp, bool reserved);