blk-mq: rework flush sequencing logic

Witch to using a preallocated flush_rq for blk-mq similar to what's done
with the old request path.  This allows us to set up the request properly
with a tag from the actually allowed range and ->rq_disk as needed by
some drivers.  To make life easier we also switch to dynamic allocation
of ->flush_rq for the old path.

This effectively reverts most of

    "blk-mq: fix for flush deadlock"

and

    "blk-mq: Don't reserve a tag for flush request"

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 9143e85..66e2b69 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,20 +130,26 @@
 	blk_clear_rq_complete(rq);
 }
 
-static void mq_flush_data_run(struct work_struct *work)
+static void mq_flush_run(struct work_struct *work)
 {
 	struct request *rq;
 
-	rq = container_of(work, struct request, mq_flush_data);
+	rq = container_of(work, struct request, mq_flush_work);
 
 	memset(&rq->csd, 0, sizeof(rq->csd));
 	blk_mq_run_request(rq, true, false);
 }
 
-static void blk_mq_flush_data_insert(struct request *rq)
+static bool blk_flush_queue_rq(struct request *rq)
 {
-	INIT_WORK(&rq->mq_flush_data, mq_flush_data_run);
-	kblockd_schedule_work(rq->q, &rq->mq_flush_data);
+	if (rq->q->mq_ops) {
+		INIT_WORK(&rq->mq_flush_work, mq_flush_run);
+		kblockd_schedule_work(rq->q, &rq->mq_flush_work);
+		return false;
+	} else {
+		list_add_tail(&rq->queuelist, &rq->q->queue_head);
+		return true;
+	}
 }
 
 /**
@@ -187,12 +193,7 @@
 
 	case REQ_FSEQ_DATA:
 		list_move_tail(&rq->flush.list, &q->flush_data_in_flight);
-		if (q->mq_ops)
-			blk_mq_flush_data_insert(rq);
-		else {
-			list_add(&rq->queuelist, &q->queue_head);
-			queued = true;
-		}
+		queued = blk_flush_queue_rq(rq);
 		break;
 
 	case REQ_FSEQ_DONE:
@@ -216,9 +217,6 @@
 	}
 
 	kicked = blk_kick_flush(q);
-	/* blk_mq_run_flush will run queue */
-	if (q->mq_ops)
-		return queued;
 	return kicked | queued;
 }
 
@@ -230,10 +228,9 @@
 	struct request *rq, *n;
 	unsigned long flags = 0;
 
-	if (q->mq_ops) {
-		blk_mq_free_request(flush_rq);
+	if (q->mq_ops)
 		spin_lock_irqsave(&q->mq_flush_lock, flags);
-	}
+
 	running = &q->flush_queue[q->flush_running_idx];
 	BUG_ON(q->flush_pending_idx == q->flush_running_idx);
 
@@ -263,48 +260,14 @@
 	 * kblockd.
 	 */
 	if (queued || q->flush_queue_delayed) {
-		if (!q->mq_ops)
-			blk_run_queue_async(q);
-		else
-		/*
-		 * This can be optimized to only run queues with requests
-		 * queued if necessary.
-		 */
-			blk_mq_run_queues(q, true);
+		WARN_ON(q->mq_ops);
+		blk_run_queue_async(q);
 	}
 	q->flush_queue_delayed = 0;
 	if (q->mq_ops)
 		spin_unlock_irqrestore(&q->mq_flush_lock, flags);
 }
 
-static void mq_flush_work(struct work_struct *work)
-{
-	struct request_queue *q;
-	struct request *rq;
-
-	q = container_of(work, struct request_queue, mq_flush_work);
-
-	rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
-		__GFP_WAIT|GFP_ATOMIC, false);
-	rq->cmd_type = REQ_TYPE_FS;
-	rq->end_io = flush_end_io;
-
-	blk_mq_run_request(rq, true, false);
-}
-
-/*
- * We can't directly use q->flush_rq, because it doesn't have tag and is not in
- * hctx->rqs[]. so we must allocate a new request, since we can't sleep here,
- * so offload the work to workqueue.
- *
- * Note: we assume a flush request finished in any hardware queue will flush
- * the whole disk cache.
- */
-static void mq_run_flush(struct request_queue *q)
-{
-	kblockd_schedule_work(q, &q->mq_flush_work);
-}
-
 /**
  * blk_kick_flush - consider issuing flush request
  * @q: request_queue being kicked
@@ -339,19 +302,31 @@
 	 * different from running_idx, which means flush is in flight.
 	 */
 	q->flush_pending_idx ^= 1;
+
 	if (q->mq_ops) {
-		mq_run_flush(q);
-		return true;
+		struct blk_mq_ctx *ctx = first_rq->mq_ctx;
+		struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
+
+		blk_mq_rq_init(hctx, q->flush_rq);
+		q->flush_rq->mq_ctx = ctx;
+
+		/*
+		 * Reuse the tag value from the fist waiting request,
+		 * with blk-mq the tag is generated during request
+		 * allocation and drivers can rely on it being inside
+		 * the range they asked for.
+		 */
+		q->flush_rq->tag = first_rq->tag;
+	} else {
+		blk_rq_init(q, q->flush_rq);
 	}
 
-	blk_rq_init(q, &q->flush_rq);
-	q->flush_rq.cmd_type = REQ_TYPE_FS;
-	q->flush_rq.cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
-	q->flush_rq.rq_disk = first_rq->rq_disk;
-	q->flush_rq.end_io = flush_end_io;
+	q->flush_rq->cmd_type = REQ_TYPE_FS;
+	q->flush_rq->cmd_flags = WRITE_FLUSH | REQ_FLUSH_SEQ;
+	q->flush_rq->rq_disk = first_rq->rq_disk;
+	q->flush_rq->end_io = flush_end_io;
 
-	list_add_tail(&q->flush_rq.queuelist, &q->queue_head);
-	return true;
+	return blk_flush_queue_rq(q->flush_rq);
 }
 
 static void flush_data_end_io(struct request *rq, int error)
@@ -407,11 +382,8 @@
 	/*
 	 * @policy now records what operations need to be done.  Adjust
 	 * REQ_FLUSH and FUA for the driver.
-	 * We keep REQ_FLUSH for mq to track flush requests. For !FUA,
-	 * we never dispatch the request directly.
 	 */
-	if (rq->cmd_flags & REQ_FUA)
-		rq->cmd_flags &= ~REQ_FLUSH;
+	rq->cmd_flags &= ~REQ_FLUSH;
 	if (!(fflags & REQ_FUA))
 		rq->cmd_flags &= ~REQ_FUA;
 
@@ -560,5 +532,4 @@
 void blk_mq_init_flush(struct request_queue *q)
 {
 	spin_lock_init(&q->mq_flush_lock);
-	INIT_WORK(&q->mq_flush_work, mq_flush_work);
 }