Blame - block/blk-mq.c - SHIFTPHONES/mainline/linux

blob: 3b1c425a935ee76c72d1edb7f8782774b941d2d0 [file] [log] [blame]

Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1	#include <linux/kernel.h>
				2	#include <linux/module.h>
				3	#include <linux/backing-dev.h>
				4	#include <linux/bio.h>
				5	#include <linux/blkdev.h>
				6	#include <linux/mm.h>
				7	#include <linux/init.h>
				8	#include <linux/slab.h>
				9	#include <linux/workqueue.h>
				10	#include <linux/smp.h>
				11	#include <linux/llist.h>
				12	#include <linux/list_sort.h>
				13	#include <linux/cpu.h>
				14	#include <linux/cache.h>
				15	#include <linux/sched/sysctl.h>
				16	#include <linux/delay.h>
				17
				18	#include <trace/events/block.h>
				19
				20	#include <linux/blk-mq.h>
				21	#include "blk.h"
				22	#include "blk-mq.h"
				23	#include "blk-mq-tag.h"
				24
				25	static DEFINE_MUTEX(all_q_mutex);
				26	static LIST_HEAD(all_q_list);
				27
				28	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
				29
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	30	static struct blk_mq_ctx __blk_mq_get_ctx(struct request_queue q,
				31	unsigned int cpu)
				32	{
				33	return per_cpu_ptr(q->queue_ctx, cpu);
				34	}
				35
				36	/*
				37	* This assumes per-cpu software queueing queues. They could be per-node
				38	* as well, for instance. For now this is hardcoded as-is. Note that we don't
				39	* care about preemption, since we know the ctx's are persistent. This does
				40	* mean that we can't rely on ctx always matching the currently running CPU.
				41	*/
				42	static struct blk_mq_ctx blk_mq_get_ctx(struct request_queue q)
				43	{
				44	return __blk_mq_get_ctx(q, get_cpu());
				45	}
				46
				47	static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
				48	{
				49	put_cpu();
				50	}
				51
				52	/*
				53	* Check if any of the ctx's have pending work in this hardware queue
				54	*/
				55	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
				56	{
				57	unsigned int i;
				58
				59	for (i = 0; i < hctx->nr_ctx_map; i++)
				60	if (hctx->ctx_map[i])
				61	return true;
				62
				63	return false;
				64	}
				65
				66	/*
				67	* Mark this ctx as having pending work in this hardware queue
				68	*/
				69	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				70	struct blk_mq_ctx *ctx)
				71	{
				72	if (!test_bit(ctx->index_hw, hctx->ctx_map))
				73	set_bit(ctx->index_hw, hctx->ctx_map);
				74	}
				75
				76	static struct request blk_mq_alloc_rq(struct blk_mq_hw_ctx hctx, gfp_t gfp,
				77	bool reserved)
				78	{
				79	struct request *rq;
				80	unsigned int tag;
				81
				82	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
				83	if (tag != BLK_MQ_TAG_FAIL) {
				84	rq = hctx->rqs[tag];
				85	rq->tag = tag;
				86
				87	return rq;
				88	}
				89
				90	return NULL;
				91	}
				92
				93	static int blk_mq_queue_enter(struct request_queue *q)
				94	{
				95	int ret;
				96
				97	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
				98	smp_wmb();
				99	/* we have problems to freeze the queue if it's initializing */
				100	if (!blk_queue_bypass(q) \|\| !blk_queue_init_done(q))
				101	return 0;
				102
				103	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				104
				105	spin_lock_irq(q->queue_lock);
				106	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	107	!blk_queue_bypass(q) \|\| blk_queue_dying(q),
				108	*q->queue_lock);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	109	/* inc usage with lock hold to avoid freeze_queue runs here */
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	110	if (!ret && !blk_queue_dying(q))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	111	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	112	else if (blk_queue_dying(q))
				113	ret = -ENODEV;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	114	spin_unlock_irq(q->queue_lock);
				115
				116	return ret;
				117	}
				118
				119	static void blk_mq_queue_exit(struct request_queue *q)
				120	{
				121	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				122	}
				123
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	124	static void __blk_mq_drain_queue(struct request_queue *q)
				125	{
				126	while (true) {
				127	s64 count;
				128
				129	spin_lock_irq(q->queue_lock);
				130	count = percpu_counter_sum(&q->mq_usage_counter);
				131	spin_unlock_irq(q->queue_lock);
				132
				133	if (count == 0)
				134	break;
				135	blk_mq_run_queues(q, false);
				136	msleep(10);
				137	}
				138	}
				139
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	140	/*
				141	* Guarantee no request is in use, so we can change any data structure of
				142	* the queue afterward.
				143	*/
				144	static void blk_mq_freeze_queue(struct request_queue *q)
				145	{
				146	bool drain;
				147
				148	spin_lock_irq(q->queue_lock);
				149	drain = !q->bypass_depth++;
				150	queue_flag_set(QUEUE_FLAG_BYPASS, q);
				151	spin_unlock_irq(q->queue_lock);
				152
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	153	if (drain)
				154	__blk_mq_drain_queue(q);
				155	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	156
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	157	void blk_mq_drain_queue(struct request_queue *q)
				158	{
				159	__blk_mq_drain_queue(q);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	160	}
				161
				162	static void blk_mq_unfreeze_queue(struct request_queue *q)
				163	{
				164	bool wake = false;
				165
				166	spin_lock_irq(q->queue_lock);
				167	if (!--q->bypass_depth) {
				168	queue_flag_clear(QUEUE_FLAG_BYPASS, q);
				169	wake = true;
				170	}
				171	WARN_ON_ONCE(q->bypass_depth < 0);
				172	spin_unlock_irq(q->queue_lock);
				173	if (wake)
				174	wake_up_all(&q->mq_freeze_wq);
				175	}
				176
				177	bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
				178	{
				179	return blk_mq_has_free_tags(hctx->tags);
				180	}
				181	EXPORT_SYMBOL(blk_mq_can_queue);
				182
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	183	static void blk_mq_rq_ctx_init(struct request_queue q, struct blk_mq_ctx ctx,
				184	struct request *rq, unsigned int rw_flags)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	185	{
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	186	if (blk_queue_io_stat(q))
				187	rw_flags \|= REQ_IO_STAT;
				188
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	189	rq->mq_ctx = ctx;
				190	rq->cmd_flags = rw_flags;
Ming Lei	0fec08b	2014-01-03 10:00:08 -0700	[diff] [blame]	191	rq->start_time = jiffies;
				192	set_start_time_ns(rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	193	ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
				194	}
				195
				196	static struct request __blk_mq_alloc_request(struct blk_mq_hw_ctx hctx,
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	197	gfp_t gfp, bool reserved)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	198	{
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	199	return blk_mq_alloc_rq(hctx, gfp, reserved);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	200	}
				201
				202	static struct request blk_mq_alloc_request_pinned(struct request_queue q,
				203	int rw, gfp_t gfp,
				204	bool reserved)
				205	{
				206	struct request *rq;
				207
				208	do {
				209	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
				210	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
				211
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	212	rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	213	if (rq) {
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	214	blk_mq_rq_ctx_init(q, ctx, rq, rw);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	215	break;
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	216	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	217
				218	blk_mq_put_ctx(ctx);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	219	if (!(gfp & __GFP_WAIT))
				220	break;
				221
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	222	__blk_mq_run_hw_queue(hctx);
				223	blk_mq_wait_for_tags(hctx->tags);
				224	} while (1);
				225
				226	return rq;
				227	}
				228
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	229	struct request blk_mq_alloc_request(struct request_queue q, int rw, gfp_t gfp)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	230	{
				231	struct request *rq;
				232
				233	if (blk_mq_queue_enter(q))
				234	return NULL;
				235
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	236	rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	237	if (rq)
				238	blk_mq_put_ctx(rq->mq_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	239	return rq;
				240	}
				241
				242	struct request blk_mq_alloc_reserved_request(struct request_queue q, int rw,
				243	gfp_t gfp)
				244	{
				245	struct request *rq;
				246
				247	if (blk_mq_queue_enter(q))
				248	return NULL;
				249
				250	rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	251	if (rq)
				252	blk_mq_put_ctx(rq->mq_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	253	return rq;
				254	}
				255	EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
				256
				257	/*
				258	* Re-init and set pdu, if we have it
				259	*/
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	260	void blk_mq_rq_init(struct blk_mq_hw_ctx hctx, struct request rq)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	261	{
				262	blk_rq_init(hctx->queue, rq);
				263
				264	if (hctx->cmd_size)
				265	rq->special = blk_mq_rq_to_pdu(rq);
				266	}
				267
				268	static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
				269	struct blk_mq_ctx ctx, struct request rq)
				270	{
				271	const int tag = rq->tag;
				272	struct request_queue *q = rq->q;
				273
				274	blk_mq_rq_init(hctx, rq);
				275	blk_mq_put_tag(hctx->tags, tag);
				276
				277	blk_mq_queue_exit(q);
				278	}
				279
				280	void blk_mq_free_request(struct request *rq)
				281	{
				282	struct blk_mq_ctx *ctx = rq->mq_ctx;
				283	struct blk_mq_hw_ctx *hctx;
				284	struct request_queue *q = rq->q;
				285
				286	ctx->rq_completed[rq_is_sync(rq)]++;
				287
				288	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				289	__blk_mq_free_request(hctx, ctx, rq);
				290	}
				291
				292	static void blk_mq_bio_endio(struct request rq, struct bio bio, int error)
				293	{
				294	if (error)
				295	clear_bit(BIO_UPTODATE, &bio->bi_flags);
				296	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
				297	error = -EIO;
				298
				299	if (unlikely(rq->cmd_flags & REQ_QUIET))
				300	set_bit(BIO_QUIET, &bio->bi_flags);
				301
				302	/* don't actually finish bio if it's part of flush sequence */
				303	if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
				304	bio_endio(bio, error);
				305	}
				306
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	307	void blk_mq_end_io(struct request *rq, int error)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	308	{
				309	struct bio *bio = rq->bio;
				310	unsigned int bytes = 0;
				311
Roman Pen	af5040d	2014-03-04 23:13:10 +0900	[diff] [blame]	312	trace_block_rq_complete(rq->q, rq, blk_rq_bytes(rq));
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	313
				314	while (bio) {
				315	struct bio *next = bio->bi_next;
				316
				317	bio->bi_next = NULL;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	318	bytes += bio->bi_iter.bi_size;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	319	blk_mq_bio_endio(rq, bio, error);
				320	bio = next;
				321	}
				322
				323	blk_account_io_completion(rq, bytes);
				324
Ming Lei	0d11e6a	2013-12-05 10:50:39 -0700	[diff] [blame]	325	blk_account_io_done(rq);
				326
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	327	if (rq->end_io)
				328	rq->end_io(rq, error);
				329	else
				330	blk_mq_free_request(rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	331	}
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	332	EXPORT_SYMBOL(blk_mq_end_io);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	333
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	334	static void __blk_mq_complete_request_remote(void *data)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	335	{
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	336	struct request *rq = data;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	337
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	338	rq->q->softirq_done_fn(rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	339	}
				340
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	341	void __blk_mq_complete_request(struct request *rq)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	342	{
				343	struct blk_mq_ctx *ctx = rq->mq_ctx;
				344	int cpu;
				345
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	346	if (!ctx->ipi_redirect) {
				347	rq->q->softirq_done_fn(rq);
				348	return;
				349	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	350
				351	cpu = get_cpu();
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	352	if (cpu != ctx->cpu && cpu_online(ctx->cpu)) {
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	353	rq->csd.func = __blk_mq_complete_request_remote;
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	354	rq->csd.info = rq;
				355	rq->csd.flags = 0;
Frederic Weisbecker	c46fff2	2014-02-24 16:40:02 +0100	[diff] [blame]	356	smp_call_function_single_async(ctx->cpu, &rq->csd);
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	357	} else {
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	358	rq->q->softirq_done_fn(rq);
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	359	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	360	put_cpu();
				361	}
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	362
				363	/**
				364	* blk_mq_complete_request - end I/O on a request
				365	* @rq: the request being processed
				366	*
				367	* Description:
				368	* Ends all I/O on a request. It does not handle partial completions.
				369	* The actual completion happens out-of-order, through a IPI handler.
				370	**/
				371	void blk_mq_complete_request(struct request *rq)
				372	{
				373	if (unlikely(blk_should_fake_timeout(rq->q)))
				374	return;
				375	if (!blk_mark_rq_complete(rq))
				376	__blk_mq_complete_request(rq);
				377	}
				378	EXPORT_SYMBOL(blk_mq_complete_request);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	379
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	380	static void blk_mq_start_request(struct request *rq, bool last)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	381	{
				382	struct request_queue *q = rq->q;
				383
				384	trace_block_rq_issue(q, rq);
				385
				386	/*
				387	* Just mark start time and set the started bit. Due to memory
				388	* ordering, we know we'll see the correct deadline as long as
				389	* REQ_ATOMIC_STARTED is seen.
				390	*/
				391	rq->deadline = jiffies + q->rq_timeout;
				392	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	393
				394	if (q->dma_drain_size && blk_rq_bytes(rq)) {
				395	/*
				396	* Make sure space for the drain appears. We know we can do
				397	* this because max_hw_segments has been adjusted to be one
				398	* fewer than the device can handle.
				399	*/
				400	rq->nr_phys_segments++;
				401	}
				402
				403	/*
				404	* Flag the last request in the series so that drivers know when IO
				405	* should be kicked off, if they don't do it on a per-request basis.
				406	*
				407	* Note: the flag isn't the only condition drivers should do kick off.
				408	* If drive is busy, the last request might not have the bit set.
				409	*/
				410	if (last)
				411	rq->cmd_flags \|= REQ_END;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	412	}
				413
				414	static void blk_mq_requeue_request(struct request *rq)
				415	{
				416	struct request_queue *q = rq->q;
				417
				418	trace_block_rq_requeue(q, rq);
				419	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	420
				421	rq->cmd_flags &= ~REQ_END;
				422
				423	if (q->dma_drain_size && blk_rq_bytes(rq))
				424	rq->nr_phys_segments--;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	425	}
				426
				427	struct blk_mq_timeout_data {
				428	struct blk_mq_hw_ctx *hctx;
				429	unsigned long *next;
				430	unsigned int *next_set;
				431	};
				432
				433	static void blk_mq_timeout_check(void __data, unsigned long free_tags)
				434	{
				435	struct blk_mq_timeout_data *data = __data;
				436	struct blk_mq_hw_ctx *hctx = data->hctx;
				437	unsigned int tag;
				438
				439	/* It may not be in flight yet (this is where
				440	* the REQ_ATOMIC_STARTED flag comes in). The requests are
				441	* statically allocated, so we know it's always safe to access the
				442	* memory associated with a bit offset into ->rqs[].
				443	*/
				444	tag = 0;
				445	do {
				446	struct request *rq;
				447
				448	tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
				449	if (tag >= hctx->queue_depth)
				450	break;
				451
				452	rq = hctx->rqs[tag++];
				453
				454	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
				455	continue;
				456
				457	blk_rq_check_expired(rq, data->next, data->next_set);
				458	} while (1);
				459	}
				460
				461	static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
				462	unsigned long *next,
				463	unsigned int *next_set)
				464	{
				465	struct blk_mq_timeout_data data = {
				466	.hctx = hctx,
				467	.next = next,
				468	.next_set = next_set,
				469	};
				470
				471	/*
				472	* Ask the tagging code to iterate busy requests, so we can
				473	* check them for timeout.
				474	*/
				475	blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
				476	}
				477
				478	static void blk_mq_rq_timer(unsigned long data)
				479	{
				480	struct request_queue q = (struct request_queue ) data;
				481	struct blk_mq_hw_ctx *hctx;
				482	unsigned long next = 0;
				483	int i, next_set = 0;
				484
				485	queue_for_each_hw_ctx(q, hctx, i)
				486	blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
				487
				488	if (next_set)
				489	mod_timer(&q->timeout, round_jiffies_up(next));
				490	}
				491
				492	/*
				493	* Reverse check our software queue for entries that we could potentially
				494	* merge with. Currently includes a hand-wavy stop count of 8, to not spend
				495	* too much time checking for merges.
				496	*/
				497	static bool blk_mq_attempt_merge(struct request_queue *q,
				498	struct blk_mq_ctx ctx, struct bio bio)
				499	{
				500	struct request *rq;
				501	int checked = 8;
				502
				503	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
				504	int el_ret;
				505
				506	if (!checked--)
				507	break;
				508
				509	if (!blk_rq_merge_ok(rq, bio))
				510	continue;
				511
				512	el_ret = blk_try_merge(rq, bio);
				513	if (el_ret == ELEVATOR_BACK_MERGE) {
				514	if (bio_attempt_back_merge(q, rq, bio)) {
				515	ctx->rq_merged++;
				516	return true;
				517	}
				518	break;
				519	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
				520	if (bio_attempt_front_merge(q, rq, bio)) {
				521	ctx->rq_merged++;
				522	return true;
				523	}
				524	break;
				525	}
				526	}
				527
				528	return false;
				529	}
				530
				531	void blk_mq_add_timer(struct request *rq)
				532	{
				533	__blk_add_timer(rq, NULL);
				534	}
				535
				536	/*
				537	* Run this hardware queue, pulling any software queues mapped to it in.
				538	* Note that this function currently has various problems around ordering
				539	* of IO. In particular, we'd like FIFO behaviour on handling existing
				540	* items on the hctx->dispatch list. Ignore that for now.
				541	*/
				542	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
				543	{
				544	struct request_queue *q = hctx->queue;
				545	struct blk_mq_ctx *ctx;
				546	struct request *rq;
				547	LIST_HEAD(rq_list);
				548	int bit, queued;
				549
Jens Axboe	5d12f90	2014-03-19 15:25:02 -0600	[diff] [blame]	550	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	551	return;
				552
				553	hctx->run++;
				554
				555	/*
				556	* Touch any software queue that has pending entries.
				557	*/
				558	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
				559	clear_bit(bit, hctx->ctx_map);
				560	ctx = hctx->ctxs[bit];
				561	BUG_ON(bit != ctx->index_hw);
				562
				563	spin_lock(&ctx->lock);
				564	list_splice_tail_init(&ctx->rq_list, &rq_list);
				565	spin_unlock(&ctx->lock);
				566	}
				567
				568	/*
				569	* If we have previous entries on our dispatch list, grab them
				570	* and stuff them at the front for more fair dispatch.
				571	*/
				572	if (!list_empty_careful(&hctx->dispatch)) {
				573	spin_lock(&hctx->lock);
				574	if (!list_empty(&hctx->dispatch))
				575	list_splice_init(&hctx->dispatch, &rq_list);
				576	spin_unlock(&hctx->lock);
				577	}
				578
				579	/*
				580	* Delete and return all entries from our dispatch list
				581	*/
				582	queued = 0;
				583
				584	/*
				585	* Now process all the entries, sending them to the driver.
				586	*/
				587	while (!list_empty(&rq_list)) {
				588	int ret;
				589
				590	rq = list_first_entry(&rq_list, struct request, queuelist);
				591	list_del_init(&rq->queuelist);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	592
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	593	blk_mq_start_request(rq, list_empty(&rq_list));
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	594
				595	ret = q->mq_ops->queue_rq(hctx, rq);
				596	switch (ret) {
				597	case BLK_MQ_RQ_QUEUE_OK:
				598	queued++;
				599	continue;
				600	case BLK_MQ_RQ_QUEUE_BUSY:
				601	/*
				602	* FIXME: we should have a mechanism to stop the queue
				603	* like blk_stop_queue, otherwise we will waste cpu
				604	* time
				605	*/
				606	list_add(&rq->queuelist, &rq_list);
				607	blk_mq_requeue_request(rq);
				608	break;
				609	default:
				610	pr_err("blk-mq: bad return on queue: %d\n", ret);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	611	case BLK_MQ_RQ_QUEUE_ERROR:
Christoph Hellwig	1e93b8c	2014-02-11 08:27:13 -0800	[diff] [blame]	612	rq->errors = -EIO;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	613	blk_mq_end_io(rq, rq->errors);
				614	break;
				615	}
				616
				617	if (ret == BLK_MQ_RQ_QUEUE_BUSY)
				618	break;
				619	}
				620
				621	if (!queued)
				622	hctx->dispatched[0]++;
				623	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
				624	hctx->dispatched[ilog2(queued) + 1]++;
				625
				626	/*
				627	* Any items that need requeuing? Stuff them into hctx->dispatch,
				628	* that is where we will continue on next queue run.
				629	*/
				630	if (!list_empty(&rq_list)) {
				631	spin_lock(&hctx->lock);
				632	list_splice(&rq_list, &hctx->dispatch);
				633	spin_unlock(&hctx->lock);
				634	}
				635	}
				636
				637	void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
				638	{
Jens Axboe	5d12f90	2014-03-19 15:25:02 -0600	[diff] [blame]	639	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	640	return;
				641
				642	if (!async)
				643	__blk_mq_run_hw_queue(hctx);
				644	else {
				645	struct request_queue *q = hctx->queue;
				646
				647	kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
				648	}
				649	}
				650
				651	void blk_mq_run_queues(struct request_queue *q, bool async)
				652	{
				653	struct blk_mq_hw_ctx *hctx;
				654	int i;
				655
				656	queue_for_each_hw_ctx(q, hctx, i) {
				657	if ((!blk_mq_hctx_has_pending(hctx) &&
				658	list_empty_careful(&hctx->dispatch)) \|\|
Jens Axboe	5d12f90	2014-03-19 15:25:02 -0600	[diff] [blame]	659	test_bit(BLK_MQ_S_STOPPED, &hctx->state))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	660	continue;
				661
				662	blk_mq_run_hw_queue(hctx, async);
				663	}
				664	}
				665	EXPORT_SYMBOL(blk_mq_run_queues);
				666
				667	void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
				668	{
				669	cancel_delayed_work(&hctx->delayed_work);
				670	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
				671	}
				672	EXPORT_SYMBOL(blk_mq_stop_hw_queue);
				673
Christoph Hellwig	280d45f	2013-10-25 14:45:58 +0100	[diff] [blame]	674	void blk_mq_stop_hw_queues(struct request_queue *q)
				675	{
				676	struct blk_mq_hw_ctx *hctx;
				677	int i;
				678
				679	queue_for_each_hw_ctx(q, hctx, i)
				680	blk_mq_stop_hw_queue(hctx);
				681	}
				682	EXPORT_SYMBOL(blk_mq_stop_hw_queues);
				683
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	684	void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
				685	{
				686	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
				687	__blk_mq_run_hw_queue(hctx);
				688	}
				689	EXPORT_SYMBOL(blk_mq_start_hw_queue);
				690
				691	void blk_mq_start_stopped_hw_queues(struct request_queue *q)
				692	{
				693	struct blk_mq_hw_ctx *hctx;
				694	int i;
				695
				696	queue_for_each_hw_ctx(q, hctx, i) {
				697	if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
				698	continue;
				699
				700	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
				701	blk_mq_run_hw_queue(hctx, true);
				702	}
				703	}
				704	EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
				705
				706	static void blk_mq_work_fn(struct work_struct *work)
				707	{
				708	struct blk_mq_hw_ctx *hctx;
				709
				710	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
				711	__blk_mq_run_hw_queue(hctx);
				712	}
				713
				714	static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	715	struct request *rq, bool at_head)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	716	{
				717	struct blk_mq_ctx *ctx = rq->mq_ctx;
				718
Jens Axboe	01b983c	2013-11-19 18:59:10 -0700	[diff] [blame]	719	trace_block_rq_insert(hctx->queue, rq);
				720
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	721	if (at_head)
				722	list_add(&rq->queuelist, &ctx->rq_list);
				723	else
				724	list_add_tail(&rq->queuelist, &ctx->rq_list);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	725	blk_mq_hctx_mark_pending(hctx, ctx);
				726
				727	/*
				728	* We do this early, to ensure we are on the right CPU.
				729	*/
				730	blk_mq_add_timer(rq);
				731	}
				732
				733	void blk_mq_insert_request(struct request_queue q, struct request rq,
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	734	bool at_head, bool run_queue)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	735	{
				736	struct blk_mq_hw_ctx *hctx;
				737	struct blk_mq_ctx ctx, current_ctx;
				738
				739	ctx = rq->mq_ctx;
				740	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				741
				742	if (rq->cmd_flags & (REQ_FLUSH \| REQ_FUA)) {
				743	blk_insert_flush(rq);
				744	} else {
				745	current_ctx = blk_mq_get_ctx(q);
				746
				747	if (!cpu_online(ctx->cpu)) {
				748	ctx = current_ctx;
				749	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				750	rq->mq_ctx = ctx;
				751	}
				752	spin_lock(&ctx->lock);
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	753	__blk_mq_insert_request(hctx, rq, at_head);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	754	spin_unlock(&ctx->lock);
				755
				756	blk_mq_put_ctx(current_ctx);
				757	}
				758
				759	if (run_queue)
				760	__blk_mq_run_hw_queue(hctx);
				761	}
				762	EXPORT_SYMBOL(blk_mq_insert_request);
				763
				764	/*
				765	* This is a special version of blk_mq_insert_request to bypass FLUSH request
				766	* check. Should only be used internally.
				767	*/
				768	void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
				769	{
				770	struct request_queue *q = rq->q;
				771	struct blk_mq_hw_ctx *hctx;
				772	struct blk_mq_ctx ctx, current_ctx;
				773
				774	current_ctx = blk_mq_get_ctx(q);
				775
				776	ctx = rq->mq_ctx;
				777	if (!cpu_online(ctx->cpu)) {
				778	ctx = current_ctx;
				779	rq->mq_ctx = ctx;
				780	}
				781	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				782
				783	/* ctx->cpu might be offline */
				784	spin_lock(&ctx->lock);
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	785	__blk_mq_insert_request(hctx, rq, false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	786	spin_unlock(&ctx->lock);
				787
				788	blk_mq_put_ctx(current_ctx);
				789
				790	if (run_queue)
				791	blk_mq_run_hw_queue(hctx, async);
				792	}
				793
				794	static void blk_mq_insert_requests(struct request_queue *q,
				795	struct blk_mq_ctx *ctx,
				796	struct list_head *list,
				797	int depth,
				798	bool from_schedule)
				799
				800	{
				801	struct blk_mq_hw_ctx *hctx;
				802	struct blk_mq_ctx *current_ctx;
				803
				804	trace_block_unplug(q, depth, !from_schedule);
				805
				806	current_ctx = blk_mq_get_ctx(q);
				807
				808	if (!cpu_online(ctx->cpu))
				809	ctx = current_ctx;
				810	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				811
				812	/*
				813	* preemption doesn't flush plug list, so it's possible ctx->cpu is
				814	* offline now
				815	*/
				816	spin_lock(&ctx->lock);
				817	while (!list_empty(list)) {
				818	struct request *rq;
				819
				820	rq = list_first_entry(list, struct request, queuelist);
				821	list_del_init(&rq->queuelist);
				822	rq->mq_ctx = ctx;
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	823	__blk_mq_insert_request(hctx, rq, false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	824	}
				825	spin_unlock(&ctx->lock);
				826
				827	blk_mq_put_ctx(current_ctx);
				828
				829	blk_mq_run_hw_queue(hctx, from_schedule);
				830	}
				831
				832	static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
				833	{
				834	struct request *rqa = container_of(a, struct request, queuelist);
				835	struct request *rqb = container_of(b, struct request, queuelist);
				836
				837	return !(rqa->mq_ctx < rqb->mq_ctx \|\|
				838	(rqa->mq_ctx == rqb->mq_ctx &&
				839	blk_rq_pos(rqa) < blk_rq_pos(rqb)));
				840	}
				841
				842	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
				843	{
				844	struct blk_mq_ctx *this_ctx;
				845	struct request_queue *this_q;
				846	struct request *rq;
				847	LIST_HEAD(list);
				848	LIST_HEAD(ctx_list);
				849	unsigned int depth;
				850
				851	list_splice_init(&plug->mq_list, &list);
				852
				853	list_sort(NULL, &list, plug_ctx_cmp);
				854
				855	this_q = NULL;
				856	this_ctx = NULL;
				857	depth = 0;
				858
				859	while (!list_empty(&list)) {
				860	rq = list_entry_rq(list.next);
				861	list_del_init(&rq->queuelist);
				862	BUG_ON(!rq->q);
				863	if (rq->mq_ctx != this_ctx) {
				864	if (this_ctx) {
				865	blk_mq_insert_requests(this_q, this_ctx,
				866	&ctx_list, depth,
				867	from_schedule);
				868	}
				869
				870	this_ctx = rq->mq_ctx;
				871	this_q = rq->q;
				872	depth = 0;
				873	}
				874
				875	depth++;
				876	list_add_tail(&rq->queuelist, &ctx_list);
				877	}
				878
				879	/*
				880	* If 'this_ctx' is set, we know we have entries to complete
				881	* on 'ctx_list'. Do those.
				882	*/
				883	if (this_ctx) {
				884	blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
				885	from_schedule);
				886	}
				887	}
				888
				889	static void blk_mq_bio_to_request(struct request rq, struct bio bio)
				890	{
				891	init_request_from_bio(rq, bio);
				892	blk_account_io_start(rq, 1);
				893	}
				894
				895	static void blk_mq_make_request(struct request_queue q, struct bio bio)
				896	{
				897	struct blk_mq_hw_ctx *hctx;
				898	struct blk_mq_ctx *ctx;
				899	const int is_sync = rw_is_sync(bio->bi_rw);
				900	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH \| REQ_FUA);
				901	int rw = bio_data_dir(bio);
				902	struct request *rq;
				903	unsigned int use_plug, request_count = 0;
				904
				905	/*
				906	* If we have multiple hardware queues, just go directly to
				907	* one of those for sync IO.
				908	*/
				909	use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) \|\| !is_sync);
				910
				911	blk_queue_bounce(q, &bio);
				912
Nicholas Bellinger	14ec77f	2014-02-07 13:45:39 -0700	[diff] [blame]	913	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
				914	bio_endio(bio, -EIO);
				915	return;
				916	}
				917
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	918	if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
				919	return;
				920
				921	if (blk_mq_queue_enter(q)) {
				922	bio_endio(bio, -EIO);
				923	return;
				924	}
				925
				926	ctx = blk_mq_get_ctx(q);
				927	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				928
				929	trace_block_getrq(q, bio, rw);
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	930	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	931	if (likely(rq))
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	932	blk_mq_rq_ctx_init(q, ctx, rq, rw);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	933	else {
				934	blk_mq_put_ctx(ctx);
				935	trace_block_sleeprq(q, bio, rw);
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	936	rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT\|GFP_ATOMIC,
				937	false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	938	ctx = rq->mq_ctx;
				939	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				940	}
				941
				942	hctx->queued++;
				943
				944	if (unlikely(is_flush_fua)) {
				945	blk_mq_bio_to_request(rq, bio);
				946	blk_mq_put_ctx(ctx);
				947	blk_insert_flush(rq);
				948	goto run_queue;
				949	}
				950
				951	/*
				952	* A task plug currently exists. Since this is completely lockless,
				953	* utilize that to temporarily store requests until the task is
				954	* either done or scheduled away.
				955	*/
				956	if (use_plug) {
				957	struct blk_plug *plug = current->plug;
				958
				959	if (plug) {
				960	blk_mq_bio_to_request(rq, bio);
Shaohua Li	92f399c	2013-10-29 12:01:03 -0600	[diff] [blame]	961	if (list_empty(&plug->mq_list))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	962	trace_block_plug(q);
				963	else if (request_count >= BLK_MAX_REQUEST_COUNT) {
				964	blk_flush_plug_list(plug, false);
				965	trace_block_plug(q);
				966	}
				967	list_add_tail(&rq->queuelist, &plug->mq_list);
				968	blk_mq_put_ctx(ctx);
				969	return;
				970	}
				971	}
				972
				973	spin_lock(&ctx->lock);
				974
				975	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
				976	blk_mq_attempt_merge(q, ctx, bio))
				977	__blk_mq_free_request(hctx, ctx, rq);
				978	else {
				979	blk_mq_bio_to_request(rq, bio);
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	980	__blk_mq_insert_request(hctx, rq, false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	981	}
				982
				983	spin_unlock(&ctx->lock);
				984	blk_mq_put_ctx(ctx);
				985
				986	/*
				987	* For a SYNC request, send it to the hardware immediately. For an
				988	* ASYNC request, just ensure that we run it later on. The latter
				989	* allows for merging opportunities and more efficient dispatching.
				990	*/
				991	run_queue:
				992	blk_mq_run_hw_queue(hctx, !is_sync \|\| is_flush_fua);
				993	}
				994
				995	/*
				996	* Default mapping to a software queue, since we use one per CPU.
				997	*/
				998	struct blk_mq_hw_ctx blk_mq_map_queue(struct request_queue q, const int cpu)
				999	{
				1000	return q->queue_hw_ctx[q->mq_map[cpu]];
				1001	}
				1002	EXPORT_SYMBOL(blk_mq_map_queue);
				1003
				1004	struct blk_mq_hw_ctx blk_mq_alloc_single_hw_queue(struct blk_mq_reg reg,
				1005	unsigned int hctx_index)
				1006	{
				1007	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
				1008	GFP_KERNEL \| __GFP_ZERO, reg->numa_node);
				1009	}
				1010	EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
				1011
				1012	void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
				1013	unsigned int hctx_index)
				1014	{
				1015	kfree(hctx);
				1016	}
				1017	EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
				1018
				1019	static void blk_mq_hctx_notify(void *data, unsigned long action,
				1020	unsigned int cpu)
				1021	{
				1022	struct blk_mq_hw_ctx *hctx = data;
				1023	struct blk_mq_ctx *ctx;
				1024	LIST_HEAD(tmp);
				1025
				1026	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
				1027	return;
				1028
				1029	/*
				1030	* Move ctx entries to new CPU, if this one is going away.
				1031	*/
				1032	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
				1033
				1034	spin_lock(&ctx->lock);
				1035	if (!list_empty(&ctx->rq_list)) {
				1036	list_splice_init(&ctx->rq_list, &tmp);
				1037	clear_bit(ctx->index_hw, hctx->ctx_map);
				1038	}
				1039	spin_unlock(&ctx->lock);
				1040
				1041	if (list_empty(&tmp))
				1042	return;
				1043
				1044	ctx = blk_mq_get_ctx(hctx->queue);
				1045	spin_lock(&ctx->lock);
				1046
				1047	while (!list_empty(&tmp)) {
				1048	struct request *rq;
				1049
				1050	rq = list_first_entry(&tmp, struct request, queuelist);
				1051	rq->mq_ctx = ctx;
				1052	list_move_tail(&rq->queuelist, &ctx->rq_list);
				1053	}
				1054
				1055	blk_mq_hctx_mark_pending(hctx, ctx);
				1056
				1057	spin_unlock(&ctx->lock);
				1058	blk_mq_put_ctx(ctx);
				1059	}
				1060
Jens Axboe	95363ef	2014-03-14 10:43:15 -0600	[diff] [blame]	1061	static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
				1062	int (init)(void , struct blk_mq_hw_ctx *,
				1063	struct request *, unsigned int),
				1064	void *data)
				1065	{
				1066	unsigned int i;
				1067	int ret = 0;
				1068
				1069	for (i = 0; i < hctx->queue_depth; i++) {
				1070	struct request *rq = hctx->rqs[i];
				1071
				1072	ret = init(data, hctx, rq, i);
				1073	if (ret)
				1074	break;
				1075	}
				1076
				1077	return ret;
				1078	}
				1079
				1080	int blk_mq_init_commands(struct request_queue *q,
				1081	int (init)(void , struct blk_mq_hw_ctx *,
				1082	struct request *, unsigned int),
				1083	void *data)
				1084	{
				1085	struct blk_mq_hw_ctx *hctx;
				1086	unsigned int i;
				1087	int ret = 0;
				1088
				1089	queue_for_each_hw_ctx(q, hctx, i) {
				1090	ret = blk_mq_init_hw_commands(hctx, init, data);
				1091	if (ret)
				1092	break;
				1093	}
				1094
				1095	return ret;
				1096	}
				1097	EXPORT_SYMBOL(blk_mq_init_commands);
				1098
				1099	static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx,
				1100	void (free)(void , struct blk_mq_hw_ctx *,
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1101	struct request *, unsigned int),
				1102	void *data)
				1103	{
				1104	unsigned int i;
				1105
				1106	for (i = 0; i < hctx->queue_depth; i++) {
				1107	struct request *rq = hctx->rqs[i];
				1108
Jens Axboe	95363ef	2014-03-14 10:43:15 -0600	[diff] [blame]	1109	free(data, hctx, rq, i);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1110	}
				1111	}
				1112
Jens Axboe	95363ef	2014-03-14 10:43:15 -0600	[diff] [blame]	1113	void blk_mq_free_commands(struct request_queue *q,
				1114	void (free)(void , struct blk_mq_hw_ctx *,
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1115	struct request *, unsigned int),
				1116	void *data)
				1117	{
				1118	struct blk_mq_hw_ctx *hctx;
				1119	unsigned int i;
				1120
				1121	queue_for_each_hw_ctx(q, hctx, i)
Jens Axboe	95363ef	2014-03-14 10:43:15 -0600	[diff] [blame]	1122	blk_mq_free_hw_commands(hctx, free, data);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1123	}
Jens Axboe	95363ef	2014-03-14 10:43:15 -0600	[diff] [blame]	1124	EXPORT_SYMBOL(blk_mq_free_commands);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1125
				1126	static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
				1127	{
				1128	struct page *page;
				1129
				1130	while (!list_empty(&hctx->page_list)) {
Dave Hansen	6753471	2014-01-08 20:17:46 -0700	[diff] [blame]	1131	page = list_first_entry(&hctx->page_list, struct page, lru);
				1132	list_del_init(&page->lru);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1133	__free_pages(page, page->private);
				1134	}
				1135
				1136	kfree(hctx->rqs);
				1137
				1138	if (hctx->tags)
				1139	blk_mq_free_tags(hctx->tags);
				1140	}
				1141
				1142	static size_t order_to_size(unsigned int order)
				1143	{
				1144	size_t ret = PAGE_SIZE;
				1145
				1146	while (order--)
				1147	ret *= 2;
				1148
				1149	return ret;
				1150	}
				1151
				1152	static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
				1153	unsigned int reserved_tags, int node)
				1154	{
				1155	unsigned int i, j, entries_per_page, max_order = 4;
				1156	size_t rq_size, left;
				1157
				1158	INIT_LIST_HEAD(&hctx->page_list);
				1159
				1160	hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
				1161	GFP_KERNEL, node);
				1162	if (!hctx->rqs)
				1163	return -ENOMEM;
				1164
				1165	/*
				1166	* rq_size is the size of the request plus driver payload, rounded
				1167	* to the cacheline size
				1168	*/
				1169	rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
				1170	cache_line_size());
				1171	left = rq_size * hctx->queue_depth;
				1172
				1173	for (i = 0; i < hctx->queue_depth;) {
				1174	int this_order = max_order;
				1175	struct page *page;
				1176	int to_do;
				1177	void *p;
				1178
				1179	while (left < order_to_size(this_order - 1) && this_order)
				1180	this_order--;
				1181
				1182	do {
				1183	page = alloc_pages_node(node, GFP_KERNEL, this_order);
				1184	if (page)
				1185	break;
				1186	if (!this_order--)
				1187	break;
				1188	if (order_to_size(this_order) < rq_size)
				1189	break;
				1190	} while (1);
				1191
				1192	if (!page)
				1193	break;
				1194
				1195	page->private = this_order;
Dave Hansen	6753471	2014-01-08 20:17:46 -0700	[diff] [blame]	1196	list_add_tail(&page->lru, &hctx->page_list);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1197
				1198	p = page_address(page);
				1199	entries_per_page = order_to_size(this_order) / rq_size;
				1200	to_do = min(entries_per_page, hctx->queue_depth - i);
				1201	left -= to_do * rq_size;
				1202	for (j = 0; j < to_do; j++) {
				1203	hctx->rqs[i] = p;
				1204	blk_mq_rq_init(hctx, hctx->rqs[i]);
				1205	p += rq_size;
				1206	i++;
				1207	}
				1208	}
				1209
				1210	if (i < (reserved_tags + BLK_MQ_TAG_MIN))
				1211	goto err_rq_map;
				1212	else if (i != hctx->queue_depth) {
				1213	hctx->queue_depth = i;
				1214	pr_warn("%s: queue depth set to %u because of low memory\n",
				1215	__func__, i);
				1216	}
				1217
				1218	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
				1219	if (!hctx->tags) {
				1220	err_rq_map:
				1221	blk_mq_free_rq_map(hctx);
				1222	return -ENOMEM;
				1223	}
				1224
				1225	return 0;
				1226	}
				1227
				1228	static int blk_mq_init_hw_queues(struct request_queue *q,
				1229	struct blk_mq_reg reg, void driver_data)
				1230	{
				1231	struct blk_mq_hw_ctx *hctx;
				1232	unsigned int i, j;
				1233
				1234	/*
				1235	* Initialize hardware queues
				1236	*/
				1237	queue_for_each_hw_ctx(q, hctx, i) {
				1238	unsigned int num_maps;
				1239	int node;
				1240
				1241	node = hctx->numa_node;
				1242	if (node == NUMA_NO_NODE)
				1243	node = hctx->numa_node = reg->numa_node;
				1244
				1245	INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
				1246	spin_lock_init(&hctx->lock);
				1247	INIT_LIST_HEAD(&hctx->dispatch);
				1248	hctx->queue = q;
				1249	hctx->queue_num = i;
				1250	hctx->flags = reg->flags;
				1251	hctx->queue_depth = reg->queue_depth;
				1252	hctx->cmd_size = reg->cmd_size;
				1253
				1254	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
				1255	blk_mq_hctx_notify, hctx);
				1256	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
				1257
				1258	if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
				1259	break;
				1260
				1261	/*
				1262	* Allocate space for all possible cpus to avoid allocation in
				1263	* runtime
				1264	*/
				1265	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
				1266	GFP_KERNEL, node);
				1267	if (!hctx->ctxs)
				1268	break;
				1269
				1270	num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
				1271	hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
				1272	GFP_KERNEL, node);
				1273	if (!hctx->ctx_map)
				1274	break;
				1275
				1276	hctx->nr_ctx_map = num_maps;
				1277	hctx->nr_ctx = 0;
				1278
				1279	if (reg->ops->init_hctx &&
				1280	reg->ops->init_hctx(hctx, driver_data, i))
				1281	break;
				1282	}
				1283
				1284	if (i == q->nr_hw_queues)
				1285	return 0;
				1286
				1287	/*
				1288	* Init failed
				1289	*/
				1290	queue_for_each_hw_ctx(q, hctx, j) {
				1291	if (i == j)
				1292	break;
				1293
				1294	if (reg->ops->exit_hctx)
				1295	reg->ops->exit_hctx(hctx, j);
				1296
				1297	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1298	blk_mq_free_rq_map(hctx);
				1299	kfree(hctx->ctxs);
				1300	}
				1301
				1302	return 1;
				1303	}
				1304
				1305	static void blk_mq_init_cpu_queues(struct request_queue *q,
				1306	unsigned int nr_hw_queues)
				1307	{
				1308	unsigned int i;
				1309
				1310	for_each_possible_cpu(i) {
				1311	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
				1312	struct blk_mq_hw_ctx *hctx;
				1313
				1314	memset(__ctx, 0, sizeof(*__ctx));
				1315	__ctx->cpu = i;
				1316	spin_lock_init(&__ctx->lock);
				1317	INIT_LIST_HEAD(&__ctx->rq_list);
				1318	__ctx->queue = q;
				1319
				1320	/* If the cpu isn't online, the cpu is mapped to first hctx */
				1321	hctx = q->mq_ops->map_queue(q, i);
				1322	hctx->nr_ctx++;
				1323
				1324	if (!cpu_online(i))
				1325	continue;
				1326
				1327	/*
				1328	* Set local node, IFF we have more than one hw queue. If
				1329	* not, we remain on the home node of the device
				1330	*/
				1331	if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
				1332	hctx->numa_node = cpu_to_node(i);
				1333	}
				1334	}
				1335
				1336	static void blk_mq_map_swqueue(struct request_queue *q)
				1337	{
				1338	unsigned int i;
				1339	struct blk_mq_hw_ctx *hctx;
				1340	struct blk_mq_ctx *ctx;
				1341
				1342	queue_for_each_hw_ctx(q, hctx, i) {
				1343	hctx->nr_ctx = 0;
				1344	}
				1345
				1346	/*
				1347	* Map software to hardware queues
				1348	*/
				1349	queue_for_each_ctx(q, ctx, i) {
				1350	/* If the cpu isn't online, the cpu is mapped to first hctx */
				1351	hctx = q->mq_ops->map_queue(q, i);
				1352	ctx->index_hw = hctx->nr_ctx;
				1353	hctx->ctxs[hctx->nr_ctx++] = ctx;
				1354	}
				1355	}
				1356
				1357	struct request_queue blk_mq_init_queue(struct blk_mq_reg reg,
				1358	void *driver_data)
				1359	{
				1360	struct blk_mq_hw_ctx **hctxs;
				1361	struct blk_mq_ctx *ctx;
				1362	struct request_queue *q;
				1363	int i;
				1364
				1365	if (!reg->nr_hw_queues \|\|
				1366	!reg->ops->queue_rq \|\| !reg->ops->map_queue \|\|
				1367	!reg->ops->alloc_hctx \|\| !reg->ops->free_hctx)
				1368	return ERR_PTR(-EINVAL);
				1369
				1370	if (!reg->queue_depth)
				1371	reg->queue_depth = BLK_MQ_MAX_DEPTH;
				1372	else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
				1373	pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
				1374	reg->queue_depth = BLK_MQ_MAX_DEPTH;
				1375	}
				1376
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1377	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
				1378	return ERR_PTR(-EINVAL);
				1379
				1380	ctx = alloc_percpu(struct blk_mq_ctx);
				1381	if (!ctx)
				1382	return ERR_PTR(-ENOMEM);
				1383
				1384	hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
				1385	reg->numa_node);
				1386
				1387	if (!hctxs)
				1388	goto err_percpu;
				1389
				1390	for (i = 0; i < reg->nr_hw_queues; i++) {
				1391	hctxs[i] = reg->ops->alloc_hctx(reg, i);
				1392	if (!hctxs[i])
				1393	goto err_hctxs;
				1394
				1395	hctxs[i]->numa_node = NUMA_NO_NODE;
				1396	hctxs[i]->queue_num = i;
				1397	}
				1398
				1399	q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
				1400	if (!q)
				1401	goto err_hctxs;
				1402
				1403	q->mq_map = blk_mq_make_queue_map(reg);
				1404	if (!q->mq_map)
				1405	goto err_map;
				1406
				1407	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
				1408	blk_queue_rq_timeout(q, 30000);
				1409
				1410	q->nr_queues = nr_cpu_ids;
				1411	q->nr_hw_queues = reg->nr_hw_queues;
				1412
				1413	q->queue_ctx = ctx;
				1414	q->queue_hw_ctx = hctxs;
				1415
				1416	q->mq_ops = reg->ops;
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	1417	q->queue_flags \|= QUEUE_FLAG_MQ_DEFAULT;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1418
Christoph Hellwig	1be036e	2014-02-07 10:22:39 -0800	[diff] [blame]	1419	q->sg_reserved_size = INT_MAX;
				1420
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1421	blk_queue_make_request(q, blk_mq_make_request);
				1422	blk_queue_rq_timed_out(q, reg->ops->timeout);
				1423	if (reg->timeout)
				1424	blk_queue_rq_timeout(q, reg->timeout);
				1425
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	1426	if (reg->ops->complete)
				1427	blk_queue_softirq_done(q, reg->ops->complete);
				1428
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1429	blk_mq_init_flush(q);
				1430	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
				1431
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	1432	q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
				1433	cache_line_size()), GFP_KERNEL);
				1434	if (!q->flush_rq)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1435	goto err_hw;
				1436
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	1437	if (blk_mq_init_hw_queues(q, reg, driver_data))
				1438	goto err_flush_rq;
				1439
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1440	blk_mq_map_swqueue(q);
				1441
				1442	mutex_lock(&all_q_mutex);
				1443	list_add_tail(&q->all_q_node, &all_q_list);
				1444	mutex_unlock(&all_q_mutex);
				1445
				1446	return q;
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	1447
				1448	err_flush_rq:
				1449	kfree(q->flush_rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1450	err_hw:
				1451	kfree(q->mq_map);
				1452	err_map:
				1453	blk_cleanup_queue(q);
				1454	err_hctxs:
				1455	for (i = 0; i < reg->nr_hw_queues; i++) {
				1456	if (!hctxs[i])
				1457	break;
				1458	reg->ops->free_hctx(hctxs[i], i);
				1459	}
				1460	kfree(hctxs);
				1461	err_percpu:
				1462	free_percpu(ctx);
				1463	return ERR_PTR(-ENOMEM);
				1464	}
				1465	EXPORT_SYMBOL(blk_mq_init_queue);
				1466
				1467	void blk_mq_free_queue(struct request_queue *q)
				1468	{
				1469	struct blk_mq_hw_ctx *hctx;
				1470	int i;
				1471
				1472	queue_for_each_hw_ctx(q, hctx, i) {
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1473	kfree(hctx->ctx_map);
				1474	kfree(hctx->ctxs);
				1475	blk_mq_free_rq_map(hctx);
				1476	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1477	if (q->mq_ops->exit_hctx)
				1478	q->mq_ops->exit_hctx(hctx, i);
				1479	q->mq_ops->free_hctx(hctx, i);
				1480	}
				1481
				1482	free_percpu(q->queue_ctx);
				1483	kfree(q->queue_hw_ctx);
				1484	kfree(q->mq_map);
				1485
				1486	q->queue_ctx = NULL;
				1487	q->queue_hw_ctx = NULL;
				1488	q->mq_map = NULL;
				1489
				1490	mutex_lock(&all_q_mutex);
				1491	list_del_init(&q->all_q_node);
				1492	mutex_unlock(&all_q_mutex);
				1493	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1494
				1495	/* Basically redo blk_mq_init_queue with queue frozen */
Paul Gortmaker	f618ef7	2013-11-14 08:26:02 -0700	[diff] [blame]	1496	static void blk_mq_queue_reinit(struct request_queue *q)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1497	{
				1498	blk_mq_freeze_queue(q);
				1499
				1500	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
				1501
				1502	/*
				1503	* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
				1504	* we should change hctx numa_node according to new topology (this
				1505	* involves free and re-allocate memory, worthy doing?)
				1506	*/
				1507
				1508	blk_mq_map_swqueue(q);
				1509
				1510	blk_mq_unfreeze_queue(q);
				1511	}
				1512
Paul Gortmaker	f618ef7	2013-11-14 08:26:02 -0700	[diff] [blame]	1513	static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
				1514	unsigned long action, void *hcpu)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1515	{
				1516	struct request_queue *q;
				1517
				1518	/*
				1519	* Before new mapping is established, hotadded cpu might already start
				1520	* handling requests. This doesn't break anything as we map offline
				1521	* CPUs to first hardware queue. We will re-init queue below to get
				1522	* optimal settings.
				1523	*/
				1524	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
				1525	action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
				1526	return NOTIFY_OK;
				1527
				1528	mutex_lock(&all_q_mutex);
				1529	list_for_each_entry(q, &all_q_list, all_q_node)
				1530	blk_mq_queue_reinit(q);
				1531	mutex_unlock(&all_q_mutex);
				1532	return NOTIFY_OK;
				1533	}
				1534
Jens Axboe	676141e	2014-03-20 13:29:18 -0600	[diff] [blame^]	1535	void blk_mq_disable_hotplug(void)
				1536	{
				1537	mutex_lock(&all_q_mutex);
				1538	}
				1539
				1540	void blk_mq_enable_hotplug(void)
				1541	{
				1542	mutex_unlock(&all_q_mutex);
				1543	}
				1544
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1545	static int __init blk_mq_init(void)
				1546	{
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1547	blk_mq_cpu_init();
				1548
				1549	/* Must be called after percpu_counter_hotcpu_callback() */
				1550	hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
				1551
				1552	return 0;
				1553	}
				1554	subsys_initcall(blk_mq_init);