Blame - block/blk-mq.c - SHIFTPHONES/mainline/linux

blob: 86d66e0e900c37cf0c7f270d27db5986596b932f [file] [log] [blame]

Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1	#include <linux/kernel.h>
				2	#include <linux/module.h>
				3	#include <linux/backing-dev.h>
				4	#include <linux/bio.h>
				5	#include <linux/blkdev.h>
				6	#include <linux/mm.h>
				7	#include <linux/init.h>
				8	#include <linux/slab.h>
				9	#include <linux/workqueue.h>
				10	#include <linux/smp.h>
				11	#include <linux/llist.h>
				12	#include <linux/list_sort.h>
				13	#include <linux/cpu.h>
				14	#include <linux/cache.h>
				15	#include <linux/sched/sysctl.h>
				16	#include <linux/delay.h>
				17
				18	#include <trace/events/block.h>
				19
				20	#include <linux/blk-mq.h>
				21	#include "blk.h"
				22	#include "blk-mq.h"
				23	#include "blk-mq-tag.h"
				24
				25	static DEFINE_MUTEX(all_q_mutex);
				26	static LIST_HEAD(all_q_list);
				27
				28	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
				29
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	30	static struct blk_mq_ctx __blk_mq_get_ctx(struct request_queue q,
				31	unsigned int cpu)
				32	{
				33	return per_cpu_ptr(q->queue_ctx, cpu);
				34	}
				35
				36	/*
				37	* This assumes per-cpu software queueing queues. They could be per-node
				38	* as well, for instance. For now this is hardcoded as-is. Note that we don't
				39	* care about preemption, since we know the ctx's are persistent. This does
				40	* mean that we can't rely on ctx always matching the currently running CPU.
				41	*/
				42	static struct blk_mq_ctx blk_mq_get_ctx(struct request_queue q)
				43	{
				44	return __blk_mq_get_ctx(q, get_cpu());
				45	}
				46
				47	static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
				48	{
				49	put_cpu();
				50	}
				51
				52	/*
				53	* Check if any of the ctx's have pending work in this hardware queue
				54	*/
				55	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
				56	{
				57	unsigned int i;
				58
				59	for (i = 0; i < hctx->nr_ctx_map; i++)
				60	if (hctx->ctx_map[i])
				61	return true;
				62
				63	return false;
				64	}
				65
				66	/*
				67	* Mark this ctx as having pending work in this hardware queue
				68	*/
				69	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				70	struct blk_mq_ctx *ctx)
				71	{
				72	if (!test_bit(ctx->index_hw, hctx->ctx_map))
				73	set_bit(ctx->index_hw, hctx->ctx_map);
				74	}
				75
Christoph Hellwig	081241e	2014-02-20 15:32:36 -0800	[diff] [blame]	76	static struct request __blk_mq_alloc_request(struct blk_mq_hw_ctx hctx,
				77	gfp_t gfp, bool reserved)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	78	{
				79	struct request *rq;
				80	unsigned int tag;
				81
				82	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
				83	if (tag != BLK_MQ_TAG_FAIL) {
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	84	rq = hctx->tags->rqs[tag];
Christoph Hellwig	ed44832	2014-04-14 10:30:10 +0200	[diff] [blame]	85	blk_rq_init(hctx->queue, rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	86	rq->tag = tag;
				87
				88	return rq;
				89	}
				90
				91	return NULL;
				92	}
				93
				94	static int blk_mq_queue_enter(struct request_queue *q)
				95	{
				96	int ret;
				97
				98	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
				99	smp_wmb();
				100	/* we have problems to freeze the queue if it's initializing */
				101	if (!blk_queue_bypass(q) \|\| !blk_queue_init_done(q))
				102	return 0;
				103
				104	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				105
				106	spin_lock_irq(q->queue_lock);
				107	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	108	!blk_queue_bypass(q) \|\| blk_queue_dying(q),
				109	*q->queue_lock);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	110	/* inc usage with lock hold to avoid freeze_queue runs here */
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	111	if (!ret && !blk_queue_dying(q))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	112	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	113	else if (blk_queue_dying(q))
				114	ret = -ENODEV;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	115	spin_unlock_irq(q->queue_lock);
				116
				117	return ret;
				118	}
				119
				120	static void blk_mq_queue_exit(struct request_queue *q)
				121	{
				122	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				123	}
				124
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	125	static void __blk_mq_drain_queue(struct request_queue *q)
				126	{
				127	while (true) {
				128	s64 count;
				129
				130	spin_lock_irq(q->queue_lock);
				131	count = percpu_counter_sum(&q->mq_usage_counter);
				132	spin_unlock_irq(q->queue_lock);
				133
				134	if (count == 0)
				135	break;
				136	blk_mq_run_queues(q, false);
				137	msleep(10);
				138	}
				139	}
				140
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	141	/*
				142	* Guarantee no request is in use, so we can change any data structure of
				143	* the queue afterward.
				144	*/
				145	static void blk_mq_freeze_queue(struct request_queue *q)
				146	{
				147	bool drain;
				148
				149	spin_lock_irq(q->queue_lock);
				150	drain = !q->bypass_depth++;
				151	queue_flag_set(QUEUE_FLAG_BYPASS, q);
				152	spin_unlock_irq(q->queue_lock);
				153
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	154	if (drain)
				155	__blk_mq_drain_queue(q);
				156	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	157
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	158	void blk_mq_drain_queue(struct request_queue *q)
				159	{
				160	__blk_mq_drain_queue(q);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	161	}
				162
				163	static void blk_mq_unfreeze_queue(struct request_queue *q)
				164	{
				165	bool wake = false;
				166
				167	spin_lock_irq(q->queue_lock);
				168	if (!--q->bypass_depth) {
				169	queue_flag_clear(QUEUE_FLAG_BYPASS, q);
				170	wake = true;
				171	}
				172	WARN_ON_ONCE(q->bypass_depth < 0);
				173	spin_unlock_irq(q->queue_lock);
				174	if (wake)
				175	wake_up_all(&q->mq_freeze_wq);
				176	}
				177
				178	bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
				179	{
				180	return blk_mq_has_free_tags(hctx->tags);
				181	}
				182	EXPORT_SYMBOL(blk_mq_can_queue);
				183
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	184	static void blk_mq_rq_ctx_init(struct request_queue q, struct blk_mq_ctx ctx,
				185	struct request *rq, unsigned int rw_flags)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	186	{
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	187	if (blk_queue_io_stat(q))
				188	rw_flags \|= REQ_IO_STAT;
				189
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	190	rq->mq_ctx = ctx;
				191	rq->cmd_flags = rw_flags;
Ming Lei	0fec08b	2014-01-03 10:00:08 -0700	[diff] [blame]	192	rq->start_time = jiffies;
				193	set_start_time_ns(rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	194	ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
				195	}
				196
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	197	static struct request blk_mq_alloc_request_pinned(struct request_queue q,
				198	int rw, gfp_t gfp,
				199	bool reserved)
				200	{
				201	struct request *rq;
				202
				203	do {
				204	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
				205	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
				206
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	207	rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	208	if (rq) {
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	209	blk_mq_rq_ctx_init(q, ctx, rq, rw);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	210	break;
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	211	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	212
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	213	if (gfp & __GFP_WAIT) {
				214	__blk_mq_run_hw_queue(hctx);
				215	blk_mq_put_ctx(ctx);
				216	} else {
				217	blk_mq_put_ctx(ctx);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	218	break;
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	219	}
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	220
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	221	blk_mq_wait_for_tags(hctx->tags);
				222	} while (1);
				223
				224	return rq;
				225	}
				226
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	227	struct request blk_mq_alloc_request(struct request_queue q, int rw, gfp_t gfp)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	228	{
				229	struct request *rq;
				230
				231	if (blk_mq_queue_enter(q))
				232	return NULL;
				233
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	234	rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	235	if (rq)
				236	blk_mq_put_ctx(rq->mq_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	237	return rq;
				238	}
				239
				240	struct request blk_mq_alloc_reserved_request(struct request_queue q, int rw,
				241	gfp_t gfp)
				242	{
				243	struct request *rq;
				244
				245	if (blk_mq_queue_enter(q))
				246	return NULL;
				247
				248	rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	249	if (rq)
				250	blk_mq_put_ctx(rq->mq_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	251	return rq;
				252	}
				253	EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
				254
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	255	static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
				256	struct blk_mq_ctx ctx, struct request rq)
				257	{
				258	const int tag = rq->tag;
				259	struct request_queue *q = rq->q;
				260
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	261	blk_mq_put_tag(hctx->tags, tag);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	262	blk_mq_queue_exit(q);
				263	}
				264
				265	void blk_mq_free_request(struct request *rq)
				266	{
				267	struct blk_mq_ctx *ctx = rq->mq_ctx;
				268	struct blk_mq_hw_ctx *hctx;
				269	struct request_queue *q = rq->q;
				270
				271	ctx->rq_completed[rq_is_sync(rq)]++;
				272
				273	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				274	__blk_mq_free_request(hctx, ctx, rq);
				275	}
				276
Christoph Hellwig	8727af4	2014-04-14 10:30:08 +0200	[diff] [blame]	277	/*
				278	* Clone all relevant state from a request that has been put on hold in
				279	* the flush state machine into the preallocated flush request that hangs
				280	* off the request queue.
				281	*
				282	* For a driver the flush request should be invisible, that's why we are
				283	* impersonating the original request here.
				284	*/
				285	void blk_mq_clone_flush_request(struct request *flush_rq,
				286	struct request *orig_rq)
				287	{
				288	struct blk_mq_hw_ctx *hctx =
				289	orig_rq->q->mq_ops->map_queue(orig_rq->q, orig_rq->mq_ctx->cpu);
				290
				291	flush_rq->mq_ctx = orig_rq->mq_ctx;
				292	flush_rq->tag = orig_rq->tag;
				293	memcpy(blk_mq_rq_to_pdu(flush_rq), blk_mq_rq_to_pdu(orig_rq),
				294	hctx->cmd_size);
				295	}
				296
Christoph Hellwig	63151a4	2014-04-16 09:44:52 +0200	[diff] [blame^]	297	inline void __blk_mq_end_io(struct request *rq, int error)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	298	{
Ming Lei	0d11e6a	2013-12-05 10:50:39 -0700	[diff] [blame]	299	blk_account_io_done(rq);
				300
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	301	if (rq->end_io)
				302	rq->end_io(rq, error);
				303	else
				304	blk_mq_free_request(rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	305	}
Christoph Hellwig	63151a4	2014-04-16 09:44:52 +0200	[diff] [blame^]	306	EXPORT_SYMBOL(__blk_mq_end_io);
				307
				308	void blk_mq_end_io(struct request *rq, int error)
				309	{
				310	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
				311	BUG();
				312	__blk_mq_end_io(rq, error);
				313	}
				314	EXPORT_SYMBOL(blk_mq_end_io);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	315
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	316	static void __blk_mq_complete_request_remote(void *data)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	317	{
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	318	struct request *rq = data;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	319
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	320	rq->q->softirq_done_fn(rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	321	}
				322
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	323	void __blk_mq_complete_request(struct request *rq)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	324	{
				325	struct blk_mq_ctx *ctx = rq->mq_ctx;
				326	int cpu;
				327
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	328	if (!ctx->ipi_redirect) {
				329	rq->q->softirq_done_fn(rq);
				330	return;
				331	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	332
				333	cpu = get_cpu();
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	334	if (cpu != ctx->cpu && cpu_online(ctx->cpu)) {
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	335	rq->csd.func = __blk_mq_complete_request_remote;
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	336	rq->csd.info = rq;
				337	rq->csd.flags = 0;
Frederic Weisbecker	c46fff2	2014-02-24 16:40:02 +0100	[diff] [blame]	338	smp_call_function_single_async(ctx->cpu, &rq->csd);
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	339	} else {
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	340	rq->q->softirq_done_fn(rq);
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	341	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	342	put_cpu();
				343	}
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	344
				345	/**
				346	* blk_mq_complete_request - end I/O on a request
				347	* @rq: the request being processed
				348	*
				349	* Description:
				350	* Ends all I/O on a request. It does not handle partial completions.
				351	* The actual completion happens out-of-order, through a IPI handler.
				352	**/
				353	void blk_mq_complete_request(struct request *rq)
				354	{
				355	if (unlikely(blk_should_fake_timeout(rq->q)))
				356	return;
				357	if (!blk_mark_rq_complete(rq))
				358	__blk_mq_complete_request(rq);
				359	}
				360	EXPORT_SYMBOL(blk_mq_complete_request);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	361
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	362	static void blk_mq_start_request(struct request *rq, bool last)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	363	{
				364	struct request_queue *q = rq->q;
				365
				366	trace_block_rq_issue(q, rq);
				367
Christoph Hellwig	742ee69	2014-04-14 10:30:06 +0200	[diff] [blame]	368	rq->resid_len = blk_rq_bytes(rq);
				369
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	370	/*
				371	* Just mark start time and set the started bit. Due to memory
				372	* ordering, we know we'll see the correct deadline as long as
				373	* REQ_ATOMIC_STARTED is seen.
				374	*/
				375	rq->deadline = jiffies + q->rq_timeout;
				376	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	377
				378	if (q->dma_drain_size && blk_rq_bytes(rq)) {
				379	/*
				380	* Make sure space for the drain appears. We know we can do
				381	* this because max_hw_segments has been adjusted to be one
				382	* fewer than the device can handle.
				383	*/
				384	rq->nr_phys_segments++;
				385	}
				386
				387	/*
				388	* Flag the last request in the series so that drivers know when IO
				389	* should be kicked off, if they don't do it on a per-request basis.
				390	*
				391	* Note: the flag isn't the only condition drivers should do kick off.
				392	* If drive is busy, the last request might not have the bit set.
				393	*/
				394	if (last)
				395	rq->cmd_flags \|= REQ_END;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	396	}
				397
				398	static void blk_mq_requeue_request(struct request *rq)
				399	{
				400	struct request_queue *q = rq->q;
				401
				402	trace_block_rq_requeue(q, rq);
				403	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	404
				405	rq->cmd_flags &= ~REQ_END;
				406
				407	if (q->dma_drain_size && blk_rq_bytes(rq))
				408	rq->nr_phys_segments--;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	409	}
				410
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	411	struct request blk_mq_tag_to_rq(struct blk_mq_tags tags, unsigned int tag)
				412	{
				413	return tags->rqs[tag];
				414	}
				415	EXPORT_SYMBOL(blk_mq_tag_to_rq);
				416
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	417	struct blk_mq_timeout_data {
				418	struct blk_mq_hw_ctx *hctx;
				419	unsigned long *next;
				420	unsigned int *next_set;
				421	};
				422
				423	static void blk_mq_timeout_check(void __data, unsigned long free_tags)
				424	{
				425	struct blk_mq_timeout_data *data = __data;
				426	struct blk_mq_hw_ctx *hctx = data->hctx;
				427	unsigned int tag;
				428
				429	/* It may not be in flight yet (this is where
				430	* the REQ_ATOMIC_STARTED flag comes in). The requests are
				431	* statically allocated, so we know it's always safe to access the
				432	* memory associated with a bit offset into ->rqs[].
				433	*/
				434	tag = 0;
				435	do {
				436	struct request *rq;
				437
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	438	tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
				439	if (tag >= hctx->tags->nr_tags)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	440	break;
				441
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	442	rq = blk_mq_tag_to_rq(hctx->tags, tag++);
				443	if (rq->q != hctx->queue)
				444	continue;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	445	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
				446	continue;
				447
				448	blk_rq_check_expired(rq, data->next, data->next_set);
				449	} while (1);
				450	}
				451
				452	static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
				453	unsigned long *next,
				454	unsigned int *next_set)
				455	{
				456	struct blk_mq_timeout_data data = {
				457	.hctx = hctx,
				458	.next = next,
				459	.next_set = next_set,
				460	};
				461
				462	/*
				463	* Ask the tagging code to iterate busy requests, so we can
				464	* check them for timeout.
				465	*/
				466	blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
				467	}
				468
				469	static void blk_mq_rq_timer(unsigned long data)
				470	{
				471	struct request_queue q = (struct request_queue ) data;
				472	struct blk_mq_hw_ctx *hctx;
				473	unsigned long next = 0;
				474	int i, next_set = 0;
				475
				476	queue_for_each_hw_ctx(q, hctx, i)
				477	blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
				478
				479	if (next_set)
				480	mod_timer(&q->timeout, round_jiffies_up(next));
				481	}
				482
				483	/*
				484	* Reverse check our software queue for entries that we could potentially
				485	* merge with. Currently includes a hand-wavy stop count of 8, to not spend
				486	* too much time checking for merges.
				487	*/
				488	static bool blk_mq_attempt_merge(struct request_queue *q,
				489	struct blk_mq_ctx ctx, struct bio bio)
				490	{
				491	struct request *rq;
				492	int checked = 8;
				493
				494	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
				495	int el_ret;
				496
				497	if (!checked--)
				498	break;
				499
				500	if (!blk_rq_merge_ok(rq, bio))
				501	continue;
				502
				503	el_ret = blk_try_merge(rq, bio);
				504	if (el_ret == ELEVATOR_BACK_MERGE) {
				505	if (bio_attempt_back_merge(q, rq, bio)) {
				506	ctx->rq_merged++;
				507	return true;
				508	}
				509	break;
				510	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
				511	if (bio_attempt_front_merge(q, rq, bio)) {
				512	ctx->rq_merged++;
				513	return true;
				514	}
				515	break;
				516	}
				517	}
				518
				519	return false;
				520	}
				521
				522	void blk_mq_add_timer(struct request *rq)
				523	{
				524	__blk_add_timer(rq, NULL);
				525	}
				526
				527	/*
				528	* Run this hardware queue, pulling any software queues mapped to it in.
				529	* Note that this function currently has various problems around ordering
				530	* of IO. In particular, we'd like FIFO behaviour on handling existing
				531	* items on the hctx->dispatch list. Ignore that for now.
				532	*/
				533	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
				534	{
				535	struct request_queue *q = hctx->queue;
				536	struct blk_mq_ctx *ctx;
				537	struct request *rq;
				538	LIST_HEAD(rq_list);
				539	int bit, queued;
				540
Jens Axboe	fd1270d	2014-04-16 09:23:48 -0600	[diff] [blame]	541	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	542
Jens Axboe	5d12f90	2014-03-19 15:25:02 -0600	[diff] [blame]	543	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	544	return;
				545
				546	hctx->run++;
				547
				548	/*
				549	* Touch any software queue that has pending entries.
				550	*/
				551	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
				552	clear_bit(bit, hctx->ctx_map);
				553	ctx = hctx->ctxs[bit];
				554	BUG_ON(bit != ctx->index_hw);
				555
				556	spin_lock(&ctx->lock);
				557	list_splice_tail_init(&ctx->rq_list, &rq_list);
				558	spin_unlock(&ctx->lock);
				559	}
				560
				561	/*
				562	* If we have previous entries on our dispatch list, grab them
				563	* and stuff them at the front for more fair dispatch.
				564	*/
				565	if (!list_empty_careful(&hctx->dispatch)) {
				566	spin_lock(&hctx->lock);
				567	if (!list_empty(&hctx->dispatch))
				568	list_splice_init(&hctx->dispatch, &rq_list);
				569	spin_unlock(&hctx->lock);
				570	}
				571
				572	/*
				573	* Delete and return all entries from our dispatch list
				574	*/
				575	queued = 0;
				576
				577	/*
				578	* Now process all the entries, sending them to the driver.
				579	*/
				580	while (!list_empty(&rq_list)) {
				581	int ret;
				582
				583	rq = list_first_entry(&rq_list, struct request, queuelist);
				584	list_del_init(&rq->queuelist);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	585
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	586	blk_mq_start_request(rq, list_empty(&rq_list));
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	587
				588	ret = q->mq_ops->queue_rq(hctx, rq);
				589	switch (ret) {
				590	case BLK_MQ_RQ_QUEUE_OK:
				591	queued++;
				592	continue;
				593	case BLK_MQ_RQ_QUEUE_BUSY:
				594	/*
				595	* FIXME: we should have a mechanism to stop the queue
				596	* like blk_stop_queue, otherwise we will waste cpu
				597	* time
				598	*/
				599	list_add(&rq->queuelist, &rq_list);
				600	blk_mq_requeue_request(rq);
				601	break;
				602	default:
				603	pr_err("blk-mq: bad return on queue: %d\n", ret);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	604	case BLK_MQ_RQ_QUEUE_ERROR:
Christoph Hellwig	1e93b8c	2014-02-11 08:27:13 -0800	[diff] [blame]	605	rq->errors = -EIO;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	606	blk_mq_end_io(rq, rq->errors);
				607	break;
				608	}
				609
				610	if (ret == BLK_MQ_RQ_QUEUE_BUSY)
				611	break;
				612	}
				613
				614	if (!queued)
				615	hctx->dispatched[0]++;
				616	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
				617	hctx->dispatched[ilog2(queued) + 1]++;
				618
				619	/*
				620	* Any items that need requeuing? Stuff them into hctx->dispatch,
				621	* that is where we will continue on next queue run.
				622	*/
				623	if (!list_empty(&rq_list)) {
				624	spin_lock(&hctx->lock);
				625	list_splice(&rq_list, &hctx->dispatch);
				626	spin_unlock(&hctx->lock);
				627	}
				628	}
				629
				630	void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
				631	{
Jens Axboe	5d12f90	2014-03-19 15:25:02 -0600	[diff] [blame]	632	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	633	return;
				634
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	635	if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	636	__blk_mq_run_hw_queue(hctx);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	637	else if (hctx->queue->nr_hw_queues == 1)
Jens Axboe	59c3d45	2014-04-08 09:15:35 -0600	[diff] [blame]	638	kblockd_schedule_delayed_work(&hctx->delayed_work, 0);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	639	else {
				640	unsigned int cpu;
				641
				642	/*
				643	* It'd be great if the workqueue API had a way to pass
				644	* in a mask and had some smarts for more clever placement
				645	* than the first CPU. Or we could round-robin here. For now,
				646	* just queue on the first CPU.
				647	*/
				648	cpu = cpumask_first(hctx->cpumask);
				649	kblockd_schedule_delayed_work_on(cpu, &hctx->delayed_work, 0);
				650	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	651	}
				652
				653	void blk_mq_run_queues(struct request_queue *q, bool async)
				654	{
				655	struct blk_mq_hw_ctx *hctx;
				656	int i;
				657
				658	queue_for_each_hw_ctx(q, hctx, i) {
				659	if ((!blk_mq_hctx_has_pending(hctx) &&
				660	list_empty_careful(&hctx->dispatch)) \|\|
Jens Axboe	5d12f90	2014-03-19 15:25:02 -0600	[diff] [blame]	661	test_bit(BLK_MQ_S_STOPPED, &hctx->state))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	662	continue;
				663
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	664	preempt_disable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	665	blk_mq_run_hw_queue(hctx, async);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	666	preempt_enable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	667	}
				668	}
				669	EXPORT_SYMBOL(blk_mq_run_queues);
				670
				671	void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
				672	{
				673	cancel_delayed_work(&hctx->delayed_work);
				674	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
				675	}
				676	EXPORT_SYMBOL(blk_mq_stop_hw_queue);
				677
Christoph Hellwig	280d45f	2013-10-25 14:45:58 +0100	[diff] [blame]	678	void blk_mq_stop_hw_queues(struct request_queue *q)
				679	{
				680	struct blk_mq_hw_ctx *hctx;
				681	int i;
				682
				683	queue_for_each_hw_ctx(q, hctx, i)
				684	blk_mq_stop_hw_queue(hctx);
				685	}
				686	EXPORT_SYMBOL(blk_mq_stop_hw_queues);
				687
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	688	void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
				689	{
				690	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	691
				692	preempt_disable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	693	__blk_mq_run_hw_queue(hctx);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	694	preempt_enable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	695	}
				696	EXPORT_SYMBOL(blk_mq_start_hw_queue);
				697
				698	void blk_mq_start_stopped_hw_queues(struct request_queue *q)
				699	{
				700	struct blk_mq_hw_ctx *hctx;
				701	int i;
				702
				703	queue_for_each_hw_ctx(q, hctx, i) {
				704	if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
				705	continue;
				706
				707	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	708	preempt_disable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	709	blk_mq_run_hw_queue(hctx, true);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	710	preempt_enable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	711	}
				712	}
				713	EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
				714
				715	static void blk_mq_work_fn(struct work_struct *work)
				716	{
				717	struct blk_mq_hw_ctx *hctx;
				718
				719	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	720
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	721	__blk_mq_run_hw_queue(hctx);
				722	}
				723
				724	static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	725	struct request *rq, bool at_head)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	726	{
				727	struct blk_mq_ctx *ctx = rq->mq_ctx;
				728
Jens Axboe	01b983c	2013-11-19 18:59:10 -0700	[diff] [blame]	729	trace_block_rq_insert(hctx->queue, rq);
				730
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	731	if (at_head)
				732	list_add(&rq->queuelist, &ctx->rq_list);
				733	else
				734	list_add_tail(&rq->queuelist, &ctx->rq_list);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	735	blk_mq_hctx_mark_pending(hctx, ctx);
				736
				737	/*
				738	* We do this early, to ensure we are on the right CPU.
				739	*/
				740	blk_mq_add_timer(rq);
				741	}
				742
Christoph Hellwig	eeabc85	2014-03-21 08:57:37 -0600	[diff] [blame]	743	void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
				744	bool async)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	745	{
				746	struct request_queue *q = rq->q;
				747	struct blk_mq_hw_ctx *hctx;
Christoph Hellwig	eeabc85	2014-03-21 08:57:37 -0600	[diff] [blame]	748	struct blk_mq_ctx ctx = rq->mq_ctx, current_ctx;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	749
				750	current_ctx = blk_mq_get_ctx(q);
Christoph Hellwig	eeabc85	2014-03-21 08:57:37 -0600	[diff] [blame]	751	if (!cpu_online(ctx->cpu))
				752	rq->mq_ctx = ctx = current_ctx;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	753
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	754	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				755
Christoph Hellwig	eeabc85	2014-03-21 08:57:37 -0600	[diff] [blame]	756	if (rq->cmd_flags & (REQ_FLUSH \| REQ_FUA) &&
				757	!(rq->cmd_flags & (REQ_FLUSH_SEQ))) {
				758	blk_insert_flush(rq);
				759	} else {
				760	spin_lock(&ctx->lock);
				761	__blk_mq_insert_request(hctx, rq, at_head);
				762	spin_unlock(&ctx->lock);
				763	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	764
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	765	if (run_queue)
				766	blk_mq_run_hw_queue(hctx, async);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	767
				768	blk_mq_put_ctx(current_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	769	}
				770
				771	static void blk_mq_insert_requests(struct request_queue *q,
				772	struct blk_mq_ctx *ctx,
				773	struct list_head *list,
				774	int depth,
				775	bool from_schedule)
				776
				777	{
				778	struct blk_mq_hw_ctx *hctx;
				779	struct blk_mq_ctx *current_ctx;
				780
				781	trace_block_unplug(q, depth, !from_schedule);
				782
				783	current_ctx = blk_mq_get_ctx(q);
				784
				785	if (!cpu_online(ctx->cpu))
				786	ctx = current_ctx;
				787	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				788
				789	/*
				790	* preemption doesn't flush plug list, so it's possible ctx->cpu is
				791	* offline now
				792	*/
				793	spin_lock(&ctx->lock);
				794	while (!list_empty(list)) {
				795	struct request *rq;
				796
				797	rq = list_first_entry(list, struct request, queuelist);
				798	list_del_init(&rq->queuelist);
				799	rq->mq_ctx = ctx;
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	800	__blk_mq_insert_request(hctx, rq, false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	801	}
				802	spin_unlock(&ctx->lock);
				803
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	804	blk_mq_run_hw_queue(hctx, from_schedule);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	805	blk_mq_put_ctx(current_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	806	}
				807
				808	static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
				809	{
				810	struct request *rqa = container_of(a, struct request, queuelist);
				811	struct request *rqb = container_of(b, struct request, queuelist);
				812
				813	return !(rqa->mq_ctx < rqb->mq_ctx \|\|
				814	(rqa->mq_ctx == rqb->mq_ctx &&
				815	blk_rq_pos(rqa) < blk_rq_pos(rqb)));
				816	}
				817
				818	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
				819	{
				820	struct blk_mq_ctx *this_ctx;
				821	struct request_queue *this_q;
				822	struct request *rq;
				823	LIST_HEAD(list);
				824	LIST_HEAD(ctx_list);
				825	unsigned int depth;
				826
				827	list_splice_init(&plug->mq_list, &list);
				828
				829	list_sort(NULL, &list, plug_ctx_cmp);
				830
				831	this_q = NULL;
				832	this_ctx = NULL;
				833	depth = 0;
				834
				835	while (!list_empty(&list)) {
				836	rq = list_entry_rq(list.next);
				837	list_del_init(&rq->queuelist);
				838	BUG_ON(!rq->q);
				839	if (rq->mq_ctx != this_ctx) {
				840	if (this_ctx) {
				841	blk_mq_insert_requests(this_q, this_ctx,
				842	&ctx_list, depth,
				843	from_schedule);
				844	}
				845
				846	this_ctx = rq->mq_ctx;
				847	this_q = rq->q;
				848	depth = 0;
				849	}
				850
				851	depth++;
				852	list_add_tail(&rq->queuelist, &ctx_list);
				853	}
				854
				855	/*
				856	* If 'this_ctx' is set, we know we have entries to complete
				857	* on 'ctx_list'. Do those.
				858	*/
				859	if (this_ctx) {
				860	blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
				861	from_schedule);
				862	}
				863	}
				864
				865	static void blk_mq_bio_to_request(struct request rq, struct bio bio)
				866	{
				867	init_request_from_bio(rq, bio);
				868	blk_account_io_start(rq, 1);
				869	}
				870
				871	static void blk_mq_make_request(struct request_queue q, struct bio bio)
				872	{
				873	struct blk_mq_hw_ctx *hctx;
				874	struct blk_mq_ctx *ctx;
				875	const int is_sync = rw_is_sync(bio->bi_rw);
				876	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH \| REQ_FUA);
				877	int rw = bio_data_dir(bio);
				878	struct request *rq;
				879	unsigned int use_plug, request_count = 0;
				880
				881	/*
				882	* If we have multiple hardware queues, just go directly to
				883	* one of those for sync IO.
				884	*/
				885	use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) \|\| !is_sync);
				886
				887	blk_queue_bounce(q, &bio);
				888
Nicholas Bellinger	14ec77f	2014-02-07 13:45:39 -0700	[diff] [blame]	889	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
				890	bio_endio(bio, -EIO);
				891	return;
				892	}
				893
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	894	if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
				895	return;
				896
				897	if (blk_mq_queue_enter(q)) {
				898	bio_endio(bio, -EIO);
				899	return;
				900	}
				901
				902	ctx = blk_mq_get_ctx(q);
				903	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				904
Shaohua Li	27fbf4e8	2014-02-19 20:20:21 +0800	[diff] [blame]	905	if (is_sync)
				906	rw \|= REQ_SYNC;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	907	trace_block_getrq(q, bio, rw);
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	908	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	909	if (likely(rq))
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	910	blk_mq_rq_ctx_init(q, ctx, rq, rw);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	911	else {
				912	blk_mq_put_ctx(ctx);
				913	trace_block_sleeprq(q, bio, rw);
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	914	rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT\|GFP_ATOMIC,
				915	false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	916	ctx = rq->mq_ctx;
				917	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				918	}
				919
				920	hctx->queued++;
				921
				922	if (unlikely(is_flush_fua)) {
				923	blk_mq_bio_to_request(rq, bio);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	924	blk_insert_flush(rq);
				925	goto run_queue;
				926	}
				927
				928	/*
				929	* A task plug currently exists. Since this is completely lockless,
				930	* utilize that to temporarily store requests until the task is
				931	* either done or scheduled away.
				932	*/
				933	if (use_plug) {
				934	struct blk_plug *plug = current->plug;
				935
				936	if (plug) {
				937	blk_mq_bio_to_request(rq, bio);
Shaohua Li	92f399c	2013-10-29 12:01:03 -0600	[diff] [blame]	938	if (list_empty(&plug->mq_list))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	939	trace_block_plug(q);
				940	else if (request_count >= BLK_MAX_REQUEST_COUNT) {
				941	blk_flush_plug_list(plug, false);
				942	trace_block_plug(q);
				943	}
				944	list_add_tail(&rq->queuelist, &plug->mq_list);
				945	blk_mq_put_ctx(ctx);
				946	return;
				947	}
				948	}
				949
				950	spin_lock(&ctx->lock);
				951
				952	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
				953	blk_mq_attempt_merge(q, ctx, bio))
				954	__blk_mq_free_request(hctx, ctx, rq);
				955	else {
				956	blk_mq_bio_to_request(rq, bio);
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	957	__blk_mq_insert_request(hctx, rq, false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	958	}
				959
				960	spin_unlock(&ctx->lock);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	961
				962	/*
				963	* For a SYNC request, send it to the hardware immediately. For an
				964	* ASYNC request, just ensure that we run it later on. The latter
				965	* allows for merging opportunities and more efficient dispatching.
				966	*/
				967	run_queue:
				968	blk_mq_run_hw_queue(hctx, !is_sync \|\| is_flush_fua);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	969	blk_mq_put_ctx(ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	970	}
				971
				972	/*
				973	* Default mapping to a software queue, since we use one per CPU.
				974	*/
				975	struct blk_mq_hw_ctx blk_mq_map_queue(struct request_queue q, const int cpu)
				976	{
				977	return q->queue_hw_ctx[q->mq_map[cpu]];
				978	}
				979	EXPORT_SYMBOL(blk_mq_map_queue);
				980
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	981	struct blk_mq_hw_ctx blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set set,
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	982	unsigned int hctx_index)
				983	{
				984	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	985	GFP_KERNEL \| __GFP_ZERO, set->numa_node);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	986	}
				987	EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
				988
				989	void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
				990	unsigned int hctx_index)
				991	{
				992	kfree(hctx);
				993	}
				994	EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
				995
				996	static void blk_mq_hctx_notify(void *data, unsigned long action,
				997	unsigned int cpu)
				998	{
				999	struct blk_mq_hw_ctx *hctx = data;
Jens Axboe	bccb5f7	2014-04-04 21:34:48 -0600	[diff] [blame]	1000	struct request_queue *q = hctx->queue;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1001	struct blk_mq_ctx *ctx;
				1002	LIST_HEAD(tmp);
				1003
				1004	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
				1005	return;
				1006
				1007	/*
				1008	* Move ctx entries to new CPU, if this one is going away.
				1009	*/
Jens Axboe	bccb5f7	2014-04-04 21:34:48 -0600	[diff] [blame]	1010	ctx = __blk_mq_get_ctx(q, cpu);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1011
				1012	spin_lock(&ctx->lock);
				1013	if (!list_empty(&ctx->rq_list)) {
				1014	list_splice_init(&ctx->rq_list, &tmp);
				1015	clear_bit(ctx->index_hw, hctx->ctx_map);
				1016	}
				1017	spin_unlock(&ctx->lock);
				1018
				1019	if (list_empty(&tmp))
				1020	return;
				1021
Jens Axboe	bccb5f7	2014-04-04 21:34:48 -0600	[diff] [blame]	1022	ctx = blk_mq_get_ctx(q);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1023	spin_lock(&ctx->lock);
				1024
				1025	while (!list_empty(&tmp)) {
				1026	struct request *rq;
				1027
				1028	rq = list_first_entry(&tmp, struct request, queuelist);
				1029	rq->mq_ctx = ctx;
				1030	list_move_tail(&rq->queuelist, &ctx->rq_list);
				1031	}
				1032
Jens Axboe	bccb5f7	2014-04-04 21:34:48 -0600	[diff] [blame]	1033	hctx = q->mq_ops->map_queue(q, ctx->cpu);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1034	blk_mq_hctx_mark_pending(hctx, ctx);
				1035
				1036	spin_unlock(&ctx->lock);
Jens Axboe	bccb5f7	2014-04-04 21:34:48 -0600	[diff] [blame]	1037
				1038	blk_mq_run_hw_queue(hctx, true);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1039	blk_mq_put_ctx(ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1040	}
				1041
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1042	static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
				1043	struct blk_mq_tags *tags, unsigned int hctx_idx)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1044	{
				1045	struct page *page;
				1046
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1047	if (tags->rqs && set->ops->exit_request) {
Christoph Hellwig	e9b267d	2014-04-15 13:59:10 -0600	[diff] [blame]	1048	int i;
				1049
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1050	for (i = 0; i < tags->nr_tags; i++) {
				1051	if (!tags->rqs[i])
Christoph Hellwig	e9b267d	2014-04-15 13:59:10 -0600	[diff] [blame]	1052	continue;
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1053	set->ops->exit_request(set->driver_data, tags->rqs[i],
				1054	hctx_idx, i);
Christoph Hellwig	e9b267d	2014-04-15 13:59:10 -0600	[diff] [blame]	1055	}
				1056	}
				1057
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1058	while (!list_empty(&tags->page_list)) {
				1059	page = list_first_entry(&tags->page_list, struct page, lru);
Dave Hansen	6753471	2014-01-08 20:17:46 -0700	[diff] [blame]	1060	list_del_init(&page->lru);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1061	__free_pages(page, page->private);
				1062	}
				1063
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1064	kfree(tags->rqs);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1065
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1066	blk_mq_free_tags(tags);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1067	}
				1068
				1069	static size_t order_to_size(unsigned int order)
				1070	{
				1071	size_t ret = PAGE_SIZE;
				1072
				1073	while (order--)
				1074	ret *= 2;
				1075
				1076	return ret;
				1077	}
				1078
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1079	static struct blk_mq_tags blk_mq_init_rq_map(struct blk_mq_tag_set set,
				1080	unsigned int hctx_idx)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1081	{
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1082	struct blk_mq_tags *tags;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1083	unsigned int i, j, entries_per_page, max_order = 4;
				1084	size_t rq_size, left;
				1085
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1086	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
				1087	set->numa_node);
				1088	if (!tags)
				1089	return NULL;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1090
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1091	INIT_LIST_HEAD(&tags->page_list);
				1092
				1093	tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
				1094	GFP_KERNEL, set->numa_node);
				1095	if (!tags->rqs) {
				1096	blk_mq_free_tags(tags);
				1097	return NULL;
				1098	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1099
				1100	/*
				1101	* rq_size is the size of the request plus driver payload, rounded
				1102	* to the cacheline size
				1103	*/
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1104	rq_size = round_up(sizeof(struct request) + set->cmd_size,
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1105	cache_line_size());
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1106	left = rq_size * set->queue_depth;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1107
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1108	for (i = 0; i < set->queue_depth; ) {
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1109	int this_order = max_order;
				1110	struct page *page;
				1111	int to_do;
				1112	void *p;
				1113
				1114	while (left < order_to_size(this_order - 1) && this_order)
				1115	this_order--;
				1116
				1117	do {
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1118	page = alloc_pages_node(set->numa_node, GFP_KERNEL,
				1119	this_order);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1120	if (page)
				1121	break;
				1122	if (!this_order--)
				1123	break;
				1124	if (order_to_size(this_order) < rq_size)
				1125	break;
				1126	} while (1);
				1127
				1128	if (!page)
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1129	goto fail;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1130
				1131	page->private = this_order;
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1132	list_add_tail(&page->lru, &tags->page_list);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1133
				1134	p = page_address(page);
				1135	entries_per_page = order_to_size(this_order) / rq_size;
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1136	to_do = min(entries_per_page, set->queue_depth - i);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1137	left -= to_do * rq_size;
				1138	for (j = 0; j < to_do; j++) {
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1139	tags->rqs[i] = p;
				1140	if (set->ops->init_request) {
				1141	if (set->ops->init_request(set->driver_data,
				1142	tags->rqs[i], hctx_idx, i,
				1143	set->numa_node))
				1144	goto fail;
Christoph Hellwig	e9b267d	2014-04-15 13:59:10 -0600	[diff] [blame]	1145	}
				1146
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1147	p += rq_size;
				1148	i++;
				1149	}
				1150	}
				1151
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1152	return tags;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1153
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1154	fail:
				1155	pr_warn("%s: failed to allocate requests\n", __func__);
				1156	blk_mq_free_rq_map(set, tags, hctx_idx);
				1157	return NULL;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1158	}
				1159
				1160	static int blk_mq_init_hw_queues(struct request_queue *q,
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1161	struct blk_mq_tag_set *set)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1162	{
				1163	struct blk_mq_hw_ctx *hctx;
				1164	unsigned int i, j;
				1165
				1166	/*
				1167	* Initialize hardware queues
				1168	*/
				1169	queue_for_each_hw_ctx(q, hctx, i) {
				1170	unsigned int num_maps;
				1171	int node;
				1172
				1173	node = hctx->numa_node;
				1174	if (node == NUMA_NO_NODE)
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1175	node = hctx->numa_node = set->numa_node;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1176
				1177	INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
				1178	spin_lock_init(&hctx->lock);
				1179	INIT_LIST_HEAD(&hctx->dispatch);
				1180	hctx->queue = q;
				1181	hctx->queue_num = i;
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1182	hctx->flags = set->flags;
				1183	hctx->cmd_size = set->cmd_size;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1184
				1185	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
				1186	blk_mq_hctx_notify, hctx);
				1187	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
				1188
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1189	hctx->tags = set->tags[i];
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1190
				1191	/*
				1192	* Allocate space for all possible cpus to avoid allocation in
				1193	* runtime
				1194	*/
				1195	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
				1196	GFP_KERNEL, node);
				1197	if (!hctx->ctxs)
				1198	break;
				1199
				1200	num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
				1201	hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
				1202	GFP_KERNEL, node);
				1203	if (!hctx->ctx_map)
				1204	break;
				1205
				1206	hctx->nr_ctx_map = num_maps;
				1207	hctx->nr_ctx = 0;
				1208
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1209	if (set->ops->init_hctx &&
				1210	set->ops->init_hctx(hctx, set->driver_data, i))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1211	break;
				1212	}
				1213
				1214	if (i == q->nr_hw_queues)
				1215	return 0;
				1216
				1217	/*
				1218	* Init failed
				1219	*/
				1220	queue_for_each_hw_ctx(q, hctx, j) {
				1221	if (i == j)
				1222	break;
				1223
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1224	if (set->ops->exit_hctx)
				1225	set->ops->exit_hctx(hctx, j);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1226
				1227	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1228	kfree(hctx->ctxs);
				1229	}
				1230
				1231	return 1;
				1232	}
				1233
				1234	static void blk_mq_init_cpu_queues(struct request_queue *q,
				1235	unsigned int nr_hw_queues)
				1236	{
				1237	unsigned int i;
				1238
				1239	for_each_possible_cpu(i) {
				1240	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
				1241	struct blk_mq_hw_ctx *hctx;
				1242
				1243	memset(__ctx, 0, sizeof(*__ctx));
				1244	__ctx->cpu = i;
				1245	spin_lock_init(&__ctx->lock);
				1246	INIT_LIST_HEAD(&__ctx->rq_list);
				1247	__ctx->queue = q;
				1248
				1249	/* If the cpu isn't online, the cpu is mapped to first hctx */
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1250	if (!cpu_online(i))
				1251	continue;
				1252
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1253	hctx = q->mq_ops->map_queue(q, i);
				1254	cpumask_set_cpu(i, hctx->cpumask);
				1255	hctx->nr_ctx++;
				1256
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1257	/*
				1258	* Set local node, IFF we have more than one hw queue. If
				1259	* not, we remain on the home node of the device
				1260	*/
				1261	if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
				1262	hctx->numa_node = cpu_to_node(i);
				1263	}
				1264	}
				1265
				1266	static void blk_mq_map_swqueue(struct request_queue *q)
				1267	{
				1268	unsigned int i;
				1269	struct blk_mq_hw_ctx *hctx;
				1270	struct blk_mq_ctx *ctx;
				1271
				1272	queue_for_each_hw_ctx(q, hctx, i) {
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1273	cpumask_clear(hctx->cpumask);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1274	hctx->nr_ctx = 0;
				1275	}
				1276
				1277	/*
				1278	* Map software to hardware queues
				1279	*/
				1280	queue_for_each_ctx(q, ctx, i) {
				1281	/* If the cpu isn't online, the cpu is mapped to first hctx */
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1282	if (!cpu_online(i))
				1283	continue;
				1284
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1285	hctx = q->mq_ops->map_queue(q, i);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1286	cpumask_set_cpu(i, hctx->cpumask);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1287	ctx->index_hw = hctx->nr_ctx;
				1288	hctx->ctxs[hctx->nr_ctx++] = ctx;
				1289	}
				1290	}
				1291
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1292	struct request_queue blk_mq_init_queue(struct blk_mq_tag_set set)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1293	{
				1294	struct blk_mq_hw_ctx **hctxs;
				1295	struct blk_mq_ctx *ctx;
				1296	struct request_queue *q;
				1297	int i;
				1298
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1299	ctx = alloc_percpu(struct blk_mq_ctx);
				1300	if (!ctx)
				1301	return ERR_PTR(-ENOMEM);
				1302
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1303	hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
				1304	set->numa_node);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1305
				1306	if (!hctxs)
				1307	goto err_percpu;
				1308
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1309	for (i = 0; i < set->nr_hw_queues; i++) {
				1310	hctxs[i] = set->ops->alloc_hctx(set, i);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1311	if (!hctxs[i])
				1312	goto err_hctxs;
				1313
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1314	if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
				1315	goto err_hctxs;
				1316
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1317	hctxs[i]->numa_node = NUMA_NO_NODE;
				1318	hctxs[i]->queue_num = i;
				1319	}
				1320
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1321	q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1322	if (!q)
				1323	goto err_hctxs;
				1324
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1325	q->mq_map = blk_mq_make_queue_map(set);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1326	if (!q->mq_map)
				1327	goto err_map;
				1328
				1329	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
				1330	blk_queue_rq_timeout(q, 30000);
				1331
				1332	q->nr_queues = nr_cpu_ids;
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1333	q->nr_hw_queues = set->nr_hw_queues;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1334
				1335	q->queue_ctx = ctx;
				1336	q->queue_hw_ctx = hctxs;
				1337
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1338	q->mq_ops = set->ops;
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	1339	q->queue_flags \|= QUEUE_FLAG_MQ_DEFAULT;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1340
Christoph Hellwig	1be036e	2014-02-07 10:22:39 -0800	[diff] [blame]	1341	q->sg_reserved_size = INT_MAX;
				1342
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1343	blk_queue_make_request(q, blk_mq_make_request);
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1344	blk_queue_rq_timed_out(q, set->ops->timeout);
				1345	if (set->timeout)
				1346	blk_queue_rq_timeout(q, set->timeout);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1347
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1348	if (set->ops->complete)
				1349	blk_queue_softirq_done(q, set->ops->complete);
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	1350
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1351	blk_mq_init_flush(q);
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1352	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1353
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1354	q->flush_rq = kzalloc(round_up(sizeof(struct request) +
				1355	set->cmd_size, cache_line_size()),
				1356	GFP_KERNEL);
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	1357	if (!q->flush_rq)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1358	goto err_hw;
				1359
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1360	if (blk_mq_init_hw_queues(q, set))
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	1361	goto err_flush_rq;
				1362
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1363	blk_mq_map_swqueue(q);
				1364
				1365	mutex_lock(&all_q_mutex);
				1366	list_add_tail(&q->all_q_node, &all_q_list);
				1367	mutex_unlock(&all_q_mutex);
				1368
				1369	return q;
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	1370
				1371	err_flush_rq:
				1372	kfree(q->flush_rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1373	err_hw:
				1374	kfree(q->mq_map);
				1375	err_map:
				1376	blk_cleanup_queue(q);
				1377	err_hctxs:
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1378	for (i = 0; i < set->nr_hw_queues; i++) {
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1379	if (!hctxs[i])
				1380	break;
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1381	free_cpumask_var(hctxs[i]->cpumask);
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1382	set->ops->free_hctx(hctxs[i], i);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1383	}
				1384	kfree(hctxs);
				1385	err_percpu:
				1386	free_percpu(ctx);
				1387	return ERR_PTR(-ENOMEM);
				1388	}
				1389	EXPORT_SYMBOL(blk_mq_init_queue);
				1390
				1391	void blk_mq_free_queue(struct request_queue *q)
				1392	{
				1393	struct blk_mq_hw_ctx *hctx;
				1394	int i;
				1395
				1396	queue_for_each_hw_ctx(q, hctx, i) {
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1397	kfree(hctx->ctx_map);
				1398	kfree(hctx->ctxs);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1399	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1400	if (q->mq_ops->exit_hctx)
				1401	q->mq_ops->exit_hctx(hctx, i);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1402	free_cpumask_var(hctx->cpumask);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1403	q->mq_ops->free_hctx(hctx, i);
				1404	}
				1405
				1406	free_percpu(q->queue_ctx);
				1407	kfree(q->queue_hw_ctx);
				1408	kfree(q->mq_map);
				1409
				1410	q->queue_ctx = NULL;
				1411	q->queue_hw_ctx = NULL;
				1412	q->mq_map = NULL;
				1413
				1414	mutex_lock(&all_q_mutex);
				1415	list_del_init(&q->all_q_node);
				1416	mutex_unlock(&all_q_mutex);
				1417	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1418
				1419	/* Basically redo blk_mq_init_queue with queue frozen */
Paul Gortmaker	f618ef7	2013-11-14 08:26:02 -0700	[diff] [blame]	1420	static void blk_mq_queue_reinit(struct request_queue *q)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1421	{
				1422	blk_mq_freeze_queue(q);
				1423
				1424	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
				1425
				1426	/*
				1427	* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
				1428	* we should change hctx numa_node according to new topology (this
				1429	* involves free and re-allocate memory, worthy doing?)
				1430	*/
				1431
				1432	blk_mq_map_swqueue(q);
				1433
				1434	blk_mq_unfreeze_queue(q);
				1435	}
				1436
Paul Gortmaker	f618ef7	2013-11-14 08:26:02 -0700	[diff] [blame]	1437	static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
				1438	unsigned long action, void *hcpu)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1439	{
				1440	struct request_queue *q;
				1441
				1442	/*
				1443	* Before new mapping is established, hotadded cpu might already start
				1444	* handling requests. This doesn't break anything as we map offline
				1445	* CPUs to first hardware queue. We will re-init queue below to get
				1446	* optimal settings.
				1447	*/
				1448	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
				1449	action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
				1450	return NOTIFY_OK;
				1451
				1452	mutex_lock(&all_q_mutex);
				1453	list_for_each_entry(q, &all_q_list, all_q_node)
				1454	blk_mq_queue_reinit(q);
				1455	mutex_unlock(&all_q_mutex);
				1456	return NOTIFY_OK;
				1457	}
				1458
Christoph Hellwig	24d2f90	2014-04-15 14:14:00 -0600	[diff] [blame]	1459	int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
				1460	{
				1461	int i;
				1462
				1463	if (!set->nr_hw_queues)
				1464	return -EINVAL;
				1465	if (!set->queue_depth \|\| set->queue_depth > BLK_MQ_MAX_DEPTH)
				1466	return -EINVAL;
				1467	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
				1468	return -EINVAL;
				1469
				1470	if (!set->nr_hw_queues \|\|
				1471	!set->ops->queue_rq \|\| !set->ops->map_queue \|\|
				1472	!set->ops->alloc_hctx \|\| !set->ops->free_hctx)
				1473	return -EINVAL;
				1474
				1475
				1476	set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags),
				1477	GFP_KERNEL, set->numa_node);
				1478	if (!set->tags)
				1479	goto out;
				1480
				1481	for (i = 0; i < set->nr_hw_queues; i++) {
				1482	set->tags[i] = blk_mq_init_rq_map(set, i);
				1483	if (!set->tags[i])
				1484	goto out_unwind;
				1485	}
				1486
				1487	return 0;
				1488
				1489	out_unwind:
				1490	while (--i >= 0)
				1491	blk_mq_free_rq_map(set, set->tags[i], i);
				1492	out:
				1493	return -ENOMEM;
				1494	}
				1495	EXPORT_SYMBOL(blk_mq_alloc_tag_set);
				1496
				1497	void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
				1498	{
				1499	int i;
				1500
				1501	for (i = 0; i < set->nr_hw_queues; i++)
				1502	blk_mq_free_rq_map(set, set->tags[i], i);
				1503	}
				1504	EXPORT_SYMBOL(blk_mq_free_tag_set);
				1505
Jens Axboe	676141e	2014-03-20 13:29:18 -0600	[diff] [blame]	1506	void blk_mq_disable_hotplug(void)
				1507	{
				1508	mutex_lock(&all_q_mutex);
				1509	}
				1510
				1511	void blk_mq_enable_hotplug(void)
				1512	{
				1513	mutex_unlock(&all_q_mutex);
				1514	}
				1515
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1516	static int __init blk_mq_init(void)
				1517	{
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1518	blk_mq_cpu_init();
				1519
				1520	/* Must be called after percpu_counter_hotcpu_callback() */
				1521	hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
				1522
				1523	return 0;
				1524	}
				1525	subsys_initcall(blk_mq_init);