Blame - block/blk-mq.c - SHIFTPHONES/mainline/linux

blob: 658428a28fafb47e3644c100f320addaf1d62e26 [file] [log] [blame]

Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1	#include <linux/kernel.h>
				2	#include <linux/module.h>
				3	#include <linux/backing-dev.h>
				4	#include <linux/bio.h>
				5	#include <linux/blkdev.h>
				6	#include <linux/mm.h>
				7	#include <linux/init.h>
				8	#include <linux/slab.h>
				9	#include <linux/workqueue.h>
				10	#include <linux/smp.h>
				11	#include <linux/llist.h>
				12	#include <linux/list_sort.h>
				13	#include <linux/cpu.h>
				14	#include <linux/cache.h>
				15	#include <linux/sched/sysctl.h>
				16	#include <linux/delay.h>
				17
				18	#include <trace/events/block.h>
				19
				20	#include <linux/blk-mq.h>
				21	#include "blk.h"
				22	#include "blk-mq.h"
				23	#include "blk-mq-tag.h"
				24
				25	static DEFINE_MUTEX(all_q_mutex);
				26	static LIST_HEAD(all_q_list);
				27
				28	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
				29
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	30	static struct blk_mq_ctx __blk_mq_get_ctx(struct request_queue q,
				31	unsigned int cpu)
				32	{
				33	return per_cpu_ptr(q->queue_ctx, cpu);
				34	}
				35
				36	/*
				37	* This assumes per-cpu software queueing queues. They could be per-node
				38	* as well, for instance. For now this is hardcoded as-is. Note that we don't
				39	* care about preemption, since we know the ctx's are persistent. This does
				40	* mean that we can't rely on ctx always matching the currently running CPU.
				41	*/
				42	static struct blk_mq_ctx blk_mq_get_ctx(struct request_queue q)
				43	{
				44	return __blk_mq_get_ctx(q, get_cpu());
				45	}
				46
				47	static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
				48	{
				49	put_cpu();
				50	}
				51
				52	/*
				53	* Check if any of the ctx's have pending work in this hardware queue
				54	*/
				55	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
				56	{
				57	unsigned int i;
				58
				59	for (i = 0; i < hctx->nr_ctx_map; i++)
				60	if (hctx->ctx_map[i])
				61	return true;
				62
				63	return false;
				64	}
				65
				66	/*
				67	* Mark this ctx as having pending work in this hardware queue
				68	*/
				69	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				70	struct blk_mq_ctx *ctx)
				71	{
				72	if (!test_bit(ctx->index_hw, hctx->ctx_map))
				73	set_bit(ctx->index_hw, hctx->ctx_map);
				74	}
				75
Christoph Hellwig	081241e	2014-02-20 15:32:36 -0800	[diff] [blame]	76	static struct request __blk_mq_alloc_request(struct blk_mq_hw_ctx hctx,
				77	gfp_t gfp, bool reserved)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	78	{
				79	struct request *rq;
				80	unsigned int tag;
				81
				82	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
				83	if (tag != BLK_MQ_TAG_FAIL) {
				84	rq = hctx->rqs[tag];
				85	rq->tag = tag;
				86
				87	return rq;
				88	}
				89
				90	return NULL;
				91	}
				92
				93	static int blk_mq_queue_enter(struct request_queue *q)
				94	{
				95	int ret;
				96
				97	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
				98	smp_wmb();
				99	/* we have problems to freeze the queue if it's initializing */
				100	if (!blk_queue_bypass(q) \|\| !blk_queue_init_done(q))
				101	return 0;
				102
				103	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				104
				105	spin_lock_irq(q->queue_lock);
				106	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	107	!blk_queue_bypass(q) \|\| blk_queue_dying(q),
				108	*q->queue_lock);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	109	/* inc usage with lock hold to avoid freeze_queue runs here */
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	110	if (!ret && !blk_queue_dying(q))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	111	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	112	else if (blk_queue_dying(q))
				113	ret = -ENODEV;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	114	spin_unlock_irq(q->queue_lock);
				115
				116	return ret;
				117	}
				118
				119	static void blk_mq_queue_exit(struct request_queue *q)
				120	{
				121	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				122	}
				123
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	124	static void __blk_mq_drain_queue(struct request_queue *q)
				125	{
				126	while (true) {
				127	s64 count;
				128
				129	spin_lock_irq(q->queue_lock);
				130	count = percpu_counter_sum(&q->mq_usage_counter);
				131	spin_unlock_irq(q->queue_lock);
				132
				133	if (count == 0)
				134	break;
				135	blk_mq_run_queues(q, false);
				136	msleep(10);
				137	}
				138	}
				139
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	140	/*
				141	* Guarantee no request is in use, so we can change any data structure of
				142	* the queue afterward.
				143	*/
				144	static void blk_mq_freeze_queue(struct request_queue *q)
				145	{
				146	bool drain;
				147
				148	spin_lock_irq(q->queue_lock);
				149	drain = !q->bypass_depth++;
				150	queue_flag_set(QUEUE_FLAG_BYPASS, q);
				151	spin_unlock_irq(q->queue_lock);
				152
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	153	if (drain)
				154	__blk_mq_drain_queue(q);
				155	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	156
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	157	void blk_mq_drain_queue(struct request_queue *q)
				158	{
				159	__blk_mq_drain_queue(q);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	160	}
				161
				162	static void blk_mq_unfreeze_queue(struct request_queue *q)
				163	{
				164	bool wake = false;
				165
				166	spin_lock_irq(q->queue_lock);
				167	if (!--q->bypass_depth) {
				168	queue_flag_clear(QUEUE_FLAG_BYPASS, q);
				169	wake = true;
				170	}
				171	WARN_ON_ONCE(q->bypass_depth < 0);
				172	spin_unlock_irq(q->queue_lock);
				173	if (wake)
				174	wake_up_all(&q->mq_freeze_wq);
				175	}
				176
				177	bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
				178	{
				179	return blk_mq_has_free_tags(hctx->tags);
				180	}
				181	EXPORT_SYMBOL(blk_mq_can_queue);
				182
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	183	static void blk_mq_rq_ctx_init(struct request_queue q, struct blk_mq_ctx ctx,
				184	struct request *rq, unsigned int rw_flags)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	185	{
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	186	if (blk_queue_io_stat(q))
				187	rw_flags \|= REQ_IO_STAT;
				188
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	189	rq->mq_ctx = ctx;
				190	rq->cmd_flags = rw_flags;
Ming Lei	0fec08b	2014-01-03 10:00:08 -0700	[diff] [blame]	191	rq->start_time = jiffies;
				192	set_start_time_ns(rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	193	ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
				194	}
				195
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	196	static struct request blk_mq_alloc_request_pinned(struct request_queue q,
				197	int rw, gfp_t gfp,
				198	bool reserved)
				199	{
				200	struct request *rq;
				201
				202	do {
				203	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
				204	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
				205
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	206	rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	207	if (rq) {
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	208	blk_mq_rq_ctx_init(q, ctx, rq, rw);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	209	break;
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	210	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	211
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	212	if (gfp & __GFP_WAIT) {
				213	__blk_mq_run_hw_queue(hctx);
				214	blk_mq_put_ctx(ctx);
				215	} else {
				216	blk_mq_put_ctx(ctx);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	217	break;
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	218	}
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	219
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	220	blk_mq_wait_for_tags(hctx->tags);
				221	} while (1);
				222
				223	return rq;
				224	}
				225
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	226	struct request blk_mq_alloc_request(struct request_queue q, int rw, gfp_t gfp)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	227	{
				228	struct request *rq;
				229
				230	if (blk_mq_queue_enter(q))
				231	return NULL;
				232
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	233	rq = blk_mq_alloc_request_pinned(q, rw, gfp, false);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	234	if (rq)
				235	blk_mq_put_ctx(rq->mq_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	236	return rq;
				237	}
				238
				239	struct request blk_mq_alloc_reserved_request(struct request_queue q, int rw,
				240	gfp_t gfp)
				241	{
				242	struct request *rq;
				243
				244	if (blk_mq_queue_enter(q))
				245	return NULL;
				246
				247	rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	248	if (rq)
				249	blk_mq_put_ctx(rq->mq_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	250	return rq;
				251	}
				252	EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
				253
				254	/*
				255	* Re-init and set pdu, if we have it
				256	*/
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	257	void blk_mq_rq_init(struct blk_mq_hw_ctx hctx, struct request rq)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	258	{
				259	blk_rq_init(hctx->queue, rq);
				260
				261	if (hctx->cmd_size)
				262	rq->special = blk_mq_rq_to_pdu(rq);
				263	}
				264
				265	static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
				266	struct blk_mq_ctx ctx, struct request rq)
				267	{
				268	const int tag = rq->tag;
				269	struct request_queue *q = rq->q;
				270
				271	blk_mq_rq_init(hctx, rq);
				272	blk_mq_put_tag(hctx->tags, tag);
				273
				274	blk_mq_queue_exit(q);
				275	}
				276
				277	void blk_mq_free_request(struct request *rq)
				278	{
				279	struct blk_mq_ctx *ctx = rq->mq_ctx;
				280	struct blk_mq_hw_ctx *hctx;
				281	struct request_queue *q = rq->q;
				282
				283	ctx->rq_completed[rq_is_sync(rq)]++;
				284
				285	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				286	__blk_mq_free_request(hctx, ctx, rq);
				287	}
				288
Christoph Hellwig	7237c74	2014-02-20 15:32:38 -0800	[diff] [blame]	289	bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	290	{
Christoph Hellwig	7237c74	2014-02-20 15:32:38 -0800	[diff] [blame]	291	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
				292	return true;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	293
Ming Lei	0d11e6a	2013-12-05 10:50:39 -0700	[diff] [blame]	294	blk_account_io_done(rq);
				295
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	296	if (rq->end_io)
				297	rq->end_io(rq, error);
				298	else
				299	blk_mq_free_request(rq);
Christoph Hellwig	7237c74	2014-02-20 15:32:38 -0800	[diff] [blame]	300	return false;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	301	}
Christoph Hellwig	7237c74	2014-02-20 15:32:38 -0800	[diff] [blame]	302	EXPORT_SYMBOL(blk_mq_end_io_partial);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	303
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	304	static void __blk_mq_complete_request_remote(void *data)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	305	{
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	306	struct request *rq = data;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	307
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	308	rq->q->softirq_done_fn(rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	309	}
				310
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	311	void __blk_mq_complete_request(struct request *rq)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	312	{
				313	struct blk_mq_ctx *ctx = rq->mq_ctx;
				314	int cpu;
				315
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	316	if (!ctx->ipi_redirect) {
				317	rq->q->softirq_done_fn(rq);
				318	return;
				319	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	320
				321	cpu = get_cpu();
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	322	if (cpu != ctx->cpu && cpu_online(ctx->cpu)) {
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	323	rq->csd.func = __blk_mq_complete_request_remote;
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	324	rq->csd.info = rq;
				325	rq->csd.flags = 0;
Frederic Weisbecker	c46fff2	2014-02-24 16:40:02 +0100	[diff] [blame]	326	smp_call_function_single_async(ctx->cpu, &rq->csd);
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	327	} else {
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	328	rq->q->softirq_done_fn(rq);
Christoph Hellwig	3d6efbf	2014-01-08 09:33:37 -0800	[diff] [blame]	329	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	330	put_cpu();
				331	}
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	332
				333	/**
				334	* blk_mq_complete_request - end I/O on a request
				335	* @rq: the request being processed
				336	*
				337	* Description:
				338	* Ends all I/O on a request. It does not handle partial completions.
				339	* The actual completion happens out-of-order, through a IPI handler.
				340	**/
				341	void blk_mq_complete_request(struct request *rq)
				342	{
				343	if (unlikely(blk_should_fake_timeout(rq->q)))
				344	return;
				345	if (!blk_mark_rq_complete(rq))
				346	__blk_mq_complete_request(rq);
				347	}
				348	EXPORT_SYMBOL(blk_mq_complete_request);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	349
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	350	static void blk_mq_start_request(struct request *rq, bool last)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	351	{
				352	struct request_queue *q = rq->q;
				353
				354	trace_block_rq_issue(q, rq);
				355
Christoph Hellwig	742ee69	2014-04-14 10:30:06 +0200	[diff] [blame^]	356	rq->resid_len = blk_rq_bytes(rq);
				357
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	358	/*
				359	* Just mark start time and set the started bit. Due to memory
				360	* ordering, we know we'll see the correct deadline as long as
				361	* REQ_ATOMIC_STARTED is seen.
				362	*/
				363	rq->deadline = jiffies + q->rq_timeout;
				364	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	365
				366	if (q->dma_drain_size && blk_rq_bytes(rq)) {
				367	/*
				368	* Make sure space for the drain appears. We know we can do
				369	* this because max_hw_segments has been adjusted to be one
				370	* fewer than the device can handle.
				371	*/
				372	rq->nr_phys_segments++;
				373	}
				374
				375	/*
				376	* Flag the last request in the series so that drivers know when IO
				377	* should be kicked off, if they don't do it on a per-request basis.
				378	*
				379	* Note: the flag isn't the only condition drivers should do kick off.
				380	* If drive is busy, the last request might not have the bit set.
				381	*/
				382	if (last)
				383	rq->cmd_flags \|= REQ_END;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	384	}
				385
				386	static void blk_mq_requeue_request(struct request *rq)
				387	{
				388	struct request_queue *q = rq->q;
				389
				390	trace_block_rq_requeue(q, rq);
				391	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	392
				393	rq->cmd_flags &= ~REQ_END;
				394
				395	if (q->dma_drain_size && blk_rq_bytes(rq))
				396	rq->nr_phys_segments--;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	397	}
				398
				399	struct blk_mq_timeout_data {
				400	struct blk_mq_hw_ctx *hctx;
				401	unsigned long *next;
				402	unsigned int *next_set;
				403	};
				404
				405	static void blk_mq_timeout_check(void __data, unsigned long free_tags)
				406	{
				407	struct blk_mq_timeout_data *data = __data;
				408	struct blk_mq_hw_ctx *hctx = data->hctx;
				409	unsigned int tag;
				410
				411	/* It may not be in flight yet (this is where
				412	* the REQ_ATOMIC_STARTED flag comes in). The requests are
				413	* statically allocated, so we know it's always safe to access the
				414	* memory associated with a bit offset into ->rqs[].
				415	*/
				416	tag = 0;
				417	do {
				418	struct request *rq;
				419
				420	tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
				421	if (tag >= hctx->queue_depth)
				422	break;
				423
				424	rq = hctx->rqs[tag++];
				425
				426	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
				427	continue;
				428
				429	blk_rq_check_expired(rq, data->next, data->next_set);
				430	} while (1);
				431	}
				432
				433	static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
				434	unsigned long *next,
				435	unsigned int *next_set)
				436	{
				437	struct blk_mq_timeout_data data = {
				438	.hctx = hctx,
				439	.next = next,
				440	.next_set = next_set,
				441	};
				442
				443	/*
				444	* Ask the tagging code to iterate busy requests, so we can
				445	* check them for timeout.
				446	*/
				447	blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
				448	}
				449
				450	static void blk_mq_rq_timer(unsigned long data)
				451	{
				452	struct request_queue q = (struct request_queue ) data;
				453	struct blk_mq_hw_ctx *hctx;
				454	unsigned long next = 0;
				455	int i, next_set = 0;
				456
				457	queue_for_each_hw_ctx(q, hctx, i)
				458	blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
				459
				460	if (next_set)
				461	mod_timer(&q->timeout, round_jiffies_up(next));
				462	}
				463
				464	/*
				465	* Reverse check our software queue for entries that we could potentially
				466	* merge with. Currently includes a hand-wavy stop count of 8, to not spend
				467	* too much time checking for merges.
				468	*/
				469	static bool blk_mq_attempt_merge(struct request_queue *q,
				470	struct blk_mq_ctx ctx, struct bio bio)
				471	{
				472	struct request *rq;
				473	int checked = 8;
				474
				475	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
				476	int el_ret;
				477
				478	if (!checked--)
				479	break;
				480
				481	if (!blk_rq_merge_ok(rq, bio))
				482	continue;
				483
				484	el_ret = blk_try_merge(rq, bio);
				485	if (el_ret == ELEVATOR_BACK_MERGE) {
				486	if (bio_attempt_back_merge(q, rq, bio)) {
				487	ctx->rq_merged++;
				488	return true;
				489	}
				490	break;
				491	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
				492	if (bio_attempt_front_merge(q, rq, bio)) {
				493	ctx->rq_merged++;
				494	return true;
				495	}
				496	break;
				497	}
				498	}
				499
				500	return false;
				501	}
				502
				503	void blk_mq_add_timer(struct request *rq)
				504	{
				505	__blk_add_timer(rq, NULL);
				506	}
				507
				508	/*
				509	* Run this hardware queue, pulling any software queues mapped to it in.
				510	* Note that this function currently has various problems around ordering
				511	* of IO. In particular, we'd like FIFO behaviour on handling existing
				512	* items on the hctx->dispatch list. Ignore that for now.
				513	*/
				514	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
				515	{
				516	struct request_queue *q = hctx->queue;
				517	struct blk_mq_ctx *ctx;
				518	struct request *rq;
				519	LIST_HEAD(rq_list);
				520	int bit, queued;
				521
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	522	WARN_ON(!preempt_count());
				523
Jens Axboe	5d12f90	2014-03-19 15:25:02 -0600	[diff] [blame]	524	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	525	return;
				526
				527	hctx->run++;
				528
				529	/*
				530	* Touch any software queue that has pending entries.
				531	*/
				532	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
				533	clear_bit(bit, hctx->ctx_map);
				534	ctx = hctx->ctxs[bit];
				535	BUG_ON(bit != ctx->index_hw);
				536
				537	spin_lock(&ctx->lock);
				538	list_splice_tail_init(&ctx->rq_list, &rq_list);
				539	spin_unlock(&ctx->lock);
				540	}
				541
				542	/*
				543	* If we have previous entries on our dispatch list, grab them
				544	* and stuff them at the front for more fair dispatch.
				545	*/
				546	if (!list_empty_careful(&hctx->dispatch)) {
				547	spin_lock(&hctx->lock);
				548	if (!list_empty(&hctx->dispatch))
				549	list_splice_init(&hctx->dispatch, &rq_list);
				550	spin_unlock(&hctx->lock);
				551	}
				552
				553	/*
				554	* Delete and return all entries from our dispatch list
				555	*/
				556	queued = 0;
				557
				558	/*
				559	* Now process all the entries, sending them to the driver.
				560	*/
				561	while (!list_empty(&rq_list)) {
				562	int ret;
				563
				564	rq = list_first_entry(&rq_list, struct request, queuelist);
				565	list_del_init(&rq->queuelist);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	566
Christoph Hellwig	49f5baa	2014-02-11 08:27:14 -0800	[diff] [blame]	567	blk_mq_start_request(rq, list_empty(&rq_list));
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	568
				569	ret = q->mq_ops->queue_rq(hctx, rq);
				570	switch (ret) {
				571	case BLK_MQ_RQ_QUEUE_OK:
				572	queued++;
				573	continue;
				574	case BLK_MQ_RQ_QUEUE_BUSY:
				575	/*
				576	* FIXME: we should have a mechanism to stop the queue
				577	* like blk_stop_queue, otherwise we will waste cpu
				578	* time
				579	*/
				580	list_add(&rq->queuelist, &rq_list);
				581	blk_mq_requeue_request(rq);
				582	break;
				583	default:
				584	pr_err("blk-mq: bad return on queue: %d\n", ret);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	585	case BLK_MQ_RQ_QUEUE_ERROR:
Christoph Hellwig	1e93b8c	2014-02-11 08:27:13 -0800	[diff] [blame]	586	rq->errors = -EIO;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	587	blk_mq_end_io(rq, rq->errors);
				588	break;
				589	}
				590
				591	if (ret == BLK_MQ_RQ_QUEUE_BUSY)
				592	break;
				593	}
				594
				595	if (!queued)
				596	hctx->dispatched[0]++;
				597	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
				598	hctx->dispatched[ilog2(queued) + 1]++;
				599
				600	/*
				601	* Any items that need requeuing? Stuff them into hctx->dispatch,
				602	* that is where we will continue on next queue run.
				603	*/
				604	if (!list_empty(&rq_list)) {
				605	spin_lock(&hctx->lock);
				606	list_splice(&rq_list, &hctx->dispatch);
				607	spin_unlock(&hctx->lock);
				608	}
				609	}
				610
				611	void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
				612	{
Jens Axboe	5d12f90	2014-03-19 15:25:02 -0600	[diff] [blame]	613	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	614	return;
				615
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	616	if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	617	__blk_mq_run_hw_queue(hctx);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	618	else if (hctx->queue->nr_hw_queues == 1)
Jens Axboe	59c3d45	2014-04-08 09:15:35 -0600	[diff] [blame]	619	kblockd_schedule_delayed_work(&hctx->delayed_work, 0);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	620	else {
				621	unsigned int cpu;
				622
				623	/*
				624	* It'd be great if the workqueue API had a way to pass
				625	* in a mask and had some smarts for more clever placement
				626	* than the first CPU. Or we could round-robin here. For now,
				627	* just queue on the first CPU.
				628	*/
				629	cpu = cpumask_first(hctx->cpumask);
				630	kblockd_schedule_delayed_work_on(cpu, &hctx->delayed_work, 0);
				631	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	632	}
				633
				634	void blk_mq_run_queues(struct request_queue *q, bool async)
				635	{
				636	struct blk_mq_hw_ctx *hctx;
				637	int i;
				638
				639	queue_for_each_hw_ctx(q, hctx, i) {
				640	if ((!blk_mq_hctx_has_pending(hctx) &&
				641	list_empty_careful(&hctx->dispatch)) \|\|
Jens Axboe	5d12f90	2014-03-19 15:25:02 -0600	[diff] [blame]	642	test_bit(BLK_MQ_S_STOPPED, &hctx->state))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	643	continue;
				644
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	645	preempt_disable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	646	blk_mq_run_hw_queue(hctx, async);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	647	preempt_enable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	648	}
				649	}
				650	EXPORT_SYMBOL(blk_mq_run_queues);
				651
				652	void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
				653	{
				654	cancel_delayed_work(&hctx->delayed_work);
				655	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
				656	}
				657	EXPORT_SYMBOL(blk_mq_stop_hw_queue);
				658
Christoph Hellwig	280d45f	2013-10-25 14:45:58 +0100	[diff] [blame]	659	void blk_mq_stop_hw_queues(struct request_queue *q)
				660	{
				661	struct blk_mq_hw_ctx *hctx;
				662	int i;
				663
				664	queue_for_each_hw_ctx(q, hctx, i)
				665	blk_mq_stop_hw_queue(hctx);
				666	}
				667	EXPORT_SYMBOL(blk_mq_stop_hw_queues);
				668
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	669	void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
				670	{
				671	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	672
				673	preempt_disable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	674	__blk_mq_run_hw_queue(hctx);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	675	preempt_enable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	676	}
				677	EXPORT_SYMBOL(blk_mq_start_hw_queue);
				678
				679	void blk_mq_start_stopped_hw_queues(struct request_queue *q)
				680	{
				681	struct blk_mq_hw_ctx *hctx;
				682	int i;
				683
				684	queue_for_each_hw_ctx(q, hctx, i) {
				685	if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
				686	continue;
				687
				688	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	689	preempt_disable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	690	blk_mq_run_hw_queue(hctx, true);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	691	preempt_enable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	692	}
				693	}
				694	EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
				695
				696	static void blk_mq_work_fn(struct work_struct *work)
				697	{
				698	struct blk_mq_hw_ctx *hctx;
				699
				700	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	701
				702	preempt_disable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	703	__blk_mq_run_hw_queue(hctx);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	704	preempt_enable();
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	705	}
				706
				707	static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	708	struct request *rq, bool at_head)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	709	{
				710	struct blk_mq_ctx *ctx = rq->mq_ctx;
				711
Jens Axboe	01b983c	2013-11-19 18:59:10 -0700	[diff] [blame]	712	trace_block_rq_insert(hctx->queue, rq);
				713
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	714	if (at_head)
				715	list_add(&rq->queuelist, &ctx->rq_list);
				716	else
				717	list_add_tail(&rq->queuelist, &ctx->rq_list);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	718	blk_mq_hctx_mark_pending(hctx, ctx);
				719
				720	/*
				721	* We do this early, to ensure we are on the right CPU.
				722	*/
				723	blk_mq_add_timer(rq);
				724	}
				725
Christoph Hellwig	eeabc85	2014-03-21 08:57:37 -0600	[diff] [blame]	726	void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
				727	bool async)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	728	{
				729	struct request_queue *q = rq->q;
				730	struct blk_mq_hw_ctx *hctx;
Christoph Hellwig	eeabc85	2014-03-21 08:57:37 -0600	[diff] [blame]	731	struct blk_mq_ctx ctx = rq->mq_ctx, current_ctx;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	732
				733	current_ctx = blk_mq_get_ctx(q);
Christoph Hellwig	eeabc85	2014-03-21 08:57:37 -0600	[diff] [blame]	734	if (!cpu_online(ctx->cpu))
				735	rq->mq_ctx = ctx = current_ctx;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	736
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	737	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				738
Christoph Hellwig	eeabc85	2014-03-21 08:57:37 -0600	[diff] [blame]	739	if (rq->cmd_flags & (REQ_FLUSH \| REQ_FUA) &&
				740	!(rq->cmd_flags & (REQ_FLUSH_SEQ))) {
				741	blk_insert_flush(rq);
				742	} else {
				743	spin_lock(&ctx->lock);
				744	__blk_mq_insert_request(hctx, rq, at_head);
				745	spin_unlock(&ctx->lock);
				746	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	747
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	748	if (run_queue)
				749	blk_mq_run_hw_queue(hctx, async);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	750
				751	blk_mq_put_ctx(current_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	752	}
				753
				754	static void blk_mq_insert_requests(struct request_queue *q,
				755	struct blk_mq_ctx *ctx,
				756	struct list_head *list,
				757	int depth,
				758	bool from_schedule)
				759
				760	{
				761	struct blk_mq_hw_ctx *hctx;
				762	struct blk_mq_ctx *current_ctx;
				763
				764	trace_block_unplug(q, depth, !from_schedule);
				765
				766	current_ctx = blk_mq_get_ctx(q);
				767
				768	if (!cpu_online(ctx->cpu))
				769	ctx = current_ctx;
				770	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				771
				772	/*
				773	* preemption doesn't flush plug list, so it's possible ctx->cpu is
				774	* offline now
				775	*/
				776	spin_lock(&ctx->lock);
				777	while (!list_empty(list)) {
				778	struct request *rq;
				779
				780	rq = list_first_entry(list, struct request, queuelist);
				781	list_del_init(&rq->queuelist);
				782	rq->mq_ctx = ctx;
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	783	__blk_mq_insert_request(hctx, rq, false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	784	}
				785	spin_unlock(&ctx->lock);
				786
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	787	blk_mq_run_hw_queue(hctx, from_schedule);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	788	blk_mq_put_ctx(current_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	789	}
				790
				791	static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
				792	{
				793	struct request *rqa = container_of(a, struct request, queuelist);
				794	struct request *rqb = container_of(b, struct request, queuelist);
				795
				796	return !(rqa->mq_ctx < rqb->mq_ctx \|\|
				797	(rqa->mq_ctx == rqb->mq_ctx &&
				798	blk_rq_pos(rqa) < blk_rq_pos(rqb)));
				799	}
				800
				801	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
				802	{
				803	struct blk_mq_ctx *this_ctx;
				804	struct request_queue *this_q;
				805	struct request *rq;
				806	LIST_HEAD(list);
				807	LIST_HEAD(ctx_list);
				808	unsigned int depth;
				809
				810	list_splice_init(&plug->mq_list, &list);
				811
				812	list_sort(NULL, &list, plug_ctx_cmp);
				813
				814	this_q = NULL;
				815	this_ctx = NULL;
				816	depth = 0;
				817
				818	while (!list_empty(&list)) {
				819	rq = list_entry_rq(list.next);
				820	list_del_init(&rq->queuelist);
				821	BUG_ON(!rq->q);
				822	if (rq->mq_ctx != this_ctx) {
				823	if (this_ctx) {
				824	blk_mq_insert_requests(this_q, this_ctx,
				825	&ctx_list, depth,
				826	from_schedule);
				827	}
				828
				829	this_ctx = rq->mq_ctx;
				830	this_q = rq->q;
				831	depth = 0;
				832	}
				833
				834	depth++;
				835	list_add_tail(&rq->queuelist, &ctx_list);
				836	}
				837
				838	/*
				839	* If 'this_ctx' is set, we know we have entries to complete
				840	* on 'ctx_list'. Do those.
				841	*/
				842	if (this_ctx) {
				843	blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
				844	from_schedule);
				845	}
				846	}
				847
				848	static void blk_mq_bio_to_request(struct request rq, struct bio bio)
				849	{
				850	init_request_from_bio(rq, bio);
				851	blk_account_io_start(rq, 1);
				852	}
				853
				854	static void blk_mq_make_request(struct request_queue q, struct bio bio)
				855	{
				856	struct blk_mq_hw_ctx *hctx;
				857	struct blk_mq_ctx *ctx;
				858	const int is_sync = rw_is_sync(bio->bi_rw);
				859	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH \| REQ_FUA);
				860	int rw = bio_data_dir(bio);
				861	struct request *rq;
				862	unsigned int use_plug, request_count = 0;
				863
				864	/*
				865	* If we have multiple hardware queues, just go directly to
				866	* one of those for sync IO.
				867	*/
				868	use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) \|\| !is_sync);
				869
				870	blk_queue_bounce(q, &bio);
				871
Nicholas Bellinger	14ec77f	2014-02-07 13:45:39 -0700	[diff] [blame]	872	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
				873	bio_endio(bio, -EIO);
				874	return;
				875	}
				876
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	877	if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
				878	return;
				879
				880	if (blk_mq_queue_enter(q)) {
				881	bio_endio(bio, -EIO);
				882	return;
				883	}
				884
				885	ctx = blk_mq_get_ctx(q);
				886	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				887
Shaohua Li	27fbf4e8	2014-02-19 20:20:21 +0800	[diff] [blame]	888	if (is_sync)
				889	rw \|= REQ_SYNC;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	890	trace_block_getrq(q, bio, rw);
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	891	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	892	if (likely(rq))
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	893	blk_mq_rq_ctx_init(q, ctx, rq, rw);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	894	else {
				895	blk_mq_put_ctx(ctx);
				896	trace_block_sleeprq(q, bio, rw);
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	897	rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT\|GFP_ATOMIC,
				898	false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	899	ctx = rq->mq_ctx;
				900	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				901	}
				902
				903	hctx->queued++;
				904
				905	if (unlikely(is_flush_fua)) {
				906	blk_mq_bio_to_request(rq, bio);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	907	blk_insert_flush(rq);
				908	goto run_queue;
				909	}
				910
				911	/*
				912	* A task plug currently exists. Since this is completely lockless,
				913	* utilize that to temporarily store requests until the task is
				914	* either done or scheduled away.
				915	*/
				916	if (use_plug) {
				917	struct blk_plug *plug = current->plug;
				918
				919	if (plug) {
				920	blk_mq_bio_to_request(rq, bio);
Shaohua Li	92f399c	2013-10-29 12:01:03 -0600	[diff] [blame]	921	if (list_empty(&plug->mq_list))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	922	trace_block_plug(q);
				923	else if (request_count >= BLK_MAX_REQUEST_COUNT) {
				924	blk_flush_plug_list(plug, false);
				925	trace_block_plug(q);
				926	}
				927	list_add_tail(&rq->queuelist, &plug->mq_list);
				928	blk_mq_put_ctx(ctx);
				929	return;
				930	}
				931	}
				932
				933	spin_lock(&ctx->lock);
				934
				935	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
				936	blk_mq_attempt_merge(q, ctx, bio))
				937	__blk_mq_free_request(hctx, ctx, rq);
				938	else {
				939	blk_mq_bio_to_request(rq, bio);
Christoph Hellwig	72a0a36	2014-02-07 10:22:36 -0800	[diff] [blame]	940	__blk_mq_insert_request(hctx, rq, false);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	941	}
				942
				943	spin_unlock(&ctx->lock);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	944
				945	/*
				946	* For a SYNC request, send it to the hardware immediately. For an
				947	* ASYNC request, just ensure that we run it later on. The latter
				948	* allows for merging opportunities and more efficient dispatching.
				949	*/
				950	run_queue:
				951	blk_mq_run_hw_queue(hctx, !is_sync \|\| is_flush_fua);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	952	blk_mq_put_ctx(ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	953	}
				954
				955	/*
				956	* Default mapping to a software queue, since we use one per CPU.
				957	*/
				958	struct blk_mq_hw_ctx blk_mq_map_queue(struct request_queue q, const int cpu)
				959	{
				960	return q->queue_hw_ctx[q->mq_map[cpu]];
				961	}
				962	EXPORT_SYMBOL(blk_mq_map_queue);
				963
				964	struct blk_mq_hw_ctx blk_mq_alloc_single_hw_queue(struct blk_mq_reg reg,
				965	unsigned int hctx_index)
				966	{
				967	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
				968	GFP_KERNEL \| __GFP_ZERO, reg->numa_node);
				969	}
				970	EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
				971
				972	void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
				973	unsigned int hctx_index)
				974	{
				975	kfree(hctx);
				976	}
				977	EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
				978
				979	static void blk_mq_hctx_notify(void *data, unsigned long action,
				980	unsigned int cpu)
				981	{
				982	struct blk_mq_hw_ctx *hctx = data;
Jens Axboe	bccb5f7	2014-04-04 21:34:48 -0600	[diff] [blame]	983	struct request_queue *q = hctx->queue;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	984	struct blk_mq_ctx *ctx;
				985	LIST_HEAD(tmp);
				986
				987	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
				988	return;
				989
				990	/*
				991	* Move ctx entries to new CPU, if this one is going away.
				992	*/
Jens Axboe	bccb5f7	2014-04-04 21:34:48 -0600	[diff] [blame]	993	ctx = __blk_mq_get_ctx(q, cpu);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	994
				995	spin_lock(&ctx->lock);
				996	if (!list_empty(&ctx->rq_list)) {
				997	list_splice_init(&ctx->rq_list, &tmp);
				998	clear_bit(ctx->index_hw, hctx->ctx_map);
				999	}
				1000	spin_unlock(&ctx->lock);
				1001
				1002	if (list_empty(&tmp))
				1003	return;
				1004
Jens Axboe	bccb5f7	2014-04-04 21:34:48 -0600	[diff] [blame]	1005	ctx = blk_mq_get_ctx(q);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1006	spin_lock(&ctx->lock);
				1007
				1008	while (!list_empty(&tmp)) {
				1009	struct request *rq;
				1010
				1011	rq = list_first_entry(&tmp, struct request, queuelist);
				1012	rq->mq_ctx = ctx;
				1013	list_move_tail(&rq->queuelist, &ctx->rq_list);
				1014	}
				1015
Jens Axboe	bccb5f7	2014-04-04 21:34:48 -0600	[diff] [blame]	1016	hctx = q->mq_ops->map_queue(q, ctx->cpu);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1017	blk_mq_hctx_mark_pending(hctx, ctx);
				1018
				1019	spin_unlock(&ctx->lock);
Jens Axboe	bccb5f7	2014-04-04 21:34:48 -0600	[diff] [blame]	1020
				1021	blk_mq_run_hw_queue(hctx, true);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1022	blk_mq_put_ctx(ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1023	}
				1024
Jens Axboe	95363ef	2014-03-14 10:43:15 -0600	[diff] [blame]	1025	static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
				1026	int (init)(void , struct blk_mq_hw_ctx *,
				1027	struct request *, unsigned int),
				1028	void *data)
				1029	{
				1030	unsigned int i;
				1031	int ret = 0;
				1032
				1033	for (i = 0; i < hctx->queue_depth; i++) {
				1034	struct request *rq = hctx->rqs[i];
				1035
				1036	ret = init(data, hctx, rq, i);
				1037	if (ret)
				1038	break;
				1039	}
				1040
				1041	return ret;
				1042	}
				1043
				1044	int blk_mq_init_commands(struct request_queue *q,
				1045	int (init)(void , struct blk_mq_hw_ctx *,
				1046	struct request *, unsigned int),
				1047	void *data)
				1048	{
				1049	struct blk_mq_hw_ctx *hctx;
				1050	unsigned int i;
				1051	int ret = 0;
				1052
				1053	queue_for_each_hw_ctx(q, hctx, i) {
				1054	ret = blk_mq_init_hw_commands(hctx, init, data);
				1055	if (ret)
				1056	break;
				1057	}
				1058
				1059	return ret;
				1060	}
				1061	EXPORT_SYMBOL(blk_mq_init_commands);
				1062
				1063	static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx,
				1064	void (free)(void , struct blk_mq_hw_ctx *,
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1065	struct request *, unsigned int),
				1066	void *data)
				1067	{
				1068	unsigned int i;
				1069
				1070	for (i = 0; i < hctx->queue_depth; i++) {
				1071	struct request *rq = hctx->rqs[i];
				1072
Jens Axboe	95363ef	2014-03-14 10:43:15 -0600	[diff] [blame]	1073	free(data, hctx, rq, i);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1074	}
				1075	}
				1076
Jens Axboe	95363ef	2014-03-14 10:43:15 -0600	[diff] [blame]	1077	void blk_mq_free_commands(struct request_queue *q,
				1078	void (free)(void , struct blk_mq_hw_ctx *,
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1079	struct request *, unsigned int),
				1080	void *data)
				1081	{
				1082	struct blk_mq_hw_ctx *hctx;
				1083	unsigned int i;
				1084
				1085	queue_for_each_hw_ctx(q, hctx, i)
Jens Axboe	95363ef	2014-03-14 10:43:15 -0600	[diff] [blame]	1086	blk_mq_free_hw_commands(hctx, free, data);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1087	}
Jens Axboe	95363ef	2014-03-14 10:43:15 -0600	[diff] [blame]	1088	EXPORT_SYMBOL(blk_mq_free_commands);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1089
				1090	static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
				1091	{
				1092	struct page *page;
				1093
				1094	while (!list_empty(&hctx->page_list)) {
Dave Hansen	6753471	2014-01-08 20:17:46 -0700	[diff] [blame]	1095	page = list_first_entry(&hctx->page_list, struct page, lru);
				1096	list_del_init(&page->lru);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1097	__free_pages(page, page->private);
				1098	}
				1099
				1100	kfree(hctx->rqs);
				1101
				1102	if (hctx->tags)
				1103	blk_mq_free_tags(hctx->tags);
				1104	}
				1105
				1106	static size_t order_to_size(unsigned int order)
				1107	{
				1108	size_t ret = PAGE_SIZE;
				1109
				1110	while (order--)
				1111	ret *= 2;
				1112
				1113	return ret;
				1114	}
				1115
				1116	static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
				1117	unsigned int reserved_tags, int node)
				1118	{
				1119	unsigned int i, j, entries_per_page, max_order = 4;
				1120	size_t rq_size, left;
				1121
				1122	INIT_LIST_HEAD(&hctx->page_list);
				1123
				1124	hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
				1125	GFP_KERNEL, node);
				1126	if (!hctx->rqs)
				1127	return -ENOMEM;
				1128
				1129	/*
				1130	* rq_size is the size of the request plus driver payload, rounded
				1131	* to the cacheline size
				1132	*/
				1133	rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
				1134	cache_line_size());
				1135	left = rq_size * hctx->queue_depth;
				1136
				1137	for (i = 0; i < hctx->queue_depth;) {
				1138	int this_order = max_order;
				1139	struct page *page;
				1140	int to_do;
				1141	void *p;
				1142
				1143	while (left < order_to_size(this_order - 1) && this_order)
				1144	this_order--;
				1145
				1146	do {
				1147	page = alloc_pages_node(node, GFP_KERNEL, this_order);
				1148	if (page)
				1149	break;
				1150	if (!this_order--)
				1151	break;
				1152	if (order_to_size(this_order) < rq_size)
				1153	break;
				1154	} while (1);
				1155
				1156	if (!page)
				1157	break;
				1158
				1159	page->private = this_order;
Dave Hansen	6753471	2014-01-08 20:17:46 -0700	[diff] [blame]	1160	list_add_tail(&page->lru, &hctx->page_list);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1161
				1162	p = page_address(page);
				1163	entries_per_page = order_to_size(this_order) / rq_size;
				1164	to_do = min(entries_per_page, hctx->queue_depth - i);
				1165	left -= to_do * rq_size;
				1166	for (j = 0; j < to_do; j++) {
				1167	hctx->rqs[i] = p;
				1168	blk_mq_rq_init(hctx, hctx->rqs[i]);
				1169	p += rq_size;
				1170	i++;
				1171	}
				1172	}
				1173
				1174	if (i < (reserved_tags + BLK_MQ_TAG_MIN))
				1175	goto err_rq_map;
				1176	else if (i != hctx->queue_depth) {
				1177	hctx->queue_depth = i;
				1178	pr_warn("%s: queue depth set to %u because of low memory\n",
				1179	__func__, i);
				1180	}
				1181
				1182	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
				1183	if (!hctx->tags) {
				1184	err_rq_map:
				1185	blk_mq_free_rq_map(hctx);
				1186	return -ENOMEM;
				1187	}
				1188
				1189	return 0;
				1190	}
				1191
				1192	static int blk_mq_init_hw_queues(struct request_queue *q,
				1193	struct blk_mq_reg reg, void driver_data)
				1194	{
				1195	struct blk_mq_hw_ctx *hctx;
				1196	unsigned int i, j;
				1197
				1198	/*
				1199	* Initialize hardware queues
				1200	*/
				1201	queue_for_each_hw_ctx(q, hctx, i) {
				1202	unsigned int num_maps;
				1203	int node;
				1204
				1205	node = hctx->numa_node;
				1206	if (node == NUMA_NO_NODE)
				1207	node = hctx->numa_node = reg->numa_node;
				1208
				1209	INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
				1210	spin_lock_init(&hctx->lock);
				1211	INIT_LIST_HEAD(&hctx->dispatch);
				1212	hctx->queue = q;
				1213	hctx->queue_num = i;
				1214	hctx->flags = reg->flags;
				1215	hctx->queue_depth = reg->queue_depth;
				1216	hctx->cmd_size = reg->cmd_size;
				1217
				1218	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
				1219	blk_mq_hctx_notify, hctx);
				1220	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
				1221
				1222	if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
				1223	break;
				1224
				1225	/*
				1226	* Allocate space for all possible cpus to avoid allocation in
				1227	* runtime
				1228	*/
				1229	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
				1230	GFP_KERNEL, node);
				1231	if (!hctx->ctxs)
				1232	break;
				1233
				1234	num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
				1235	hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
				1236	GFP_KERNEL, node);
				1237	if (!hctx->ctx_map)
				1238	break;
				1239
				1240	hctx->nr_ctx_map = num_maps;
				1241	hctx->nr_ctx = 0;
				1242
				1243	if (reg->ops->init_hctx &&
				1244	reg->ops->init_hctx(hctx, driver_data, i))
				1245	break;
				1246	}
				1247
				1248	if (i == q->nr_hw_queues)
				1249	return 0;
				1250
				1251	/*
				1252	* Init failed
				1253	*/
				1254	queue_for_each_hw_ctx(q, hctx, j) {
				1255	if (i == j)
				1256	break;
				1257
				1258	if (reg->ops->exit_hctx)
				1259	reg->ops->exit_hctx(hctx, j);
				1260
				1261	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1262	blk_mq_free_rq_map(hctx);
				1263	kfree(hctx->ctxs);
				1264	}
				1265
				1266	return 1;
				1267	}
				1268
				1269	static void blk_mq_init_cpu_queues(struct request_queue *q,
				1270	unsigned int nr_hw_queues)
				1271	{
				1272	unsigned int i;
				1273
				1274	for_each_possible_cpu(i) {
				1275	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
				1276	struct blk_mq_hw_ctx *hctx;
				1277
				1278	memset(__ctx, 0, sizeof(*__ctx));
				1279	__ctx->cpu = i;
				1280	spin_lock_init(&__ctx->lock);
				1281	INIT_LIST_HEAD(&__ctx->rq_list);
				1282	__ctx->queue = q;
				1283
				1284	/* If the cpu isn't online, the cpu is mapped to first hctx */
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1285	if (!cpu_online(i))
				1286	continue;
				1287
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1288	hctx = q->mq_ops->map_queue(q, i);
				1289	cpumask_set_cpu(i, hctx->cpumask);
				1290	hctx->nr_ctx++;
				1291
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1292	/*
				1293	* Set local node, IFF we have more than one hw queue. If
				1294	* not, we remain on the home node of the device
				1295	*/
				1296	if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
				1297	hctx->numa_node = cpu_to_node(i);
				1298	}
				1299	}
				1300
				1301	static void blk_mq_map_swqueue(struct request_queue *q)
				1302	{
				1303	unsigned int i;
				1304	struct blk_mq_hw_ctx *hctx;
				1305	struct blk_mq_ctx *ctx;
				1306
				1307	queue_for_each_hw_ctx(q, hctx, i) {
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1308	cpumask_clear(hctx->cpumask);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1309	hctx->nr_ctx = 0;
				1310	}
				1311
				1312	/*
				1313	* Map software to hardware queues
				1314	*/
				1315	queue_for_each_ctx(q, ctx, i) {
				1316	/* If the cpu isn't online, the cpu is mapped to first hctx */
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1317	if (!cpu_online(i))
				1318	continue;
				1319
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1320	hctx = q->mq_ops->map_queue(q, i);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1321	cpumask_set_cpu(i, hctx->cpumask);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1322	ctx->index_hw = hctx->nr_ctx;
				1323	hctx->ctxs[hctx->nr_ctx++] = ctx;
				1324	}
				1325	}
				1326
				1327	struct request_queue blk_mq_init_queue(struct blk_mq_reg reg,
				1328	void *driver_data)
				1329	{
				1330	struct blk_mq_hw_ctx **hctxs;
				1331	struct blk_mq_ctx *ctx;
				1332	struct request_queue *q;
				1333	int i;
				1334
				1335	if (!reg->nr_hw_queues \|\|
				1336	!reg->ops->queue_rq \|\| !reg->ops->map_queue \|\|
				1337	!reg->ops->alloc_hctx \|\| !reg->ops->free_hctx)
				1338	return ERR_PTR(-EINVAL);
				1339
				1340	if (!reg->queue_depth)
				1341	reg->queue_depth = BLK_MQ_MAX_DEPTH;
				1342	else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
				1343	pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
				1344	reg->queue_depth = BLK_MQ_MAX_DEPTH;
				1345	}
				1346
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1347	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
				1348	return ERR_PTR(-EINVAL);
				1349
				1350	ctx = alloc_percpu(struct blk_mq_ctx);
				1351	if (!ctx)
				1352	return ERR_PTR(-ENOMEM);
				1353
				1354	hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
				1355	reg->numa_node);
				1356
				1357	if (!hctxs)
				1358	goto err_percpu;
				1359
				1360	for (i = 0; i < reg->nr_hw_queues; i++) {
				1361	hctxs[i] = reg->ops->alloc_hctx(reg, i);
				1362	if (!hctxs[i])
				1363	goto err_hctxs;
				1364
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1365	if (!zalloc_cpumask_var(&hctxs[i]->cpumask, GFP_KERNEL))
				1366	goto err_hctxs;
				1367
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1368	hctxs[i]->numa_node = NUMA_NO_NODE;
				1369	hctxs[i]->queue_num = i;
				1370	}
				1371
				1372	q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
				1373	if (!q)
				1374	goto err_hctxs;
				1375
				1376	q->mq_map = blk_mq_make_queue_map(reg);
				1377	if (!q->mq_map)
				1378	goto err_map;
				1379
				1380	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
				1381	blk_queue_rq_timeout(q, 30000);
				1382
				1383	q->nr_queues = nr_cpu_ids;
				1384	q->nr_hw_queues = reg->nr_hw_queues;
				1385
				1386	q->queue_ctx = ctx;
				1387	q->queue_hw_ctx = hctxs;
				1388
				1389	q->mq_ops = reg->ops;
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	1390	q->queue_flags \|= QUEUE_FLAG_MQ_DEFAULT;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1391
Christoph Hellwig	1be036e	2014-02-07 10:22:39 -0800	[diff] [blame]	1392	q->sg_reserved_size = INT_MAX;
				1393
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1394	blk_queue_make_request(q, blk_mq_make_request);
				1395	blk_queue_rq_timed_out(q, reg->ops->timeout);
				1396	if (reg->timeout)
				1397	blk_queue_rq_timeout(q, reg->timeout);
				1398
Christoph Hellwig	30a91cb	2014-02-10 03:24:38 -0800	[diff] [blame]	1399	if (reg->ops->complete)
				1400	blk_queue_softirq_done(q, reg->ops->complete);
				1401
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1402	blk_mq_init_flush(q);
				1403	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
				1404
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	1405	q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
				1406	cache_line_size()), GFP_KERNEL);
				1407	if (!q->flush_rq)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1408	goto err_hw;
				1409
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	1410	if (blk_mq_init_hw_queues(q, reg, driver_data))
				1411	goto err_flush_rq;
				1412
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1413	blk_mq_map_swqueue(q);
				1414
				1415	mutex_lock(&all_q_mutex);
				1416	list_add_tail(&q->all_q_node, &all_q_list);
				1417	mutex_unlock(&all_q_mutex);
				1418
				1419	return q;
Christoph Hellwig	1874198	2014-02-10 09:29:00 -0700	[diff] [blame]	1420
				1421	err_flush_rq:
				1422	kfree(q->flush_rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1423	err_hw:
				1424	kfree(q->mq_map);
				1425	err_map:
				1426	blk_cleanup_queue(q);
				1427	err_hctxs:
				1428	for (i = 0; i < reg->nr_hw_queues; i++) {
				1429	if (!hctxs[i])
				1430	break;
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1431	free_cpumask_var(hctxs[i]->cpumask);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1432	reg->ops->free_hctx(hctxs[i], i);
				1433	}
				1434	kfree(hctxs);
				1435	err_percpu:
				1436	free_percpu(ctx);
				1437	return ERR_PTR(-ENOMEM);
				1438	}
				1439	EXPORT_SYMBOL(blk_mq_init_queue);
				1440
				1441	void blk_mq_free_queue(struct request_queue *q)
				1442	{
				1443	struct blk_mq_hw_ctx *hctx;
				1444	int i;
				1445
				1446	queue_for_each_hw_ctx(q, hctx, i) {
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1447	kfree(hctx->ctx_map);
				1448	kfree(hctx->ctxs);
				1449	blk_mq_free_rq_map(hctx);
				1450	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1451	if (q->mq_ops->exit_hctx)
				1452	q->mq_ops->exit_hctx(hctx, i);
Jens Axboe	e4043dc	2014-04-09 10:18:23 -0600	[diff] [blame]	1453	free_cpumask_var(hctx->cpumask);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1454	q->mq_ops->free_hctx(hctx, i);
				1455	}
				1456
				1457	free_percpu(q->queue_ctx);
				1458	kfree(q->queue_hw_ctx);
				1459	kfree(q->mq_map);
				1460
				1461	q->queue_ctx = NULL;
				1462	q->queue_hw_ctx = NULL;
				1463	q->mq_map = NULL;
				1464
				1465	mutex_lock(&all_q_mutex);
				1466	list_del_init(&q->all_q_node);
				1467	mutex_unlock(&all_q_mutex);
				1468	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1469
				1470	/* Basically redo blk_mq_init_queue with queue frozen */
Paul Gortmaker	f618ef7	2013-11-14 08:26:02 -0700	[diff] [blame]	1471	static void blk_mq_queue_reinit(struct request_queue *q)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1472	{
				1473	blk_mq_freeze_queue(q);
				1474
				1475	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
				1476
				1477	/*
				1478	* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
				1479	* we should change hctx numa_node according to new topology (this
				1480	* involves free and re-allocate memory, worthy doing?)
				1481	*/
				1482
				1483	blk_mq_map_swqueue(q);
				1484
				1485	blk_mq_unfreeze_queue(q);
				1486	}
				1487
Paul Gortmaker	f618ef7	2013-11-14 08:26:02 -0700	[diff] [blame]	1488	static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
				1489	unsigned long action, void *hcpu)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1490	{
				1491	struct request_queue *q;
				1492
				1493	/*
				1494	* Before new mapping is established, hotadded cpu might already start
				1495	* handling requests. This doesn't break anything as we map offline
				1496	* CPUs to first hardware queue. We will re-init queue below to get
				1497	* optimal settings.
				1498	*/
				1499	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
				1500	action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
				1501	return NOTIFY_OK;
				1502
				1503	mutex_lock(&all_q_mutex);
				1504	list_for_each_entry(q, &all_q_list, all_q_node)
				1505	blk_mq_queue_reinit(q);
				1506	mutex_unlock(&all_q_mutex);
				1507	return NOTIFY_OK;
				1508	}
				1509
Jens Axboe	676141e	2014-03-20 13:29:18 -0600	[diff] [blame]	1510	void blk_mq_disable_hotplug(void)
				1511	{
				1512	mutex_lock(&all_q_mutex);
				1513	}
				1514
				1515	void blk_mq_enable_hotplug(void)
				1516	{
				1517	mutex_unlock(&all_q_mutex);
				1518	}
				1519
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1520	static int __init blk_mq_init(void)
				1521	{
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1522	blk_mq_cpu_init();
				1523
				1524	/* Must be called after percpu_counter_hotcpu_callback() */
				1525	hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
				1526
				1527	return 0;
				1528	}
				1529	subsys_initcall(blk_mq_init);