Blame - block/blk-mq.c - SHIFTPHONES/kernel/shift/mainline

blob: 6914f9bd470ac2bf48de797f1ddd53ad27dd6604 [file] [log] [blame]

Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1	#include <linux/kernel.h>
				2	#include <linux/module.h>
				3	#include <linux/backing-dev.h>
				4	#include <linux/bio.h>
				5	#include <linux/blkdev.h>
				6	#include <linux/mm.h>
				7	#include <linux/init.h>
				8	#include <linux/slab.h>
				9	#include <linux/workqueue.h>
				10	#include <linux/smp.h>
				11	#include <linux/llist.h>
				12	#include <linux/list_sort.h>
				13	#include <linux/cpu.h>
				14	#include <linux/cache.h>
				15	#include <linux/sched/sysctl.h>
				16	#include <linux/delay.h>
				17
				18	#include <trace/events/block.h>
				19
				20	#include <linux/blk-mq.h>
				21	#include "blk.h"
				22	#include "blk-mq.h"
				23	#include "blk-mq-tag.h"
				24
				25	static DEFINE_MUTEX(all_q_mutex);
				26	static LIST_HEAD(all_q_list);
				27
				28	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
				29
				30	DEFINE_PER_CPU(struct llist_head, ipi_lists);
				31
				32	static struct blk_mq_ctx __blk_mq_get_ctx(struct request_queue q,
				33	unsigned int cpu)
				34	{
				35	return per_cpu_ptr(q->queue_ctx, cpu);
				36	}
				37
				38	/*
				39	* This assumes per-cpu software queueing queues. They could be per-node
				40	* as well, for instance. For now this is hardcoded as-is. Note that we don't
				41	* care about preemption, since we know the ctx's are persistent. This does
				42	* mean that we can't rely on ctx always matching the currently running CPU.
				43	*/
				44	static struct blk_mq_ctx blk_mq_get_ctx(struct request_queue q)
				45	{
				46	return __blk_mq_get_ctx(q, get_cpu());
				47	}
				48
				49	static void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
				50	{
				51	put_cpu();
				52	}
				53
				54	/*
				55	* Check if any of the ctx's have pending work in this hardware queue
				56	*/
				57	static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
				58	{
				59	unsigned int i;
				60
				61	for (i = 0; i < hctx->nr_ctx_map; i++)
				62	if (hctx->ctx_map[i])
				63	return true;
				64
				65	return false;
				66	}
				67
				68	/*
				69	* Mark this ctx as having pending work in this hardware queue
				70	*/
				71	static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				72	struct blk_mq_ctx *ctx)
				73	{
				74	if (!test_bit(ctx->index_hw, hctx->ctx_map))
				75	set_bit(ctx->index_hw, hctx->ctx_map);
				76	}
				77
				78	static struct request blk_mq_alloc_rq(struct blk_mq_hw_ctx hctx, gfp_t gfp,
				79	bool reserved)
				80	{
				81	struct request *rq;
				82	unsigned int tag;
				83
				84	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
				85	if (tag != BLK_MQ_TAG_FAIL) {
				86	rq = hctx->rqs[tag];
				87	rq->tag = tag;
				88
				89	return rq;
				90	}
				91
				92	return NULL;
				93	}
				94
				95	static int blk_mq_queue_enter(struct request_queue *q)
				96	{
				97	int ret;
				98
				99	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
				100	smp_wmb();
				101	/* we have problems to freeze the queue if it's initializing */
				102	if (!blk_queue_bypass(q) \|\| !blk_queue_init_done(q))
				103	return 0;
				104
				105	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				106
				107	spin_lock_irq(q->queue_lock);
				108	ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq,
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	109	!blk_queue_bypass(q) \|\| blk_queue_dying(q),
				110	*q->queue_lock);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	111	/* inc usage with lock hold to avoid freeze_queue runs here */
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	112	if (!ret && !blk_queue_dying(q))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	113	__percpu_counter_add(&q->mq_usage_counter, 1, 1000000);
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	114	else if (blk_queue_dying(q))
				115	ret = -ENODEV;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	116	spin_unlock_irq(q->queue_lock);
				117
				118	return ret;
				119	}
				120
				121	static void blk_mq_queue_exit(struct request_queue *q)
				122	{
				123	__percpu_counter_add(&q->mq_usage_counter, -1, 1000000);
				124	}
				125
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	126	static void __blk_mq_drain_queue(struct request_queue *q)
				127	{
				128	while (true) {
				129	s64 count;
				130
				131	spin_lock_irq(q->queue_lock);
				132	count = percpu_counter_sum(&q->mq_usage_counter);
				133	spin_unlock_irq(q->queue_lock);
				134
				135	if (count == 0)
				136	break;
				137	blk_mq_run_queues(q, false);
				138	msleep(10);
				139	}
				140	}
				141
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	142	/*
				143	* Guarantee no request is in use, so we can change any data structure of
				144	* the queue afterward.
				145	*/
				146	static void blk_mq_freeze_queue(struct request_queue *q)
				147	{
				148	bool drain;
				149
				150	spin_lock_irq(q->queue_lock);
				151	drain = !q->bypass_depth++;
				152	queue_flag_set(QUEUE_FLAG_BYPASS, q);
				153	spin_unlock_irq(q->queue_lock);
				154
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	155	if (drain)
				156	__blk_mq_drain_queue(q);
				157	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	158
Ming Lei	43a5e4e	2013-12-26 21:31:35 +0800	[diff] [blame]	159	void blk_mq_drain_queue(struct request_queue *q)
				160	{
				161	__blk_mq_drain_queue(q);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	162	}
				163
				164	static void blk_mq_unfreeze_queue(struct request_queue *q)
				165	{
				166	bool wake = false;
				167
				168	spin_lock_irq(q->queue_lock);
				169	if (!--q->bypass_depth) {
				170	queue_flag_clear(QUEUE_FLAG_BYPASS, q);
				171	wake = true;
				172	}
				173	WARN_ON_ONCE(q->bypass_depth < 0);
				174	spin_unlock_irq(q->queue_lock);
				175	if (wake)
				176	wake_up_all(&q->mq_freeze_wq);
				177	}
				178
				179	bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
				180	{
				181	return blk_mq_has_free_tags(hctx->tags);
				182	}
				183	EXPORT_SYMBOL(blk_mq_can_queue);
				184
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	185	static void blk_mq_rq_ctx_init(struct request_queue q, struct blk_mq_ctx ctx,
				186	struct request *rq, unsigned int rw_flags)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	187	{
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	188	if (blk_queue_io_stat(q))
				189	rw_flags \|= REQ_IO_STAT;
				190
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	191	rq->mq_ctx = ctx;
				192	rq->cmd_flags = rw_flags;
				193	ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
				194	}
				195
				196	static struct request __blk_mq_alloc_request(struct blk_mq_hw_ctx hctx,
				197	gfp_t gfp, bool reserved)
				198	{
				199	return blk_mq_alloc_rq(hctx, gfp, reserved);
				200	}
				201
				202	static struct request blk_mq_alloc_request_pinned(struct request_queue q,
				203	int rw, gfp_t gfp,
				204	bool reserved)
				205	{
				206	struct request *rq;
				207
				208	do {
				209	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
				210	struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
				211
				212	rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved);
				213	if (rq) {
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	214	blk_mq_rq_ctx_init(q, ctx, rq, rw);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	215	break;
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	216	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	217
				218	blk_mq_put_ctx(ctx);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	219	if (!(gfp & __GFP_WAIT))
				220	break;
				221
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	222	__blk_mq_run_hw_queue(hctx);
				223	blk_mq_wait_for_tags(hctx->tags);
				224	} while (1);
				225
				226	return rq;
				227	}
				228
Christoph Hellwig	3228f48	2013-10-28 13:33:58 -0600	[diff] [blame]	229	struct request blk_mq_alloc_request(struct request_queue q, int rw,
				230	gfp_t gfp, bool reserved)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	231	{
				232	struct request *rq;
				233
				234	if (blk_mq_queue_enter(q))
				235	return NULL;
				236
Christoph Hellwig	3228f48	2013-10-28 13:33:58 -0600	[diff] [blame]	237	rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	238	if (rq)
				239	blk_mq_put_ctx(rq->mq_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	240	return rq;
				241	}
				242
				243	struct request blk_mq_alloc_reserved_request(struct request_queue q, int rw,
				244	gfp_t gfp)
				245	{
				246	struct request *rq;
				247
				248	if (blk_mq_queue_enter(q))
				249	return NULL;
				250
				251	rq = blk_mq_alloc_request_pinned(q, rw, gfp, true);
Jeff Moyer	959a35f	2013-12-03 14:23:00 -0700	[diff] [blame]	252	if (rq)
				253	blk_mq_put_ctx(rq->mq_ctx);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	254	return rq;
				255	}
				256	EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
				257
				258	/*
				259	* Re-init and set pdu, if we have it
				260	*/
				261	static void blk_mq_rq_init(struct blk_mq_hw_ctx hctx, struct request rq)
				262	{
				263	blk_rq_init(hctx->queue, rq);
				264
				265	if (hctx->cmd_size)
				266	rq->special = blk_mq_rq_to_pdu(rq);
				267	}
				268
				269	static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
				270	struct blk_mq_ctx ctx, struct request rq)
				271	{
				272	const int tag = rq->tag;
				273	struct request_queue *q = rq->q;
				274
				275	blk_mq_rq_init(hctx, rq);
				276	blk_mq_put_tag(hctx->tags, tag);
				277
				278	blk_mq_queue_exit(q);
				279	}
				280
				281	void blk_mq_free_request(struct request *rq)
				282	{
				283	struct blk_mq_ctx *ctx = rq->mq_ctx;
				284	struct blk_mq_hw_ctx *hctx;
				285	struct request_queue *q = rq->q;
				286
				287	ctx->rq_completed[rq_is_sync(rq)]++;
				288
				289	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				290	__blk_mq_free_request(hctx, ctx, rq);
				291	}
				292
				293	static void blk_mq_bio_endio(struct request rq, struct bio bio, int error)
				294	{
				295	if (error)
				296	clear_bit(BIO_UPTODATE, &bio->bi_flags);
				297	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
				298	error = -EIO;
				299
				300	if (unlikely(rq->cmd_flags & REQ_QUIET))
				301	set_bit(BIO_QUIET, &bio->bi_flags);
				302
				303	/* don't actually finish bio if it's part of flush sequence */
				304	if (!(rq->cmd_flags & REQ_FLUSH_SEQ))
				305	bio_endio(bio, error);
				306	}
				307
				308	void blk_mq_complete_request(struct request *rq, int error)
				309	{
				310	struct bio *bio = rq->bio;
				311	unsigned int bytes = 0;
				312
				313	trace_block_rq_complete(rq->q, rq);
				314
				315	while (bio) {
				316	struct bio *next = bio->bi_next;
				317
				318	bio->bi_next = NULL;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	319	bytes += bio->bi_iter.bi_size;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	320	blk_mq_bio_endio(rq, bio, error);
				321	bio = next;
				322	}
				323
				324	blk_account_io_completion(rq, bytes);
				325
Ming Lei	0d11e6a	2013-12-05 10:50:39 -0700	[diff] [blame]	326	blk_account_io_done(rq);
				327
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	328	if (rq->end_io)
				329	rq->end_io(rq, error);
				330	else
				331	blk_mq_free_request(rq);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	332	}
				333
				334	void __blk_mq_end_io(struct request *rq, int error)
				335	{
				336	if (!blk_mark_rq_complete(rq))
				337	blk_mq_complete_request(rq, error);
				338	}
				339
Christoph Hellwig	0a06ff0	2013-11-14 14:32:07 -0800	[diff] [blame]	340	#if defined(CONFIG_SMP)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	341
				342	/*
				343	* Called with interrupts disabled.
				344	*/
				345	static void ipi_end_io(void *data)
				346	{
				347	struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id());
				348	struct llist_node entry, next;
				349	struct request *rq;
				350
				351	entry = llist_del_all(list);
				352
				353	while (entry) {
				354	next = entry->next;
				355	rq = llist_entry(entry, struct request, ll_list);
				356	__blk_mq_end_io(rq, rq->errors);
				357	entry = next;
				358	}
				359	}
				360
				361	static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
				362	struct request *rq, const int error)
				363	{
				364	struct call_single_data *data = &rq->csd;
				365
				366	rq->errors = error;
				367	rq->ll_list.next = NULL;
				368
				369	/*
				370	* If the list is non-empty, an existing IPI must already
				371	* be "in flight". If that is the case, we need not schedule
				372	* a new one.
				373	*/
				374	if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) {
				375	data->func = ipi_end_io;
				376	data->flags = 0;
				377	__smp_call_function_single(ctx->cpu, data, 0);
				378	}
				379
				380	return true;
				381	}
Christoph Hellwig	0a06ff0	2013-11-14 14:32:07 -0800	[diff] [blame]	382	#else /* CONFIG_SMP */
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	383	static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu,
				384	struct request *rq, const int error)
				385	{
				386	return false;
				387	}
				388	#endif
				389
				390	/*
				391	* End IO on this request on a multiqueue enabled driver. We'll either do
				392	* it directly inline, or punt to a local IPI handler on the matching
				393	* remote CPU.
				394	*/
				395	void blk_mq_end_io(struct request *rq, int error)
				396	{
				397	struct blk_mq_ctx *ctx = rq->mq_ctx;
				398	int cpu;
				399
				400	if (!ctx->ipi_redirect)
				401	return __blk_mq_end_io(rq, error);
				402
				403	cpu = get_cpu();
				404
				405	if (cpu == ctx->cpu \|\| !cpu_online(ctx->cpu) \|\|
				406	!ipi_remote_cpu(ctx, cpu, rq, error))
				407	__blk_mq_end_io(rq, error);
				408
				409	put_cpu();
				410	}
				411	EXPORT_SYMBOL(blk_mq_end_io);
				412
				413	static void blk_mq_start_request(struct request *rq)
				414	{
				415	struct request_queue *q = rq->q;
				416
				417	trace_block_rq_issue(q, rq);
				418
				419	/*
				420	* Just mark start time and set the started bit. Due to memory
				421	* ordering, we know we'll see the correct deadline as long as
				422	* REQ_ATOMIC_STARTED is seen.
				423	*/
				424	rq->deadline = jiffies + q->rq_timeout;
				425	set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
				426	}
				427
				428	static void blk_mq_requeue_request(struct request *rq)
				429	{
				430	struct request_queue *q = rq->q;
				431
				432	trace_block_rq_requeue(q, rq);
				433	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
				434	}
				435
				436	struct blk_mq_timeout_data {
				437	struct blk_mq_hw_ctx *hctx;
				438	unsigned long *next;
				439	unsigned int *next_set;
				440	};
				441
				442	static void blk_mq_timeout_check(void __data, unsigned long free_tags)
				443	{
				444	struct blk_mq_timeout_data *data = __data;
				445	struct blk_mq_hw_ctx *hctx = data->hctx;
				446	unsigned int tag;
				447
				448	/* It may not be in flight yet (this is where
				449	* the REQ_ATOMIC_STARTED flag comes in). The requests are
				450	* statically allocated, so we know it's always safe to access the
				451	* memory associated with a bit offset into ->rqs[].
				452	*/
				453	tag = 0;
				454	do {
				455	struct request *rq;
				456
				457	tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
				458	if (tag >= hctx->queue_depth)
				459	break;
				460
				461	rq = hctx->rqs[tag++];
				462
				463	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
				464	continue;
				465
				466	blk_rq_check_expired(rq, data->next, data->next_set);
				467	} while (1);
				468	}
				469
				470	static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx,
				471	unsigned long *next,
				472	unsigned int *next_set)
				473	{
				474	struct blk_mq_timeout_data data = {
				475	.hctx = hctx,
				476	.next = next,
				477	.next_set = next_set,
				478	};
				479
				480	/*
				481	* Ask the tagging code to iterate busy requests, so we can
				482	* check them for timeout.
				483	*/
				484	blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data);
				485	}
				486
				487	static void blk_mq_rq_timer(unsigned long data)
				488	{
				489	struct request_queue q = (struct request_queue ) data;
				490	struct blk_mq_hw_ctx *hctx;
				491	unsigned long next = 0;
				492	int i, next_set = 0;
				493
				494	queue_for_each_hw_ctx(q, hctx, i)
				495	blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
				496
				497	if (next_set)
				498	mod_timer(&q->timeout, round_jiffies_up(next));
				499	}
				500
				501	/*
				502	* Reverse check our software queue for entries that we could potentially
				503	* merge with. Currently includes a hand-wavy stop count of 8, to not spend
				504	* too much time checking for merges.
				505	*/
				506	static bool blk_mq_attempt_merge(struct request_queue *q,
				507	struct blk_mq_ctx ctx, struct bio bio)
				508	{
				509	struct request *rq;
				510	int checked = 8;
				511
				512	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
				513	int el_ret;
				514
				515	if (!checked--)
				516	break;
				517
				518	if (!blk_rq_merge_ok(rq, bio))
				519	continue;
				520
				521	el_ret = blk_try_merge(rq, bio);
				522	if (el_ret == ELEVATOR_BACK_MERGE) {
				523	if (bio_attempt_back_merge(q, rq, bio)) {
				524	ctx->rq_merged++;
				525	return true;
				526	}
				527	break;
				528	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
				529	if (bio_attempt_front_merge(q, rq, bio)) {
				530	ctx->rq_merged++;
				531	return true;
				532	}
				533	break;
				534	}
				535	}
				536
				537	return false;
				538	}
				539
				540	void blk_mq_add_timer(struct request *rq)
				541	{
				542	__blk_add_timer(rq, NULL);
				543	}
				544
				545	/*
				546	* Run this hardware queue, pulling any software queues mapped to it in.
				547	* Note that this function currently has various problems around ordering
				548	* of IO. In particular, we'd like FIFO behaviour on handling existing
				549	* items on the hctx->dispatch list. Ignore that for now.
				550	*/
				551	static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
				552	{
				553	struct request_queue *q = hctx->queue;
				554	struct blk_mq_ctx *ctx;
				555	struct request *rq;
				556	LIST_HEAD(rq_list);
				557	int bit, queued;
				558
				559	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
				560	return;
				561
				562	hctx->run++;
				563
				564	/*
				565	* Touch any software queue that has pending entries.
				566	*/
				567	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
				568	clear_bit(bit, hctx->ctx_map);
				569	ctx = hctx->ctxs[bit];
				570	BUG_ON(bit != ctx->index_hw);
				571
				572	spin_lock(&ctx->lock);
				573	list_splice_tail_init(&ctx->rq_list, &rq_list);
				574	spin_unlock(&ctx->lock);
				575	}
				576
				577	/*
				578	* If we have previous entries on our dispatch list, grab them
				579	* and stuff them at the front for more fair dispatch.
				580	*/
				581	if (!list_empty_careful(&hctx->dispatch)) {
				582	spin_lock(&hctx->lock);
				583	if (!list_empty(&hctx->dispatch))
				584	list_splice_init(&hctx->dispatch, &rq_list);
				585	spin_unlock(&hctx->lock);
				586	}
				587
				588	/*
				589	* Delete and return all entries from our dispatch list
				590	*/
				591	queued = 0;
				592
				593	/*
				594	* Now process all the entries, sending them to the driver.
				595	*/
				596	while (!list_empty(&rq_list)) {
				597	int ret;
				598
				599	rq = list_first_entry(&rq_list, struct request, queuelist);
				600	list_del_init(&rq->queuelist);
				601	blk_mq_start_request(rq);
				602
				603	/*
				604	* Last request in the series. Flag it as such, this
				605	* enables drivers to know when IO should be kicked off,
				606	* if they don't do it on a per-request basis.
				607	*
				608	* Note: the flag isn't the only condition drivers
				609	* should do kick off. If drive is busy, the last
				610	* request might not have the bit set.
				611	*/
				612	if (list_empty(&rq_list))
				613	rq->cmd_flags \|= REQ_END;
				614
				615	ret = q->mq_ops->queue_rq(hctx, rq);
				616	switch (ret) {
				617	case BLK_MQ_RQ_QUEUE_OK:
				618	queued++;
				619	continue;
				620	case BLK_MQ_RQ_QUEUE_BUSY:
				621	/*
				622	* FIXME: we should have a mechanism to stop the queue
				623	* like blk_stop_queue, otherwise we will waste cpu
				624	* time
				625	*/
				626	list_add(&rq->queuelist, &rq_list);
				627	blk_mq_requeue_request(rq);
				628	break;
				629	default:
				630	pr_err("blk-mq: bad return on queue: %d\n", ret);
				631	rq->errors = -EIO;
				632	case BLK_MQ_RQ_QUEUE_ERROR:
				633	blk_mq_end_io(rq, rq->errors);
				634	break;
				635	}
				636
				637	if (ret == BLK_MQ_RQ_QUEUE_BUSY)
				638	break;
				639	}
				640
				641	if (!queued)
				642	hctx->dispatched[0]++;
				643	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
				644	hctx->dispatched[ilog2(queued) + 1]++;
				645
				646	/*
				647	* Any items that need requeuing? Stuff them into hctx->dispatch,
				648	* that is where we will continue on next queue run.
				649	*/
				650	if (!list_empty(&rq_list)) {
				651	spin_lock(&hctx->lock);
				652	list_splice(&rq_list, &hctx->dispatch);
				653	spin_unlock(&hctx->lock);
				654	}
				655	}
				656
				657	void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
				658	{
				659	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags)))
				660	return;
				661
				662	if (!async)
				663	__blk_mq_run_hw_queue(hctx);
				664	else {
				665	struct request_queue *q = hctx->queue;
				666
				667	kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0);
				668	}
				669	}
				670
				671	void blk_mq_run_queues(struct request_queue *q, bool async)
				672	{
				673	struct blk_mq_hw_ctx *hctx;
				674	int i;
				675
				676	queue_for_each_hw_ctx(q, hctx, i) {
				677	if ((!blk_mq_hctx_has_pending(hctx) &&
				678	list_empty_careful(&hctx->dispatch)) \|\|
				679	test_bit(BLK_MQ_S_STOPPED, &hctx->flags))
				680	continue;
				681
				682	blk_mq_run_hw_queue(hctx, async);
				683	}
				684	}
				685	EXPORT_SYMBOL(blk_mq_run_queues);
				686
				687	void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
				688	{
				689	cancel_delayed_work(&hctx->delayed_work);
				690	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
				691	}
				692	EXPORT_SYMBOL(blk_mq_stop_hw_queue);
				693
Christoph Hellwig	280d45f	2013-10-25 14:45:58 +0100	[diff] [blame]	694	void blk_mq_stop_hw_queues(struct request_queue *q)
				695	{
				696	struct blk_mq_hw_ctx *hctx;
				697	int i;
				698
				699	queue_for_each_hw_ctx(q, hctx, i)
				700	blk_mq_stop_hw_queue(hctx);
				701	}
				702	EXPORT_SYMBOL(blk_mq_stop_hw_queues);
				703
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	704	void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
				705	{
				706	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
				707	__blk_mq_run_hw_queue(hctx);
				708	}
				709	EXPORT_SYMBOL(blk_mq_start_hw_queue);
				710
				711	void blk_mq_start_stopped_hw_queues(struct request_queue *q)
				712	{
				713	struct blk_mq_hw_ctx *hctx;
				714	int i;
				715
				716	queue_for_each_hw_ctx(q, hctx, i) {
				717	if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
				718	continue;
				719
				720	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
				721	blk_mq_run_hw_queue(hctx, true);
				722	}
				723	}
				724	EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
				725
				726	static void blk_mq_work_fn(struct work_struct *work)
				727	{
				728	struct blk_mq_hw_ctx *hctx;
				729
				730	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work);
				731	__blk_mq_run_hw_queue(hctx);
				732	}
				733
				734	static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
				735	struct request *rq)
				736	{
				737	struct blk_mq_ctx *ctx = rq->mq_ctx;
				738
Jens Axboe	01b983c	2013-11-19 18:59:10 -0700	[diff] [blame]	739	trace_block_rq_insert(hctx->queue, rq);
				740
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	741	list_add_tail(&rq->queuelist, &ctx->rq_list);
				742	blk_mq_hctx_mark_pending(hctx, ctx);
				743
				744	/*
				745	* We do this early, to ensure we are on the right CPU.
				746	*/
				747	blk_mq_add_timer(rq);
				748	}
				749
				750	void blk_mq_insert_request(struct request_queue q, struct request rq,
				751	bool run_queue)
				752	{
				753	struct blk_mq_hw_ctx *hctx;
				754	struct blk_mq_ctx ctx, current_ctx;
				755
				756	ctx = rq->mq_ctx;
				757	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				758
				759	if (rq->cmd_flags & (REQ_FLUSH \| REQ_FUA)) {
				760	blk_insert_flush(rq);
				761	} else {
				762	current_ctx = blk_mq_get_ctx(q);
				763
				764	if (!cpu_online(ctx->cpu)) {
				765	ctx = current_ctx;
				766	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				767	rq->mq_ctx = ctx;
				768	}
				769	spin_lock(&ctx->lock);
				770	__blk_mq_insert_request(hctx, rq);
				771	spin_unlock(&ctx->lock);
				772
				773	blk_mq_put_ctx(current_ctx);
				774	}
				775
				776	if (run_queue)
				777	__blk_mq_run_hw_queue(hctx);
				778	}
				779	EXPORT_SYMBOL(blk_mq_insert_request);
				780
				781	/*
				782	* This is a special version of blk_mq_insert_request to bypass FLUSH request
				783	* check. Should only be used internally.
				784	*/
				785	void blk_mq_run_request(struct request *rq, bool run_queue, bool async)
				786	{
				787	struct request_queue *q = rq->q;
				788	struct blk_mq_hw_ctx *hctx;
				789	struct blk_mq_ctx ctx, current_ctx;
				790
				791	current_ctx = blk_mq_get_ctx(q);
				792
				793	ctx = rq->mq_ctx;
				794	if (!cpu_online(ctx->cpu)) {
				795	ctx = current_ctx;
				796	rq->mq_ctx = ctx;
				797	}
				798	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				799
				800	/* ctx->cpu might be offline */
				801	spin_lock(&ctx->lock);
				802	__blk_mq_insert_request(hctx, rq);
				803	spin_unlock(&ctx->lock);
				804
				805	blk_mq_put_ctx(current_ctx);
				806
				807	if (run_queue)
				808	blk_mq_run_hw_queue(hctx, async);
				809	}
				810
				811	static void blk_mq_insert_requests(struct request_queue *q,
				812	struct blk_mq_ctx *ctx,
				813	struct list_head *list,
				814	int depth,
				815	bool from_schedule)
				816
				817	{
				818	struct blk_mq_hw_ctx *hctx;
				819	struct blk_mq_ctx *current_ctx;
				820
				821	trace_block_unplug(q, depth, !from_schedule);
				822
				823	current_ctx = blk_mq_get_ctx(q);
				824
				825	if (!cpu_online(ctx->cpu))
				826	ctx = current_ctx;
				827	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				828
				829	/*
				830	* preemption doesn't flush plug list, so it's possible ctx->cpu is
				831	* offline now
				832	*/
				833	spin_lock(&ctx->lock);
				834	while (!list_empty(list)) {
				835	struct request *rq;
				836
				837	rq = list_first_entry(list, struct request, queuelist);
				838	list_del_init(&rq->queuelist);
				839	rq->mq_ctx = ctx;
				840	__blk_mq_insert_request(hctx, rq);
				841	}
				842	spin_unlock(&ctx->lock);
				843
				844	blk_mq_put_ctx(current_ctx);
				845
				846	blk_mq_run_hw_queue(hctx, from_schedule);
				847	}
				848
				849	static int plug_ctx_cmp(void priv, struct list_head a, struct list_head *b)
				850	{
				851	struct request *rqa = container_of(a, struct request, queuelist);
				852	struct request *rqb = container_of(b, struct request, queuelist);
				853
				854	return !(rqa->mq_ctx < rqb->mq_ctx \|\|
				855	(rqa->mq_ctx == rqb->mq_ctx &&
				856	blk_rq_pos(rqa) < blk_rq_pos(rqb)));
				857	}
				858
				859	void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
				860	{
				861	struct blk_mq_ctx *this_ctx;
				862	struct request_queue *this_q;
				863	struct request *rq;
				864	LIST_HEAD(list);
				865	LIST_HEAD(ctx_list);
				866	unsigned int depth;
				867
				868	list_splice_init(&plug->mq_list, &list);
				869
				870	list_sort(NULL, &list, plug_ctx_cmp);
				871
				872	this_q = NULL;
				873	this_ctx = NULL;
				874	depth = 0;
				875
				876	while (!list_empty(&list)) {
				877	rq = list_entry_rq(list.next);
				878	list_del_init(&rq->queuelist);
				879	BUG_ON(!rq->q);
				880	if (rq->mq_ctx != this_ctx) {
				881	if (this_ctx) {
				882	blk_mq_insert_requests(this_q, this_ctx,
				883	&ctx_list, depth,
				884	from_schedule);
				885	}
				886
				887	this_ctx = rq->mq_ctx;
				888	this_q = rq->q;
				889	depth = 0;
				890	}
				891
				892	depth++;
				893	list_add_tail(&rq->queuelist, &ctx_list);
				894	}
				895
				896	/*
				897	* If 'this_ctx' is set, we know we have entries to complete
				898	* on 'ctx_list'. Do those.
				899	*/
				900	if (this_ctx) {
				901	blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
				902	from_schedule);
				903	}
				904	}
				905
				906	static void blk_mq_bio_to_request(struct request rq, struct bio bio)
				907	{
				908	init_request_from_bio(rq, bio);
				909	blk_account_io_start(rq, 1);
				910	}
				911
				912	static void blk_mq_make_request(struct request_queue q, struct bio bio)
				913	{
				914	struct blk_mq_hw_ctx *hctx;
				915	struct blk_mq_ctx *ctx;
				916	const int is_sync = rw_is_sync(bio->bi_rw);
				917	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH \| REQ_FUA);
				918	int rw = bio_data_dir(bio);
				919	struct request *rq;
				920	unsigned int use_plug, request_count = 0;
				921
				922	/*
				923	* If we have multiple hardware queues, just go directly to
				924	* one of those for sync IO.
				925	*/
				926	use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) \|\| !is_sync);
				927
				928	blk_queue_bounce(q, &bio);
				929
				930	if (use_plug && blk_attempt_plug_merge(q, bio, &request_count))
				931	return;
				932
				933	if (blk_mq_queue_enter(q)) {
				934	bio_endio(bio, -EIO);
				935	return;
				936	}
				937
				938	ctx = blk_mq_get_ctx(q);
				939	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				940
				941	trace_block_getrq(q, bio, rw);
				942	rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false);
				943	if (likely(rq))
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	944	blk_mq_rq_ctx_init(q, ctx, rq, rw);
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	945	else {
				946	blk_mq_put_ctx(ctx);
				947	trace_block_sleeprq(q, bio, rw);
				948	rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT\|GFP_ATOMIC,
				949	false);
				950	ctx = rq->mq_ctx;
				951	hctx = q->mq_ops->map_queue(q, ctx->cpu);
				952	}
				953
				954	hctx->queued++;
				955
				956	if (unlikely(is_flush_fua)) {
				957	blk_mq_bio_to_request(rq, bio);
				958	blk_mq_put_ctx(ctx);
				959	blk_insert_flush(rq);
				960	goto run_queue;
				961	}
				962
				963	/*
				964	* A task plug currently exists. Since this is completely lockless,
				965	* utilize that to temporarily store requests until the task is
				966	* either done or scheduled away.
				967	*/
				968	if (use_plug) {
				969	struct blk_plug *plug = current->plug;
				970
				971	if (plug) {
				972	blk_mq_bio_to_request(rq, bio);
Shaohua Li	92f399c	2013-10-29 12:01:03 -0600	[diff] [blame]	973	if (list_empty(&plug->mq_list))
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	974	trace_block_plug(q);
				975	else if (request_count >= BLK_MAX_REQUEST_COUNT) {
				976	blk_flush_plug_list(plug, false);
				977	trace_block_plug(q);
				978	}
				979	list_add_tail(&rq->queuelist, &plug->mq_list);
				980	blk_mq_put_ctx(ctx);
				981	return;
				982	}
				983	}
				984
				985	spin_lock(&ctx->lock);
				986
				987	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
				988	blk_mq_attempt_merge(q, ctx, bio))
				989	__blk_mq_free_request(hctx, ctx, rq);
				990	else {
				991	blk_mq_bio_to_request(rq, bio);
				992	__blk_mq_insert_request(hctx, rq);
				993	}
				994
				995	spin_unlock(&ctx->lock);
				996	blk_mq_put_ctx(ctx);
				997
				998	/*
				999	* For a SYNC request, send it to the hardware immediately. For an
				1000	* ASYNC request, just ensure that we run it later on. The latter
				1001	* allows for merging opportunities and more efficient dispatching.
				1002	*/
				1003	run_queue:
				1004	blk_mq_run_hw_queue(hctx, !is_sync \|\| is_flush_fua);
				1005	}
				1006
				1007	/*
				1008	* Default mapping to a software queue, since we use one per CPU.
				1009	*/
				1010	struct blk_mq_hw_ctx blk_mq_map_queue(struct request_queue q, const int cpu)
				1011	{
				1012	return q->queue_hw_ctx[q->mq_map[cpu]];
				1013	}
				1014	EXPORT_SYMBOL(blk_mq_map_queue);
				1015
				1016	struct blk_mq_hw_ctx blk_mq_alloc_single_hw_queue(struct blk_mq_reg reg,
				1017	unsigned int hctx_index)
				1018	{
				1019	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
				1020	GFP_KERNEL \| __GFP_ZERO, reg->numa_node);
				1021	}
				1022	EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
				1023
				1024	void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
				1025	unsigned int hctx_index)
				1026	{
				1027	kfree(hctx);
				1028	}
				1029	EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
				1030
				1031	static void blk_mq_hctx_notify(void *data, unsigned long action,
				1032	unsigned int cpu)
				1033	{
				1034	struct blk_mq_hw_ctx *hctx = data;
				1035	struct blk_mq_ctx *ctx;
				1036	LIST_HEAD(tmp);
				1037
				1038	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
				1039	return;
				1040
				1041	/*
				1042	* Move ctx entries to new CPU, if this one is going away.
				1043	*/
				1044	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
				1045
				1046	spin_lock(&ctx->lock);
				1047	if (!list_empty(&ctx->rq_list)) {
				1048	list_splice_init(&ctx->rq_list, &tmp);
				1049	clear_bit(ctx->index_hw, hctx->ctx_map);
				1050	}
				1051	spin_unlock(&ctx->lock);
				1052
				1053	if (list_empty(&tmp))
				1054	return;
				1055
				1056	ctx = blk_mq_get_ctx(hctx->queue);
				1057	spin_lock(&ctx->lock);
				1058
				1059	while (!list_empty(&tmp)) {
				1060	struct request *rq;
				1061
				1062	rq = list_first_entry(&tmp, struct request, queuelist);
				1063	rq->mq_ctx = ctx;
				1064	list_move_tail(&rq->queuelist, &ctx->rq_list);
				1065	}
				1066
				1067	blk_mq_hctx_mark_pending(hctx, ctx);
				1068
				1069	spin_unlock(&ctx->lock);
				1070	blk_mq_put_ctx(ctx);
				1071	}
				1072
				1073	static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
				1074	void (init)(void , struct blk_mq_hw_ctx *,
				1075	struct request *, unsigned int),
				1076	void *data)
				1077	{
				1078	unsigned int i;
				1079
				1080	for (i = 0; i < hctx->queue_depth; i++) {
				1081	struct request *rq = hctx->rqs[i];
				1082
				1083	init(data, hctx, rq, i);
				1084	}
				1085	}
				1086
				1087	void blk_mq_init_commands(struct request_queue *q,
				1088	void (init)(void , struct blk_mq_hw_ctx *,
				1089	struct request *, unsigned int),
				1090	void *data)
				1091	{
				1092	struct blk_mq_hw_ctx *hctx;
				1093	unsigned int i;
				1094
				1095	queue_for_each_hw_ctx(q, hctx, i)
				1096	blk_mq_init_hw_commands(hctx, init, data);
				1097	}
				1098	EXPORT_SYMBOL(blk_mq_init_commands);
				1099
				1100	static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
				1101	{
				1102	struct page *page;
				1103
				1104	while (!list_empty(&hctx->page_list)) {
				1105	page = list_first_entry(&hctx->page_list, struct page, list);
				1106	list_del_init(&page->list);
				1107	__free_pages(page, page->private);
				1108	}
				1109
				1110	kfree(hctx->rqs);
				1111
				1112	if (hctx->tags)
				1113	blk_mq_free_tags(hctx->tags);
				1114	}
				1115
				1116	static size_t order_to_size(unsigned int order)
				1117	{
				1118	size_t ret = PAGE_SIZE;
				1119
				1120	while (order--)
				1121	ret *= 2;
				1122
				1123	return ret;
				1124	}
				1125
				1126	static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
				1127	unsigned int reserved_tags, int node)
				1128	{
				1129	unsigned int i, j, entries_per_page, max_order = 4;
				1130	size_t rq_size, left;
				1131
				1132	INIT_LIST_HEAD(&hctx->page_list);
				1133
				1134	hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
				1135	GFP_KERNEL, node);
				1136	if (!hctx->rqs)
				1137	return -ENOMEM;
				1138
				1139	/*
				1140	* rq_size is the size of the request plus driver payload, rounded
				1141	* to the cacheline size
				1142	*/
				1143	rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
				1144	cache_line_size());
				1145	left = rq_size * hctx->queue_depth;
				1146
				1147	for (i = 0; i < hctx->queue_depth;) {
				1148	int this_order = max_order;
				1149	struct page *page;
				1150	int to_do;
				1151	void *p;
				1152
				1153	while (left < order_to_size(this_order - 1) && this_order)
				1154	this_order--;
				1155
				1156	do {
				1157	page = alloc_pages_node(node, GFP_KERNEL, this_order);
				1158	if (page)
				1159	break;
				1160	if (!this_order--)
				1161	break;
				1162	if (order_to_size(this_order) < rq_size)
				1163	break;
				1164	} while (1);
				1165
				1166	if (!page)
				1167	break;
				1168
				1169	page->private = this_order;
				1170	list_add_tail(&page->list, &hctx->page_list);
				1171
				1172	p = page_address(page);
				1173	entries_per_page = order_to_size(this_order) / rq_size;
				1174	to_do = min(entries_per_page, hctx->queue_depth - i);
				1175	left -= to_do * rq_size;
				1176	for (j = 0; j < to_do; j++) {
				1177	hctx->rqs[i] = p;
				1178	blk_mq_rq_init(hctx, hctx->rqs[i]);
				1179	p += rq_size;
				1180	i++;
				1181	}
				1182	}
				1183
				1184	if (i < (reserved_tags + BLK_MQ_TAG_MIN))
				1185	goto err_rq_map;
				1186	else if (i != hctx->queue_depth) {
				1187	hctx->queue_depth = i;
				1188	pr_warn("%s: queue depth set to %u because of low memory\n",
				1189	__func__, i);
				1190	}
				1191
				1192	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
				1193	if (!hctx->tags) {
				1194	err_rq_map:
				1195	blk_mq_free_rq_map(hctx);
				1196	return -ENOMEM;
				1197	}
				1198
				1199	return 0;
				1200	}
				1201
				1202	static int blk_mq_init_hw_queues(struct request_queue *q,
				1203	struct blk_mq_reg reg, void driver_data)
				1204	{
				1205	struct blk_mq_hw_ctx *hctx;
				1206	unsigned int i, j;
				1207
				1208	/*
				1209	* Initialize hardware queues
				1210	*/
				1211	queue_for_each_hw_ctx(q, hctx, i) {
				1212	unsigned int num_maps;
				1213	int node;
				1214
				1215	node = hctx->numa_node;
				1216	if (node == NUMA_NO_NODE)
				1217	node = hctx->numa_node = reg->numa_node;
				1218
				1219	INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
				1220	spin_lock_init(&hctx->lock);
				1221	INIT_LIST_HEAD(&hctx->dispatch);
				1222	hctx->queue = q;
				1223	hctx->queue_num = i;
				1224	hctx->flags = reg->flags;
				1225	hctx->queue_depth = reg->queue_depth;
				1226	hctx->cmd_size = reg->cmd_size;
				1227
				1228	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
				1229	blk_mq_hctx_notify, hctx);
				1230	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
				1231
				1232	if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
				1233	break;
				1234
				1235	/*
				1236	* Allocate space for all possible cpus to avoid allocation in
				1237	* runtime
				1238	*/
				1239	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
				1240	GFP_KERNEL, node);
				1241	if (!hctx->ctxs)
				1242	break;
				1243
				1244	num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
				1245	hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
				1246	GFP_KERNEL, node);
				1247	if (!hctx->ctx_map)
				1248	break;
				1249
				1250	hctx->nr_ctx_map = num_maps;
				1251	hctx->nr_ctx = 0;
				1252
				1253	if (reg->ops->init_hctx &&
				1254	reg->ops->init_hctx(hctx, driver_data, i))
				1255	break;
				1256	}
				1257
				1258	if (i == q->nr_hw_queues)
				1259	return 0;
				1260
				1261	/*
				1262	* Init failed
				1263	*/
				1264	queue_for_each_hw_ctx(q, hctx, j) {
				1265	if (i == j)
				1266	break;
				1267
				1268	if (reg->ops->exit_hctx)
				1269	reg->ops->exit_hctx(hctx, j);
				1270
				1271	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1272	blk_mq_free_rq_map(hctx);
				1273	kfree(hctx->ctxs);
				1274	}
				1275
				1276	return 1;
				1277	}
				1278
				1279	static void blk_mq_init_cpu_queues(struct request_queue *q,
				1280	unsigned int nr_hw_queues)
				1281	{
				1282	unsigned int i;
				1283
				1284	for_each_possible_cpu(i) {
				1285	struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
				1286	struct blk_mq_hw_ctx *hctx;
				1287
				1288	memset(__ctx, 0, sizeof(*__ctx));
				1289	__ctx->cpu = i;
				1290	spin_lock_init(&__ctx->lock);
				1291	INIT_LIST_HEAD(&__ctx->rq_list);
				1292	__ctx->queue = q;
				1293
				1294	/* If the cpu isn't online, the cpu is mapped to first hctx */
				1295	hctx = q->mq_ops->map_queue(q, i);
				1296	hctx->nr_ctx++;
				1297
				1298	if (!cpu_online(i))
				1299	continue;
				1300
				1301	/*
				1302	* Set local node, IFF we have more than one hw queue. If
				1303	* not, we remain on the home node of the device
				1304	*/
				1305	if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
				1306	hctx->numa_node = cpu_to_node(i);
				1307	}
				1308	}
				1309
				1310	static void blk_mq_map_swqueue(struct request_queue *q)
				1311	{
				1312	unsigned int i;
				1313	struct blk_mq_hw_ctx *hctx;
				1314	struct blk_mq_ctx *ctx;
				1315
				1316	queue_for_each_hw_ctx(q, hctx, i) {
				1317	hctx->nr_ctx = 0;
				1318	}
				1319
				1320	/*
				1321	* Map software to hardware queues
				1322	*/
				1323	queue_for_each_ctx(q, ctx, i) {
				1324	/* If the cpu isn't online, the cpu is mapped to first hctx */
				1325	hctx = q->mq_ops->map_queue(q, i);
				1326	ctx->index_hw = hctx->nr_ctx;
				1327	hctx->ctxs[hctx->nr_ctx++] = ctx;
				1328	}
				1329	}
				1330
				1331	struct request_queue blk_mq_init_queue(struct blk_mq_reg reg,
				1332	void *driver_data)
				1333	{
				1334	struct blk_mq_hw_ctx **hctxs;
				1335	struct blk_mq_ctx *ctx;
				1336	struct request_queue *q;
				1337	int i;
				1338
				1339	if (!reg->nr_hw_queues \|\|
				1340	!reg->ops->queue_rq \|\| !reg->ops->map_queue \|\|
				1341	!reg->ops->alloc_hctx \|\| !reg->ops->free_hctx)
				1342	return ERR_PTR(-EINVAL);
				1343
				1344	if (!reg->queue_depth)
				1345	reg->queue_depth = BLK_MQ_MAX_DEPTH;
				1346	else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
				1347	pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
				1348	reg->queue_depth = BLK_MQ_MAX_DEPTH;
				1349	}
				1350
Christoph Hellwig	3228f48	2013-10-28 13:33:58 -0600	[diff] [blame]	1351	/*
				1352	* Set aside a tag for flush requests. It will only be used while
				1353	* another flush request is in progress but outside the driver.
				1354	*
				1355	* TODO: only allocate if flushes are supported
				1356	*/
				1357	reg->queue_depth++;
				1358	reg->reserved_tags++;
				1359
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1360	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
				1361	return ERR_PTR(-EINVAL);
				1362
				1363	ctx = alloc_percpu(struct blk_mq_ctx);
				1364	if (!ctx)
				1365	return ERR_PTR(-ENOMEM);
				1366
				1367	hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
				1368	reg->numa_node);
				1369
				1370	if (!hctxs)
				1371	goto err_percpu;
				1372
				1373	for (i = 0; i < reg->nr_hw_queues; i++) {
				1374	hctxs[i] = reg->ops->alloc_hctx(reg, i);
				1375	if (!hctxs[i])
				1376	goto err_hctxs;
				1377
				1378	hctxs[i]->numa_node = NUMA_NO_NODE;
				1379	hctxs[i]->queue_num = i;
				1380	}
				1381
				1382	q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
				1383	if (!q)
				1384	goto err_hctxs;
				1385
				1386	q->mq_map = blk_mq_make_queue_map(reg);
				1387	if (!q->mq_map)
				1388	goto err_map;
				1389
				1390	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
				1391	blk_queue_rq_timeout(q, 30000);
				1392
				1393	q->nr_queues = nr_cpu_ids;
				1394	q->nr_hw_queues = reg->nr_hw_queues;
				1395
				1396	q->queue_ctx = ctx;
				1397	q->queue_hw_ctx = hctxs;
				1398
				1399	q->mq_ops = reg->ops;
Jens Axboe	94eddfb	2013-11-19 09:25:07 -0700	[diff] [blame]	1400	q->queue_flags \|= QUEUE_FLAG_MQ_DEFAULT;
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1401
				1402	blk_queue_make_request(q, blk_mq_make_request);
				1403	blk_queue_rq_timed_out(q, reg->ops->timeout);
				1404	if (reg->timeout)
				1405	blk_queue_rq_timeout(q, reg->timeout);
				1406
				1407	blk_mq_init_flush(q);
				1408	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
				1409
				1410	if (blk_mq_init_hw_queues(q, reg, driver_data))
				1411	goto err_hw;
				1412
				1413	blk_mq_map_swqueue(q);
				1414
				1415	mutex_lock(&all_q_mutex);
				1416	list_add_tail(&q->all_q_node, &all_q_list);
				1417	mutex_unlock(&all_q_mutex);
				1418
				1419	return q;
				1420	err_hw:
				1421	kfree(q->mq_map);
				1422	err_map:
				1423	blk_cleanup_queue(q);
				1424	err_hctxs:
				1425	for (i = 0; i < reg->nr_hw_queues; i++) {
				1426	if (!hctxs[i])
				1427	break;
				1428	reg->ops->free_hctx(hctxs[i], i);
				1429	}
				1430	kfree(hctxs);
				1431	err_percpu:
				1432	free_percpu(ctx);
				1433	return ERR_PTR(-ENOMEM);
				1434	}
				1435	EXPORT_SYMBOL(blk_mq_init_queue);
				1436
				1437	void blk_mq_free_queue(struct request_queue *q)
				1438	{
				1439	struct blk_mq_hw_ctx *hctx;
				1440	int i;
				1441
				1442	queue_for_each_hw_ctx(q, hctx, i) {
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1443	kfree(hctx->ctx_map);
				1444	kfree(hctx->ctxs);
				1445	blk_mq_free_rq_map(hctx);
				1446	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
				1447	if (q->mq_ops->exit_hctx)
				1448	q->mq_ops->exit_hctx(hctx, i);
				1449	q->mq_ops->free_hctx(hctx, i);
				1450	}
				1451
				1452	free_percpu(q->queue_ctx);
				1453	kfree(q->queue_hw_ctx);
				1454	kfree(q->mq_map);
				1455
				1456	q->queue_ctx = NULL;
				1457	q->queue_hw_ctx = NULL;
				1458	q->mq_map = NULL;
				1459
				1460	mutex_lock(&all_q_mutex);
				1461	list_del_init(&q->all_q_node);
				1462	mutex_unlock(&all_q_mutex);
				1463	}
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1464
				1465	/* Basically redo blk_mq_init_queue with queue frozen */
Paul Gortmaker	f618ef7	2013-11-14 08:26:02 -0700	[diff] [blame]	1466	static void blk_mq_queue_reinit(struct request_queue *q)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1467	{
				1468	blk_mq_freeze_queue(q);
				1469
				1470	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
				1471
				1472	/*
				1473	* redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
				1474	* we should change hctx numa_node according to new topology (this
				1475	* involves free and re-allocate memory, worthy doing?)
				1476	*/
				1477
				1478	blk_mq_map_swqueue(q);
				1479
				1480	blk_mq_unfreeze_queue(q);
				1481	}
				1482
Paul Gortmaker	f618ef7	2013-11-14 08:26:02 -0700	[diff] [blame]	1483	static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
				1484	unsigned long action, void *hcpu)
Jens Axboe	320ae51	2013-10-24 09:20:05 +0100	[diff] [blame]	1485	{
				1486	struct request_queue *q;
				1487
				1488	/*
				1489	* Before new mapping is established, hotadded cpu might already start
				1490	* handling requests. This doesn't break anything as we map offline
				1491	* CPUs to first hardware queue. We will re-init queue below to get
				1492	* optimal settings.
				1493	*/
				1494	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
				1495	action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
				1496	return NOTIFY_OK;
				1497
				1498	mutex_lock(&all_q_mutex);
				1499	list_for_each_entry(q, &all_q_list, all_q_node)
				1500	blk_mq_queue_reinit(q);
				1501	mutex_unlock(&all_q_mutex);
				1502	return NOTIFY_OK;
				1503	}
				1504
				1505	static int __init blk_mq_init(void)
				1506	{
				1507	unsigned int i;
				1508
				1509	for_each_possible_cpu(i)
				1510	init_llist_head(&per_cpu(ipi_lists, i));
				1511
				1512	blk_mq_cpu_init();
				1513
				1514	/* Must be called after percpu_counter_hotcpu_callback() */
				1515	hotcpu_notifier(blk_mq_queue_reinit_notify, -10);
				1516
				1517	return 0;
				1518	}
				1519	subsys_initcall(blk_mq_init);