Blame - block/bfq-iosched.c - SHIFTPHONES/kernel/shift/mainline

blob: c4e7d8db796a7fdda971e53db51b940afad75b33 [file] [log] [blame]

Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame^]	1	/*
				2	* Budget Fair Queueing (BFQ) I/O scheduler.
				3	*
				4	* Based on ideas and code from CFQ:
				5	* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
				6	*
				7	* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
				8	* Paolo Valente <paolo.valente@unimore.it>
				9	*
				10	* Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
				11	* Arianna Avanzini <avanzini@google.com>
				12	*
				13	* Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
				14	*
				15	* This program is free software; you can redistribute it and/or
				16	* modify it under the terms of the GNU General Public License as
				17	* published by the Free Software Foundation; either version 2 of the
				18	* License, or (at your option) any later version.
				19	*
				20	* This program is distributed in the hope that it will be useful,
				21	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				22	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				23	* General Public License for more details.
				24	*
				25	* BFQ is a proportional-share I/O scheduler, with some extra
				26	* low-latency capabilities. BFQ also supports full hierarchical
				27	* scheduling through cgroups. Next paragraphs provide an introduction
				28	* on BFQ inner workings. Details on BFQ benefits, usage and
				29	* limitations can be found in Documentation/block/bfq-iosched.txt.
				30	*
				31	* BFQ is a proportional-share storage-I/O scheduling algorithm based
				32	* on the slice-by-slice service scheme of CFQ. But BFQ assigns
				33	* budgets, measured in number of sectors, to processes instead of
				34	* time slices. The device is not granted to the in-service process
				35	* for a given time slice, but until it has exhausted its assigned
				36	* budget. This change from the time to the service domain enables BFQ
				37	* to distribute the device throughput among processes as desired,
				38	* without any distortion due to throughput fluctuations, or to device
				39	* internal queueing. BFQ uses an ad hoc internal scheduler, called
				40	* B-WF2Q+, to schedule processes according to their budgets. More
				41	* precisely, BFQ schedules queues associated with processes. Each
				42	* process/queue is assigned a user-configurable weight, and B-WF2Q+
				43	* guarantees that each queue receives a fraction of the throughput
				44	* proportional to its weight. Thanks to the accurate policy of
				45	* B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
				46	* processes issuing sequential requests (to boost the throughput),
				47	* and yet guarantee a low latency to interactive and soft real-time
				48	* applications.
				49	*
				50	* In particular, to provide these low-latency guarantees, BFQ
				51	* explicitly privileges the I/O of two classes of time-sensitive
				52	* applications: interactive and soft real-time. This feature enables
				53	* BFQ to provide applications in these classes with a very low
				54	* latency. Finally, BFQ also features additional heuristics for
				55	* preserving both a low latency and a high throughput on NCQ-capable,
				56	* rotational or flash-based devices, and to get the job done quickly
				57	* for applications consisting in many I/O-bound processes.
				58	*
				59	* BFQ is described in [1], where also a reference to the initial, more
				60	* theoretical paper on BFQ can be found. The interested reader can find
				61	* in the latter paper full details on the main algorithm, as well as
				62	* formulas of the guarantees and formal proofs of all the properties.
				63	* With respect to the version of BFQ presented in these papers, this
				64	* implementation adds a few more heuristics, such as the one that
				65	* guarantees a low latency to soft real-time applications, and a
				66	* hierarchical extension based on H-WF2Q+.
				67	*
				68	* B-WF2Q+ is based on WF2Q+, which is described in [2], together with
				69	* H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
				70	* with O(log N) complexity derives from the one introduced with EEVDF
				71	* in [3].
				72	*
				73	* [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
				74	* Scheduler", Proceedings of the First Workshop on Mobile System
				75	* Technologies (MST-2015), May 2015.
				76	* http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
				77	*
				78	* [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
				79	* Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
				80	* Oct 1997.
				81	*
				82	* http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
				83	*
				84	* [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
				85	* First: A Flexible and Accurate Mechanism for Proportional Share
				86	* Resource Allocation", technical report.
				87	*
				88	* http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
				89	*/
				90	#include <linux/module.h>
				91	#include <linux/slab.h>
				92	#include <linux/blkdev.h>
				93	#include <linux/elevator.h>
				94	#include <linux/ktime.h>
				95	#include <linux/rbtree.h>
				96	#include <linux/ioprio.h>
				97	#include <linux/sbitmap.h>
				98	#include <linux/delay.h>
				99
				100	#include "blk.h"
				101	#include "blk-mq.h"
				102	#include "blk-mq-tag.h"
				103	#include "blk-mq-sched.h"
				104	#include <linux/blktrace_api.h>
				105	#include <linux/hrtimer.h>
				106	#include <linux/blk-cgroup.h>
				107
				108	#define BFQ_IOPRIO_CLASSES 3
				109	#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
				110
				111	#define BFQ_MIN_WEIGHT 1
				112	#define BFQ_MAX_WEIGHT 1000
				113	#define BFQ_WEIGHT_CONVERSION_COEFF 10
				114
				115	#define BFQ_DEFAULT_QUEUE_IOPRIO 4
				116
				117	#define BFQ_DEFAULT_GRP_WEIGHT 10
				118	#define BFQ_DEFAULT_GRP_IOPRIO 0
				119	#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
				120
				121	struct bfq_entity;
				122
				123	/**
				124	* struct bfq_service_tree - per ioprio_class service tree.
				125	*
				126	* Each service tree represents a B-WF2Q+ scheduler on its own. Each
				127	* ioprio_class has its own independent scheduler, and so its own
				128	* bfq_service_tree. All the fields are protected by the queue lock
				129	* of the containing bfqd.
				130	*/
				131	struct bfq_service_tree {
				132	/* tree for active entities (i.e., those backlogged) */
				133	struct rb_root active;
				134	/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
				135	struct rb_root idle;
				136
				137	/* idle entity with minimum F_i */
				138	struct bfq_entity *first_idle;
				139	/* idle entity with maximum F_i */
				140	struct bfq_entity *last_idle;
				141
				142	/* scheduler virtual time */
				143	u64 vtime;
				144	/* scheduler weight sum; active and idle entities contribute to it */
				145	unsigned long wsum;
				146	};
				147
				148	/**
				149	* struct bfq_sched_data - multi-class scheduler.
				150	*
				151	* bfq_sched_data is the basic scheduler queue. It supports three
				152	* ioprio_classes, and can be used either as a toplevel queue or as
				153	* an intermediate queue on a hierarchical setup.
				154	* @next_in_service points to the active entity of the sched_data
				155	* service trees that will be scheduled next.
				156	*
				157	* The supported ioprio_classes are the same as in CFQ, in descending
				158	* priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
				159	* Requests from higher priority queues are served before all the
				160	* requests from lower priority queues; among requests of the same
				161	* queue requests are served according to B-WF2Q+.
				162	* All the fields are protected by the queue lock of the containing bfqd.
				163	*/
				164	struct bfq_sched_data {
				165	/* entity in service */
				166	struct bfq_entity *in_service_entity;
				167	/* head-of-the-line entity in the scheduler */
				168	struct bfq_entity *next_in_service;
				169	/* array of service trees, one per ioprio_class */
				170	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
				171	};
				172
				173	/**
				174	* struct bfq_entity - schedulable entity.
				175	*
				176	* A bfq_entity is used to represent a bfq_queue (leaf node in the upper
				177	* level scheduler). Each entity belongs to the sched_data of the parent
				178	* group hierarchy. Non-leaf entities have also their own sched_data,
				179	* stored in @my_sched_data.
				180	*
				181	* Each entity stores independently its priority values; this would
				182	* allow different weights on different devices, but this
				183	* functionality is not exported to userspace by now. Priorities and
				184	* weights are updated lazily, first storing the new values into the
				185	* new_* fields, then setting the @prio_changed flag. As soon as
				186	* there is a transition in the entity state that allows the priority
				187	* update to take place the effective and the requested priority
				188	* values are synchronized.
				189	*
				190	* The weight value is calculated from the ioprio to export the same
				191	* interface as CFQ. When dealing with ``well-behaved'' queues (i.e.,
				192	* queues that do not spend too much time to consume their budget
				193	* and have true sequential behavior, and when there are no external
				194	* factors breaking anticipation) the relative weights at each level
				195	* of the hierarchy should be guaranteed. All the fields are
				196	* protected by the queue lock of the containing bfqd.
				197	*/
				198	struct bfq_entity {
				199	/* service_tree member */
				200	struct rb_node rb_node;
				201
				202	/*
				203	* flag, true if the entity is on a tree (either the active or
				204	* the idle one of its service_tree).
				205	*/
				206	int on_st;
				207
				208	/* B-WF2Q+ start and finish timestamps [sectors/weight] */
				209	u64 start, finish;
				210
				211	/* tree the entity is enqueued into; %NULL if not on a tree */
				212	struct rb_root *tree;
				213
				214	/*
				215	* minimum start time of the (active) subtree rooted at this
				216	* entity; used for O(log N) lookups into active trees
				217	*/
				218	u64 min_start;
				219
				220	/* amount of service received during the last service slot */
				221	int service;
				222
				223	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
				224	int budget;
				225
				226	/* weight of the queue */
				227	int weight;
				228	/* next weight if a change is in progress */
				229	int new_weight;
				230
				231	/* original weight, used to implement weight boosting */
				232	int orig_weight;
				233
				234	/* parent entity, for hierarchical scheduling */
				235	struct bfq_entity *parent;
				236
				237	/*
				238	* For non-leaf nodes in the hierarchy, the associated
				239	* scheduler queue, %NULL on leaf nodes.
				240	*/
				241	struct bfq_sched_data *my_sched_data;
				242	/* the scheduler queue this entity belongs to */
				243	struct bfq_sched_data *sched_data;
				244
				245	/* flag, set to request a weight, ioprio or ioprio_class change */
				246	int prio_changed;
				247	};
				248
				249	/**
				250	* struct bfq_ttime - per process thinktime stats.
				251	*/
				252	struct bfq_ttime {
				253	/* completion time of the last request */
				254	u64 last_end_request;
				255
				256	/* total process thinktime */
				257	u64 ttime_total;
				258	/* number of thinktime samples */
				259	unsigned long ttime_samples;
				260	/* average process thinktime */
				261	u64 ttime_mean;
				262	};
				263
				264	/**
				265	* struct bfq_queue - leaf schedulable entity.
				266	*
				267	* A bfq_queue is a leaf request queue; it can be associated with an
				268	* io_context or more, if it is async.
				269	*/
				270	struct bfq_queue {
				271	/* reference counter */
				272	int ref;
				273	/* parent bfq_data */
				274	struct bfq_data *bfqd;
				275
				276	/* current ioprio and ioprio class */
				277	unsigned short ioprio, ioprio_class;
				278	/* next ioprio and ioprio class if a change is in progress */
				279	unsigned short new_ioprio, new_ioprio_class;
				280
				281	/* sorted list of pending requests */
				282	struct rb_root sort_list;
				283	/* if fifo isn't expired, next request to serve */
				284	struct request *next_rq;
				285	/* number of sync and async requests queued */
				286	int queued[2];
				287	/* number of requests currently allocated */
				288	int allocated;
				289	/* number of pending metadata requests */
				290	int meta_pending;
				291	/* fifo list of requests in sort_list */
				292	struct list_head fifo;
				293
				294	/* entity representing this queue in the scheduler */
				295	struct bfq_entity entity;
				296
				297	/* maximum budget allowed from the feedback mechanism */
				298	int max_budget;
				299	/* budget expiration (in jiffies) */
				300	unsigned long budget_timeout;
				301
				302	/* number of requests on the dispatch list or inside driver */
				303	int dispatched;
				304
				305	/* status flags */
				306	unsigned long flags;
				307
				308	/* node for active/idle bfqq list inside parent bfqd */
				309	struct list_head bfqq_list;
				310
				311	/* associated @bfq_ttime struct */
				312	struct bfq_ttime ttime;
				313
				314	/* bit vector: a 1 for each seeky requests in history */
				315	u32 seek_history;
				316	/* position of the last request enqueued */
				317	sector_t last_request_pos;
				318
				319	/* Number of consecutive pairs of request completion and
				320	* arrival, such that the queue becomes idle after the
				321	* completion, but the next request arrives within an idle
				322	* time slice; used only if the queue's IO_bound flag has been
				323	* cleared.
				324	*/
				325	unsigned int requests_within_timer;
				326
				327	/* pid of the process owning the queue, used for logging purposes */
				328	pid_t pid;
				329	};
				330
				331	/**
				332	* struct bfq_io_cq - per (request_queue, io_context) structure.
				333	*/
				334	struct bfq_io_cq {
				335	/* associated io_cq structure */
				336	struct io_cq icq; /* must be the first member */
				337	/* array of two process queues, the sync and the async */
				338	struct bfq_queue *bfqq[2];
				339	/* per (request_queue, blkcg) ioprio */
				340	int ioprio;
				341	};
				342
				343	/**
				344	* struct bfq_data - per-device data structure.
				345	*
				346	* All the fields are protected by @lock.
				347	*/
				348	struct bfq_data {
				349	/* device request queue */
				350	struct request_queue *queue;
				351	/* dispatch queue */
				352	struct list_head dispatch;
				353
				354	/* root @bfq_sched_data for the device */
				355	struct bfq_sched_data sched_data;
				356
				357	/*
				358	* Number of bfq_queues containing requests (including the
				359	* queue in service, even if it is idling).
				360	*/
				361	int busy_queues;
				362	/* number of queued requests */
				363	int queued;
				364	/* number of requests dispatched and waiting for completion */
				365	int rq_in_driver;
				366
				367	/*
				368	* Maximum number of requests in driver in the last
				369	* @hw_tag_samples completed requests.
				370	*/
				371	int max_rq_in_driver;
				372	/* number of samples used to calculate hw_tag */
				373	int hw_tag_samples;
				374	/* flag set to one if the driver is showing a queueing behavior */
				375	int hw_tag;
				376
				377	/* number of budgets assigned */
				378	int budgets_assigned;
				379
				380	/*
				381	* Timer set when idling (waiting) for the next request from
				382	* the queue in service.
				383	*/
				384	struct hrtimer idle_slice_timer;
				385
				386	/* bfq_queue in service */
				387	struct bfq_queue *in_service_queue;
				388	/* bfq_io_cq (bic) associated with the @in_service_queue */
				389	struct bfq_io_cq *in_service_bic;
				390
				391	/* on-disk position of the last served request */
				392	sector_t last_position;
				393
				394	/* beginning of the last budget */
				395	ktime_t last_budget_start;
				396	/* beginning of the last idle slice */
				397	ktime_t last_idling_start;
				398	/* number of samples used to calculate @peak_rate */
				399	int peak_rate_samples;
				400	/*
				401	* Peak read/write rate, observed during the service of a
				402	* budget [BFQ_RATE_SHIFT * sectors/usec]. The value is
				403	* left-shifted by BFQ_RATE_SHIFT to increase precision in
				404	* fixed-point calculations.
				405	*/
				406	u64 peak_rate;
				407	/* maximum budget allotted to a bfq_queue before rescheduling */
				408	int bfq_max_budget;
				409
				410	/* list of all the bfq_queues active on the device */
				411	struct list_head active_list;
				412	/* list of all the bfq_queues idle on the device */
				413	struct list_head idle_list;
				414
				415	/*
				416	* Timeout for async/sync requests; when it fires, requests
				417	* are served in fifo order.
				418	*/
				419	u64 bfq_fifo_expire[2];
				420	/* weight of backward seeks wrt forward ones */
				421	unsigned int bfq_back_penalty;
				422	/* maximum allowed backward seek */
				423	unsigned int bfq_back_max;
				424	/* maximum idling time */
				425	u32 bfq_slice_idle;
				426	/* last time CLASS_IDLE was served */
				427	u64 bfq_class_idle_last_service;
				428
				429	/* user-configured max budget value (0 for auto-tuning) */
				430	int bfq_user_max_budget;
				431	/*
				432	* Timeout for bfq_queues to consume their budget; used to
				433	* prevent seeky queues from imposing long latencies to
				434	* sequential or quasi-sequential ones (this also implies that
				435	* seeky queues cannot receive guarantees in the service
				436	* domain; after a timeout they are charged for the time they
				437	* have been in service, to preserve fairness among them, but
				438	* without service-domain guarantees).
				439	*/
				440	unsigned int bfq_timeout;
				441
				442	/*
				443	* Number of consecutive requests that must be issued within
				444	* the idle time slice to set again idling to a queue which
				445	* was marked as non-I/O-bound (see the definition of the
				446	* IO_bound flag for further details).
				447	*/
				448	unsigned int bfq_requests_within_timer;
				449
				450	/*
				451	* Force device idling whenever needed to provide accurate
				452	* service guarantees, without caring about throughput
				453	* issues. CAVEAT: this may even increase latencies, in case
				454	* of useless idling for processes that did stop doing I/O.
				455	*/
				456	bool strict_guarantees;
				457
				458	/* fallback dummy bfqq for extreme OOM conditions */
				459	struct bfq_queue oom_bfqq;
				460
				461	spinlock_t lock;
				462
				463	/*
				464	* bic associated with the task issuing current bio for
				465	* merging. This and the next field are used as a support to
				466	* be able to perform the bic lookup, needed by bio-merge
				467	* functions, before the scheduler lock is taken, and thus
				468	* avoid taking the request-queue lock while the scheduler
				469	* lock is being held.
				470	*/
				471	struct bfq_io_cq *bio_bic;
				472	/* bfqq associated with the task issuing current bio for merging */
				473	struct bfq_queue *bio_bfqq;
				474	};
				475
				476	enum bfqq_state_flags {
				477	BFQQF_busy = 0, /* has requests or is in service */
				478	BFQQF_wait_request, /* waiting for a request */
				479	BFQQF_non_blocking_wait_rq, /*
				480	* waiting for a request
				481	* without idling the device
				482	*/
				483	BFQQF_fifo_expire, /* FIFO checked in this slice */
				484	BFQQF_idle_window, /* slice idling enabled */
				485	BFQQF_sync, /* synchronous queue */
				486	BFQQF_budget_new, /* no completion with this budget */
				487	BFQQF_IO_bound, /*
				488	* bfqq has timed-out at least once
				489	* having consumed at most 2/10 of
				490	* its budget
				491	*/
				492	};
				493
				494	#define BFQ_BFQQ_FNS(name) \
				495	static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
				496	{ \
				497	__set_bit(BFQQF_##name, &(bfqq)->flags); \
				498	} \
				499	static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
				500	{ \
				501	__clear_bit(BFQQF_##name, &(bfqq)->flags); \
				502	} \
				503	static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
				504	{ \
				505	return test_bit(BFQQF_##name, &(bfqq)->flags); \
				506	}
				507
				508	BFQ_BFQQ_FNS(busy);
				509	BFQ_BFQQ_FNS(wait_request);
				510	BFQ_BFQQ_FNS(non_blocking_wait_rq);
				511	BFQ_BFQQ_FNS(fifo_expire);
				512	BFQ_BFQQ_FNS(idle_window);
				513	BFQ_BFQQ_FNS(sync);
				514	BFQ_BFQQ_FNS(budget_new);
				515	BFQ_BFQQ_FNS(IO_bound);
				516	#undef BFQ_BFQQ_FNS
				517
				518	/* Logging facilities. */
				519	#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
				520	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
				521
				522	#define bfq_log(bfqd, fmt, args...) \
				523	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
				524
				525	/* Expiration reasons. */
				526	enum bfqq_expiration {
				527	BFQQE_TOO_IDLE = 0, /*
				528	* queue has been idling for
				529	* too long
				530	*/
				531	BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
				532	BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
				533	BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
				534	BFQQE_PREEMPTED /* preemption in progress */
				535	};
				536
				537	static struct bfq_queue bfq_entity_to_bfqq(struct bfq_entity entity);
				538
				539	static struct bfq_service_tree *
				540	bfq_entity_service_tree(struct bfq_entity *entity)
				541	{
				542	struct bfq_sched_data *sched_data = entity->sched_data;
				543	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				544	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
				545	BFQ_DEFAULT_GRP_CLASS - 1;
				546
				547	return sched_data->service_tree + idx;
				548	}
				549
				550	static struct bfq_queue bic_to_bfqq(struct bfq_io_cq bic, bool is_sync)
				551	{
				552	return bic->bfqq[is_sync];
				553	}
				554
				555	static void bic_set_bfqq(struct bfq_io_cq bic, struct bfq_queue bfqq,
				556	bool is_sync)
				557	{
				558	bic->bfqq[is_sync] = bfqq;
				559	}
				560
				561	static struct bfq_data bic_to_bfqd(struct bfq_io_cq bic)
				562	{
				563	return bic->icq.q->elevator->elevator_data;
				564	}
				565
				566	static void bfq_check_ioprio_change(struct bfq_io_cq bic, struct bio bio);
				567	static void bfq_put_queue(struct bfq_queue *bfqq);
				568	static struct bfq_queue bfq_get_queue(struct bfq_data bfqd,
				569	struct bio *bio, bool is_sync,
				570	struct bfq_io_cq *bic);
				571	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq);
				572
				573	/*
				574	* Array of async queues for all the processes, one queue
				575	* per ioprio value per ioprio_class.
				576	*/
				577	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
				578	/* Async queue for the idle class (ioprio is ignored) */
				579	struct bfq_queue *async_idle_bfqq;
				580
				581	/* Expiration time of sync (0) and async (1) requests, in ns. */
				582	static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
				583
				584	/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
				585	static const int bfq_back_max = 16 * 1024;
				586
				587	/* Penalty of a backwards seek, in number of sectors. */
				588	static const int bfq_back_penalty = 2;
				589
				590	/* Idling period duration, in ns. */
				591	static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
				592
				593	/* Minimum number of assigned budgets for which stats are safe to compute. */
				594	static const int bfq_stats_min_budgets = 194;
				595
				596	/* Default maximum budget values, in sectors and number of requests. */
				597	static const int bfq_default_max_budget = 16 * 1024;
				598
				599	/* Default timeout values, in jiffies, approximating CFQ defaults. */
				600	static const int bfq_timeout = HZ / 8;
				601
				602	static struct kmem_cache *bfq_pool;
				603
				604	/* Below this threshold (in ms), we consider thinktime immediate. */
				605	#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
				606
				607	/* hw_tag detection: parallel requests threshold and min samples needed. */
				608	#define BFQ_HW_QUEUE_THRESHOLD 4
				609	#define BFQ_HW_QUEUE_SAMPLES 32
				610
				611	#define BFQQ_SEEK_THR (sector_t)(8 * 100)
				612	#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
				613	#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
				614	#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
				615
				616	/* Budget feedback step. */
				617	#define BFQ_BUDGET_STEP 128
				618
				619	/* Min samples used for peak rate estimation (for autotuning). */
				620	#define BFQ_PEAK_RATE_SAMPLES 32
				621
				622	/* Shift used for peak rate fixed precision calculations. */
				623	#define BFQ_RATE_SHIFT 16
				624
				625	#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
				626	{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
				627
				628	#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
				629	#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
				630
				631	/**
				632	* icq_to_bic - convert iocontext queue structure to bfq_io_cq.
				633	* @icq: the iocontext queue.
				634	*/
				635	static struct bfq_io_cq icq_to_bic(struct io_cq icq)
				636	{
				637	/* bic->icq is the first member, %NULL will convert to %NULL */
				638	return container_of(icq, struct bfq_io_cq, icq);
				639	}
				640
				641	/**
				642	* bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
				643	* @bfqd: the lookup key.
				644	* @ioc: the io_context of the process doing I/O.
				645	* @q: the request queue.
				646	*/
				647	static struct bfq_io_cq bfq_bic_lookup(struct bfq_data bfqd,
				648	struct io_context *ioc,
				649	struct request_queue *q)
				650	{
				651	if (ioc) {
				652	unsigned long flags;
				653	struct bfq_io_cq *icq;
				654
				655	spin_lock_irqsave(q->queue_lock, flags);
				656	icq = icq_to_bic(ioc_lookup_icq(ioc, q));
				657	spin_unlock_irqrestore(q->queue_lock, flags);
				658
				659	return icq;
				660	}
				661
				662	return NULL;
				663	}
				664
				665	/*
				666	* Next two macros are just fake loops for the moment. They will
				667	* become true loops in the cgroups-enabled variant of the code. Such
				668	* a variant, in its turn, will be introduced by next commit.
				669	*/
				670	#define for_each_entity(entity) \
				671	for (; entity ; entity = NULL)
				672
				673	#define for_each_entity_safe(entity, parent) \
				674	for (parent = NULL; entity ; entity = parent)
				675
				676	static int bfq_update_next_in_service(struct bfq_sched_data *sd)
				677	{
				678	return 0;
				679	}
				680
				681	static void bfq_check_next_in_service(struct bfq_sched_data *sd,
				682	struct bfq_entity *entity)
				683	{
				684	}
				685
				686	static void bfq_update_budget(struct bfq_entity *next_in_service)
				687	{
				688	}
				689
				690	/*
				691	* Shift for timestamp calculations. This actually limits the maximum
				692	* service allowed in one timestamp delta (small shift values increase it),
				693	* the maximum total weight that can be used for the queues in the system
				694	* (big shift values increase it), and the period of virtual time
				695	* wraparounds.
				696	*/
				697	#define WFQ_SERVICE_SHIFT 22
				698
				699	/**
				700	* bfq_gt - compare two timestamps.
				701	* @a: first ts.
				702	* @b: second ts.
				703	*
				704	* Return @a > @b, dealing with wrapping correctly.
				705	*/
				706	static int bfq_gt(u64 a, u64 b)
				707	{
				708	return (s64)(a - b) > 0;
				709	}
				710
				711	static struct bfq_queue bfq_entity_to_bfqq(struct bfq_entity entity)
				712	{
				713	struct bfq_queue *bfqq = NULL;
				714
				715	if (!entity->my_sched_data)
				716	bfqq = container_of(entity, struct bfq_queue, entity);
				717
				718	return bfqq;
				719	}
				720
				721
				722	/**
				723	* bfq_delta - map service into the virtual time domain.
				724	* @service: amount of service.
				725	* @weight: scale factor (weight of an entity or weight sum).
				726	*/
				727	static u64 bfq_delta(unsigned long service, unsigned long weight)
				728	{
				729	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
				730
				731	do_div(d, weight);
				732	return d;
				733	}
				734
				735	/**
				736	* bfq_calc_finish - assign the finish time to an entity.
				737	* @entity: the entity to act upon.
				738	* @service: the service to be charged to the entity.
				739	*/
				740	static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
				741	{
				742	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				743
				744	entity->finish = entity->start +
				745	bfq_delta(service, entity->weight);
				746
				747	if (bfqq) {
				748	bfq_log_bfqq(bfqq->bfqd, bfqq,
				749	"calc_finish: serv %lu, w %d",
				750	service, entity->weight);
				751	bfq_log_bfqq(bfqq->bfqd, bfqq,
				752	"calc_finish: start %llu, finish %llu, delta %llu",
				753	entity->start, entity->finish,
				754	bfq_delta(service, entity->weight));
				755	}
				756	}
				757
				758	/**
				759	* bfq_entity_of - get an entity from a node.
				760	* @node: the node field of the entity.
				761	*
				762	* Convert a node pointer to the relative entity. This is used only
				763	* to simplify the logic of some functions and not as the generic
				764	* conversion mechanism because, e.g., in the tree walking functions,
				765	* the check for a %NULL value would be redundant.
				766	*/
				767	static struct bfq_entity bfq_entity_of(struct rb_node node)
				768	{
				769	struct bfq_entity *entity = NULL;
				770
				771	if (node)
				772	entity = rb_entry(node, struct bfq_entity, rb_node);
				773
				774	return entity;
				775	}
				776
				777	/**
				778	* bfq_extract - remove an entity from a tree.
				779	* @root: the tree root.
				780	* @entity: the entity to remove.
				781	*/
				782	static void bfq_extract(struct rb_root root, struct bfq_entity entity)
				783	{
				784	entity->tree = NULL;
				785	rb_erase(&entity->rb_node, root);
				786	}
				787
				788	/**
				789	* bfq_idle_extract - extract an entity from the idle tree.
				790	* @st: the service tree of the owning @entity.
				791	* @entity: the entity being removed.
				792	*/
				793	static void bfq_idle_extract(struct bfq_service_tree *st,
				794	struct bfq_entity *entity)
				795	{
				796	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				797	struct rb_node *next;
				798
				799	if (entity == st->first_idle) {
				800	next = rb_next(&entity->rb_node);
				801	st->first_idle = bfq_entity_of(next);
				802	}
				803
				804	if (entity == st->last_idle) {
				805	next = rb_prev(&entity->rb_node);
				806	st->last_idle = bfq_entity_of(next);
				807	}
				808
				809	bfq_extract(&st->idle, entity);
				810
				811	if (bfqq)
				812	list_del(&bfqq->bfqq_list);
				813	}
				814
				815	/**
				816	* bfq_insert - generic tree insertion.
				817	* @root: tree root.
				818	* @entity: entity to insert.
				819	*
				820	* This is used for the idle and the active tree, since they are both
				821	* ordered by finish time.
				822	*/
				823	static void bfq_insert(struct rb_root root, struct bfq_entity entity)
				824	{
				825	struct bfq_entity *entry;
				826	struct rb_node **node = &root->rb_node;
				827	struct rb_node *parent = NULL;
				828
				829	while (*node) {
				830	parent = *node;
				831	entry = rb_entry(parent, struct bfq_entity, rb_node);
				832
				833	if (bfq_gt(entry->finish, entity->finish))
				834	node = &parent->rb_left;
				835	else
				836	node = &parent->rb_right;
				837	}
				838
				839	rb_link_node(&entity->rb_node, parent, node);
				840	rb_insert_color(&entity->rb_node, root);
				841
				842	entity->tree = root;
				843	}
				844
				845	/**
				846	* bfq_update_min - update the min_start field of a entity.
				847	* @entity: the entity to update.
				848	* @node: one of its children.
				849	*
				850	* This function is called when @entity may store an invalid value for
				851	* min_start due to updates to the active tree. The function assumes
				852	* that the subtree rooted at @node (which may be its left or its right
				853	* child) has a valid min_start value.
				854	*/
				855	static void bfq_update_min(struct bfq_entity entity, struct rb_node node)
				856	{
				857	struct bfq_entity *child;
				858
				859	if (node) {
				860	child = rb_entry(node, struct bfq_entity, rb_node);
				861	if (bfq_gt(entity->min_start, child->min_start))
				862	entity->min_start = child->min_start;
				863	}
				864	}
				865
				866	/**
				867	* bfq_update_active_node - recalculate min_start.
				868	* @node: the node to update.
				869	*
				870	* @node may have changed position or one of its children may have moved,
				871	* this function updates its min_start value. The left and right subtrees
				872	* are assumed to hold a correct min_start value.
				873	*/
				874	static void bfq_update_active_node(struct rb_node *node)
				875	{
				876	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
				877
				878	entity->min_start = entity->start;
				879	bfq_update_min(entity, node->rb_right);
				880	bfq_update_min(entity, node->rb_left);
				881	}
				882
				883	/**
				884	* bfq_update_active_tree - update min_start for the whole active tree.
				885	* @node: the starting node.
				886	*
				887	* @node must be the deepest modified node after an update. This function
				888	* updates its min_start using the values held by its children, assuming
				889	* that they did not change, and then updates all the nodes that may have
				890	* changed in the path to the root. The only nodes that may have changed
				891	* are the ones in the path or their siblings.
				892	*/
				893	static void bfq_update_active_tree(struct rb_node *node)
				894	{
				895	struct rb_node *parent;
				896
				897	up:
				898	bfq_update_active_node(node);
				899
				900	parent = rb_parent(node);
				901	if (!parent)
				902	return;
				903
				904	if (node == parent->rb_left && parent->rb_right)
				905	bfq_update_active_node(parent->rb_right);
				906	else if (parent->rb_left)
				907	bfq_update_active_node(parent->rb_left);
				908
				909	node = parent;
				910	goto up;
				911	}
				912
				913	/**
				914	* bfq_active_insert - insert an entity in the active tree of its
				915	* group/device.
				916	* @st: the service tree of the entity.
				917	* @entity: the entity being inserted.
				918	*
				919	* The active tree is ordered by finish time, but an extra key is kept
				920	* per each node, containing the minimum value for the start times of
				921	* its children (and the node itself), so it's possible to search for
				922	* the eligible node with the lowest finish time in logarithmic time.
				923	*/
				924	static void bfq_active_insert(struct bfq_service_tree *st,
				925	struct bfq_entity *entity)
				926	{
				927	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				928	struct rb_node *node = &entity->rb_node;
				929
				930	bfq_insert(&st->active, entity);
				931
				932	if (node->rb_left)
				933	node = node->rb_left;
				934	else if (node->rb_right)
				935	node = node->rb_right;
				936
				937	bfq_update_active_tree(node);
				938
				939	if (bfqq)
				940	list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
				941	}
				942
				943	/**
				944	* bfq_ioprio_to_weight - calc a weight from an ioprio.
				945	* @ioprio: the ioprio value to convert.
				946	*/
				947	static unsigned short bfq_ioprio_to_weight(int ioprio)
				948	{
				949	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
				950	}
				951
				952	/**
				953	* bfq_weight_to_ioprio - calc an ioprio from a weight.
				954	* @weight: the weight value to convert.
				955	*
				956	* To preserve as much as possible the old only-ioprio user interface,
				957	* 0 is used as an escape ioprio value for weights (numerically) equal or
				958	* larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
				959	*/
				960	static unsigned short bfq_weight_to_ioprio(int weight)
				961	{
				962	return max_t(int, 0,
				963	IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
				964	}
				965
				966	static void bfq_get_entity(struct bfq_entity *entity)
				967	{
				968	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				969
				970	if (bfqq) {
				971	bfqq->ref++;
				972	bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
				973	bfqq, bfqq->ref);
				974	}
				975	}
				976
				977	/**
				978	* bfq_find_deepest - find the deepest node that an extraction can modify.
				979	* @node: the node being removed.
				980	*
				981	* Do the first step of an extraction in an rb tree, looking for the
				982	* node that will replace @node, and returning the deepest node that
				983	* the following modifications to the tree can touch. If @node is the
				984	* last node in the tree return %NULL.
				985	*/
				986	static struct rb_node bfq_find_deepest(struct rb_node node)
				987	{
				988	struct rb_node *deepest;
				989
				990	if (!node->rb_right && !node->rb_left)
				991	deepest = rb_parent(node);
				992	else if (!node->rb_right)
				993	deepest = node->rb_left;
				994	else if (!node->rb_left)
				995	deepest = node->rb_right;
				996	else {
				997	deepest = rb_next(node);
				998	if (deepest->rb_right)
				999	deepest = deepest->rb_right;
				1000	else if (rb_parent(deepest) != node)
				1001	deepest = rb_parent(deepest);
				1002	}
				1003
				1004	return deepest;
				1005	}
				1006
				1007	/**
				1008	* bfq_active_extract - remove an entity from the active tree.
				1009	* @st: the service_tree containing the tree.
				1010	* @entity: the entity being removed.
				1011	*/
				1012	static void bfq_active_extract(struct bfq_service_tree *st,
				1013	struct bfq_entity *entity)
				1014	{
				1015	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1016	struct rb_node *node;
				1017
				1018	node = bfq_find_deepest(&entity->rb_node);
				1019	bfq_extract(&st->active, entity);
				1020
				1021	if (node)
				1022	bfq_update_active_tree(node);
				1023
				1024	if (bfqq)
				1025	list_del(&bfqq->bfqq_list);
				1026	}
				1027
				1028	/**
				1029	* bfq_idle_insert - insert an entity into the idle tree.
				1030	* @st: the service tree containing the tree.
				1031	* @entity: the entity to insert.
				1032	*/
				1033	static void bfq_idle_insert(struct bfq_service_tree *st,
				1034	struct bfq_entity *entity)
				1035	{
				1036	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1037	struct bfq_entity *first_idle = st->first_idle;
				1038	struct bfq_entity *last_idle = st->last_idle;
				1039
				1040	if (!first_idle \|\| bfq_gt(first_idle->finish, entity->finish))
				1041	st->first_idle = entity;
				1042	if (!last_idle \|\| bfq_gt(entity->finish, last_idle->finish))
				1043	st->last_idle = entity;
				1044
				1045	bfq_insert(&st->idle, entity);
				1046
				1047	if (bfqq)
				1048	list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
				1049	}
				1050
				1051	/**
				1052	* bfq_forget_entity - do not consider entity any longer for scheduling
				1053	* @st: the service tree.
				1054	* @entity: the entity being removed.
				1055	* @is_in_service: true if entity is currently the in-service entity.
				1056	*
				1057	* Forget everything about @entity. In addition, if entity represents
				1058	* a queue, and the latter is not in service, then release the service
				1059	* reference to the queue (the one taken through bfq_get_entity). In
				1060	* fact, in this case, there is really no more service reference to
				1061	* the queue, as the latter is also outside any service tree. If,
				1062	* instead, the queue is in service, then __bfq_bfqd_reset_in_service
				1063	* will take care of putting the reference when the queue finally
				1064	* stops being served.
				1065	*/
				1066	static void bfq_forget_entity(struct bfq_service_tree *st,
				1067	struct bfq_entity *entity,
				1068	bool is_in_service)
				1069	{
				1070	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1071
				1072	entity->on_st = 0;
				1073	st->wsum -= entity->weight;
				1074	if (bfqq && !is_in_service)
				1075	bfq_put_queue(bfqq);
				1076	}
				1077
				1078	/**
				1079	* bfq_put_idle_entity - release the idle tree ref of an entity.
				1080	* @st: service tree for the entity.
				1081	* @entity: the entity being released.
				1082	*/
				1083	static void bfq_put_idle_entity(struct bfq_service_tree *st,
				1084	struct bfq_entity *entity)
				1085	{
				1086	bfq_idle_extract(st, entity);
				1087	bfq_forget_entity(st, entity,
				1088	entity == entity->sched_data->in_service_entity);
				1089	}
				1090
				1091	/**
				1092	* bfq_forget_idle - update the idle tree if necessary.
				1093	* @st: the service tree to act upon.
				1094	*
				1095	* To preserve the global O(log N) complexity we only remove one entry here;
				1096	* as the idle tree will not grow indefinitely this can be done safely.
				1097	*/
				1098	static void bfq_forget_idle(struct bfq_service_tree *st)
				1099	{
				1100	struct bfq_entity *first_idle = st->first_idle;
				1101	struct bfq_entity *last_idle = st->last_idle;
				1102
				1103	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
				1104	!bfq_gt(last_idle->finish, st->vtime)) {
				1105	/*
				1106	* Forget the whole idle tree, increasing the vtime past
				1107	* the last finish time of idle entities.
				1108	*/
				1109	st->vtime = last_idle->finish;
				1110	}
				1111
				1112	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
				1113	bfq_put_idle_entity(st, first_idle);
				1114	}
				1115
				1116	static struct bfq_service_tree *
				1117	__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
				1118	struct bfq_entity *entity)
				1119	{
				1120	struct bfq_service_tree *new_st = old_st;
				1121
				1122	if (entity->prio_changed) {
				1123	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1124	unsigned short prev_weight, new_weight;
				1125	struct bfq_data *bfqd = NULL;
				1126
				1127	if (bfqq)
				1128	bfqd = bfqq->bfqd;
				1129
				1130	old_st->wsum -= entity->weight;
				1131
				1132	if (entity->new_weight != entity->orig_weight) {
				1133	if (entity->new_weight < BFQ_MIN_WEIGHT \|\|
				1134	entity->new_weight > BFQ_MAX_WEIGHT) {
				1135	pr_crit("update_weight_prio: new_weight %d\n",
				1136	entity->new_weight);
				1137	if (entity->new_weight < BFQ_MIN_WEIGHT)
				1138	entity->new_weight = BFQ_MIN_WEIGHT;
				1139	else
				1140	entity->new_weight = BFQ_MAX_WEIGHT;
				1141	}
				1142	entity->orig_weight = entity->new_weight;
				1143	if (bfqq)
				1144	bfqq->ioprio =
				1145	bfq_weight_to_ioprio(entity->orig_weight);
				1146	}
				1147
				1148	if (bfqq)
				1149	bfqq->ioprio_class = bfqq->new_ioprio_class;
				1150	entity->prio_changed = 0;
				1151
				1152	/*
				1153	* NOTE: here we may be changing the weight too early,
				1154	* this will cause unfairness. The correct approach
				1155	* would have required additional complexity to defer
				1156	* weight changes to the proper time instants (i.e.,
				1157	* when entity->finish <= old_st->vtime).
				1158	*/
				1159	new_st = bfq_entity_service_tree(entity);
				1160
				1161	prev_weight = entity->weight;
				1162	new_weight = entity->orig_weight;
				1163	entity->weight = new_weight;
				1164
				1165	new_st->wsum += entity->weight;
				1166
				1167	if (new_st != old_st)
				1168	entity->start = new_st->vtime;
				1169	}
				1170
				1171	return new_st;
				1172	}
				1173
				1174	/**
				1175	* bfq_bfqq_served - update the scheduler status after selection for
				1176	* service.
				1177	* @bfqq: the queue being served.
				1178	* @served: bytes to transfer.
				1179	*
				1180	* NOTE: this can be optimized, as the timestamps of upper level entities
				1181	* are synchronized every time a new bfqq is selected for service. By now,
				1182	* we keep it to better check consistency.
				1183	*/
				1184	static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
				1185	{
				1186	struct bfq_entity *entity = &bfqq->entity;
				1187	struct bfq_service_tree *st;
				1188
				1189	for_each_entity(entity) {
				1190	st = bfq_entity_service_tree(entity);
				1191
				1192	entity->service += served;
				1193
				1194	st->vtime += bfq_delta(served, st->wsum);
				1195	bfq_forget_idle(st);
				1196	}
				1197	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
				1198	}
				1199
				1200	/**
				1201	* bfq_bfqq_charge_full_budget - set the service to the entity budget.
				1202	* @bfqq: the queue that needs a service update.
				1203	*
				1204	* When it's not possible to be fair in the service domain, because
				1205	* a queue is not consuming its budget fast enough (the meaning of
				1206	* fast depends on the timeout parameter), we charge it a full
				1207	* budget. In this way we should obtain a sort of time-domain
				1208	* fairness among all the seeky/slow queues.
				1209	*/
				1210	static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
				1211	{
				1212	struct bfq_entity *entity = &bfqq->entity;
				1213
				1214	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
				1215
				1216	bfq_bfqq_served(bfqq, entity->budget - entity->service);
				1217	}
				1218
				1219	/**
				1220	* __bfq_activate_entity - activate an entity.
				1221	* @entity: the entity being activated.
				1222	* @non_blocking_wait_rq: true if this entity was waiting for a request
				1223	*
				1224	* Called whenever an entity is activated, i.e., it is not active and one
				1225	* of its children receives a new request, or has to be reactivated due to
				1226	* budget exhaustion. It uses the current budget of the entity (and the
				1227	* service received if @entity is active) of the queue to calculate its
				1228	* timestamps.
				1229	*/
				1230	static void __bfq_activate_entity(struct bfq_entity *entity,
				1231	bool non_blocking_wait_rq)
				1232	{
				1233	struct bfq_sched_data *sd = entity->sched_data;
				1234	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				1235	bool backshifted = false;
				1236
				1237	if (entity == sd->in_service_entity) {
				1238	/*
				1239	* If we are requeueing the current entity we have
				1240	* to take care of not charging to it service it has
				1241	* not received.
				1242	*/
				1243	bfq_calc_finish(entity, entity->service);
				1244	entity->start = entity->finish;
				1245	sd->in_service_entity = NULL;
				1246	} else if (entity->tree == &st->active) {
				1247	/*
				1248	* Requeueing an entity due to a change of some
				1249	* next_in_service entity below it. We reuse the
				1250	* old start time.
				1251	*/
				1252	bfq_active_extract(st, entity);
				1253	} else {
				1254	unsigned long long min_vstart;
				1255
				1256	/* See comments on bfq_fqq_update_budg_for_activation */
				1257	if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
				1258	backshifted = true;
				1259	min_vstart = entity->finish;
				1260	} else
				1261	min_vstart = st->vtime;
				1262
				1263	if (entity->tree == &st->idle) {
				1264	/*
				1265	* Must be on the idle tree, bfq_idle_extract() will
				1266	* check for that.
				1267	*/
				1268	bfq_idle_extract(st, entity);
				1269	entity->start = bfq_gt(min_vstart, entity->finish) ?
				1270	min_vstart : entity->finish;
				1271	} else {
				1272	/*
				1273	* The finish time of the entity may be invalid, and
				1274	* it is in the past for sure, otherwise the queue
				1275	* would have been on the idle tree.
				1276	*/
				1277	entity->start = min_vstart;
				1278	st->wsum += entity->weight;
				1279	/*
				1280	* entity is about to be inserted into a service tree,
				1281	* and then set in service: get a reference to make
				1282	* sure entity does not disappear until it is no
				1283	* longer in service or scheduled for service.
				1284	*/
				1285	bfq_get_entity(entity);
				1286
				1287	entity->on_st = 1;
				1288	}
				1289	}
				1290
				1291	st = __bfq_entity_update_weight_prio(st, entity);
				1292	bfq_calc_finish(entity, entity->budget);
				1293
				1294	/*
				1295	* If some queues enjoy backshifting for a while, then their
				1296	* (virtual) finish timestamps may happen to become lower and
				1297	* lower than the system virtual time. In particular, if
				1298	* these queues often happen to be idle for short time
				1299	* periods, and during such time periods other queues with
				1300	* higher timestamps happen to be busy, then the backshifted
				1301	* timestamps of the former queues can become much lower than
				1302	* the system virtual time. In fact, to serve the queues with
				1303	* higher timestamps while the ones with lower timestamps are
				1304	* idle, the system virtual time may be pushed-up to much
				1305	* higher values than the finish timestamps of the idle
				1306	* queues. As a consequence, the finish timestamps of all new
				1307	* or newly activated queues may end up being much larger than
				1308	* those of lucky queues with backshifted timestamps. The
				1309	* latter queues may then monopolize the device for a lot of
				1310	* time. This would simply break service guarantees.
				1311	*
				1312	* To reduce this problem, push up a little bit the
				1313	* backshifted timestamps of the queue associated with this
				1314	* entity (only a queue can happen to have the backshifted
				1315	* flag set): just enough to let the finish timestamp of the
				1316	* queue be equal to the current value of the system virtual
				1317	* time. This may introduce a little unfairness among queues
				1318	* with backshifted timestamps, but it does not break
				1319	* worst-case fairness guarantees.
				1320	*/
				1321	if (backshifted && bfq_gt(st->vtime, entity->finish)) {
				1322	unsigned long delta = st->vtime - entity->finish;
				1323
				1324	entity->start += delta;
				1325	entity->finish += delta;
				1326	}
				1327
				1328	bfq_active_insert(st, entity);
				1329	}
				1330
				1331	/**
				1332	* bfq_activate_entity - activate an entity and its ancestors if necessary.
				1333	* @entity: the entity to activate.
				1334	* @non_blocking_wait_rq: true if this entity was waiting for a request
				1335	*
				1336	* Activate @entity and all the entities on the path from it to the root.
				1337	*/
				1338	static void bfq_activate_entity(struct bfq_entity *entity,
				1339	bool non_blocking_wait_rq)
				1340	{
				1341	struct bfq_sched_data *sd;
				1342
				1343	for_each_entity(entity) {
				1344	__bfq_activate_entity(entity, non_blocking_wait_rq);
				1345
				1346	sd = entity->sched_data;
				1347	if (!bfq_update_next_in_service(sd))
				1348	/*
				1349	* No need to propagate the activation to the
				1350	* upper entities, as they will be updated when
				1351	* the in-service entity is rescheduled.
				1352	*/
				1353	break;
				1354	}
				1355	}
				1356
				1357	/**
				1358	* __bfq_deactivate_entity - deactivate an entity from its service tree.
				1359	* @entity: the entity to deactivate.
				1360	* @requeue: if false, the entity will not be put into the idle tree.
				1361	*
				1362	* Deactivate an entity, independently from its previous state. If the
				1363	* entity was not on a service tree just return, otherwise if it is on
				1364	* any scheduler tree, extract it from that tree, and if necessary
				1365	* and if the caller did not specify @requeue, put it on the idle tree.
				1366	*
				1367	* Return %1 if the caller should update the entity hierarchy, i.e.,
				1368	* if the entity was in service or if it was the next_in_service for
				1369	* its sched_data; return %0 otherwise.
				1370	*/
				1371	static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
				1372	{
				1373	struct bfq_sched_data *sd = entity->sched_data;
				1374	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				1375	int is_in_service = entity == sd->in_service_entity;
				1376	int ret = 0;
				1377
				1378	if (!entity->on_st)
				1379	return 0;
				1380
				1381	if (is_in_service) {
				1382	bfq_calc_finish(entity, entity->service);
				1383	sd->in_service_entity = NULL;
				1384	} else if (entity->tree == &st->active)
				1385	bfq_active_extract(st, entity);
				1386	else if (entity->tree == &st->idle)
				1387	bfq_idle_extract(st, entity);
				1388
				1389	if (is_in_service \|\| sd->next_in_service == entity)
				1390	ret = bfq_update_next_in_service(sd);
				1391
				1392	if (!requeue \|\| !bfq_gt(entity->finish, st->vtime))
				1393	bfq_forget_entity(st, entity, is_in_service);
				1394	else
				1395	bfq_idle_insert(st, entity);
				1396
				1397	return ret;
				1398	}
				1399
				1400	/**
				1401	* bfq_deactivate_entity - deactivate an entity.
				1402	* @entity: the entity to deactivate.
				1403	* @requeue: true if the entity can be put on the idle tree
				1404	*/
				1405	static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
				1406	{
				1407	struct bfq_sched_data *sd;
				1408	struct bfq_entity *parent = NULL;
				1409
				1410	for_each_entity_safe(entity, parent) {
				1411	sd = entity->sched_data;
				1412
				1413	if (!__bfq_deactivate_entity(entity, requeue))
				1414	/*
				1415	* The parent entity is still backlogged, and
				1416	* we don't need to update it as it is still
				1417	* in service.
				1418	*/
				1419	break;
				1420
				1421	if (sd->next_in_service)
				1422	/*
				1423	* The parent entity is still backlogged and
				1424	* the budgets on the path towards the root
				1425	* need to be updated.
				1426	*/
				1427	goto update;
				1428
				1429	/*
				1430	* If we get here, then the parent is no more backlogged and
				1431	* we want to propagate the deactivation upwards.
				1432	*/
				1433	requeue = 1;
				1434	}
				1435
				1436	return;
				1437
				1438	update:
				1439	entity = parent;
				1440	for_each_entity(entity) {
				1441	__bfq_activate_entity(entity, false);
				1442
				1443	sd = entity->sched_data;
				1444	if (!bfq_update_next_in_service(sd))
				1445	break;
				1446	}
				1447	}
				1448
				1449	/**
				1450	* bfq_update_vtime - update vtime if necessary.
				1451	* @st: the service tree to act upon.
				1452	*
				1453	* If necessary update the service tree vtime to have at least one
				1454	* eligible entity, skipping to its start time. Assumes that the
				1455	* active tree of the device is not empty.
				1456	*
				1457	* NOTE: this hierarchical implementation updates vtimes quite often,
				1458	* we may end up with reactivated processes getting timestamps after a
				1459	* vtime skip done because we needed a ->first_active entity on some
				1460	* intermediate node.
				1461	*/
				1462	static void bfq_update_vtime(struct bfq_service_tree *st)
				1463	{
				1464	struct bfq_entity *entry;
				1465	struct rb_node *node = st->active.rb_node;
				1466
				1467	entry = rb_entry(node, struct bfq_entity, rb_node);
				1468	if (bfq_gt(entry->min_start, st->vtime)) {
				1469	st->vtime = entry->min_start;
				1470	bfq_forget_idle(st);
				1471	}
				1472	}
				1473
				1474	/**
				1475	* bfq_first_active_entity - find the eligible entity with
				1476	* the smallest finish time
				1477	* @st: the service tree to select from.
				1478	*
				1479	* This function searches the first schedulable entity, starting from the
				1480	* root of the tree and going on the left every time on this side there is
				1481	* a subtree with at least one eligible (start >= vtime) entity. The path on
				1482	* the right is followed only if a) the left subtree contains no eligible
				1483	* entities and b) no eligible entity has been found yet.
				1484	*/
				1485	static struct bfq_entity bfq_first_active_entity(struct bfq_service_tree st)
				1486	{
				1487	struct bfq_entity entry, first = NULL;
				1488	struct rb_node *node = st->active.rb_node;
				1489
				1490	while (node) {
				1491	entry = rb_entry(node, struct bfq_entity, rb_node);
				1492	left:
				1493	if (!bfq_gt(entry->start, st->vtime))
				1494	first = entry;
				1495
				1496	if (node->rb_left) {
				1497	entry = rb_entry(node->rb_left,
				1498	struct bfq_entity, rb_node);
				1499	if (!bfq_gt(entry->min_start, st->vtime)) {
				1500	node = node->rb_left;
				1501	goto left;
				1502	}
				1503	}
				1504	if (first)
				1505	break;
				1506	node = node->rb_right;
				1507	}
				1508
				1509	return first;
				1510	}
				1511
				1512	/**
				1513	* __bfq_lookup_next_entity - return the first eligible entity in @st.
				1514	* @st: the service tree.
				1515	*
				1516	* Update the virtual time in @st and return the first eligible entity
				1517	* it contains.
				1518	*/
				1519	static struct bfq_entity __bfq_lookup_next_entity(struct bfq_service_tree st,
				1520	bool force)
				1521	{
				1522	struct bfq_entity entity, new_next_in_service = NULL;
				1523
				1524	if (RB_EMPTY_ROOT(&st->active))
				1525	return NULL;
				1526
				1527	bfq_update_vtime(st);
				1528	entity = bfq_first_active_entity(st);
				1529
				1530	/*
				1531	* If the chosen entity does not match with the sched_data's
				1532	* next_in_service and we are forcedly serving the IDLE priority
				1533	* class tree, bubble up budget update.
				1534	*/
				1535	if (unlikely(force && entity != entity->sched_data->next_in_service)) {
				1536	new_next_in_service = entity;
				1537	for_each_entity(new_next_in_service)
				1538	bfq_update_budget(new_next_in_service);
				1539	}
				1540
				1541	return entity;
				1542	}
				1543
				1544	/**
				1545	* bfq_lookup_next_entity - return the first eligible entity in @sd.
				1546	* @sd: the sched_data.
				1547	* @extract: if true the returned entity will be also extracted from @sd.
				1548	*
				1549	* NOTE: since we cache the next_in_service entity at each level of the
				1550	* hierarchy, the complexity of the lookup can be decreased with
				1551	* absolutely no effort just returning the cached next_in_service value;
				1552	* we prefer to do full lookups to test the consistency of the data
				1553	* structures.
				1554	*/
				1555	static struct bfq_entity bfq_lookup_next_entity(struct bfq_sched_data sd,
				1556	int extract,
				1557	struct bfq_data *bfqd)
				1558	{
				1559	struct bfq_service_tree *st = sd->service_tree;
				1560	struct bfq_entity *entity;
				1561	int i = 0;
				1562
				1563	/*
				1564	* Choose from idle class, if needed to guarantee a minimum
				1565	* bandwidth to this class. This should also mitigate
				1566	* priority-inversion problems in case a low priority task is
				1567	* holding file system resources.
				1568	*/
				1569	if (bfqd &&
				1570	jiffies - bfqd->bfq_class_idle_last_service >
				1571	BFQ_CL_IDLE_TIMEOUT) {
				1572	entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
				1573	true);
				1574	if (entity) {
				1575	i = BFQ_IOPRIO_CLASSES - 1;
				1576	bfqd->bfq_class_idle_last_service = jiffies;
				1577	sd->next_in_service = entity;
				1578	}
				1579	}
				1580	for (; i < BFQ_IOPRIO_CLASSES; i++) {
				1581	entity = __bfq_lookup_next_entity(st + i, false);
				1582	if (entity) {
				1583	if (extract) {
				1584	bfq_check_next_in_service(sd, entity);
				1585	bfq_active_extract(st + i, entity);
				1586	sd->in_service_entity = entity;
				1587	sd->next_in_service = NULL;
				1588	}
				1589	break;
				1590	}
				1591	}
				1592
				1593	return entity;
				1594	}
				1595
				1596	static bool next_queue_may_preempt(struct bfq_data *bfqd)
				1597	{
				1598	struct bfq_sched_data *sd = &bfqd->sched_data;
				1599
				1600	return sd->next_in_service != sd->in_service_entity;
				1601	}
				1602
				1603
				1604	/*
				1605	* Get next queue for service.
				1606	*/
				1607	static struct bfq_queue bfq_get_next_queue(struct bfq_data bfqd)
				1608	{
				1609	struct bfq_entity *entity = NULL;
				1610	struct bfq_sched_data *sd;
				1611	struct bfq_queue *bfqq;
				1612
				1613	if (bfqd->busy_queues == 0)
				1614	return NULL;
				1615
				1616	sd = &bfqd->sched_data;
				1617	for (; sd ; sd = entity->my_sched_data) {
				1618	entity = bfq_lookup_next_entity(sd, 1, bfqd);
				1619	entity->service = 0;
				1620	}
				1621
				1622	bfqq = bfq_entity_to_bfqq(entity);
				1623
				1624	return bfqq;
				1625	}
				1626
				1627	static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
				1628	{
				1629	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
				1630	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
				1631
				1632	if (bfqd->in_service_bic) {
				1633	put_io_context(bfqd->in_service_bic->icq.ioc);
				1634	bfqd->in_service_bic = NULL;
				1635	}
				1636
				1637	bfq_clear_bfqq_wait_request(in_serv_bfqq);
				1638	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
				1639	bfqd->in_service_queue = NULL;
				1640
				1641	/*
				1642	* in_serv_entity is no longer in service, so, if it is in no
				1643	* service tree either, then release the service reference to
				1644	* the queue it represents (taken with bfq_get_entity).
				1645	*/
				1646	if (!in_serv_entity->on_st)
				1647	bfq_put_queue(in_serv_bfqq);
				1648	}
				1649
				1650	static void bfq_deactivate_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq,
				1651	int requeue)
				1652	{
				1653	struct bfq_entity *entity = &bfqq->entity;
				1654
				1655	bfq_deactivate_entity(entity, requeue);
				1656	}
				1657
				1658	static void bfq_activate_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
				1659	{
				1660	struct bfq_entity *entity = &bfqq->entity;
				1661
				1662	bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq));
				1663	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
				1664	}
				1665
				1666	/*
				1667	* Called when the bfqq no longer has requests pending, remove it from
				1668	* the service tree.
				1669	*/
				1670	static void bfq_del_bfqq_busy(struct bfq_data bfqd, struct bfq_queue bfqq,
				1671	int requeue)
				1672	{
				1673	bfq_log_bfqq(bfqd, bfqq, "del from busy");
				1674
				1675	bfq_clear_bfqq_busy(bfqq);
				1676
				1677	bfqd->busy_queues--;
				1678
				1679	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
				1680	}
				1681
				1682	/*
				1683	* Called when an inactive queue receives a new request.
				1684	*/
				1685	static void bfq_add_bfqq_busy(struct bfq_data bfqd, struct bfq_queue bfqq)
				1686	{
				1687	bfq_log_bfqq(bfqd, bfqq, "add to busy");
				1688
				1689	bfq_activate_bfqq(bfqd, bfqq);
				1690
				1691	bfq_mark_bfqq_busy(bfqq);
				1692	bfqd->busy_queues++;
				1693	}
				1694
				1695	static void bfq_init_entity(struct bfq_entity *entity)
				1696	{
				1697	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1698
				1699	entity->weight = entity->new_weight;
				1700	entity->orig_weight = entity->new_weight;
				1701
				1702	bfqq->ioprio = bfqq->new_ioprio;
				1703	bfqq->ioprio_class = bfqq->new_ioprio_class;
				1704
				1705	entity->sched_data = &bfqq->bfqd->sched_data;
				1706	}
				1707
				1708	#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
				1709	#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
				1710
				1711	#define bfq_sample_valid(samples) ((samples) > 80)
				1712
				1713	/*
				1714	* Scheduler run of queue, if there are requests pending and no one in the
				1715	* driver that will restart queueing.
				1716	*/
				1717	static void bfq_schedule_dispatch(struct bfq_data *bfqd)
				1718	{
				1719	if (bfqd->queued != 0) {
				1720	bfq_log(bfqd, "schedule dispatch");
				1721	blk_mq_run_hw_queues(bfqd->queue, true);
				1722	}
				1723	}
				1724
				1725	/*
				1726	* Lifted from AS - choose which of rq1 and rq2 that is best served now.
				1727	* We choose the request that is closesr to the head right now. Distance
				1728	* behind the head is penalized and only allowed to a certain extent.
				1729	*/
				1730	static struct request bfq_choose_req(struct bfq_data bfqd,
				1731	struct request *rq1,
				1732	struct request *rq2,
				1733	sector_t last)
				1734	{
				1735	sector_t s1, s2, d1 = 0, d2 = 0;
				1736	unsigned long back_max;
				1737	#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
				1738	#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
				1739	unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
				1740
				1741	if (!rq1 \|\| rq1 == rq2)
				1742	return rq2;
				1743	if (!rq2)
				1744	return rq1;
				1745
				1746	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
				1747	return rq1;
				1748	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
				1749	return rq2;
				1750	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
				1751	return rq1;
				1752	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
				1753	return rq2;
				1754
				1755	s1 = blk_rq_pos(rq1);
				1756	s2 = blk_rq_pos(rq2);
				1757
				1758	/*
				1759	* By definition, 1KiB is 2 sectors.
				1760	*/
				1761	back_max = bfqd->bfq_back_max * 2;
				1762
				1763	/*
				1764	* Strict one way elevator _except_ in the case where we allow
				1765	* short backward seeks which are biased as twice the cost of a
				1766	* similar forward seek.
				1767	*/
				1768	if (s1 >= last)
				1769	d1 = s1 - last;
				1770	else if (s1 + back_max >= last)
				1771	d1 = (last - s1) * bfqd->bfq_back_penalty;
				1772	else
				1773	wrap \|= BFQ_RQ1_WRAP;
				1774
				1775	if (s2 >= last)
				1776	d2 = s2 - last;
				1777	else if (s2 + back_max >= last)
				1778	d2 = (last - s2) * bfqd->bfq_back_penalty;
				1779	else
				1780	wrap \|= BFQ_RQ2_WRAP;
				1781
				1782	/* Found required data */
				1783
				1784	/*
				1785	* By doing switch() on the bit mask "wrap" we avoid having to
				1786	* check two variables for all permutations: --> faster!
				1787	*/
				1788	switch (wrap) {
				1789	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
				1790	if (d1 < d2)
				1791	return rq1;
				1792	else if (d2 < d1)
				1793	return rq2;
				1794
				1795	if (s1 >= s2)
				1796	return rq1;
				1797	else
				1798	return rq2;
				1799
				1800	case BFQ_RQ2_WRAP:
				1801	return rq1;
				1802	case BFQ_RQ1_WRAP:
				1803	return rq2;
				1804	case BFQ_RQ1_WRAP\|BFQ_RQ2_WRAP: /* both rqs wrapped */
				1805	default:
				1806	/*
				1807	* Since both rqs are wrapped,
				1808	* start with the one that's further behind head
				1809	* (--> only one back seek required),
				1810	* since back seek takes more time than forward.
				1811	*/
				1812	if (s1 <= s2)
				1813	return rq1;
				1814	else
				1815	return rq2;
				1816	}
				1817	}
				1818
				1819	/*
				1820	* Return expired entry, or NULL to just start from scratch in rbtree.
				1821	*/
				1822	static struct request bfq_check_fifo(struct bfq_queue bfqq,
				1823	struct request *last)
				1824	{
				1825	struct request *rq;
				1826
				1827	if (bfq_bfqq_fifo_expire(bfqq))
				1828	return NULL;
				1829
				1830	bfq_mark_bfqq_fifo_expire(bfqq);
				1831
				1832	rq = rq_entry_fifo(bfqq->fifo.next);
				1833
				1834	if (rq == last \|\| ktime_get_ns() < rq->fifo_time)
				1835	return NULL;
				1836
				1837	bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
				1838	return rq;
				1839	}
				1840
				1841	static struct request bfq_find_next_rq(struct bfq_data bfqd,
				1842	struct bfq_queue *bfqq,
				1843	struct request *last)
				1844	{
				1845	struct rb_node *rbnext = rb_next(&last->rb_node);
				1846	struct rb_node *rbprev = rb_prev(&last->rb_node);
				1847	struct request next, prev = NULL;
				1848
				1849	/* Follow expired path, else get first next available. */
				1850	next = bfq_check_fifo(bfqq, last);
				1851	if (next)
				1852	return next;
				1853
				1854	if (rbprev)
				1855	prev = rb_entry_rq(rbprev);
				1856
				1857	if (rbnext)
				1858	next = rb_entry_rq(rbnext);
				1859	else {
				1860	rbnext = rb_first(&bfqq->sort_list);
				1861	if (rbnext && rbnext != &last->rb_node)
				1862	next = rb_entry_rq(rbnext);
				1863	}
				1864
				1865	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
				1866	}
				1867
				1868	static unsigned long bfq_serv_to_charge(struct request *rq,
				1869	struct bfq_queue *bfqq)
				1870	{
				1871	return blk_rq_sectors(rq);
				1872	}
				1873
				1874	/**
				1875	* bfq_updated_next_req - update the queue after a new next_rq selection.
				1876	* @bfqd: the device data the queue belongs to.
				1877	* @bfqq: the queue to update.
				1878	*
				1879	* If the first request of a queue changes we make sure that the queue
				1880	* has enough budget to serve at least its first request (if the
				1881	* request has grown). We do this because if the queue has not enough
				1882	* budget for its first request, it has to go through two dispatch
				1883	* rounds to actually get it dispatched.
				1884	*/
				1885	static void bfq_updated_next_req(struct bfq_data *bfqd,
				1886	struct bfq_queue *bfqq)
				1887	{
				1888	struct bfq_entity *entity = &bfqq->entity;
				1889	struct request *next_rq = bfqq->next_rq;
				1890	unsigned long new_budget;
				1891
				1892	if (!next_rq)
				1893	return;
				1894
				1895	if (bfqq == bfqd->in_service_queue)
				1896	/*
				1897	* In order not to break guarantees, budgets cannot be
				1898	* changed after an entity has been selected.
				1899	*/
				1900	return;
				1901
				1902	new_budget = max_t(unsigned long, bfqq->max_budget,
				1903	bfq_serv_to_charge(next_rq, bfqq));
				1904	if (entity->budget != new_budget) {
				1905	entity->budget = new_budget;
				1906	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
				1907	new_budget);
				1908	bfq_activate_bfqq(bfqd, bfqq);
				1909	}
				1910	}
				1911
				1912	static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
				1913	{
				1914	struct bfq_entity *entity = &bfqq->entity;
				1915
				1916	return entity->budget - entity->service;
				1917	}
				1918
				1919	/*
				1920	* If enough samples have been computed, return the current max budget
				1921	* stored in bfqd, which is dynamically updated according to the
				1922	* estimated disk peak rate; otherwise return the default max budget
				1923	*/
				1924	static int bfq_max_budget(struct bfq_data *bfqd)
				1925	{
				1926	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
				1927	return bfq_default_max_budget;
				1928	else
				1929	return bfqd->bfq_max_budget;
				1930	}
				1931
				1932	/*
				1933	* Return min budget, which is a fraction of the current or default
				1934	* max budget (trying with 1/32)
				1935	*/
				1936	static int bfq_min_budget(struct bfq_data *bfqd)
				1937	{
				1938	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
				1939	return bfq_default_max_budget / 32;
				1940	else
				1941	return bfqd->bfq_max_budget / 32;
				1942	}
				1943
				1944	static void bfq_bfqq_expire(struct bfq_data *bfqd,
				1945	struct bfq_queue *bfqq,
				1946	bool compensate,
				1947	enum bfqq_expiration reason);
				1948
				1949	/*
				1950	* The next function, invoked after the input queue bfqq switches from
				1951	* idle to busy, updates the budget of bfqq. The function also tells
				1952	* whether the in-service queue should be expired, by returning
				1953	* true. The purpose of expiring the in-service queue is to give bfqq
				1954	* the chance to possibly preempt the in-service queue, and the reason
				1955	* for preempting the in-service queue is to achieve the following
				1956	* goal: guarantee to bfqq its reserved bandwidth even if bfqq has
				1957	* expired because it has remained idle.
				1958	*
				1959	* In particular, bfqq may have expired for one of the following two
				1960	* reasons:
				1961	*
				1962	* - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
				1963	* and did not make it to issue a new request before its last
				1964	* request was served;
				1965	*
				1966	* - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
				1967	* a new request before the expiration of the idling-time.
				1968	*
				1969	* Even if bfqq has expired for one of the above reasons, the process
				1970	* associated with the queue may be however issuing requests greedily,
				1971	* and thus be sensitive to the bandwidth it receives (bfqq may have
				1972	* remained idle for other reasons: CPU high load, bfqq not enjoying
				1973	* idling, I/O throttling somewhere in the path from the process to
				1974	* the I/O scheduler, ...). But if, after every expiration for one of
				1975	* the above two reasons, bfqq has to wait for the service of at least
				1976	* one full budget of another queue before being served again, then
				1977	* bfqq is likely to get a much lower bandwidth or resource time than
				1978	* its reserved ones. To address this issue, two countermeasures need
				1979	* to be taken.
				1980	*
				1981	* First, the budget and the timestamps of bfqq need to be updated in
				1982	* a special way on bfqq reactivation: they need to be updated as if
				1983	* bfqq did not remain idle and did not expire. In fact, if they are
				1984	* computed as if bfqq expired and remained idle until reactivation,
				1985	* then the process associated with bfqq is treated as if, instead of
				1986	* being greedy, it stopped issuing requests when bfqq remained idle,
				1987	* and restarts issuing requests only on this reactivation. In other
				1988	* words, the scheduler does not help the process recover the "service
				1989	* hole" between bfqq expiration and reactivation. As a consequence,
				1990	* the process receives a lower bandwidth than its reserved one. In
				1991	* contrast, to recover this hole, the budget must be updated as if
				1992	* bfqq was not expired at all before this reactivation, i.e., it must
				1993	* be set to the value of the remaining budget when bfqq was
				1994	* expired. Along the same line, timestamps need to be assigned the
				1995	* value they had the last time bfqq was selected for service, i.e.,
				1996	* before last expiration. Thus timestamps need to be back-shifted
				1997	* with respect to their normal computation (see [1] for more details
				1998	* on this tricky aspect).
				1999	*
				2000	* Secondly, to allow the process to recover the hole, the in-service
				2001	* queue must be expired too, to give bfqq the chance to preempt it
				2002	* immediately. In fact, if bfqq has to wait for a full budget of the
				2003	* in-service queue to be completed, then it may become impossible to
				2004	* let the process recover the hole, even if the back-shifted
				2005	* timestamps of bfqq are lower than those of the in-service queue. If
				2006	* this happens for most or all of the holes, then the process may not
				2007	* receive its reserved bandwidth. In this respect, it is worth noting
				2008	* that, being the service of outstanding requests unpreemptible, a
				2009	* little fraction of the holes may however be unrecoverable, thereby
				2010	* causing a little loss of bandwidth.
				2011	*
				2012	* The last important point is detecting whether bfqq does need this
				2013	* bandwidth recovery. In this respect, the next function deems the
				2014	* process associated with bfqq greedy, and thus allows it to recover
				2015	* the hole, if: 1) the process is waiting for the arrival of a new
				2016	* request (which implies that bfqq expired for one of the above two
				2017	* reasons), and 2) such a request has arrived soon. The first
				2018	* condition is controlled through the flag non_blocking_wait_rq,
				2019	* while the second through the flag arrived_in_time. If both
				2020	* conditions hold, then the function computes the budget in the
				2021	* above-described special way, and signals that the in-service queue
				2022	* should be expired. Timestamp back-shifting is done later in
				2023	* __bfq_activate_entity.
				2024	*/
				2025	static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
				2026	struct bfq_queue *bfqq,
				2027	bool arrived_in_time)
				2028	{
				2029	struct bfq_entity *entity = &bfqq->entity;
				2030
				2031	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
				2032	/*
				2033	* We do not clear the flag non_blocking_wait_rq here, as
				2034	* the latter is used in bfq_activate_bfqq to signal
				2035	* that timestamps need to be back-shifted (and is
				2036	* cleared right after).
				2037	*/
				2038
				2039	/*
				2040	* In next assignment we rely on that either
				2041	* entity->service or entity->budget are not updated
				2042	* on expiration if bfqq is empty (see
				2043	* __bfq_bfqq_recalc_budget). Thus both quantities
				2044	* remain unchanged after such an expiration, and the
				2045	* following statement therefore assigns to
				2046	* entity->budget the remaining budget on such an
				2047	* expiration. For clarity, entity->service is not
				2048	* updated on expiration in any case, and, in normal
				2049	* operation, is reset only when bfqq is selected for
				2050	* service (see bfq_get_next_queue).
				2051	*/
				2052	entity->budget = min_t(unsigned long,
				2053	bfq_bfqq_budget_left(bfqq),
				2054	bfqq->max_budget);
				2055
				2056	return true;
				2057	}
				2058
				2059	entity->budget = max_t(unsigned long, bfqq->max_budget,
				2060	bfq_serv_to_charge(bfqq->next_rq, bfqq));
				2061	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
				2062	return false;
				2063	}
				2064
				2065	static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
				2066	struct bfq_queue *bfqq,
				2067	struct request *rq)
				2068	{
				2069	bool bfqq_wants_to_preempt,
				2070	/*
				2071	* See the comments on
				2072	* bfq_bfqq_update_budg_for_activation for
				2073	* details on the usage of the next variable.
				2074	*/
				2075	arrived_in_time = ktime_get_ns() <=
				2076	bfqq->ttime.last_end_request +
				2077	bfqd->bfq_slice_idle * 3;
				2078
				2079	/*
				2080	* Update budget and check whether bfqq may want to preempt
				2081	* the in-service queue.
				2082	*/
				2083	bfqq_wants_to_preempt =
				2084	bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
				2085	arrived_in_time);
				2086
				2087	if (!bfq_bfqq_IO_bound(bfqq)) {
				2088	if (arrived_in_time) {
				2089	bfqq->requests_within_timer++;
				2090	if (bfqq->requests_within_timer >=
				2091	bfqd->bfq_requests_within_timer)
				2092	bfq_mark_bfqq_IO_bound(bfqq);
				2093	} else
				2094	bfqq->requests_within_timer = 0;
				2095	}
				2096
				2097	bfq_add_bfqq_busy(bfqd, bfqq);
				2098
				2099	/*
				2100	* Expire in-service queue only if preemption may be needed
				2101	* for guarantees. In this respect, the function
				2102	* next_queue_may_preempt just checks a simple, necessary
				2103	* condition, and not a sufficient condition based on
				2104	* timestamps. In fact, for the latter condition to be
				2105	* evaluated, timestamps would need first to be updated, and
				2106	* this operation is quite costly (see the comments on the
				2107	* function bfq_bfqq_update_budg_for_activation).
				2108	*/
				2109	if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
				2110	next_queue_may_preempt(bfqd))
				2111	bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
				2112	false, BFQQE_PREEMPTED);
				2113	}
				2114
				2115	static void bfq_add_request(struct request *rq)
				2116	{
				2117	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				2118	struct bfq_data *bfqd = bfqq->bfqd;
				2119	struct request next_rq, prev;
				2120
				2121	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
				2122	bfqq->queued[rq_is_sync(rq)]++;
				2123	bfqd->queued++;
				2124
				2125	elv_rb_add(&bfqq->sort_list, rq);
				2126
				2127	/*
				2128	* Check if this request is a better next-serve candidate.
				2129	*/
				2130	prev = bfqq->next_rq;
				2131	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
				2132	bfqq->next_rq = next_rq;
				2133
				2134	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
				2135	bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, rq);
				2136	else if (prev != bfqq->next_rq)
				2137	bfq_updated_next_req(bfqd, bfqq);
				2138	}
				2139
				2140	static struct request bfq_find_rq_fmerge(struct bfq_data bfqd,
				2141	struct bio *bio,
				2142	struct request_queue *q)
				2143	{
				2144	struct bfq_queue *bfqq = bfqd->bio_bfqq;
				2145
				2146
				2147	if (bfqq)
				2148	return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
				2149
				2150	return NULL;
				2151	}
				2152
				2153	#if 0 /* Still not clear if we can do without next two functions */
				2154	static void bfq_activate_request(struct request_queue q, struct request rq)
				2155	{
				2156	struct bfq_data *bfqd = q->elevator->elevator_data;
				2157
				2158	bfqd->rq_in_driver++;
				2159	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
				2160	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
				2161	(unsigned long long)bfqd->last_position);
				2162	}
				2163
				2164	static void bfq_deactivate_request(struct request_queue q, struct request rq)
				2165	{
				2166	struct bfq_data *bfqd = q->elevator->elevator_data;
				2167
				2168	bfqd->rq_in_driver--;
				2169	}
				2170	#endif
				2171
				2172	static void bfq_remove_request(struct request_queue *q,
				2173	struct request *rq)
				2174	{
				2175	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				2176	struct bfq_data *bfqd = bfqq->bfqd;
				2177	const int sync = rq_is_sync(rq);
				2178
				2179	if (bfqq->next_rq == rq) {
				2180	bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
				2181	bfq_updated_next_req(bfqd, bfqq);
				2182	}
				2183
				2184	if (rq->queuelist.prev != &rq->queuelist)
				2185	list_del_init(&rq->queuelist);
				2186	bfqq->queued[sync]--;
				2187	bfqd->queued--;
				2188	elv_rb_del(&bfqq->sort_list, rq);
				2189
				2190	elv_rqhash_del(q, rq);
				2191	if (q->last_merge == rq)
				2192	q->last_merge = NULL;
				2193
				2194	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
				2195	bfqq->next_rq = NULL;
				2196
				2197	if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
				2198	bfq_del_bfqq_busy(bfqd, bfqq, 1);
				2199	/*
				2200	* bfqq emptied. In normal operation, when
				2201	* bfqq is empty, bfqq->entity.service and
				2202	* bfqq->entity.budget must contain,
				2203	* respectively, the service received and the
				2204	* budget used last time bfqq emptied. These
				2205	* facts do not hold in this case, as at least
				2206	* this last removal occurred while bfqq is
				2207	* not in service. To avoid inconsistencies,
				2208	* reset both bfqq->entity.service and
				2209	* bfqq->entity.budget, if bfqq has still a
				2210	* process that may issue I/O requests to it.
				2211	*/
				2212	bfqq->entity.budget = bfqq->entity.service = 0;
				2213	}
				2214	}
				2215
				2216	if (rq->cmd_flags & REQ_META)
				2217	bfqq->meta_pending--;
				2218	}
				2219
				2220	static bool bfq_bio_merge(struct blk_mq_hw_ctx hctx, struct bio bio)
				2221	{
				2222	struct request_queue *q = hctx->queue;
				2223	struct bfq_data *bfqd = q->elevator->elevator_data;
				2224	struct request *free = NULL;
				2225	/*
				2226	* bfq_bic_lookup grabs the queue_lock: invoke it now and
				2227	* store its return value for later use, to avoid nesting
				2228	* queue_lock inside the bfqd->lock. We assume that the bic
				2229	* returned by bfq_bic_lookup does not go away before
				2230	* bfqd->lock is taken.
				2231	*/
				2232	struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
				2233	bool ret;
				2234
				2235	spin_lock_irq(&bfqd->lock);
				2236
				2237	if (bic)
				2238	bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
				2239	else
				2240	bfqd->bio_bfqq = NULL;
				2241	bfqd->bio_bic = bic;
				2242
				2243	ret = blk_mq_sched_try_merge(q, bio, &free);
				2244
				2245	if (free)
				2246	blk_mq_free_request(free);
				2247	spin_unlock_irq(&bfqd->lock);
				2248
				2249	return ret;
				2250	}
				2251
				2252	static int bfq_request_merge(struct request_queue q, struct request *req,
				2253	struct bio *bio)
				2254	{
				2255	struct bfq_data *bfqd = q->elevator->elevator_data;
				2256	struct request *__rq;
				2257
				2258	__rq = bfq_find_rq_fmerge(bfqd, bio, q);
				2259	if (__rq && elv_bio_merge_ok(__rq, bio)) {
				2260	*req = __rq;
				2261	return ELEVATOR_FRONT_MERGE;
				2262	}
				2263
				2264	return ELEVATOR_NO_MERGE;
				2265	}
				2266
				2267	static void bfq_request_merged(struct request_queue q, struct request req,
				2268	enum elv_merge type)
				2269	{
				2270	if (type == ELEVATOR_FRONT_MERGE &&
				2271	rb_prev(&req->rb_node) &&
				2272	blk_rq_pos(req) <
				2273	blk_rq_pos(container_of(rb_prev(&req->rb_node),
				2274	struct request, rb_node))) {
				2275	struct bfq_queue *bfqq = RQ_BFQQ(req);
				2276	struct bfq_data *bfqd = bfqq->bfqd;
				2277	struct request prev, next_rq;
				2278
				2279	/* Reposition request in its sort_list */
				2280	elv_rb_del(&bfqq->sort_list, req);
				2281	elv_rb_add(&bfqq->sort_list, req);
				2282
				2283	/* Choose next request to be served for bfqq */
				2284	prev = bfqq->next_rq;
				2285	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
				2286	bfqd->last_position);
				2287	bfqq->next_rq = next_rq;
				2288	/*
				2289	* If next_rq changes, update the queue's budget to fit
				2290	* the new request.
				2291	*/
				2292	if (prev != bfqq->next_rq)
				2293	bfq_updated_next_req(bfqd, bfqq);
				2294	}
				2295	}
				2296
				2297	static void bfq_requests_merged(struct request_queue q, struct request rq,
				2298	struct request *next)
				2299	{
				2300	struct bfq_queue bfqq = RQ_BFQQ(rq), next_bfqq = RQ_BFQQ(next);
				2301
				2302	if (!RB_EMPTY_NODE(&rq->rb_node))
				2303	return;
				2304	spin_lock_irq(&bfqq->bfqd->lock);
				2305
				2306	/*
				2307	* If next and rq belong to the same bfq_queue and next is older
				2308	* than rq, then reposition rq in the fifo (by substituting next
				2309	* with rq). Otherwise, if next and rq belong to different
				2310	* bfq_queues, never reposition rq: in fact, we would have to
				2311	* reposition it with respect to next's position in its own fifo,
				2312	* which would most certainly be too expensive with respect to
				2313	* the benefits.
				2314	*/
				2315	if (bfqq == next_bfqq &&
				2316	!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
				2317	next->fifo_time < rq->fifo_time) {
				2318	list_del_init(&rq->queuelist);
				2319	list_replace_init(&next->queuelist, &rq->queuelist);
				2320	rq->fifo_time = next->fifo_time;
				2321	}
				2322
				2323	if (bfqq->next_rq == next)
				2324	bfqq->next_rq = rq;
				2325
				2326	bfq_remove_request(q, next);
				2327
				2328	spin_unlock_irq(&bfqq->bfqd->lock);
				2329	}
				2330
				2331	static bool bfq_allow_bio_merge(struct request_queue q, struct request rq,
				2332	struct bio *bio)
				2333	{
				2334	struct bfq_data *bfqd = q->elevator->elevator_data;
				2335	bool is_sync = op_is_sync(bio->bi_opf);
				2336	struct bfq_queue *bfqq = bfqd->bio_bfqq;
				2337
				2338	/*
				2339	* Disallow merge of a sync bio into an async request.
				2340	*/
				2341	if (is_sync && !rq_is_sync(rq))
				2342	return false;
				2343
				2344	/*
				2345	* Lookup the bfqq that this bio will be queued with. Allow
				2346	* merge only if rq is queued there.
				2347	*/
				2348	if (!bfqq)
				2349	return false;
				2350
				2351	return bfqq == RQ_BFQQ(rq);
				2352	}
				2353
				2354	static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
				2355	struct bfq_queue *bfqq)
				2356	{
				2357	if (bfqq) {
				2358	bfq_mark_bfqq_budget_new(bfqq);
				2359	bfq_clear_bfqq_fifo_expire(bfqq);
				2360
				2361	bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
				2362
				2363	bfq_log_bfqq(bfqd, bfqq,
				2364	"set_in_service_queue, cur-budget = %d",
				2365	bfqq->entity.budget);
				2366	}
				2367
				2368	bfqd->in_service_queue = bfqq;
				2369	}
				2370
				2371	/*
				2372	* Get and set a new queue for service.
				2373	*/
				2374	static struct bfq_queue bfq_set_in_service_queue(struct bfq_data bfqd)
				2375	{
				2376	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
				2377
				2378	__bfq_set_in_service_queue(bfqd, bfqq);
				2379	return bfqq;
				2380	}
				2381
				2382	/*
				2383	* bfq_default_budget - return the default budget for @bfqq on @bfqd.
				2384	* @bfqd: the device descriptor.
				2385	* @bfqq: the queue to consider.
				2386	*
				2387	* We use 3/4 of the @bfqd maximum budget as the default value
				2388	* for the max_budget field of the queues. This lets the feedback
				2389	* mechanism to start from some middle ground, then the behavior
				2390	* of the process will drive the heuristics towards high values, if
				2391	* it behaves as a greedy sequential reader, or towards small values
				2392	* if it shows a more intermittent behavior.
				2393	*/
				2394	static unsigned long bfq_default_budget(struct bfq_data *bfqd,
				2395	struct bfq_queue *bfqq)
				2396	{
				2397	unsigned long budget;
				2398
				2399	/*
				2400	* When we need an estimate of the peak rate we need to avoid
				2401	* to give budgets that are too short due to previous
				2402	* measurements. So, in the first 10 assignments use a
				2403	* ``safe'' budget value. For such first assignment the value
				2404	* of bfqd->budgets_assigned happens to be lower than 194.
				2405	* See __bfq_set_in_service_queue for the formula by which
				2406	* this field is computed.
				2407	*/
				2408	if (bfqd->budgets_assigned < 194 && bfqd->bfq_user_max_budget == 0)
				2409	budget = bfq_default_max_budget;
				2410	else
				2411	budget = bfqd->bfq_max_budget;
				2412
				2413	return budget - budget / 4;
				2414	}
				2415
				2416	static void bfq_arm_slice_timer(struct bfq_data *bfqd)
				2417	{
				2418	struct bfq_queue *bfqq = bfqd->in_service_queue;
				2419	struct bfq_io_cq *bic;
				2420	u32 sl;
				2421
				2422	/* Processes have exited, don't wait. */
				2423	bic = bfqd->in_service_bic;
				2424	if (!bic \|\| atomic_read(&bic->icq.ioc->active_ref) == 0)
				2425	return;
				2426
				2427	bfq_mark_bfqq_wait_request(bfqq);
				2428
				2429	/*
				2430	* We don't want to idle for seeks, but we do want to allow
				2431	* fair distribution of slice time for a process doing back-to-back
				2432	* seeks. So allow a little bit of time for him to submit a new rq.
				2433	*/
				2434	sl = bfqd->bfq_slice_idle;
				2435	/*
				2436	* Grant only minimum idle time if the queue is seeky.
				2437	*/
				2438	if (BFQQ_SEEKY(bfqq))
				2439	sl = min_t(u64, sl, BFQ_MIN_TT);
				2440
				2441	bfqd->last_idling_start = ktime_get();
				2442	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
				2443	HRTIMER_MODE_REL);
				2444	}
				2445
				2446	/*
				2447	* Set the maximum time for the in-service queue to consume its
				2448	* budget. This prevents seeky processes from lowering the disk
				2449	* throughput (always guaranteed with a time slice scheme as in CFQ).
				2450	*/
				2451	static void bfq_set_budget_timeout(struct bfq_data *bfqd)
				2452	{
				2453	struct bfq_queue *bfqq = bfqd->in_service_queue;
				2454	unsigned int timeout_coeff = bfqq->entity.weight /
				2455	bfqq->entity.orig_weight;
				2456
				2457	bfqd->last_budget_start = ktime_get();
				2458
				2459	bfq_clear_bfqq_budget_new(bfqq);
				2460	bfqq->budget_timeout = jiffies +
				2461	bfqd->bfq_timeout * timeout_coeff;
				2462
				2463	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
				2464	jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
				2465	}
				2466
				2467	/*
				2468	* Remove request from internal lists.
				2469	*/
				2470	static void bfq_dispatch_remove(struct request_queue q, struct request rq)
				2471	{
				2472	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				2473
				2474	/*
				2475	* For consistency, the next instruction should have been
				2476	* executed after removing the request from the queue and
				2477	* dispatching it. We execute instead this instruction before
				2478	* bfq_remove_request() (and hence introduce a temporary
				2479	* inconsistency), for efficiency. In fact, should this
				2480	* dispatch occur for a non in-service bfqq, this anticipated
				2481	* increment prevents two counters related to bfqq->dispatched
				2482	* from risking to be, first, uselessly decremented, and then
				2483	* incremented again when the (new) value of bfqq->dispatched
				2484	* happens to be taken into account.
				2485	*/
				2486	bfqq->dispatched++;
				2487
				2488	bfq_remove_request(q, rq);
				2489	}
				2490
				2491	static void __bfq_bfqq_expire(struct bfq_data bfqd, struct bfq_queue bfqq)
				2492	{
				2493	__bfq_bfqd_reset_in_service(bfqd);
				2494
				2495	if (RB_EMPTY_ROOT(&bfqq->sort_list))
				2496	bfq_del_bfqq_busy(bfqd, bfqq, 1);
				2497	else
				2498	bfq_activate_bfqq(bfqd, bfqq);
				2499	}
				2500
				2501	/**
				2502	* __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
				2503	* @bfqd: device data.
				2504	* @bfqq: queue to update.
				2505	* @reason: reason for expiration.
				2506	*
				2507	* Handle the feedback on @bfqq budget at queue expiration.
				2508	* See the body for detailed comments.
				2509	*/
				2510	static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
				2511	struct bfq_queue *bfqq,
				2512	enum bfqq_expiration reason)
				2513	{
				2514	struct request *next_rq;
				2515	int budget, min_budget;
				2516
				2517	budget = bfqq->max_budget;
				2518	min_budget = bfq_min_budget(bfqd);
				2519
				2520	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
				2521	bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
				2522	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
				2523	budget, bfq_min_budget(bfqd));
				2524	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
				2525	bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
				2526
				2527	if (bfq_bfqq_sync(bfqq)) {
				2528	switch (reason) {
				2529	/*
				2530	* Caveat: in all the following cases we trade latency
				2531	* for throughput.
				2532	*/
				2533	case BFQQE_TOO_IDLE:
				2534	if (budget > min_budget + BFQ_BUDGET_STEP)
				2535	budget -= BFQ_BUDGET_STEP;
				2536	else
				2537	budget = min_budget;
				2538	break;
				2539	case BFQQE_BUDGET_TIMEOUT:
				2540	budget = bfq_default_budget(bfqd, bfqq);
				2541	break;
				2542	case BFQQE_BUDGET_EXHAUSTED:
				2543	/*
				2544	* The process still has backlog, and did not
				2545	* let either the budget timeout or the disk
				2546	* idling timeout expire. Hence it is not
				2547	* seeky, has a short thinktime and may be
				2548	* happy with a higher budget too. So
				2549	* definitely increase the budget of this good
				2550	* candidate to boost the disk throughput.
				2551	*/
				2552	budget = min(budget + 8 * BFQ_BUDGET_STEP,
				2553	bfqd->bfq_max_budget);
				2554	break;
				2555	case BFQQE_NO_MORE_REQUESTS:
				2556	/*
				2557	* For queues that expire for this reason, it
				2558	* is particularly important to keep the
				2559	* budget close to the actual service they
				2560	* need. Doing so reduces the timestamp
				2561	* misalignment problem described in the
				2562	* comments in the body of
				2563	* __bfq_activate_entity. In fact, suppose
				2564	* that a queue systematically expires for
				2565	* BFQQE_NO_MORE_REQUESTS and presents a
				2566	* new request in time to enjoy timestamp
				2567	* back-shifting. The larger the budget of the
				2568	* queue is with respect to the service the
				2569	* queue actually requests in each service
				2570	* slot, the more times the queue can be
				2571	* reactivated with the same virtual finish
				2572	* time. It follows that, even if this finish
				2573	* time is pushed to the system virtual time
				2574	* to reduce the consequent timestamp
				2575	* misalignment, the queue unjustly enjoys for
				2576	* many re-activations a lower finish time
				2577	* than all newly activated queues.
				2578	*
				2579	* The service needed by bfqq is measured
				2580	* quite precisely by bfqq->entity.service.
				2581	* Since bfqq does not enjoy device idling,
				2582	* bfqq->entity.service is equal to the number
				2583	* of sectors that the process associated with
				2584	* bfqq requested to read/write before waiting
				2585	* for request completions, or blocking for
				2586	* other reasons.
				2587	*/
				2588	budget = max_t(int, bfqq->entity.service, min_budget);
				2589	break;
				2590	default:
				2591	return;
				2592	}
				2593	} else {
				2594	/*
				2595	* Async queues get always the maximum possible
				2596	* budget, as for them we do not care about latency
				2597	* (in addition, their ability to dispatch is limited
				2598	* by the charging factor).
				2599	*/
				2600	budget = bfqd->bfq_max_budget;
				2601	}
				2602
				2603	bfqq->max_budget = budget;
				2604
				2605	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
				2606	!bfqd->bfq_user_max_budget)
				2607	bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
				2608
				2609	/*
				2610	* If there is still backlog, then assign a new budget, making
				2611	* sure that it is large enough for the next request. Since
				2612	* the finish time of bfqq must be kept in sync with the
				2613	* budget, be sure to call __bfq_bfqq_expire() after this
				2614	* update.
				2615	*
				2616	* If there is no backlog, then no need to update the budget;
				2617	* it will be updated on the arrival of a new request.
				2618	*/
				2619	next_rq = bfqq->next_rq;
				2620	if (next_rq)
				2621	bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
				2622	bfq_serv_to_charge(next_rq, bfqq));
				2623
				2624	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
				2625	next_rq ? blk_rq_sectors(next_rq) : 0,
				2626	bfqq->entity.budget);
				2627	}
				2628
				2629	static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
				2630	{
				2631	unsigned long max_budget;
				2632
				2633	/*
				2634	* The max_budget calculated when autotuning is equal to the
				2635	* amount of sectors transferred in timeout at the estimated
				2636	* peak rate. To get this value, peak_rate is, first,
				2637	* multiplied by 1000, because timeout is measured in ms,
				2638	* while peak_rate is measured in sectors/usecs. Then the
				2639	* result of this multiplication is right-shifted by
				2640	* BFQ_RATE_SHIFT, because peak_rate is equal to the value of
				2641	* the peak rate left-shifted by BFQ_RATE_SHIFT.
				2642	*/
				2643	max_budget = (unsigned long)(peak_rate * 1000 *
				2644	timeout >> BFQ_RATE_SHIFT);
				2645
				2646	return max_budget;
				2647	}
				2648
				2649	/*
				2650	* In addition to updating the peak rate, checks whether the process
				2651	* is "slow", and returns 1 if so. This slow flag is used, in addition
				2652	* to the budget timeout, to reduce the amount of service provided to
				2653	* seeky processes, and hence reduce their chances to lower the
				2654	* throughput. See the code for more details.
				2655	*/
				2656	static bool bfq_update_peak_rate(struct bfq_data bfqd, struct bfq_queue bfqq,
				2657	bool compensate)
				2658	{
				2659	u64 bw, usecs, expected, timeout;
				2660	ktime_t delta;
				2661	int update = 0;
				2662
				2663	if (!bfq_bfqq_sync(bfqq) \|\| bfq_bfqq_budget_new(bfqq))
				2664	return false;
				2665
				2666	if (compensate)
				2667	delta = bfqd->last_idling_start;
				2668	else
				2669	delta = ktime_get();
				2670	delta = ktime_sub(delta, bfqd->last_budget_start);
				2671	usecs = ktime_to_us(delta);
				2672
				2673	/* don't use too short time intervals */
				2674	if (usecs < 1000)
				2675	return false;
				2676
				2677	/*
				2678	* Calculate the bandwidth for the last slice. We use a 64 bit
				2679	* value to store the peak rate, in sectors per usec in fixed
				2680	* point math. We do so to have enough precision in the estimate
				2681	* and to avoid overflows.
				2682	*/
				2683	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
				2684	do_div(bw, (unsigned long)usecs);
				2685
				2686	timeout = jiffies_to_msecs(bfqd->bfq_timeout);
				2687
				2688	/*
				2689	* Use only long (> 20ms) intervals to filter out spikes for
				2690	* the peak rate estimation.
				2691	*/
				2692	if (usecs > 20000) {
				2693	if (bw > bfqd->peak_rate) {
				2694	bfqd->peak_rate = bw;
				2695	update = 1;
				2696	bfq_log(bfqd, "new peak_rate=%llu", bw);
				2697	}
				2698
				2699	update \|= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
				2700
				2701	if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
				2702	bfqd->peak_rate_samples++;
				2703
				2704	if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
				2705	update && bfqd->bfq_user_max_budget == 0) {
				2706	bfqd->bfq_max_budget =
				2707	bfq_calc_max_budget(bfqd->peak_rate,
				2708	timeout);
				2709	bfq_log(bfqd, "new max_budget=%d",
				2710	bfqd->bfq_max_budget);
				2711	}
				2712	}
				2713
				2714	/*
				2715	* A process is considered ``slow'' (i.e., seeky, so that we
				2716	* cannot treat it fairly in the service domain, as it would
				2717	* slow down too much the other processes) if, when a slice
				2718	* ends for whatever reason, it has received service at a
				2719	* rate that would not be high enough to complete the budget
				2720	* before the budget timeout expiration.
				2721	*/
				2722	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
				2723
				2724	/*
				2725	* Caveat: processes doing IO in the slower disk zones will
				2726	* tend to be slow(er) even if not seeky. And the estimated
				2727	* peak rate will actually be an average over the disk
				2728	* surface. Hence, to not be too harsh with unlucky processes,
				2729	* we keep a budget/3 margin of safety before declaring a
				2730	* process slow.
				2731	*/
				2732	return expected > (4 * bfqq->entity.budget) / 3;
				2733	}
				2734
				2735	/*
				2736	* Return the farthest past time instant according to jiffies
				2737	* macros.
				2738	*/
				2739	static unsigned long bfq_smallest_from_now(void)
				2740	{
				2741	return jiffies - MAX_JIFFY_OFFSET;
				2742	}
				2743
				2744	/**
				2745	* bfq_bfqq_expire - expire a queue.
				2746	* @bfqd: device owning the queue.
				2747	* @bfqq: the queue to expire.
				2748	* @compensate: if true, compensate for the time spent idling.
				2749	* @reason: the reason causing the expiration.
				2750	*
				2751	*
				2752	* If the process associated with the queue is slow (i.e., seeky), or
				2753	* in case of budget timeout, or, finally, if it is async, we
				2754	* artificially charge it an entire budget (independently of the
				2755	* actual service it received). As a consequence, the queue will get
				2756	* higher timestamps than the correct ones upon reactivation, and
				2757	* hence it will be rescheduled as if it had received more service
				2758	* than what it actually received. In the end, this class of processes
				2759	* will receive less service in proportion to how slowly they consume
				2760	* their budgets (and hence how seriously they tend to lower the
				2761	* throughput).
				2762	*
				2763	* In contrast, when a queue expires because it has been idling for
				2764	* too much or because it exhausted its budget, we do not touch the
				2765	* amount of service it has received. Hence when the queue will be
				2766	* reactivated and its timestamps updated, the latter will be in sync
				2767	* with the actual service received by the queue until expiration.
				2768	*
				2769	* Charging a full budget to the first type of queues and the exact
				2770	* service to the others has the effect of using the WF2Q+ policy to
				2771	* schedule the former on a timeslice basis, without violating the
				2772	* service domain guarantees of the latter.
				2773	*/
				2774	static void bfq_bfqq_expire(struct bfq_data *bfqd,
				2775	struct bfq_queue *bfqq,
				2776	bool compensate,
				2777	enum bfqq_expiration reason)
				2778	{
				2779	bool slow;
				2780	int ref;
				2781
				2782	/*
				2783	* Update device peak rate for autotuning and check whether the
				2784	* process is slow (see bfq_update_peak_rate).
				2785	*/
				2786	slow = bfq_update_peak_rate(bfqd, bfqq, compensate);
				2787
				2788	/*
				2789	* As above explained, 'punish' slow (i.e., seeky), timed-out
				2790	* and async queues, to favor sequential sync workloads.
				2791	*/
				2792	if (slow \|\| reason == BFQQE_BUDGET_TIMEOUT)
				2793	bfq_bfqq_charge_full_budget(bfqq);
				2794
				2795	if (reason == BFQQE_TOO_IDLE &&
				2796	bfqq->entity.service <= 2 * bfqq->entity.budget / 10)
				2797	bfq_clear_bfqq_IO_bound(bfqq);
				2798
				2799	bfq_log_bfqq(bfqd, bfqq,
				2800	"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
				2801	slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
				2802
				2803	/*
				2804	* Increase, decrease or leave budget unchanged according to
				2805	* reason.
				2806	*/
				2807	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
				2808	ref = bfqq->ref;
				2809	__bfq_bfqq_expire(bfqd, bfqq);
				2810
				2811	/* mark bfqq as waiting a request only if a bic still points to it */
				2812	if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
				2813	reason != BFQQE_BUDGET_TIMEOUT &&
				2814	reason != BFQQE_BUDGET_EXHAUSTED)
				2815	bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
				2816	}
				2817
				2818	/*
				2819	* Budget timeout is not implemented through a dedicated timer, but
				2820	* just checked on request arrivals and completions, as well as on
				2821	* idle timer expirations.
				2822	*/
				2823	static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
				2824	{
				2825	if (bfq_bfqq_budget_new(bfqq) \|\|
				2826	time_is_after_jiffies(bfqq->budget_timeout))
				2827	return false;
				2828	return true;
				2829	}
				2830
				2831	/*
				2832	* If we expire a queue that is actively waiting (i.e., with the
				2833	* device idled) for the arrival of a new request, then we may incur
				2834	* the timestamp misalignment problem described in the body of the
				2835	* function __bfq_activate_entity. Hence we return true only if this
				2836	* condition does not hold, or if the queue is slow enough to deserve
				2837	* only to be kicked off for preserving a high throughput.
				2838	*/
				2839	static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
				2840	{
				2841	bfq_log_bfqq(bfqq->bfqd, bfqq,
				2842	"may_budget_timeout: wait_request %d left %d timeout %d",
				2843	bfq_bfqq_wait_request(bfqq),
				2844	bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
				2845	bfq_bfqq_budget_timeout(bfqq));
				2846
				2847	return (!bfq_bfqq_wait_request(bfqq) \|\|
				2848	bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
				2849	&&
				2850	bfq_bfqq_budget_timeout(bfqq);
				2851	}
				2852
				2853	/*
				2854	* For a queue that becomes empty, device idling is allowed only if
				2855	* this function returns true for the queue. And this function returns
				2856	* true only if idling is beneficial for throughput.
				2857	*/
				2858	static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
				2859	{
				2860	struct bfq_data *bfqd = bfqq->bfqd;
				2861	bool idling_boosts_thr;
				2862
				2863	if (bfqd->strict_guarantees)
				2864	return true;
				2865
				2866	/*
				2867	* The value of the next variable is computed considering that
				2868	* idling is usually beneficial for the throughput if:
				2869	* (a) the device is not NCQ-capable, or
				2870	* (b) regardless of the presence of NCQ, the request pattern
				2871	* for bfqq is I/O-bound (possible throughput losses
				2872	* caused by granting idling to seeky queues are mitigated
				2873	* by the fact that, in all scenarios where boosting
				2874	* throughput is the best thing to do, i.e., in all
				2875	* symmetric scenarios, only a minimal idle time is
				2876	* allowed to seeky queues).
				2877	*/
				2878	idling_boosts_thr = !bfqd->hw_tag \|\| bfq_bfqq_IO_bound(bfqq);
				2879
				2880	/*
				2881	* We have now the components we need to compute the return
				2882	* value of the function, which is true only if both the
				2883	* following conditions hold:
				2884	* 1) bfqq is sync, because idling make sense only for sync queues;
				2885	* 2) idling boosts the throughput.
				2886	*/
				2887	return bfq_bfqq_sync(bfqq) && idling_boosts_thr;
				2888	}
				2889
				2890	/*
				2891	* If the in-service queue is empty but the function bfq_bfqq_may_idle
				2892	* returns true, then:
				2893	* 1) the queue must remain in service and cannot be expired, and
				2894	* 2) the device must be idled to wait for the possible arrival of a new
				2895	* request for the queue.
				2896	* See the comments on the function bfq_bfqq_may_idle for the reasons
				2897	* why performing device idling is the best choice to boost the throughput
				2898	* and preserve service guarantees when bfq_bfqq_may_idle itself
				2899	* returns true.
				2900	*/
				2901	static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
				2902	{
				2903	struct bfq_data *bfqd = bfqq->bfqd;
				2904
				2905	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
				2906	bfq_bfqq_may_idle(bfqq);
				2907	}
				2908
				2909	/*
				2910	* Select a queue for service. If we have a current queue in service,
				2911	* check whether to continue servicing it, or retrieve and set a new one.
				2912	*/
				2913	static struct bfq_queue bfq_select_queue(struct bfq_data bfqd)
				2914	{
				2915	struct bfq_queue *bfqq;
				2916	struct request *next_rq;
				2917	enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
				2918
				2919	bfqq = bfqd->in_service_queue;
				2920	if (!bfqq)
				2921	goto new_queue;
				2922
				2923	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
				2924
				2925	if (bfq_may_expire_for_budg_timeout(bfqq) &&
				2926	!bfq_bfqq_wait_request(bfqq) &&
				2927	!bfq_bfqq_must_idle(bfqq))
				2928	goto expire;
				2929
				2930	check_queue:
				2931	/*
				2932	* This loop is rarely executed more than once. Even when it
				2933	* happens, it is much more convenient to re-execute this loop
				2934	* than to return NULL and trigger a new dispatch to get a
				2935	* request served.
				2936	*/
				2937	next_rq = bfqq->next_rq;
				2938	/*
				2939	* If bfqq has requests queued and it has enough budget left to
				2940	* serve them, keep the queue, otherwise expire it.
				2941	*/
				2942	if (next_rq) {
				2943	if (bfq_serv_to_charge(next_rq, bfqq) >
				2944	bfq_bfqq_budget_left(bfqq)) {
				2945	/*
				2946	* Expire the queue for budget exhaustion,
				2947	* which makes sure that the next budget is
				2948	* enough to serve the next request, even if
				2949	* it comes from the fifo expired path.
				2950	*/
				2951	reason = BFQQE_BUDGET_EXHAUSTED;
				2952	goto expire;
				2953	} else {
				2954	/*
				2955	* The idle timer may be pending because we may
				2956	* not disable disk idling even when a new request
				2957	* arrives.
				2958	*/
				2959	if (bfq_bfqq_wait_request(bfqq)) {
				2960	/*
				2961	* If we get here: 1) at least a new request
				2962	* has arrived but we have not disabled the
				2963	* timer because the request was too small,
				2964	* 2) then the block layer has unplugged
				2965	* the device, causing the dispatch to be
				2966	* invoked.
				2967	*
				2968	* Since the device is unplugged, now the
				2969	* requests are probably large enough to
				2970	* provide a reasonable throughput.
				2971	* So we disable idling.
				2972	*/
				2973	bfq_clear_bfqq_wait_request(bfqq);
				2974	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
				2975	}
				2976	goto keep_queue;
				2977	}
				2978	}
				2979
				2980	/*
				2981	* No requests pending. However, if the in-service queue is idling
				2982	* for a new request, or has requests waiting for a completion and
				2983	* may idle after their completion, then keep it anyway.
				2984	*/
				2985	if (bfq_bfqq_wait_request(bfqq) \|\|
				2986	(bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
				2987	bfqq = NULL;
				2988	goto keep_queue;
				2989	}
				2990
				2991	reason = BFQQE_NO_MORE_REQUESTS;
				2992	expire:
				2993	bfq_bfqq_expire(bfqd, bfqq, false, reason);
				2994	new_queue:
				2995	bfqq = bfq_set_in_service_queue(bfqd);
				2996	if (bfqq) {
				2997	bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
				2998	goto check_queue;
				2999	}
				3000	keep_queue:
				3001	if (bfqq)
				3002	bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
				3003	else
				3004	bfq_log(bfqd, "select_queue: no queue returned");
				3005
				3006	return bfqq;
				3007	}
				3008
				3009	/*
				3010	* Dispatch next request from bfqq.
				3011	*/
				3012	static struct request bfq_dispatch_rq_from_bfqq(struct bfq_data bfqd,
				3013	struct bfq_queue *bfqq)
				3014	{
				3015	struct request *rq = bfqq->next_rq;
				3016	unsigned long service_to_charge;
				3017
				3018	service_to_charge = bfq_serv_to_charge(rq, bfqq);
				3019
				3020	bfq_bfqq_served(bfqq, service_to_charge);
				3021
				3022	bfq_dispatch_remove(bfqd->queue, rq);
				3023
				3024	if (!bfqd->in_service_bic) {
				3025	atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
				3026	bfqd->in_service_bic = RQ_BIC(rq);
				3027	}
				3028
				3029	/*
				3030	* Expire bfqq, pretending that its budget expired, if bfqq
				3031	* belongs to CLASS_IDLE and other queues are waiting for
				3032	* service.
				3033	*/
				3034	if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
				3035	goto expire;
				3036
				3037	return rq;
				3038
				3039	expire:
				3040	bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
				3041	return rq;
				3042	}
				3043
				3044	static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
				3045	{
				3046	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
				3047
				3048	/*
				3049	* Avoiding lock: a race on bfqd->busy_queues should cause at
				3050	* most a call to dispatch for nothing
				3051	*/
				3052	return !list_empty_careful(&bfqd->dispatch) \|\|
				3053	bfqd->busy_queues > 0;
				3054	}
				3055
				3056	static struct request __bfq_dispatch_request(struct blk_mq_hw_ctx hctx)
				3057	{
				3058	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
				3059	struct request *rq = NULL;
				3060	struct bfq_queue *bfqq = NULL;
				3061
				3062	if (!list_empty(&bfqd->dispatch)) {
				3063	rq = list_first_entry(&bfqd->dispatch, struct request,
				3064	queuelist);
				3065	list_del_init(&rq->queuelist);
				3066
				3067	bfqq = RQ_BFQQ(rq);
				3068
				3069	if (bfqq) {
				3070	/*
				3071	* Increment counters here, because this
				3072	* dispatch does not follow the standard
				3073	* dispatch flow (where counters are
				3074	* incremented)
				3075	*/
				3076	bfqq->dispatched++;
				3077
				3078	goto inc_in_driver_start_rq;
				3079	}
				3080
				3081	/*
				3082	* We exploit the put_rq_private hook to decrement
				3083	* rq_in_driver, but put_rq_private will not be
				3084	* invoked on this request. So, to avoid unbalance,
				3085	* just start this request, without incrementing
				3086	* rq_in_driver. As a negative consequence,
				3087	* rq_in_driver is deceptively lower than it should be
				3088	* while this request is in service. This may cause
				3089	* bfq_schedule_dispatch to be invoked uselessly.
				3090	*
				3091	* As for implementing an exact solution, the
				3092	* put_request hook, if defined, is probably invoked
				3093	* also on this request. So, by exploiting this hook,
				3094	* we could 1) increment rq_in_driver here, and 2)
				3095	* decrement it in put_request. Such a solution would
				3096	* let the value of the counter be always accurate,
				3097	* but it would entail using an extra interface
				3098	* function. This cost seems higher than the benefit,
				3099	* being the frequency of non-elevator-private
				3100	* requests very low.
				3101	*/
				3102	goto start_rq;
				3103	}
				3104
				3105	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
				3106
				3107	if (bfqd->busy_queues == 0)
				3108	goto exit;
				3109
				3110	/*
				3111	* Force device to serve one request at a time if
				3112	* strict_guarantees is true. Forcing this service scheme is
				3113	* currently the ONLY way to guarantee that the request
				3114	* service order enforced by the scheduler is respected by a
				3115	* queueing device. Otherwise the device is free even to make
				3116	* some unlucky request wait for as long as the device
				3117	* wishes.
				3118	*
				3119	* Of course, serving one request at at time may cause loss of
				3120	* throughput.
				3121	*/
				3122	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
				3123	goto exit;
				3124
				3125	bfqq = bfq_select_queue(bfqd);
				3126	if (!bfqq)
				3127	goto exit;
				3128
				3129	rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
				3130
				3131	if (rq) {
				3132	inc_in_driver_start_rq:
				3133	bfqd->rq_in_driver++;
				3134	start_rq:
				3135	rq->rq_flags \|= RQF_STARTED;
				3136	}
				3137	exit:
				3138	return rq;
				3139	}
				3140
				3141	static struct request bfq_dispatch_request(struct blk_mq_hw_ctx hctx)
				3142	{
				3143	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
				3144	struct request *rq;
				3145
				3146	spin_lock_irq(&bfqd->lock);
				3147	rq = __bfq_dispatch_request(hctx);
				3148	spin_unlock_irq(&bfqd->lock);
				3149
				3150	return rq;
				3151	}
				3152
				3153	/*
				3154	* Task holds one reference to the queue, dropped when task exits. Each rq
				3155	* in-flight on this queue also holds a reference, dropped when rq is freed.
				3156	*
				3157	* Scheduler lock must be held here. Recall not to use bfqq after calling
				3158	* this function on it.
				3159	*/
				3160	static void bfq_put_queue(struct bfq_queue *bfqq)
				3161	{
				3162	if (bfqq->bfqd)
				3163	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
				3164	bfqq, bfqq->ref);
				3165
				3166	bfqq->ref--;
				3167	if (bfqq->ref)
				3168	return;
				3169
				3170	kmem_cache_free(bfq_pool, bfqq);
				3171	}
				3172
				3173	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
				3174	{
				3175	if (bfqq == bfqd->in_service_queue) {
				3176	__bfq_bfqq_expire(bfqd, bfqq);
				3177	bfq_schedule_dispatch(bfqd);
				3178	}
				3179
				3180	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
				3181
				3182	bfq_put_queue(bfqq); /* release process reference */
				3183	}
				3184
				3185	static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
				3186	{
				3187	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
				3188	struct bfq_data *bfqd;
				3189
				3190	if (bfqq)
				3191	bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
				3192
				3193	if (bfqq && bfqd) {
				3194	unsigned long flags;
				3195
				3196	spin_lock_irqsave(&bfqd->lock, flags);
				3197	bfq_exit_bfqq(bfqd, bfqq);
				3198	bic_set_bfqq(bic, NULL, is_sync);
				3199	spin_unlock_irq(&bfqd->lock);
				3200	}
				3201	}
				3202
				3203	static void bfq_exit_icq(struct io_cq *icq)
				3204	{
				3205	struct bfq_io_cq *bic = icq_to_bic(icq);
				3206
				3207	bfq_exit_icq_bfqq(bic, true);
				3208	bfq_exit_icq_bfqq(bic, false);
				3209	}
				3210
				3211	/*
				3212	* Update the entity prio values; note that the new values will not
				3213	* be used until the next (re)activation.
				3214	*/
				3215	static void
				3216	bfq_set_next_ioprio_data(struct bfq_queue bfqq, struct bfq_io_cq bic)
				3217	{
				3218	struct task_struct *tsk = current;
				3219	int ioprio_class;
				3220	struct bfq_data *bfqd = bfqq->bfqd;
				3221
				3222	if (!bfqd)
				3223	return;
				3224
				3225	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
				3226	switch (ioprio_class) {
				3227	default:
				3228	dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
				3229	"bfq: bad prio class %d\n", ioprio_class);
				3230	case IOPRIO_CLASS_NONE:
				3231	/*
				3232	* No prio set, inherit CPU scheduling settings.
				3233	*/
				3234	bfqq->new_ioprio = task_nice_ioprio(tsk);
				3235	bfqq->new_ioprio_class = task_nice_ioclass(tsk);
				3236	break;
				3237	case IOPRIO_CLASS_RT:
				3238	bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
				3239	bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
				3240	break;
				3241	case IOPRIO_CLASS_BE:
				3242	bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
				3243	bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
				3244	break;
				3245	case IOPRIO_CLASS_IDLE:
				3246	bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
				3247	bfqq->new_ioprio = 7;
				3248	bfq_clear_bfqq_idle_window(bfqq);
				3249	break;
				3250	}
				3251
				3252	if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
				3253	pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
				3254	bfqq->new_ioprio);
				3255	bfqq->new_ioprio = IOPRIO_BE_NR;
				3256	}
				3257
				3258	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
				3259	bfqq->entity.prio_changed = 1;
				3260	}
				3261
				3262	static void bfq_check_ioprio_change(struct bfq_io_cq bic, struct bio bio)
				3263	{
				3264	struct bfq_data *bfqd = bic_to_bfqd(bic);
				3265	struct bfq_queue *bfqq;
				3266	int ioprio = bic->icq.ioc->ioprio;
				3267
				3268	/*
				3269	* This condition may trigger on a newly created bic, be sure to
				3270	* drop the lock before returning.
				3271	*/
				3272	if (unlikely(!bfqd) \|\| likely(bic->ioprio == ioprio))
				3273	return;
				3274
				3275	bic->ioprio = ioprio;
				3276
				3277	bfqq = bic_to_bfqq(bic, false);
				3278	if (bfqq) {
				3279	/* release process reference on this queue */
				3280	bfq_put_queue(bfqq);
				3281	bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
				3282	bic_set_bfqq(bic, bfqq, false);
				3283	}
				3284
				3285	bfqq = bic_to_bfqq(bic, true);
				3286	if (bfqq)
				3287	bfq_set_next_ioprio_data(bfqq, bic);
				3288	}
				3289
				3290	static void bfq_init_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq,
				3291	struct bfq_io_cq *bic, pid_t pid, int is_sync)
				3292	{
				3293	RB_CLEAR_NODE(&bfqq->entity.rb_node);
				3294	INIT_LIST_HEAD(&bfqq->fifo);
				3295
				3296	bfqq->ref = 0;
				3297	bfqq->bfqd = bfqd;
				3298
				3299	if (bic)
				3300	bfq_set_next_ioprio_data(bfqq, bic);
				3301
				3302	if (is_sync) {
				3303	if (!bfq_class_idle(bfqq))
				3304	bfq_mark_bfqq_idle_window(bfqq);
				3305	bfq_mark_bfqq_sync(bfqq);
				3306	} else
				3307	bfq_clear_bfqq_sync(bfqq);
				3308
				3309	/* set end request to minus infinity from now */
				3310	bfqq->ttime.last_end_request = ktime_get_ns() + 1;
				3311
				3312	bfq_mark_bfqq_IO_bound(bfqq);
				3313
				3314	bfqq->pid = pid;
				3315
				3316	/* Tentative initial value to trade off between thr and lat */
				3317	bfqq->max_budget = bfq_default_budget(bfqd, bfqq);
				3318	bfqq->budget_timeout = bfq_smallest_from_now();
				3319	bfqq->pid = pid;
				3320
				3321	/* first request is almost certainly seeky */
				3322	bfqq->seek_history = 1;
				3323	}
				3324
				3325	static struct bfq_queue *bfq_async_queue_prio(struct bfq_data bfqd,
				3326	int ioprio_class, int ioprio)
				3327	{
				3328	switch (ioprio_class) {
				3329	case IOPRIO_CLASS_RT:
				3330	return &async_bfqq[0][ioprio];
				3331	case IOPRIO_CLASS_NONE:
				3332	ioprio = IOPRIO_NORM;
				3333	/* fall through */
				3334	case IOPRIO_CLASS_BE:
				3335	return &async_bfqq[1][ioprio];
				3336	case IOPRIO_CLASS_IDLE:
				3337	return &async_idle_bfqq;
				3338	default:
				3339	return NULL;
				3340	}
				3341	}
				3342
				3343	static struct bfq_queue bfq_get_queue(struct bfq_data bfqd,
				3344	struct bio *bio, bool is_sync,
				3345	struct bfq_io_cq *bic)
				3346	{
				3347	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
				3348	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
				3349	struct bfq_queue **async_bfqq = NULL;
				3350	struct bfq_queue *bfqq;
				3351
				3352	rcu_read_lock();
				3353
				3354	if (!is_sync) {
				3355	async_bfqq = bfq_async_queue_prio(bfqd, ioprio_class,
				3356	ioprio);
				3357	bfqq = *async_bfqq;
				3358	if (bfqq)
				3359	goto out;
				3360	}
				3361
				3362	bfqq = kmem_cache_alloc_node(bfq_pool,
				3363	GFP_NOWAIT \| __GFP_ZERO \| __GFP_NOWARN,
				3364	bfqd->queue->node);
				3365
				3366	if (bfqq) {
				3367	bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
				3368	is_sync);
				3369	bfq_init_entity(&bfqq->entity);
				3370	bfq_log_bfqq(bfqd, bfqq, "allocated");
				3371	} else {
				3372	bfqq = &bfqd->oom_bfqq;
				3373	bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
				3374	goto out;
				3375	}
				3376
				3377	/*
				3378	* Pin the queue now that it's allocated, scheduler exit will
				3379	* prune it.
				3380	*/
				3381	if (async_bfqq) {
				3382	bfqq->ref++;
				3383	bfq_log_bfqq(bfqd, bfqq,
				3384	"get_queue, bfqq not in async: %p, %d",
				3385	bfqq, bfqq->ref);
				3386	*async_bfqq = bfqq;
				3387	}
				3388
				3389	out:
				3390	bfqq->ref++; /* get a process reference to this queue */
				3391	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
				3392	rcu_read_unlock();
				3393	return bfqq;
				3394	}
				3395
				3396	static void bfq_update_io_thinktime(struct bfq_data *bfqd,
				3397	struct bfq_queue *bfqq)
				3398	{
				3399	struct bfq_ttime *ttime = &bfqq->ttime;
				3400	u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
				3401
				3402	elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
				3403
				3404	ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
				3405	ttime->ttime_total = div_u64(7ttime->ttime_total + 256elapsed, 8);
				3406	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
				3407	ttime->ttime_samples);
				3408	}
				3409
				3410	static void
				3411	bfq_update_io_seektime(struct bfq_data bfqd, struct bfq_queue bfqq,
				3412	struct request *rq)
				3413	{
				3414	sector_t sdist = 0;
				3415
				3416	if (bfqq->last_request_pos) {
				3417	if (bfqq->last_request_pos < blk_rq_pos(rq))
				3418	sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
				3419	else
				3420	sdist = bfqq->last_request_pos - blk_rq_pos(rq);
				3421	}
				3422
				3423	bfqq->seek_history <<= 1;
				3424	bfqq->seek_history \|= sdist > BFQQ_SEEK_THR &&
				3425	(!blk_queue_nonrot(bfqd->queue) \|\|
				3426	blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
				3427	}
				3428
				3429	/*
				3430	* Disable idle window if the process thinks too long or seeks so much that
				3431	* it doesn't matter.
				3432	*/
				3433	static void bfq_update_idle_window(struct bfq_data *bfqd,
				3434	struct bfq_queue *bfqq,
				3435	struct bfq_io_cq *bic)
				3436	{
				3437	int enable_idle;
				3438
				3439	/* Don't idle for async or idle io prio class. */
				3440	if (!bfq_bfqq_sync(bfqq) \|\| bfq_class_idle(bfqq))
				3441	return;
				3442
				3443	enable_idle = bfq_bfqq_idle_window(bfqq);
				3444
				3445	if (atomic_read(&bic->icq.ioc->active_ref) == 0 \|\|
				3446	bfqd->bfq_slice_idle == 0 \|\|
				3447	(bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
				3448	enable_idle = 0;
				3449	else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
				3450	if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)
				3451	enable_idle = 0;
				3452	else
				3453	enable_idle = 1;
				3454	}
				3455	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
				3456	enable_idle);
				3457
				3458	if (enable_idle)
				3459	bfq_mark_bfqq_idle_window(bfqq);
				3460	else
				3461	bfq_clear_bfqq_idle_window(bfqq);
				3462	}
				3463
				3464	/*
				3465	* Called when a new fs request (rq) is added to bfqq. Check if there's
				3466	* something we should do about it.
				3467	*/
				3468	static void bfq_rq_enqueued(struct bfq_data bfqd, struct bfq_queue bfqq,
				3469	struct request *rq)
				3470	{
				3471	struct bfq_io_cq *bic = RQ_BIC(rq);
				3472
				3473	if (rq->cmd_flags & REQ_META)
				3474	bfqq->meta_pending++;
				3475
				3476	bfq_update_io_thinktime(bfqd, bfqq);
				3477	bfq_update_io_seektime(bfqd, bfqq, rq);
				3478	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 \|\|
				3479	!BFQQ_SEEKY(bfqq))
				3480	bfq_update_idle_window(bfqd, bfqq, bic);
				3481
				3482	bfq_log_bfqq(bfqd, bfqq,
				3483	"rq_enqueued: idle_window=%d (seeky %d)",
				3484	bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
				3485
				3486	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
				3487
				3488	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
				3489	bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
				3490	blk_rq_sectors(rq) < 32;
				3491	bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
				3492
				3493	/*
				3494	* There is just this request queued: if the request
				3495	* is small and the queue is not to be expired, then
				3496	* just exit.
				3497	*
				3498	* In this way, if the device is being idled to wait
				3499	* for a new request from the in-service queue, we
				3500	* avoid unplugging the device and committing the
				3501	* device to serve just a small request. On the
				3502	* contrary, we wait for the block layer to decide
				3503	* when to unplug the device: hopefully, new requests
				3504	* will be merged to this one quickly, then the device
				3505	* will be unplugged and larger requests will be
				3506	* dispatched.
				3507	*/
				3508	if (small_req && !budget_timeout)
				3509	return;
				3510
				3511	/*
				3512	* A large enough request arrived, or the queue is to
				3513	* be expired: in both cases disk idling is to be
				3514	* stopped, so clear wait_request flag and reset
				3515	* timer.
				3516	*/
				3517	bfq_clear_bfqq_wait_request(bfqq);
				3518	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
				3519
				3520	/*
				3521	* The queue is not empty, because a new request just
				3522	* arrived. Hence we can safely expire the queue, in
				3523	* case of budget timeout, without risking that the
				3524	* timestamps of the queue are not updated correctly.
				3525	* See [1] for more details.
				3526	*/
				3527	if (budget_timeout)
				3528	bfq_bfqq_expire(bfqd, bfqq, false,
				3529	BFQQE_BUDGET_TIMEOUT);
				3530	}
				3531	}
				3532
				3533	static void __bfq_insert_request(struct bfq_data bfqd, struct request rq)
				3534	{
				3535	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				3536
				3537	bfq_add_request(rq);
				3538
				3539	rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
				3540	list_add_tail(&rq->queuelist, &bfqq->fifo);
				3541
				3542	bfq_rq_enqueued(bfqd, bfqq, rq);
				3543	}
				3544
				3545	static void bfq_insert_request(struct blk_mq_hw_ctx hctx, struct request rq,
				3546	bool at_head)
				3547	{
				3548	struct request_queue *q = hctx->queue;
				3549	struct bfq_data *bfqd = q->elevator->elevator_data;
				3550
				3551	spin_lock_irq(&bfqd->lock);
				3552	if (blk_mq_sched_try_insert_merge(q, rq)) {
				3553	spin_unlock_irq(&bfqd->lock);
				3554	return;
				3555	}
				3556
				3557	spin_unlock_irq(&bfqd->lock);
				3558
				3559	blk_mq_sched_request_inserted(rq);
				3560
				3561	spin_lock_irq(&bfqd->lock);
				3562	if (at_head \|\| blk_rq_is_passthrough(rq)) {
				3563	if (at_head)
				3564	list_add(&rq->queuelist, &bfqd->dispatch);
				3565	else
				3566	list_add_tail(&rq->queuelist, &bfqd->dispatch);
				3567	} else {
				3568	__bfq_insert_request(bfqd, rq);
				3569
				3570	if (rq_mergeable(rq)) {
				3571	elv_rqhash_add(q, rq);
				3572	if (!q->last_merge)
				3573	q->last_merge = rq;
				3574	}
				3575	}
				3576
				3577	spin_unlock_irq(&bfqd->lock);
				3578	}
				3579
				3580	static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
				3581	struct list_head *list, bool at_head)
				3582	{
				3583	while (!list_empty(list)) {
				3584	struct request *rq;
				3585
				3586	rq = list_first_entry(list, struct request, queuelist);
				3587	list_del_init(&rq->queuelist);
				3588	bfq_insert_request(hctx, rq, at_head);
				3589	}
				3590	}
				3591
				3592	static void bfq_update_hw_tag(struct bfq_data *bfqd)
				3593	{
				3594	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
				3595	bfqd->rq_in_driver);
				3596
				3597	if (bfqd->hw_tag == 1)
				3598	return;
				3599
				3600	/*
				3601	* This sample is valid if the number of outstanding requests
				3602	* is large enough to allow a queueing behavior. Note that the
				3603	* sum is not exact, as it's not taking into account deactivated
				3604	* requests.
				3605	*/
				3606	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
				3607	return;
				3608
				3609	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
				3610	return;
				3611
				3612	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
				3613	bfqd->max_rq_in_driver = 0;
				3614	bfqd->hw_tag_samples = 0;
				3615	}
				3616
				3617	static void bfq_completed_request(struct bfq_queue bfqq, struct bfq_data bfqd)
				3618	{
				3619	bfq_update_hw_tag(bfqd);
				3620
				3621	bfqd->rq_in_driver--;
				3622	bfqq->dispatched--;
				3623
				3624	bfqq->ttime.last_end_request = ktime_get_ns();
				3625
				3626	/*
				3627	* If this is the in-service queue, check if it needs to be expired,
				3628	* or if we want to idle in case it has no pending requests.
				3629	*/
				3630	if (bfqd->in_service_queue == bfqq) {
				3631	if (bfq_bfqq_budget_new(bfqq))
				3632	bfq_set_budget_timeout(bfqd);
				3633
				3634	if (bfq_bfqq_must_idle(bfqq)) {
				3635	bfq_arm_slice_timer(bfqd);
				3636	return;
				3637	} else if (bfq_may_expire_for_budg_timeout(bfqq))
				3638	bfq_bfqq_expire(bfqd, bfqq, false,
				3639	BFQQE_BUDGET_TIMEOUT);
				3640	else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
				3641	(bfqq->dispatched == 0 \|\|
				3642	!bfq_bfqq_may_idle(bfqq)))
				3643	bfq_bfqq_expire(bfqd, bfqq, false,
				3644	BFQQE_NO_MORE_REQUESTS);
				3645	}
				3646	}
				3647
				3648	static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
				3649	{
				3650	bfqq->allocated--;
				3651
				3652	bfq_put_queue(bfqq);
				3653	}
				3654
				3655	static void bfq_put_rq_private(struct request_queue q, struct request rq)
				3656	{
				3657	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				3658	struct bfq_data *bfqd = bfqq->bfqd;
				3659
				3660
				3661	if (likely(rq->rq_flags & RQF_STARTED)) {
				3662	unsigned long flags;
				3663
				3664	spin_lock_irqsave(&bfqd->lock, flags);
				3665
				3666	bfq_completed_request(bfqq, bfqd);
				3667	bfq_put_rq_priv_body(bfqq);
				3668
				3669	spin_unlock_irqrestore(&bfqd->lock, flags);
				3670	} else {
				3671	/*
				3672	* Request rq may be still/already in the scheduler,
				3673	* in which case we need to remove it. And we cannot
				3674	* defer such a check and removal, to avoid
				3675	* inconsistencies in the time interval from the end
				3676	* of this function to the start of the deferred work.
				3677	* This situation seems to occur only in process
				3678	* context, as a consequence of a merge. In the
				3679	* current version of the code, this implies that the
				3680	* lock is held.
				3681	*/
				3682
				3683	if (!RB_EMPTY_NODE(&rq->rb_node))
				3684	bfq_remove_request(q, rq);
				3685	bfq_put_rq_priv_body(bfqq);
				3686	}
				3687
				3688	rq->elv.priv[0] = NULL;
				3689	rq->elv.priv[1] = NULL;
				3690	}
				3691
				3692	/*
				3693	* Allocate bfq data structures associated with this request.
				3694	*/
				3695	static int bfq_get_rq_private(struct request_queue q, struct request rq,
				3696	struct bio *bio)
				3697	{
				3698	struct bfq_data *bfqd = q->elevator->elevator_data;
				3699	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
				3700	const int is_sync = rq_is_sync(rq);
				3701	struct bfq_queue *bfqq;
				3702
				3703	spin_lock_irq(&bfqd->lock);
				3704
				3705	bfq_check_ioprio_change(bic, bio);
				3706
				3707	if (!bic)
				3708	goto queue_fail;
				3709
				3710	bfqq = bic_to_bfqq(bic, is_sync);
				3711	if (!bfqq \|\| bfqq == &bfqd->oom_bfqq) {
				3712	if (bfqq)
				3713	bfq_put_queue(bfqq);
				3714	bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
				3715	bic_set_bfqq(bic, bfqq, is_sync);
				3716	}
				3717
				3718	bfqq->allocated++;
				3719	bfqq->ref++;
				3720	bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
				3721	rq, bfqq, bfqq->ref);
				3722
				3723	rq->elv.priv[0] = bic;
				3724	rq->elv.priv[1] = bfqq;
				3725
				3726	spin_unlock_irq(&bfqd->lock);
				3727
				3728	return 0;
				3729
				3730	queue_fail:
				3731	spin_unlock_irq(&bfqd->lock);
				3732
				3733	return 1;
				3734	}
				3735
				3736	static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
				3737	{
				3738	struct bfq_data *bfqd = bfqq->bfqd;
				3739	enum bfqq_expiration reason;
				3740	unsigned long flags;
				3741
				3742	spin_lock_irqsave(&bfqd->lock, flags);
				3743	bfq_clear_bfqq_wait_request(bfqq);
				3744
				3745	if (bfqq != bfqd->in_service_queue) {
				3746	spin_unlock_irqrestore(&bfqd->lock, flags);
				3747	return;
				3748	}
				3749
				3750	if (bfq_bfqq_budget_timeout(bfqq))
				3751	/*
				3752	* Also here the queue can be safely expired
				3753	* for budget timeout without wasting
				3754	* guarantees
				3755	*/
				3756	reason = BFQQE_BUDGET_TIMEOUT;
				3757	else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
				3758	/*
				3759	* The queue may not be empty upon timer expiration,
				3760	* because we may not disable the timer when the
				3761	* first request of the in-service queue arrives
				3762	* during disk idling.
				3763	*/
				3764	reason = BFQQE_TOO_IDLE;
				3765	else
				3766	goto schedule_dispatch;
				3767
				3768	bfq_bfqq_expire(bfqd, bfqq, true, reason);
				3769
				3770	schedule_dispatch:
				3771	spin_unlock_irqrestore(&bfqd->lock, flags);
				3772	bfq_schedule_dispatch(bfqd);
				3773	}
				3774
				3775	/*
				3776	* Handler of the expiration of the timer running if the in-service queue
				3777	* is idling inside its time slice.
				3778	*/
				3779	static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
				3780	{
				3781	struct bfq_data *bfqd = container_of(timer, struct bfq_data,
				3782	idle_slice_timer);
				3783	struct bfq_queue *bfqq = bfqd->in_service_queue;
				3784
				3785	/*
				3786	* Theoretical race here: the in-service queue can be NULL or
				3787	* different from the queue that was idling if a new request
				3788	* arrives for the current queue and there is a full dispatch
				3789	* cycle that changes the in-service queue. This can hardly
				3790	* happen, but in the worst case we just expire a queue too
				3791	* early.
				3792	*/
				3793	if (bfqq)
				3794	bfq_idle_slice_timer_body(bfqq);
				3795
				3796	return HRTIMER_NORESTART;
				3797	}
				3798
				3799	static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
				3800	struct bfq_queue **bfqq_ptr)
				3801	{
				3802	struct bfq_queue bfqq = bfqq_ptr;
				3803
				3804	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
				3805	if (bfqq) {
				3806	bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
				3807	bfqq, bfqq->ref);
				3808	bfq_put_queue(bfqq);
				3809	*bfqq_ptr = NULL;
				3810	}
				3811	}
				3812
				3813	/*
				3814	* Release the extra reference of the async queues as the device
				3815	* goes away.
				3816	*/
				3817	static void bfq_put_async_queues(struct bfq_data *bfqd)
				3818	{
				3819	int i, j;
				3820
				3821	for (i = 0; i < 2; i++)
				3822	for (j = 0; j < IOPRIO_BE_NR; j++)
				3823	__bfq_put_async_bfqq(bfqd, &async_bfqq[i][j]);
				3824
				3825	__bfq_put_async_bfqq(bfqd, &async_idle_bfqq);
				3826	}
				3827
				3828	static void bfq_exit_queue(struct elevator_queue *e)
				3829	{
				3830	struct bfq_data *bfqd = e->elevator_data;
				3831	struct bfq_queue bfqq, n;
				3832
				3833	hrtimer_cancel(&bfqd->idle_slice_timer);
				3834
				3835	spin_lock_irq(&bfqd->lock);
				3836	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
				3837	bfq_deactivate_bfqq(bfqd, bfqq, false);
				3838	bfq_put_async_queues(bfqd);
				3839	spin_unlock_irq(&bfqd->lock);
				3840
				3841	hrtimer_cancel(&bfqd->idle_slice_timer);
				3842
				3843	kfree(bfqd);
				3844	}
				3845
				3846	static int bfq_init_queue(struct request_queue q, struct elevator_type e)
				3847	{
				3848	struct bfq_data *bfqd;
				3849	struct elevator_queue *eq;
				3850	int i;
				3851
				3852	eq = elevator_alloc(q, e);
				3853	if (!eq)
				3854	return -ENOMEM;
				3855
				3856	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
				3857	if (!bfqd) {
				3858	kobject_put(&eq->kobj);
				3859	return -ENOMEM;
				3860	}
				3861	eq->elevator_data = bfqd;
				3862
				3863	/*
				3864	* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
				3865	* Grab a permanent reference to it, so that the normal code flow
				3866	* will not attempt to free it.
				3867	*/
				3868	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
				3869	bfqd->oom_bfqq.ref++;
				3870	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
				3871	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
				3872	bfqd->oom_bfqq.entity.new_weight =
				3873	bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
				3874	/*
				3875	* Trigger weight initialization, according to ioprio, at the
				3876	* oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
				3877	* class won't be changed any more.
				3878	*/
				3879	bfqd->oom_bfqq.entity.prio_changed = 1;
				3880
				3881	bfqd->queue = q;
				3882
				3883	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
				3884	bfqd->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
				3885
				3886	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
				3887	HRTIMER_MODE_REL);
				3888	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
				3889
				3890	INIT_LIST_HEAD(&bfqd->active_list);
				3891	INIT_LIST_HEAD(&bfqd->idle_list);
				3892
				3893	bfqd->hw_tag = -1;
				3894
				3895	bfqd->bfq_max_budget = bfq_default_max_budget;
				3896
				3897	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
				3898	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
				3899	bfqd->bfq_back_max = bfq_back_max;
				3900	bfqd->bfq_back_penalty = bfq_back_penalty;
				3901	bfqd->bfq_slice_idle = bfq_slice_idle;
				3902	bfqd->bfq_class_idle_last_service = 0;
				3903	bfqd->bfq_timeout = bfq_timeout;
				3904
				3905	bfqd->bfq_requests_within_timer = 120;
				3906
				3907	spin_lock_init(&bfqd->lock);
				3908	INIT_LIST_HEAD(&bfqd->dispatch);
				3909
				3910	q->elevator = eq;
				3911
				3912	return 0;
				3913	}
				3914
				3915	static void bfq_slab_kill(void)
				3916	{
				3917	kmem_cache_destroy(bfq_pool);
				3918	}
				3919
				3920	static int __init bfq_slab_setup(void)
				3921	{
				3922	bfq_pool = KMEM_CACHE(bfq_queue, 0);
				3923	if (!bfq_pool)
				3924	return -ENOMEM;
				3925	return 0;
				3926	}
				3927
				3928	static ssize_t bfq_var_show(unsigned int var, char *page)
				3929	{
				3930	return sprintf(page, "%u\n", var);
				3931	}
				3932
				3933	static ssize_t bfq_var_store(unsigned long var, const char page,
				3934	size_t count)
				3935	{
				3936	unsigned long new_val;
				3937	int ret = kstrtoul(page, 10, &new_val);
				3938
				3939	if (ret == 0)
				3940	*var = new_val;
				3941
				3942	return count;
				3943	}
				3944
				3945	#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
				3946	static ssize_t __FUNC(struct elevator_queue e, char page) \
				3947	{ \
				3948	struct bfq_data *bfqd = e->elevator_data; \
				3949	u64 __data = __VAR; \
				3950	if (__CONV == 1) \
				3951	__data = jiffies_to_msecs(__data); \
				3952	else if (__CONV == 2) \
				3953	__data = div_u64(__data, NSEC_PER_MSEC); \
				3954	return bfq_var_show(__data, (page)); \
				3955	}
				3956	SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
				3957	SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
				3958	SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
				3959	SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
				3960	SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
				3961	SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
				3962	SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
				3963	SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
				3964	#undef SHOW_FUNCTION
				3965
				3966	#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
				3967	static ssize_t __FUNC(struct elevator_queue e, char page) \
				3968	{ \
				3969	struct bfq_data *bfqd = e->elevator_data; \
				3970	u64 __data = __VAR; \
				3971	__data = div_u64(__data, NSEC_PER_USEC); \
				3972	return bfq_var_show(__data, (page)); \
				3973	}
				3974	USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
				3975	#undef USEC_SHOW_FUNCTION
				3976
				3977	#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
				3978	static ssize_t \
				3979	__FUNC(struct elevator_queue e, const char page, size_t count) \
				3980	{ \
				3981	struct bfq_data *bfqd = e->elevator_data; \
				3982	unsigned long uninitialized_var(__data); \
				3983	int ret = bfq_var_store(&__data, (page), count); \
				3984	if (__data < (MIN)) \
				3985	__data = (MIN); \
				3986	else if (__data > (MAX)) \
				3987	__data = (MAX); \
				3988	if (__CONV == 1) \
				3989	*(__PTR) = msecs_to_jiffies(__data); \
				3990	else if (__CONV == 2) \
				3991	(__PTR) = (u64)__data NSEC_PER_MSEC; \
				3992	else \
				3993	*(__PTR) = __data; \
				3994	return ret; \
				3995	}
				3996	STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
				3997	INT_MAX, 2);
				3998	STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
				3999	INT_MAX, 2);
				4000	STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
				4001	STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
				4002	INT_MAX, 0);
				4003	STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
				4004	#undef STORE_FUNCTION
				4005
				4006	#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
				4007	static ssize_t __FUNC(struct elevator_queue e, const char page, size_t count)\
				4008	{ \
				4009	struct bfq_data *bfqd = e->elevator_data; \
				4010	unsigned long uninitialized_var(__data); \
				4011	int ret = bfq_var_store(&__data, (page), count); \
				4012	if (__data < (MIN)) \
				4013	__data = (MIN); \
				4014	else if (__data > (MAX)) \
				4015	__data = (MAX); \
				4016	(__PTR) = (u64)__data NSEC_PER_USEC; \
				4017	return ret; \
				4018	}
				4019	USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
				4020	UINT_MAX);
				4021	#undef USEC_STORE_FUNCTION
				4022
				4023	static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
				4024	{
				4025	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout);
				4026
				4027	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
				4028	return bfq_calc_max_budget(bfqd->peak_rate, timeout);
				4029	else
				4030	return bfq_default_max_budget;
				4031	}
				4032
				4033	static ssize_t bfq_max_budget_store(struct elevator_queue *e,
				4034	const char *page, size_t count)
				4035	{
				4036	struct bfq_data *bfqd = e->elevator_data;
				4037	unsigned long uninitialized_var(__data);
				4038	int ret = bfq_var_store(&__data, (page), count);
				4039
				4040	if (__data == 0)
				4041	bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
				4042	else {
				4043	if (__data > INT_MAX)
				4044	__data = INT_MAX;
				4045	bfqd->bfq_max_budget = __data;
				4046	}
				4047
				4048	bfqd->bfq_user_max_budget = __data;
				4049
				4050	return ret;
				4051	}
				4052
				4053	/*
				4054	* Leaving this name to preserve name compatibility with cfq
				4055	* parameters, but this timeout is used for both sync and async.
				4056	*/
				4057	static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
				4058	const char *page, size_t count)
				4059	{
				4060	struct bfq_data *bfqd = e->elevator_data;
				4061	unsigned long uninitialized_var(__data);
				4062	int ret = bfq_var_store(&__data, (page), count);
				4063
				4064	if (__data < 1)
				4065	__data = 1;
				4066	else if (__data > INT_MAX)
				4067	__data = INT_MAX;
				4068
				4069	bfqd->bfq_timeout = msecs_to_jiffies(__data);
				4070	if (bfqd->bfq_user_max_budget == 0)
				4071	bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
				4072
				4073	return ret;
				4074	}
				4075
				4076	static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
				4077	const char *page, size_t count)
				4078	{
				4079	struct bfq_data *bfqd = e->elevator_data;
				4080	unsigned long uninitialized_var(__data);
				4081	int ret = bfq_var_store(&__data, (page), count);
				4082
				4083	if (__data > 1)
				4084	__data = 1;
				4085	if (!bfqd->strict_guarantees && __data == 1
				4086	&& bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
				4087	bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
				4088
				4089	bfqd->strict_guarantees = __data;
				4090
				4091	return ret;
				4092	}
				4093
				4094	#define BFQ_ATTR(name) \
				4095	__ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
				4096
				4097	static struct elv_fs_entry bfq_attrs[] = {
				4098	BFQ_ATTR(fifo_expire_sync),
				4099	BFQ_ATTR(fifo_expire_async),
				4100	BFQ_ATTR(back_seek_max),
				4101	BFQ_ATTR(back_seek_penalty),
				4102	BFQ_ATTR(slice_idle),
				4103	BFQ_ATTR(slice_idle_us),
				4104	BFQ_ATTR(max_budget),
				4105	BFQ_ATTR(timeout_sync),
				4106	BFQ_ATTR(strict_guarantees),
				4107	__ATTR_NULL
				4108	};
				4109
				4110	static struct elevator_type iosched_bfq_mq = {
				4111	.ops.mq = {
				4112	.get_rq_priv = bfq_get_rq_private,
				4113	.put_rq_priv = bfq_put_rq_private,
				4114	.exit_icq = bfq_exit_icq,
				4115	.insert_requests = bfq_insert_requests,
				4116	.dispatch_request = bfq_dispatch_request,
				4117	.next_request = elv_rb_latter_request,
				4118	.former_request = elv_rb_former_request,
				4119	.allow_merge = bfq_allow_bio_merge,
				4120	.bio_merge = bfq_bio_merge,
				4121	.request_merge = bfq_request_merge,
				4122	.requests_merged = bfq_requests_merged,
				4123	.request_merged = bfq_request_merged,
				4124	.has_work = bfq_has_work,
				4125	.init_sched = bfq_init_queue,
				4126	.exit_sched = bfq_exit_queue,
				4127	},
				4128
				4129	.uses_mq = true,
				4130	.icq_size = sizeof(struct bfq_io_cq),
				4131	.icq_align = __alignof__(struct bfq_io_cq),
				4132	.elevator_attrs = bfq_attrs,
				4133	.elevator_name = "bfq",
				4134	.elevator_owner = THIS_MODULE,
				4135	};
				4136
				4137	static int __init bfq_init(void)
				4138	{
				4139	int ret;
				4140
				4141	ret = -ENOMEM;
				4142	if (bfq_slab_setup())
				4143	goto err_pol_unreg;
				4144
				4145	ret = elv_register(&iosched_bfq_mq);
				4146	if (ret)
				4147	goto err_pol_unreg;
				4148
				4149	return 0;
				4150
				4151	err_pol_unreg:
				4152	return ret;
				4153	}
				4154
				4155	static void __exit bfq_exit(void)
				4156	{
				4157	elv_unregister(&iosched_bfq_mq);
				4158	bfq_slab_kill();
				4159	}
				4160
				4161	module_init(bfq_init);
				4162	module_exit(bfq_exit);
				4163
				4164	MODULE_AUTHOR("Paolo Valente");
				4165	MODULE_LICENSE("GPL");
				4166	MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");