Blame - block/bfq-iosched.c - SHIFTPHONES/kernel/common

blob: dce273b91015d00e8206702fe146bb51c91ea7e6 [file] [log] [blame]

Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1	/*
				2	* Budget Fair Queueing (BFQ) I/O scheduler.
				3	*
				4	* Based on ideas and code from CFQ:
				5	* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
				6	*
				7	* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
				8	* Paolo Valente <paolo.valente@unimore.it>
				9	*
				10	* Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
				11	* Arianna Avanzini <avanzini@google.com>
				12	*
				13	* Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
				14	*
				15	* This program is free software; you can redistribute it and/or
				16	* modify it under the terms of the GNU General Public License as
				17	* published by the Free Software Foundation; either version 2 of the
				18	* License, or (at your option) any later version.
				19	*
				20	* This program is distributed in the hope that it will be useful,
				21	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				22	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				23	* General Public License for more details.
				24	*
				25	* BFQ is a proportional-share I/O scheduler, with some extra
				26	* low-latency capabilities. BFQ also supports full hierarchical
				27	* scheduling through cgroups. Next paragraphs provide an introduction
				28	* on BFQ inner workings. Details on BFQ benefits, usage and
				29	* limitations can be found in Documentation/block/bfq-iosched.txt.
				30	*
				31	* BFQ is a proportional-share storage-I/O scheduling algorithm based
				32	* on the slice-by-slice service scheme of CFQ. But BFQ assigns
				33	* budgets, measured in number of sectors, to processes instead of
				34	* time slices. The device is not granted to the in-service process
				35	* for a given time slice, but until it has exhausted its assigned
				36	* budget. This change from the time to the service domain enables BFQ
				37	* to distribute the device throughput among processes as desired,
				38	* without any distortion due to throughput fluctuations, or to device
				39	* internal queueing. BFQ uses an ad hoc internal scheduler, called
				40	* B-WF2Q+, to schedule processes according to their budgets. More
				41	* precisely, BFQ schedules queues associated with processes. Each
				42	* process/queue is assigned a user-configurable weight, and B-WF2Q+
				43	* guarantees that each queue receives a fraction of the throughput
				44	* proportional to its weight. Thanks to the accurate policy of
				45	* B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
				46	* processes issuing sequential requests (to boost the throughput),
				47	* and yet guarantee a low latency to interactive and soft real-time
				48	* applications.
				49	*
				50	* In particular, to provide these low-latency guarantees, BFQ
				51	* explicitly privileges the I/O of two classes of time-sensitive
				52	* applications: interactive and soft real-time. This feature enables
				53	* BFQ to provide applications in these classes with a very low
				54	* latency. Finally, BFQ also features additional heuristics for
				55	* preserving both a low latency and a high throughput on NCQ-capable,
				56	* rotational or flash-based devices, and to get the job done quickly
				57	* for applications consisting in many I/O-bound processes.
				58	*
				59	* BFQ is described in [1], where also a reference to the initial, more
				60	* theoretical paper on BFQ can be found. The interested reader can find
				61	* in the latter paper full details on the main algorithm, as well as
				62	* formulas of the guarantees and formal proofs of all the properties.
				63	* With respect to the version of BFQ presented in these papers, this
				64	* implementation adds a few more heuristics, such as the one that
				65	* guarantees a low latency to soft real-time applications, and a
				66	* hierarchical extension based on H-WF2Q+.
				67	*
				68	* B-WF2Q+ is based on WF2Q+, which is described in [2], together with
				69	* H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
				70	* with O(log N) complexity derives from the one introduced with EEVDF
				71	* in [3].
				72	*
				73	* [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
				74	* Scheduler", Proceedings of the First Workshop on Mobile System
				75	* Technologies (MST-2015), May 2015.
				76	* http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
				77	*
				78	* [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
				79	* Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
				80	* Oct 1997.
				81	*
				82	* http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
				83	*
				84	* [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
				85	* First: A Flexible and Accurate Mechanism for Proportional Share
				86	* Resource Allocation", technical report.
				87	*
				88	* http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
				89	*/
				90	#include <linux/module.h>
				91	#include <linux/slab.h>
				92	#include <linux/blkdev.h>
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	93	#include <linux/cgroup.h>
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	94	#include <linux/elevator.h>
				95	#include <linux/ktime.h>
				96	#include <linux/rbtree.h>
				97	#include <linux/ioprio.h>
				98	#include <linux/sbitmap.h>
				99	#include <linux/delay.h>
				100
				101	#include "blk.h"
				102	#include "blk-mq.h"
				103	#include "blk-mq-tag.h"
				104	#include "blk-mq-sched.h"
				105	#include <linux/blktrace_api.h>
				106	#include <linux/hrtimer.h>
				107	#include <linux/blk-cgroup.h>
				108
				109	#define BFQ_IOPRIO_CLASSES 3
				110	#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
				111
				112	#define BFQ_MIN_WEIGHT 1
				113	#define BFQ_MAX_WEIGHT 1000
				114	#define BFQ_WEIGHT_CONVERSION_COEFF 10
				115
				116	#define BFQ_DEFAULT_QUEUE_IOPRIO 4
				117
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	118	#define BFQ_WEIGHT_LEGACY_DFL 100
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	119	#define BFQ_DEFAULT_GRP_IOPRIO 0
				120	#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
				121
				122	struct bfq_entity;
				123
				124	/**
				125	* struct bfq_service_tree - per ioprio_class service tree.
				126	*
				127	* Each service tree represents a B-WF2Q+ scheduler on its own. Each
				128	* ioprio_class has its own independent scheduler, and so its own
				129	* bfq_service_tree. All the fields are protected by the queue lock
				130	* of the containing bfqd.
				131	*/
				132	struct bfq_service_tree {
				133	/* tree for active entities (i.e., those backlogged) */
				134	struct rb_root active;
				135	/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
				136	struct rb_root idle;
				137
				138	/* idle entity with minimum F_i */
				139	struct bfq_entity *first_idle;
				140	/* idle entity with maximum F_i */
				141	struct bfq_entity *last_idle;
				142
				143	/* scheduler virtual time */
				144	u64 vtime;
				145	/* scheduler weight sum; active and idle entities contribute to it */
				146	unsigned long wsum;
				147	};
				148
				149	/**
				150	* struct bfq_sched_data - multi-class scheduler.
				151	*
				152	* bfq_sched_data is the basic scheduler queue. It supports three
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	153	* ioprio_classes, and can be used either as a toplevel queue or as an
				154	* intermediate queue on a hierarchical setup. @next_in_service
				155	* points to the active entity of the sched_data service trees that
				156	* will be scheduled next. It is used to reduce the number of steps
				157	* needed for each hierarchical-schedule update.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	158	*
				159	* The supported ioprio_classes are the same as in CFQ, in descending
				160	* priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
				161	* Requests from higher priority queues are served before all the
				162	* requests from lower priority queues; among requests of the same
				163	* queue requests are served according to B-WF2Q+.
				164	* All the fields are protected by the queue lock of the containing bfqd.
				165	*/
				166	struct bfq_sched_data {
				167	/* entity in service */
				168	struct bfq_entity *in_service_entity;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	169	/* head-of-line entity (see comments above) */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	170	struct bfq_entity *next_in_service;
				171	/* array of service trees, one per ioprio_class */
				172	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	173	/* last time CLASS_IDLE was served */
				174	unsigned long bfq_class_idle_last_service;
				175
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	176	};
				177
				178	/**
				179	* struct bfq_entity - schedulable entity.
				180	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	181	* A bfq_entity is used to represent either a bfq_queue (leaf node in the
				182	* cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
				183	* entity belongs to the sched_data of the parent group in the cgroup
				184	* hierarchy. Non-leaf entities have also their own sched_data, stored
				185	* in @my_sched_data.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	186	*
				187	* Each entity stores independently its priority values; this would
				188	* allow different weights on different devices, but this
				189	* functionality is not exported to userspace by now. Priorities and
				190	* weights are updated lazily, first storing the new values into the
				191	* new_* fields, then setting the @prio_changed flag. As soon as
				192	* there is a transition in the entity state that allows the priority
				193	* update to take place the effective and the requested priority
				194	* values are synchronized.
				195	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	196	* Unless cgroups are used, the weight value is calculated from the
				197	* ioprio to export the same interface as CFQ. When dealing with
				198	* ``well-behaved'' queues (i.e., queues that do not spend too much
				199	* time to consume their budget and have true sequential behavior, and
				200	* when there are no external factors breaking anticipation) the
				201	* relative weights at each level of the cgroups hierarchy should be
				202	* guaranteed. All the fields are protected by the queue lock of the
				203	* containing bfqd.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	204	*/
				205	struct bfq_entity {
				206	/* service_tree member */
				207	struct rb_node rb_node;
				208
				209	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	210	* Flag, true if the entity is on a tree (either the active or
				211	* the idle one of its service_tree) or is in service.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	212	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	213	bool on_st;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	214
				215	/* B-WF2Q+ start and finish timestamps [sectors/weight] */
				216	u64 start, finish;
				217
				218	/* tree the entity is enqueued into; %NULL if not on a tree */
				219	struct rb_root *tree;
				220
				221	/*
				222	* minimum start time of the (active) subtree rooted at this
				223	* entity; used for O(log N) lookups into active trees
				224	*/
				225	u64 min_start;
				226
				227	/* amount of service received during the last service slot */
				228	int service;
				229
				230	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
				231	int budget;
				232
				233	/* weight of the queue */
				234	int weight;
				235	/* next weight if a change is in progress */
				236	int new_weight;
				237
				238	/* original weight, used to implement weight boosting */
				239	int orig_weight;
				240
				241	/* parent entity, for hierarchical scheduling */
				242	struct bfq_entity *parent;
				243
				244	/*
				245	* For non-leaf nodes in the hierarchy, the associated
				246	* scheduler queue, %NULL on leaf nodes.
				247	*/
				248	struct bfq_sched_data *my_sched_data;
				249	/* the scheduler queue this entity belongs to */
				250	struct bfq_sched_data *sched_data;
				251
				252	/* flag, set to request a weight, ioprio or ioprio_class change */
				253	int prio_changed;
				254	};
				255
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	256	struct bfq_group;
				257
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	258	/**
				259	* struct bfq_ttime - per process thinktime stats.
				260	*/
				261	struct bfq_ttime {
				262	/* completion time of the last request */
				263	u64 last_end_request;
				264
				265	/* total process thinktime */
				266	u64 ttime_total;
				267	/* number of thinktime samples */
				268	unsigned long ttime_samples;
				269	/* average process thinktime */
				270	u64 ttime_mean;
				271	};
				272
				273	/**
				274	* struct bfq_queue - leaf schedulable entity.
				275	*
				276	* A bfq_queue is a leaf request queue; it can be associated with an
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	277	* io_context or more, if it is async. @cgroup holds a reference to
				278	* the cgroup, to be sure that it does not disappear while a bfqq
				279	* still references it (mostly to avoid races between request issuing
				280	* and task migration followed by cgroup destruction). All the fields
				281	* are protected by the queue lock of the containing bfqd.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	282	*/
				283	struct bfq_queue {
				284	/* reference counter */
				285	int ref;
				286	/* parent bfq_data */
				287	struct bfq_data *bfqd;
				288
				289	/* current ioprio and ioprio class */
				290	unsigned short ioprio, ioprio_class;
				291	/* next ioprio and ioprio class if a change is in progress */
				292	unsigned short new_ioprio, new_ioprio_class;
				293
				294	/* sorted list of pending requests */
				295	struct rb_root sort_list;
				296	/* if fifo isn't expired, next request to serve */
				297	struct request *next_rq;
				298	/* number of sync and async requests queued */
				299	int queued[2];
				300	/* number of requests currently allocated */
				301	int allocated;
				302	/* number of pending metadata requests */
				303	int meta_pending;
				304	/* fifo list of requests in sort_list */
				305	struct list_head fifo;
				306
				307	/* entity representing this queue in the scheduler */
				308	struct bfq_entity entity;
				309
				310	/* maximum budget allowed from the feedback mechanism */
				311	int max_budget;
				312	/* budget expiration (in jiffies) */
				313	unsigned long budget_timeout;
				314
				315	/* number of requests on the dispatch list or inside driver */
				316	int dispatched;
				317
				318	/* status flags */
				319	unsigned long flags;
				320
				321	/* node for active/idle bfqq list inside parent bfqd */
				322	struct list_head bfqq_list;
				323
				324	/* associated @bfq_ttime struct */
				325	struct bfq_ttime ttime;
				326
				327	/* bit vector: a 1 for each seeky requests in history */
				328	u32 seek_history;
				329	/* position of the last request enqueued */
				330	sector_t last_request_pos;
				331
				332	/* Number of consecutive pairs of request completion and
				333	* arrival, such that the queue becomes idle after the
				334	* completion, but the next request arrives within an idle
				335	* time slice; used only if the queue's IO_bound flag has been
				336	* cleared.
				337	*/
				338	unsigned int requests_within_timer;
				339
				340	/* pid of the process owning the queue, used for logging purposes */
				341	pid_t pid;
				342	};
				343
				344	/**
				345	* struct bfq_io_cq - per (request_queue, io_context) structure.
				346	*/
				347	struct bfq_io_cq {
				348	/* associated io_cq structure */
				349	struct io_cq icq; /* must be the first member */
				350	/* array of two process queues, the sync and the async */
				351	struct bfq_queue *bfqq[2];
				352	/* per (request_queue, blkcg) ioprio */
				353	int ioprio;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	354	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				355	uint64_t blkcg_serial_nr; /* the current blkcg serial */
				356	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	357	};
				358
				359	/**
				360	* struct bfq_data - per-device data structure.
				361	*
				362	* All the fields are protected by @lock.
				363	*/
				364	struct bfq_data {
				365	/* device request queue */
				366	struct request_queue *queue;
				367	/* dispatch queue */
				368	struct list_head dispatch;
				369
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	370	/* root bfq_group for the device */
				371	struct bfq_group *root_group;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	372
				373	/*
				374	* Number of bfq_queues containing requests (including the
				375	* queue in service, even if it is idling).
				376	*/
				377	int busy_queues;
				378	/* number of queued requests */
				379	int queued;
				380	/* number of requests dispatched and waiting for completion */
				381	int rq_in_driver;
				382
				383	/*
				384	* Maximum number of requests in driver in the last
				385	* @hw_tag_samples completed requests.
				386	*/
				387	int max_rq_in_driver;
				388	/* number of samples used to calculate hw_tag */
				389	int hw_tag_samples;
				390	/* flag set to one if the driver is showing a queueing behavior */
				391	int hw_tag;
				392
				393	/* number of budgets assigned */
				394	int budgets_assigned;
				395
				396	/*
				397	* Timer set when idling (waiting) for the next request from
				398	* the queue in service.
				399	*/
				400	struct hrtimer idle_slice_timer;
				401
				402	/* bfq_queue in service */
				403	struct bfq_queue *in_service_queue;
				404	/* bfq_io_cq (bic) associated with the @in_service_queue */
				405	struct bfq_io_cq *in_service_bic;
				406
				407	/* on-disk position of the last served request */
				408	sector_t last_position;
				409
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	410	/* time of last request completion (ns) */
				411	u64 last_completion;
				412
				413	/* time of first rq dispatch in current observation interval (ns) */
				414	u64 first_dispatch;
				415	/* time of last rq dispatch in current observation interval (ns) */
				416	u64 last_dispatch;
				417
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	418	/* beginning of the last budget */
				419	ktime_t last_budget_start;
				420	/* beginning of the last idle slice */
				421	ktime_t last_idling_start;
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	422
				423	/* number of samples in current observation interval */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	424	int peak_rate_samples;
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	425	/* num of samples of seq dispatches in current observation interval */
				426	u32 sequential_samples;
				427	/* total num of sectors transferred in current observation interval */
				428	u64 tot_sectors_dispatched;
				429	/* max rq size seen during current observation interval (sectors) */
				430	u32 last_rq_max_size;
				431	/* time elapsed from first dispatch in current observ. interval (us) */
				432	u64 delta_from_first;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	433	/*
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	434	* Current estimate of the device peak rate, measured in
				435	* [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
				436	* BFQ_RATE_SHIFT is performed to increase precision in
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	437	* fixed-point calculations.
				438	*/
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	439	u32 peak_rate;
				440
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	441	/* maximum budget allotted to a bfq_queue before rescheduling */
				442	int bfq_max_budget;
				443
				444	/* list of all the bfq_queues active on the device */
				445	struct list_head active_list;
				446	/* list of all the bfq_queues idle on the device */
				447	struct list_head idle_list;
				448
				449	/*
				450	* Timeout for async/sync requests; when it fires, requests
				451	* are served in fifo order.
				452	*/
				453	u64 bfq_fifo_expire[2];
				454	/* weight of backward seeks wrt forward ones */
				455	unsigned int bfq_back_penalty;
				456	/* maximum allowed backward seek */
				457	unsigned int bfq_back_max;
				458	/* maximum idling time */
				459	u32 bfq_slice_idle;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	460
				461	/* user-configured max budget value (0 for auto-tuning) */
				462	int bfq_user_max_budget;
				463	/*
				464	* Timeout for bfq_queues to consume their budget; used to
				465	* prevent seeky queues from imposing long latencies to
				466	* sequential or quasi-sequential ones (this also implies that
				467	* seeky queues cannot receive guarantees in the service
				468	* domain; after a timeout they are charged for the time they
				469	* have been in service, to preserve fairness among them, but
				470	* without service-domain guarantees).
				471	*/
				472	unsigned int bfq_timeout;
				473
				474	/*
				475	* Number of consecutive requests that must be issued within
				476	* the idle time slice to set again idling to a queue which
				477	* was marked as non-I/O-bound (see the definition of the
				478	* IO_bound flag for further details).
				479	*/
				480	unsigned int bfq_requests_within_timer;
				481
				482	/*
				483	* Force device idling whenever needed to provide accurate
				484	* service guarantees, without caring about throughput
				485	* issues. CAVEAT: this may even increase latencies, in case
				486	* of useless idling for processes that did stop doing I/O.
				487	*/
				488	bool strict_guarantees;
				489
				490	/* fallback dummy bfqq for extreme OOM conditions */
				491	struct bfq_queue oom_bfqq;
				492
				493	spinlock_t lock;
				494
				495	/*
				496	* bic associated with the task issuing current bio for
				497	* merging. This and the next field are used as a support to
				498	* be able to perform the bic lookup, needed by bio-merge
				499	* functions, before the scheduler lock is taken, and thus
				500	* avoid taking the request-queue lock while the scheduler
				501	* lock is being held.
				502	*/
				503	struct bfq_io_cq *bio_bic;
				504	/* bfqq associated with the task issuing current bio for merging */
				505	struct bfq_queue *bio_bfqq;
				506	};
				507
				508	enum bfqq_state_flags {
				509	BFQQF_busy = 0, /* has requests or is in service */
				510	BFQQF_wait_request, /* waiting for a request */
				511	BFQQF_non_blocking_wait_rq, /*
				512	* waiting for a request
				513	* without idling the device
				514	*/
				515	BFQQF_fifo_expire, /* FIFO checked in this slice */
				516	BFQQF_idle_window, /* slice idling enabled */
				517	BFQQF_sync, /* synchronous queue */
				518	BFQQF_budget_new, /* no completion with this budget */
				519	BFQQF_IO_bound, /*
				520	* bfqq has timed-out at least once
				521	* having consumed at most 2/10 of
				522	* its budget
				523	*/
				524	};
				525
				526	#define BFQ_BFQQ_FNS(name) \
				527	static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
				528	{ \
				529	__set_bit(BFQQF_##name, &(bfqq)->flags); \
				530	} \
				531	static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
				532	{ \
				533	__clear_bit(BFQQF_##name, &(bfqq)->flags); \
				534	} \
				535	static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
				536	{ \
				537	return test_bit(BFQQF_##name, &(bfqq)->flags); \
				538	}
				539
				540	BFQ_BFQQ_FNS(busy);
				541	BFQ_BFQQ_FNS(wait_request);
				542	BFQ_BFQQ_FNS(non_blocking_wait_rq);
				543	BFQ_BFQQ_FNS(fifo_expire);
				544	BFQ_BFQQ_FNS(idle_window);
				545	BFQ_BFQQ_FNS(sync);
				546	BFQ_BFQQ_FNS(budget_new);
				547	BFQ_BFQQ_FNS(IO_bound);
				548	#undef BFQ_BFQQ_FNS
				549
				550	/* Logging facilities. */
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	551	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				552	static struct bfq_group bfqq_group(struct bfq_queue bfqq);
				553	static struct blkcg_gq bfqg_to_blkg(struct bfq_group bfqg);
				554
				555	#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
				556	char __pbuf[128]; \
				557	\
				558	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
				559	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
				560	bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
				561	__pbuf, ##args); \
				562	} while (0)
				563
				564	#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
				565	char __pbuf[128]; \
				566	\
				567	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
				568	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
				569	} while (0)
				570
				571	#else /* CONFIG_BFQ_GROUP_IOSCHED */
				572
				573	#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
				574	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
				575	bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
				576	##args)
				577	#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
				578
				579	#endif /* CONFIG_BFQ_GROUP_IOSCHED */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	580
				581	#define bfq_log(bfqd, fmt, args...) \
				582	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
				583
				584	/* Expiration reasons. */
				585	enum bfqq_expiration {
				586	BFQQE_TOO_IDLE = 0, /*
				587	* queue has been idling for
				588	* too long
				589	*/
				590	BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
				591	BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
				592	BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
				593	BFQQE_PREEMPTED /* preemption in progress */
				594	};
				595
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	596	struct bfqg_stats {
				597	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				598	/* number of ios merged */
				599	struct blkg_rwstat merged;
				600	/* total time spent on device in ns, may not be accurate w/ queueing */
				601	struct blkg_rwstat service_time;
				602	/* total time spent waiting in scheduler queue in ns */
				603	struct blkg_rwstat wait_time;
				604	/* number of IOs queued up */
				605	struct blkg_rwstat queued;
				606	/* total disk time and nr sectors dispatched by this group */
				607	struct blkg_stat time;
				608	/* sum of number of ios queued across all samples */
				609	struct blkg_stat avg_queue_size_sum;
				610	/* count of samples taken for average */
				611	struct blkg_stat avg_queue_size_samples;
				612	/* how many times this group has been removed from service tree */
				613	struct blkg_stat dequeue;
				614	/* total time spent waiting for it to be assigned a timeslice. */
				615	struct blkg_stat group_wait_time;
				616	/* time spent idling for this blkcg_gq */
				617	struct blkg_stat idle_time;
				618	/* total time with empty current active q with other requests queued */
				619	struct blkg_stat empty_time;
				620	/* fields after this shouldn't be cleared on stat reset */
				621	uint64_t start_group_wait_time;
				622	uint64_t start_idle_time;
				623	uint64_t start_empty_time;
				624	uint16_t flags;
				625	#endif /* CONFIG_BFQ_GROUP_IOSCHED */
				626	};
				627
				628	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				629
				630	/*
				631	* struct bfq_group_data - per-blkcg storage for the blkio subsystem.
				632	*
				633	* @ps: @blkcg_policy_storage that this structure inherits
				634	* @weight: weight of the bfq_group
				635	*/
				636	struct bfq_group_data {
				637	/* must be the first member */
				638	struct blkcg_policy_data pd;
				639
				640	unsigned short weight;
				641	};
				642
				643	/**
				644	* struct bfq_group - per (device, cgroup) data structure.
				645	* @entity: schedulable entity to insert into the parent group sched_data.
				646	* @sched_data: own sched_data, to contain child entities (they may be
				647	* both bfq_queues and bfq_groups).
				648	* @bfqd: the bfq_data for the device this group acts upon.
				649	* @async_bfqq: array of async queues for all the tasks belonging to
				650	* the group, one queue per ioprio value per ioprio_class,
				651	* except for the idle class that has only one queue.
				652	* @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
				653	* @my_entity: pointer to @entity, %NULL for the toplevel group; used
				654	* to avoid too many special cases during group creation/
				655	* migration.
				656	* @stats: stats for this bfqg.
				657	*
				658	* Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
				659	* there is a set of bfq_groups, each one collecting the lower-level
				660	* entities belonging to the group that are acting on the same device.
				661	*
				662	* Locking works as follows:
				663	* o @bfqd is protected by the queue lock, RCU is used to access it
				664	* from the readers.
				665	* o All the other fields are protected by the @bfqd queue lock.
				666	*/
				667	struct bfq_group {
				668	/* must be the first member */
				669	struct blkg_policy_data pd;
				670
				671	struct bfq_entity entity;
				672	struct bfq_sched_data sched_data;
				673
				674	void *bfqd;
				675
				676	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
				677	struct bfq_queue *async_idle_bfqq;
				678
				679	struct bfq_entity *my_entity;
				680
				681	struct bfqg_stats stats;
				682	};
				683
				684	#else
				685	struct bfq_group {
				686	struct bfq_sched_data sched_data;
				687
				688	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
				689	struct bfq_queue *async_idle_bfqq;
				690
				691	struct rb_root rq_pos_tree;
				692	};
				693	#endif
				694
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	695	static struct bfq_queue bfq_entity_to_bfqq(struct bfq_entity entity);
				696
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	697	static unsigned int bfq_class_idx(struct bfq_entity *entity)
				698	{
				699	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				700
				701	return bfqq ? bfqq->ioprio_class - 1 :
				702	BFQ_DEFAULT_GRP_CLASS - 1;
				703	}
				704
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	705	static struct bfq_service_tree *
				706	bfq_entity_service_tree(struct bfq_entity *entity)
				707	{
				708	struct bfq_sched_data *sched_data = entity->sched_data;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	709	unsigned int idx = bfq_class_idx(entity);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	710
				711	return sched_data->service_tree + idx;
				712	}
				713
				714	static struct bfq_queue bic_to_bfqq(struct bfq_io_cq bic, bool is_sync)
				715	{
				716	return bic->bfqq[is_sync];
				717	}
				718
				719	static void bic_set_bfqq(struct bfq_io_cq bic, struct bfq_queue bfqq,
				720	bool is_sync)
				721	{
				722	bic->bfqq[is_sync] = bfqq;
				723	}
				724
				725	static struct bfq_data bic_to_bfqd(struct bfq_io_cq bic)
				726	{
				727	return bic->icq.q->elevator->elevator_data;
				728	}
				729
				730	static void bfq_check_ioprio_change(struct bfq_io_cq bic, struct bio bio);
				731	static void bfq_put_queue(struct bfq_queue *bfqq);
				732	static struct bfq_queue bfq_get_queue(struct bfq_data bfqd,
				733	struct bio *bio, bool is_sync,
				734	struct bfq_io_cq *bic);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	735	static void bfq_put_async_queues(struct bfq_data bfqd, struct bfq_group bfqg);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	736	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq);
				737
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	738	/* Expiration time of sync (0) and async (1) requests, in ns. */
				739	static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
				740
				741	/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
				742	static const int bfq_back_max = 16 * 1024;
				743
				744	/* Penalty of a backwards seek, in number of sectors. */
				745	static const int bfq_back_penalty = 2;
				746
				747	/* Idling period duration, in ns. */
				748	static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
				749
				750	/* Minimum number of assigned budgets for which stats are safe to compute. */
				751	static const int bfq_stats_min_budgets = 194;
				752
				753	/* Default maximum budget values, in sectors and number of requests. */
				754	static const int bfq_default_max_budget = 16 * 1024;
				755
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	756	/*
				757	* Async to sync throughput distribution is controlled as follows:
				758	* when an async request is served, the entity is charged the number
				759	* of sectors of the request, multiplied by the factor below
				760	*/
				761	static const int bfq_async_charge_factor = 10;
				762
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	763	/* Default timeout values, in jiffies, approximating CFQ defaults. */
				764	static const int bfq_timeout = HZ / 8;
				765
				766	static struct kmem_cache *bfq_pool;
				767
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	768	/* Below this threshold (in ns), we consider thinktime immediate. */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	769	#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
				770
				771	/* hw_tag detection: parallel requests threshold and min samples needed. */
				772	#define BFQ_HW_QUEUE_THRESHOLD 4
				773	#define BFQ_HW_QUEUE_SAMPLES 32
				774
				775	#define BFQQ_SEEK_THR (sector_t)(8 * 100)
				776	#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
				777	#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
				778	#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
				779
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	780	/* Min number of samples required to perform peak-rate update */
				781	#define BFQ_RATE_MIN_SAMPLES 32
				782	/* Min observation time interval required to perform a peak-rate update (ns) */
				783	#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
				784	/* Target observation time interval for a peak-rate update (ns) */
				785	#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	786
				787	/* Shift used for peak rate fixed precision calculations. */
				788	#define BFQ_RATE_SHIFT 16
				789
				790	#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
				791	{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
				792
				793	#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
				794	#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
				795
				796	/**
				797	* icq_to_bic - convert iocontext queue structure to bfq_io_cq.
				798	* @icq: the iocontext queue.
				799	*/
				800	static struct bfq_io_cq icq_to_bic(struct io_cq icq)
				801	{
				802	/* bic->icq is the first member, %NULL will convert to %NULL */
				803	return container_of(icq, struct bfq_io_cq, icq);
				804	}
				805
				806	/**
				807	* bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
				808	* @bfqd: the lookup key.
				809	* @ioc: the io_context of the process doing I/O.
				810	* @q: the request queue.
				811	*/
				812	static struct bfq_io_cq bfq_bic_lookup(struct bfq_data bfqd,
				813	struct io_context *ioc,
				814	struct request_queue *q)
				815	{
				816	if (ioc) {
				817	unsigned long flags;
				818	struct bfq_io_cq *icq;
				819
				820	spin_lock_irqsave(q->queue_lock, flags);
				821	icq = icq_to_bic(ioc_lookup_icq(ioc, q));
				822	spin_unlock_irqrestore(q->queue_lock, flags);
				823
				824	return icq;
				825	}
				826
				827	return NULL;
				828	}
				829
				830	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	831	* Scheduler run of queue, if there are requests pending and no one in the
				832	* driver that will restart queueing.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	833	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	834	static void bfq_schedule_dispatch(struct bfq_data *bfqd)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	835	{
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	836	if (bfqd->queued != 0) {
				837	bfq_log(bfqd, "schedule dispatch");
				838	blk_mq_run_hw_queues(bfqd->queue, true);
				839	}
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	840	}
				841
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	842	/**
				843	* bfq_gt - compare two timestamps.
				844	* @a: first ts.
				845	* @b: second ts.
				846	*
				847	* Return @a > @b, dealing with wrapping correctly.
				848	*/
				849	static int bfq_gt(u64 a, u64 b)
				850	{
				851	return (s64)(a - b) > 0;
				852	}
				853
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	854	static struct bfq_entity bfq_root_active_entity(struct rb_root tree)
				855	{
				856	struct rb_node *node = tree->rb_node;
				857
				858	return rb_entry(node, struct bfq_entity, rb_node);
				859	}
				860
				861	static struct bfq_entity bfq_lookup_next_entity(struct bfq_sched_data sd);
				862
				863	static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
				864
				865	/**
				866	* bfq_update_next_in_service - update sd->next_in_service
				867	* @sd: sched_data for which to perform the update.
				868	* @new_entity: if not NULL, pointer to the entity whose activation,
				869	* requeueing or repositionig triggered the invocation of
				870	* this function.
				871	*
				872	* This function is called to update sd->next_in_service, which, in
				873	* its turn, may change as a consequence of the insertion or
				874	* extraction of an entity into/from one of the active trees of
				875	* sd. These insertions/extractions occur as a consequence of
				876	* activations/deactivations of entities, with some activations being
				877	* 'true' activations, and other activations being requeueings (i.e.,
				878	* implementing the second, requeueing phase of the mechanism used to
				879	* reposition an entity in its active tree; see comments on
				880	* __bfq_activate_entity and __bfq_requeue_entity for details). In
				881	* both the last two activation sub-cases, new_entity points to the
				882	* just activated or requeued entity.
				883	*
				884	* Returns true if sd->next_in_service changes in such a way that
				885	* entity->parent may become the next_in_service for its parent
				886	* entity.
				887	*/
				888	static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
				889	struct bfq_entity *new_entity)
				890	{
				891	struct bfq_entity *next_in_service = sd->next_in_service;
				892	bool parent_sched_may_change = false;
				893
				894	/*
				895	* If this update is triggered by the activation, requeueing
				896	* or repositiong of an entity that does not coincide with
				897	* sd->next_in_service, then a full lookup in the active tree
				898	* can be avoided. In fact, it is enough to check whether the
				899	* just-modified entity has a higher priority than
				900	* sd->next_in_service, or, even if it has the same priority
				901	* as sd->next_in_service, is eligible and has a lower virtual
				902	* finish time than sd->next_in_service. If this compound
				903	* condition holds, then the new entity becomes the new
				904	* next_in_service. Otherwise no change is needed.
				905	*/
				906	if (new_entity && new_entity != sd->next_in_service) {
				907	/*
				908	* Flag used to decide whether to replace
				909	* sd->next_in_service with new_entity. Tentatively
				910	* set to true, and left as true if
				911	* sd->next_in_service is NULL.
				912	*/
				913	bool replace_next = true;
				914
				915	/*
				916	* If there is already a next_in_service candidate
				917	* entity, then compare class priorities or timestamps
				918	* to decide whether to replace sd->service_tree with
				919	* new_entity.
				920	*/
				921	if (next_in_service) {
				922	unsigned int new_entity_class_idx =
				923	bfq_class_idx(new_entity);
				924	struct bfq_service_tree *st =
				925	sd->service_tree + new_entity_class_idx;
				926
				927	/*
				928	* For efficiency, evaluate the most likely
				929	* sub-condition first.
				930	*/
				931	replace_next =
				932	(new_entity_class_idx ==
				933	bfq_class_idx(next_in_service)
				934	&&
				935	!bfq_gt(new_entity->start, st->vtime)
				936	&&
				937	bfq_gt(next_in_service->finish,
				938	new_entity->finish))
				939	\|\|
				940	new_entity_class_idx <
				941	bfq_class_idx(next_in_service);
				942	}
				943
				944	if (replace_next)
				945	next_in_service = new_entity;
				946	} else /* invoked because of a deactivation: lookup needed */
				947	next_in_service = bfq_lookup_next_entity(sd);
				948
				949	if (next_in_service) {
				950	parent_sched_may_change = !sd->next_in_service \|\|
				951	bfq_update_parent_budget(next_in_service);
				952	}
				953
				954	sd->next_in_service = next_in_service;
				955
				956	if (!next_in_service)
				957	return parent_sched_may_change;
				958
				959	return parent_sched_may_change;
				960	}
				961
				962	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				963	/* both next loops stop at one of the child entities of the root group */
				964	#define for_each_entity(entity) \
				965	for (; entity ; entity = entity->parent)
				966
				967	/*
				968	* For each iteration, compute parent in advance, so as to be safe if
				969	* entity is deallocated during the iteration. Such a deallocation may
				970	* happen as a consequence of a bfq_put_queue that frees the bfq_queue
				971	* containing entity.
				972	*/
				973	#define for_each_entity_safe(entity, parent) \
				974	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
				975
				976	/*
				977	* Returns true if this budget changes may let next_in_service->parent
				978	* become the next_in_service entity for its parent entity.
				979	*/
				980	static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
				981	{
				982	struct bfq_entity *bfqg_entity;
				983	struct bfq_group *bfqg;
				984	struct bfq_sched_data *group_sd;
				985	bool ret = false;
				986
				987	group_sd = next_in_service->sched_data;
				988
				989	bfqg = container_of(group_sd, struct bfq_group, sched_data);
				990	/*
				991	* bfq_group's my_entity field is not NULL only if the group
				992	* is not the root group. We must not touch the root entity
				993	* as it must never become an in-service entity.
				994	*/
				995	bfqg_entity = bfqg->my_entity;
				996	if (bfqg_entity) {
				997	if (bfqg_entity->budget > next_in_service->budget)
				998	ret = true;
				999	bfqg_entity->budget = next_in_service->budget;
				1000	}
				1001
				1002	return ret;
				1003	}
				1004
				1005	/*
				1006	* This function tells whether entity stops being a candidate for next
				1007	* service, according to the following logic.
				1008	*
				1009	* This function is invoked for an entity that is about to be set in
				1010	* service. If such an entity is a queue, then the entity is no longer
				1011	* a candidate for next service (i.e, a candidate entity to serve
				1012	* after the in-service entity is expired). The function then returns
				1013	* true.
				1014	*/
				1015	static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
				1016	{
				1017	if (bfq_entity_to_bfqq(entity))
				1018	return true;
				1019
				1020	return false;
				1021	}
				1022
				1023	#else /* CONFIG_BFQ_GROUP_IOSCHED */
				1024	/*
				1025	* Next two macros are fake loops when cgroups support is not
				1026	* enabled. I fact, in such a case, there is only one level to go up
				1027	* (to reach the root group).
				1028	*/
				1029	#define for_each_entity(entity) \
				1030	for (; entity ; entity = NULL)
				1031
				1032	#define for_each_entity_safe(entity, parent) \
				1033	for (parent = NULL; entity ; entity = parent)
				1034
				1035	static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
				1036	{
				1037	return false;
				1038	}
				1039
				1040	static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
				1041	{
				1042	return true;
				1043	}
				1044
				1045	#endif /* CONFIG_BFQ_GROUP_IOSCHED */
				1046
				1047	/*
				1048	* Shift for timestamp calculations. This actually limits the maximum
				1049	* service allowed in one timestamp delta (small shift values increase it),
				1050	* the maximum total weight that can be used for the queues in the system
				1051	* (big shift values increase it), and the period of virtual time
				1052	* wraparounds.
				1053	*/
				1054	#define WFQ_SERVICE_SHIFT 22
				1055
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1056	static struct bfq_queue bfq_entity_to_bfqq(struct bfq_entity entity)
				1057	{
				1058	struct bfq_queue *bfqq = NULL;
				1059
				1060	if (!entity->my_sched_data)
				1061	bfqq = container_of(entity, struct bfq_queue, entity);
				1062
				1063	return bfqq;
				1064	}
				1065
				1066
				1067	/**
				1068	* bfq_delta - map service into the virtual time domain.
				1069	* @service: amount of service.
				1070	* @weight: scale factor (weight of an entity or weight sum).
				1071	*/
				1072	static u64 bfq_delta(unsigned long service, unsigned long weight)
				1073	{
				1074	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
				1075
				1076	do_div(d, weight);
				1077	return d;
				1078	}
				1079
				1080	/**
				1081	* bfq_calc_finish - assign the finish time to an entity.
				1082	* @entity: the entity to act upon.
				1083	* @service: the service to be charged to the entity.
				1084	*/
				1085	static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
				1086	{
				1087	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1088
				1089	entity->finish = entity->start +
				1090	bfq_delta(service, entity->weight);
				1091
				1092	if (bfqq) {
				1093	bfq_log_bfqq(bfqq->bfqd, bfqq,
				1094	"calc_finish: serv %lu, w %d",
				1095	service, entity->weight);
				1096	bfq_log_bfqq(bfqq->bfqd, bfqq,
				1097	"calc_finish: start %llu, finish %llu, delta %llu",
				1098	entity->start, entity->finish,
				1099	bfq_delta(service, entity->weight));
				1100	}
				1101	}
				1102
				1103	/**
				1104	* bfq_entity_of - get an entity from a node.
				1105	* @node: the node field of the entity.
				1106	*
				1107	* Convert a node pointer to the relative entity. This is used only
				1108	* to simplify the logic of some functions and not as the generic
				1109	* conversion mechanism because, e.g., in the tree walking functions,
				1110	* the check for a %NULL value would be redundant.
				1111	*/
				1112	static struct bfq_entity bfq_entity_of(struct rb_node node)
				1113	{
				1114	struct bfq_entity *entity = NULL;
				1115
				1116	if (node)
				1117	entity = rb_entry(node, struct bfq_entity, rb_node);
				1118
				1119	return entity;
				1120	}
				1121
				1122	/**
				1123	* bfq_extract - remove an entity from a tree.
				1124	* @root: the tree root.
				1125	* @entity: the entity to remove.
				1126	*/
				1127	static void bfq_extract(struct rb_root root, struct bfq_entity entity)
				1128	{
				1129	entity->tree = NULL;
				1130	rb_erase(&entity->rb_node, root);
				1131	}
				1132
				1133	/**
				1134	* bfq_idle_extract - extract an entity from the idle tree.
				1135	* @st: the service tree of the owning @entity.
				1136	* @entity: the entity being removed.
				1137	*/
				1138	static void bfq_idle_extract(struct bfq_service_tree *st,
				1139	struct bfq_entity *entity)
				1140	{
				1141	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1142	struct rb_node *next;
				1143
				1144	if (entity == st->first_idle) {
				1145	next = rb_next(&entity->rb_node);
				1146	st->first_idle = bfq_entity_of(next);
				1147	}
				1148
				1149	if (entity == st->last_idle) {
				1150	next = rb_prev(&entity->rb_node);
				1151	st->last_idle = bfq_entity_of(next);
				1152	}
				1153
				1154	bfq_extract(&st->idle, entity);
				1155
				1156	if (bfqq)
				1157	list_del(&bfqq->bfqq_list);
				1158	}
				1159
				1160	/**
				1161	* bfq_insert - generic tree insertion.
				1162	* @root: tree root.
				1163	* @entity: entity to insert.
				1164	*
				1165	* This is used for the idle and the active tree, since they are both
				1166	* ordered by finish time.
				1167	*/
				1168	static void bfq_insert(struct rb_root root, struct bfq_entity entity)
				1169	{
				1170	struct bfq_entity *entry;
				1171	struct rb_node **node = &root->rb_node;
				1172	struct rb_node *parent = NULL;
				1173
				1174	while (*node) {
				1175	parent = *node;
				1176	entry = rb_entry(parent, struct bfq_entity, rb_node);
				1177
				1178	if (bfq_gt(entry->finish, entity->finish))
				1179	node = &parent->rb_left;
				1180	else
				1181	node = &parent->rb_right;
				1182	}
				1183
				1184	rb_link_node(&entity->rb_node, parent, node);
				1185	rb_insert_color(&entity->rb_node, root);
				1186
				1187	entity->tree = root;
				1188	}
				1189
				1190	/**
				1191	* bfq_update_min - update the min_start field of a entity.
				1192	* @entity: the entity to update.
				1193	* @node: one of its children.
				1194	*
				1195	* This function is called when @entity may store an invalid value for
				1196	* min_start due to updates to the active tree. The function assumes
				1197	* that the subtree rooted at @node (which may be its left or its right
				1198	* child) has a valid min_start value.
				1199	*/
				1200	static void bfq_update_min(struct bfq_entity entity, struct rb_node node)
				1201	{
				1202	struct bfq_entity *child;
				1203
				1204	if (node) {
				1205	child = rb_entry(node, struct bfq_entity, rb_node);
				1206	if (bfq_gt(entity->min_start, child->min_start))
				1207	entity->min_start = child->min_start;
				1208	}
				1209	}
				1210
				1211	/**
				1212	* bfq_update_active_node - recalculate min_start.
				1213	* @node: the node to update.
				1214	*
				1215	* @node may have changed position or one of its children may have moved,
				1216	* this function updates its min_start value. The left and right subtrees
				1217	* are assumed to hold a correct min_start value.
				1218	*/
				1219	static void bfq_update_active_node(struct rb_node *node)
				1220	{
				1221	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
				1222
				1223	entity->min_start = entity->start;
				1224	bfq_update_min(entity, node->rb_right);
				1225	bfq_update_min(entity, node->rb_left);
				1226	}
				1227
				1228	/**
				1229	* bfq_update_active_tree - update min_start for the whole active tree.
				1230	* @node: the starting node.
				1231	*
				1232	* @node must be the deepest modified node after an update. This function
				1233	* updates its min_start using the values held by its children, assuming
				1234	* that they did not change, and then updates all the nodes that may have
				1235	* changed in the path to the root. The only nodes that may have changed
				1236	* are the ones in the path or their siblings.
				1237	*/
				1238	static void bfq_update_active_tree(struct rb_node *node)
				1239	{
				1240	struct rb_node *parent;
				1241
				1242	up:
				1243	bfq_update_active_node(node);
				1244
				1245	parent = rb_parent(node);
				1246	if (!parent)
				1247	return;
				1248
				1249	if (node == parent->rb_left && parent->rb_right)
				1250	bfq_update_active_node(parent->rb_right);
				1251	else if (parent->rb_left)
				1252	bfq_update_active_node(parent->rb_left);
				1253
				1254	node = parent;
				1255	goto up;
				1256	}
				1257
				1258	/**
				1259	* bfq_active_insert - insert an entity in the active tree of its
				1260	* group/device.
				1261	* @st: the service tree of the entity.
				1262	* @entity: the entity being inserted.
				1263	*
				1264	* The active tree is ordered by finish time, but an extra key is kept
				1265	* per each node, containing the minimum value for the start times of
				1266	* its children (and the node itself), so it's possible to search for
				1267	* the eligible node with the lowest finish time in logarithmic time.
				1268	*/
				1269	static void bfq_active_insert(struct bfq_service_tree *st,
				1270	struct bfq_entity *entity)
				1271	{
				1272	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1273	struct rb_node *node = &entity->rb_node;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1274	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1275	struct bfq_sched_data *sd = NULL;
				1276	struct bfq_group *bfqg = NULL;
				1277	struct bfq_data *bfqd = NULL;
				1278	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1279
				1280	bfq_insert(&st->active, entity);
				1281
				1282	if (node->rb_left)
				1283	node = node->rb_left;
				1284	else if (node->rb_right)
				1285	node = node->rb_right;
				1286
				1287	bfq_update_active_tree(node);
				1288
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1289	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1290	sd = entity->sched_data;
				1291	bfqg = container_of(sd, struct bfq_group, sched_data);
				1292	bfqd = (struct bfq_data *)bfqg->bfqd;
				1293	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1294	if (bfqq)
				1295	list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
				1296	}
				1297
				1298	/**
				1299	* bfq_ioprio_to_weight - calc a weight from an ioprio.
				1300	* @ioprio: the ioprio value to convert.
				1301	*/
				1302	static unsigned short bfq_ioprio_to_weight(int ioprio)
				1303	{
				1304	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
				1305	}
				1306
				1307	/**
				1308	* bfq_weight_to_ioprio - calc an ioprio from a weight.
				1309	* @weight: the weight value to convert.
				1310	*
				1311	* To preserve as much as possible the old only-ioprio user interface,
				1312	* 0 is used as an escape ioprio value for weights (numerically) equal or
				1313	* larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
				1314	*/
				1315	static unsigned short bfq_weight_to_ioprio(int weight)
				1316	{
				1317	return max_t(int, 0,
				1318	IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
				1319	}
				1320
				1321	static void bfq_get_entity(struct bfq_entity *entity)
				1322	{
				1323	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1324
				1325	if (bfqq) {
				1326	bfqq->ref++;
				1327	bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
				1328	bfqq, bfqq->ref);
				1329	}
				1330	}
				1331
				1332	/**
				1333	* bfq_find_deepest - find the deepest node that an extraction can modify.
				1334	* @node: the node being removed.
				1335	*
				1336	* Do the first step of an extraction in an rb tree, looking for the
				1337	* node that will replace @node, and returning the deepest node that
				1338	* the following modifications to the tree can touch. If @node is the
				1339	* last node in the tree return %NULL.
				1340	*/
				1341	static struct rb_node bfq_find_deepest(struct rb_node node)
				1342	{
				1343	struct rb_node *deepest;
				1344
				1345	if (!node->rb_right && !node->rb_left)
				1346	deepest = rb_parent(node);
				1347	else if (!node->rb_right)
				1348	deepest = node->rb_left;
				1349	else if (!node->rb_left)
				1350	deepest = node->rb_right;
				1351	else {
				1352	deepest = rb_next(node);
				1353	if (deepest->rb_right)
				1354	deepest = deepest->rb_right;
				1355	else if (rb_parent(deepest) != node)
				1356	deepest = rb_parent(deepest);
				1357	}
				1358
				1359	return deepest;
				1360	}
				1361
				1362	/**
				1363	* bfq_active_extract - remove an entity from the active tree.
				1364	* @st: the service_tree containing the tree.
				1365	* @entity: the entity being removed.
				1366	*/
				1367	static void bfq_active_extract(struct bfq_service_tree *st,
				1368	struct bfq_entity *entity)
				1369	{
				1370	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1371	struct rb_node *node;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1372	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1373	struct bfq_sched_data *sd = NULL;
				1374	struct bfq_group *bfqg = NULL;
				1375	struct bfq_data *bfqd = NULL;
				1376	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1377
				1378	node = bfq_find_deepest(&entity->rb_node);
				1379	bfq_extract(&st->active, entity);
				1380
				1381	if (node)
				1382	bfq_update_active_tree(node);
				1383
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1384	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1385	sd = entity->sched_data;
				1386	bfqg = container_of(sd, struct bfq_group, sched_data);
				1387	bfqd = (struct bfq_data *)bfqg->bfqd;
				1388	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1389	if (bfqq)
				1390	list_del(&bfqq->bfqq_list);
				1391	}
				1392
				1393	/**
				1394	* bfq_idle_insert - insert an entity into the idle tree.
				1395	* @st: the service tree containing the tree.
				1396	* @entity: the entity to insert.
				1397	*/
				1398	static void bfq_idle_insert(struct bfq_service_tree *st,
				1399	struct bfq_entity *entity)
				1400	{
				1401	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1402	struct bfq_entity *first_idle = st->first_idle;
				1403	struct bfq_entity *last_idle = st->last_idle;
				1404
				1405	if (!first_idle \|\| bfq_gt(first_idle->finish, entity->finish))
				1406	st->first_idle = entity;
				1407	if (!last_idle \|\| bfq_gt(entity->finish, last_idle->finish))
				1408	st->last_idle = entity;
				1409
				1410	bfq_insert(&st->idle, entity);
				1411
				1412	if (bfqq)
				1413	list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
				1414	}
				1415
				1416	/**
				1417	* bfq_forget_entity - do not consider entity any longer for scheduling
				1418	* @st: the service tree.
				1419	* @entity: the entity being removed.
				1420	* @is_in_service: true if entity is currently the in-service entity.
				1421	*
				1422	* Forget everything about @entity. In addition, if entity represents
				1423	* a queue, and the latter is not in service, then release the service
				1424	* reference to the queue (the one taken through bfq_get_entity). In
				1425	* fact, in this case, there is really no more service reference to
				1426	* the queue, as the latter is also outside any service tree. If,
				1427	* instead, the queue is in service, then __bfq_bfqd_reset_in_service
				1428	* will take care of putting the reference when the queue finally
				1429	* stops being served.
				1430	*/
				1431	static void bfq_forget_entity(struct bfq_service_tree *st,
				1432	struct bfq_entity *entity,
				1433	bool is_in_service)
				1434	{
				1435	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1436
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1437	entity->on_st = false;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1438	st->wsum -= entity->weight;
				1439	if (bfqq && !is_in_service)
				1440	bfq_put_queue(bfqq);
				1441	}
				1442
				1443	/**
				1444	* bfq_put_idle_entity - release the idle tree ref of an entity.
				1445	* @st: service tree for the entity.
				1446	* @entity: the entity being released.
				1447	*/
				1448	static void bfq_put_idle_entity(struct bfq_service_tree *st,
				1449	struct bfq_entity *entity)
				1450	{
				1451	bfq_idle_extract(st, entity);
				1452	bfq_forget_entity(st, entity,
				1453	entity == entity->sched_data->in_service_entity);
				1454	}
				1455
				1456	/**
				1457	* bfq_forget_idle - update the idle tree if necessary.
				1458	* @st: the service tree to act upon.
				1459	*
				1460	* To preserve the global O(log N) complexity we only remove one entry here;
				1461	* as the idle tree will not grow indefinitely this can be done safely.
				1462	*/
				1463	static void bfq_forget_idle(struct bfq_service_tree *st)
				1464	{
				1465	struct bfq_entity *first_idle = st->first_idle;
				1466	struct bfq_entity *last_idle = st->last_idle;
				1467
				1468	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
				1469	!bfq_gt(last_idle->finish, st->vtime)) {
				1470	/*
				1471	* Forget the whole idle tree, increasing the vtime past
				1472	* the last finish time of idle entities.
				1473	*/
				1474	st->vtime = last_idle->finish;
				1475	}
				1476
				1477	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
				1478	bfq_put_idle_entity(st, first_idle);
				1479	}
				1480
				1481	static struct bfq_service_tree *
				1482	__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1483	struct bfq_entity *entity)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1484	{
				1485	struct bfq_service_tree *new_st = old_st;
				1486
				1487	if (entity->prio_changed) {
				1488	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1489	unsigned short prev_weight, new_weight;
				1490	struct bfq_data *bfqd = NULL;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1491	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1492	struct bfq_sched_data *sd;
				1493	struct bfq_group *bfqg;
				1494	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1495
				1496	if (bfqq)
				1497	bfqd = bfqq->bfqd;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1498	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1499	else {
				1500	sd = entity->my_sched_data;
				1501	bfqg = container_of(sd, struct bfq_group, sched_data);
				1502	bfqd = (struct bfq_data *)bfqg->bfqd;
				1503	}
				1504	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1505
				1506	old_st->wsum -= entity->weight;
				1507
				1508	if (entity->new_weight != entity->orig_weight) {
				1509	if (entity->new_weight < BFQ_MIN_WEIGHT \|\|
				1510	entity->new_weight > BFQ_MAX_WEIGHT) {
				1511	pr_crit("update_weight_prio: new_weight %d\n",
				1512	entity->new_weight);
				1513	if (entity->new_weight < BFQ_MIN_WEIGHT)
				1514	entity->new_weight = BFQ_MIN_WEIGHT;
				1515	else
				1516	entity->new_weight = BFQ_MAX_WEIGHT;
				1517	}
				1518	entity->orig_weight = entity->new_weight;
				1519	if (bfqq)
				1520	bfqq->ioprio =
				1521	bfq_weight_to_ioprio(entity->orig_weight);
				1522	}
				1523
				1524	if (bfqq)
				1525	bfqq->ioprio_class = bfqq->new_ioprio_class;
				1526	entity->prio_changed = 0;
				1527
				1528	/*
				1529	* NOTE: here we may be changing the weight too early,
				1530	* this will cause unfairness. The correct approach
				1531	* would have required additional complexity to defer
				1532	* weight changes to the proper time instants (i.e.,
				1533	* when entity->finish <= old_st->vtime).
				1534	*/
				1535	new_st = bfq_entity_service_tree(entity);
				1536
				1537	prev_weight = entity->weight;
				1538	new_weight = entity->orig_weight;
				1539	entity->weight = new_weight;
				1540
				1541	new_st->wsum += entity->weight;
				1542
				1543	if (new_st != old_st)
				1544	entity->start = new_st->vtime;
				1545	}
				1546
				1547	return new_st;
				1548	}
				1549
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1550	static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
				1551	static struct bfq_group bfqq_group(struct bfq_queue bfqq);
				1552
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1553	/**
				1554	* bfq_bfqq_served - update the scheduler status after selection for
				1555	* service.
				1556	* @bfqq: the queue being served.
				1557	* @served: bytes to transfer.
				1558	*
				1559	* NOTE: this can be optimized, as the timestamps of upper level entities
				1560	* are synchronized every time a new bfqq is selected for service. By now,
				1561	* we keep it to better check consistency.
				1562	*/
				1563	static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
				1564	{
				1565	struct bfq_entity *entity = &bfqq->entity;
				1566	struct bfq_service_tree *st;
				1567
				1568	for_each_entity(entity) {
				1569	st = bfq_entity_service_tree(entity);
				1570
				1571	entity->service += served;
				1572
				1573	st->vtime += bfq_delta(served, st->wsum);
				1574	bfq_forget_idle(st);
				1575	}
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1576	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1577	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
				1578	}
				1579
				1580	/**
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	1581	* bfq_bfqq_charge_time - charge an amount of service equivalent to the length
				1582	* of the time interval during which bfqq has been in
				1583	* service.
				1584	* @bfqd: the device
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1585	* @bfqq: the queue that needs a service update.
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	1586	* @time_ms: the amount of time during which the queue has received service
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1587	*
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	1588	* If a queue does not consume its budget fast enough, then providing
				1589	* the queue with service fairness may impair throughput, more or less
				1590	* severely. For this reason, queues that consume their budget slowly
				1591	* are provided with time fairness instead of service fairness. This
				1592	* goal is achieved through the BFQ scheduling engine, even if such an
				1593	* engine works in the service, and not in the time domain. The trick
				1594	* is charging these queues with an inflated amount of service, equal
				1595	* to the amount of service that they would have received during their
				1596	* service slot if they had been fast, i.e., if their requests had
				1597	* been dispatched at a rate equal to the estimated peak rate.
				1598	*
				1599	* It is worth noting that time fairness can cause important
				1600	* distortions in terms of bandwidth distribution, on devices with
				1601	* internal queueing. The reason is that I/O requests dispatched
				1602	* during the service slot of a queue may be served after that service
				1603	* slot is finished, and may have a total processing time loosely
				1604	* correlated with the duration of the service slot. This is
				1605	* especially true for short service slots.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1606	*/
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	1607	static void bfq_bfqq_charge_time(struct bfq_data bfqd, struct bfq_queue bfqq,
				1608	unsigned long time_ms)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1609	{
				1610	struct bfq_entity *entity = &bfqq->entity;
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	1611	int tot_serv_to_charge = entity->service;
				1612	unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1613
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	1614	if (time_ms > 0 && time_ms < timeout_ms)
				1615	tot_serv_to_charge =
				1616	(bfqd->bfq_max_budget * time_ms) / timeout_ms;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1617
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	1618	if (tot_serv_to_charge < entity->service)
				1619	tot_serv_to_charge = entity->service;
				1620
				1621	/* Increase budget to avoid inconsistencies */
				1622	if (tot_serv_to_charge > entity->budget)
				1623	entity->budget = tot_serv_to_charge;
				1624
				1625	bfq_bfqq_served(bfqq,
				1626	max_t(int, 0, tot_serv_to_charge - entity->service));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1627	}
				1628
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1629	static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
				1630	struct bfq_service_tree *st,
				1631	bool backshifted)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1632	{
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1633	st = __bfq_entity_update_weight_prio(st, entity);
				1634	bfq_calc_finish(entity, entity->budget);
				1635
				1636	/*
				1637	* If some queues enjoy backshifting for a while, then their
				1638	* (virtual) finish timestamps may happen to become lower and
				1639	* lower than the system virtual time. In particular, if
				1640	* these queues often happen to be idle for short time
				1641	* periods, and during such time periods other queues with
				1642	* higher timestamps happen to be busy, then the backshifted
				1643	* timestamps of the former queues can become much lower than
				1644	* the system virtual time. In fact, to serve the queues with
				1645	* higher timestamps while the ones with lower timestamps are
				1646	* idle, the system virtual time may be pushed-up to much
				1647	* higher values than the finish timestamps of the idle
				1648	* queues. As a consequence, the finish timestamps of all new
				1649	* or newly activated queues may end up being much larger than
				1650	* those of lucky queues with backshifted timestamps. The
				1651	* latter queues may then monopolize the device for a lot of
				1652	* time. This would simply break service guarantees.
				1653	*
				1654	* To reduce this problem, push up a little bit the
				1655	* backshifted timestamps of the queue associated with this
				1656	* entity (only a queue can happen to have the backshifted
				1657	* flag set): just enough to let the finish timestamp of the
				1658	* queue be equal to the current value of the system virtual
				1659	* time. This may introduce a little unfairness among queues
				1660	* with backshifted timestamps, but it does not break
				1661	* worst-case fairness guarantees.
				1662	*/
				1663	if (backshifted && bfq_gt(st->vtime, entity->finish)) {
				1664	unsigned long delta = st->vtime - entity->finish;
				1665
				1666	entity->start += delta;
				1667	entity->finish += delta;
				1668	}
				1669
				1670	bfq_active_insert(st, entity);
				1671	}
				1672
				1673	/**
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1674	* __bfq_activate_entity - handle activation of entity.
				1675	* @entity: the entity being activated.
				1676	* @non_blocking_wait_rq: true if entity was waiting for a request
				1677	*
				1678	* Called for a 'true' activation, i.e., if entity is not active and
				1679	* one of its children receives a new request.
				1680	*
				1681	* Basically, this function updates the timestamps of entity and
				1682	* inserts entity into its active tree, ater possible extracting it
				1683	* from its idle tree.
				1684	*/
				1685	static void __bfq_activate_entity(struct bfq_entity *entity,
				1686	bool non_blocking_wait_rq)
				1687	{
				1688	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				1689	bool backshifted = false;
				1690	unsigned long long min_vstart;
				1691
				1692	/* See comments on bfq_fqq_update_budg_for_activation */
				1693	if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
				1694	backshifted = true;
				1695	min_vstart = entity->finish;
				1696	} else
				1697	min_vstart = st->vtime;
				1698
				1699	if (entity->tree == &st->idle) {
				1700	/*
				1701	* Must be on the idle tree, bfq_idle_extract() will
				1702	* check for that.
				1703	*/
				1704	bfq_idle_extract(st, entity);
				1705	entity->start = bfq_gt(min_vstart, entity->finish) ?
				1706	min_vstart : entity->finish;
				1707	} else {
				1708	/*
				1709	* The finish time of the entity may be invalid, and
				1710	* it is in the past for sure, otherwise the queue
				1711	* would have been on the idle tree.
				1712	*/
				1713	entity->start = min_vstart;
				1714	st->wsum += entity->weight;
				1715	/*
				1716	* entity is about to be inserted into a service tree,
				1717	* and then set in service: get a reference to make
				1718	* sure entity does not disappear until it is no
				1719	* longer in service or scheduled for service.
				1720	*/
				1721	bfq_get_entity(entity);
				1722
				1723	entity->on_st = true;
				1724	}
				1725
				1726	bfq_update_fin_time_enqueue(entity, st, backshifted);
				1727	}
				1728
				1729	/**
				1730	* __bfq_requeue_entity - handle requeueing or repositioning of an entity.
				1731	* @entity: the entity being requeued or repositioned.
				1732	*
				1733	* Requeueing is needed if this entity stops being served, which
				1734	* happens if a leaf descendant entity has expired. On the other hand,
				1735	* repositioning is needed if the next_inservice_entity for the child
				1736	* entity has changed. See the comments inside the function for
				1737	* details.
				1738	*
				1739	* Basically, this function: 1) removes entity from its active tree if
				1740	* present there, 2) updates the timestamps of entity and 3) inserts
				1741	* entity back into its active tree (in the new, right position for
				1742	* the new values of the timestamps).
				1743	*/
				1744	static void __bfq_requeue_entity(struct bfq_entity *entity)
				1745	{
				1746	struct bfq_sched_data *sd = entity->sched_data;
				1747	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				1748
				1749	if (entity == sd->in_service_entity) {
				1750	/*
				1751	* We are requeueing the current in-service entity,
				1752	* which may have to be done for one of the following
				1753	* reasons:
				1754	* - entity represents the in-service queue, and the
				1755	* in-service queue is being requeued after an
				1756	* expiration;
				1757	* - entity represents a group, and its budget has
				1758	* changed because one of its child entities has
				1759	* just been either activated or requeued for some
				1760	* reason; the timestamps of the entity need then to
				1761	* be updated, and the entity needs to be enqueued
				1762	* or repositioned accordingly.
				1763	*
				1764	* In particular, before requeueing, the start time of
				1765	* the entity must be moved forward to account for the
				1766	* service that the entity has received while in
				1767	* service. This is done by the next instructions. The
				1768	* finish time will then be updated according to this
				1769	* new value of the start time, and to the budget of
				1770	* the entity.
				1771	*/
				1772	bfq_calc_finish(entity, entity->service);
				1773	entity->start = entity->finish;
				1774	/*
				1775	* In addition, if the entity had more than one child
				1776	* when set in service, then was not extracted from
				1777	* the active tree. This implies that the position of
				1778	* the entity in the active tree may need to be
				1779	* changed now, because we have just updated the start
				1780	* time of the entity, and we will update its finish
				1781	* time in a moment (the requeueing is then, more
				1782	* precisely, a repositioning in this case). To
				1783	* implement this repositioning, we: 1) dequeue the
				1784	* entity here, 2) update the finish time and
				1785	* requeue the entity according to the new
				1786	* timestamps below.
				1787	*/
				1788	if (entity->tree)
				1789	bfq_active_extract(st, entity);
				1790	} else { /* The entity is already active, and not in service */
				1791	/*
				1792	* In this case, this function gets called only if the
				1793	* next_in_service entity below this entity has
				1794	* changed, and this change has caused the budget of
				1795	* this entity to change, which, finally implies that
				1796	* the finish time of this entity must be
				1797	* updated. Such an update may cause the scheduling,
				1798	* i.e., the position in the active tree, of this
				1799	* entity to change. We handle this change by: 1)
				1800	* dequeueing the entity here, 2) updating the finish
				1801	* time and requeueing the entity according to the new
				1802	* timestamps below. This is the same approach as the
				1803	* non-extracted-entity sub-case above.
				1804	*/
				1805	bfq_active_extract(st, entity);
				1806	}
				1807
				1808	bfq_update_fin_time_enqueue(entity, st, false);
				1809	}
				1810
				1811	static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
				1812	struct bfq_sched_data *sd,
				1813	bool non_blocking_wait_rq)
				1814	{
				1815	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				1816
				1817	if (sd->in_service_entity == entity \|\| entity->tree == &st->active)
				1818	/*
				1819	* in service or already queued on the active tree,
				1820	* requeue or reposition
				1821	*/
				1822	__bfq_requeue_entity(entity);
				1823	else
				1824	/*
				1825	* Not in service and not queued on its active tree:
				1826	* the activity is idle and this is a true activation.
				1827	*/
				1828	__bfq_activate_entity(entity, non_blocking_wait_rq);
				1829	}
				1830
				1831
				1832	/**
				1833	* bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
				1834	* and activate, requeue or reposition all ancestors
				1835	* for which such an update becomes necessary.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1836	* @entity: the entity to activate.
				1837	* @non_blocking_wait_rq: true if this entity was waiting for a request
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1838	* @requeue: true if this is a requeue, which implies that bfqq is
				1839	* being expired; thus ALL its ancestors stop being served and must
				1840	* therefore be requeued
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1841	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1842	static void bfq_activate_requeue_entity(struct bfq_entity *entity,
				1843	bool non_blocking_wait_rq,
				1844	bool requeue)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1845	{
				1846	struct bfq_sched_data *sd;
				1847
				1848	for_each_entity(entity) {
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1849	sd = entity->sched_data;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1850	__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
				1851
				1852	if (!bfq_update_next_in_service(sd, entity) && !requeue)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1853	break;
				1854	}
				1855	}
				1856
				1857	/**
				1858	* __bfq_deactivate_entity - deactivate an entity from its service tree.
				1859	* @entity: the entity to deactivate.
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1860	* @ins_into_idle_tree: if false, the entity will not be put into the
				1861	* idle tree.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1862	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1863	* Deactivates an entity, independently from its previous state. Must
				1864	* be invoked only if entity is on a service tree. Extracts the entity
				1865	* from that tree, and if necessary and allowed, puts it on the idle
				1866	* tree.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1867	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1868	static bool __bfq_deactivate_entity(struct bfq_entity *entity,
				1869	bool ins_into_idle_tree)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1870	{
				1871	struct bfq_sched_data *sd = entity->sched_data;
				1872	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				1873	int is_in_service = entity == sd->in_service_entity;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1874
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1875	if (!entity->on_st) /* entity never activated, or already inactive */
				1876	return false;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1877
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1878	if (is_in_service)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1879	bfq_calc_finish(entity, entity->service);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1880
				1881	if (entity->tree == &st->active)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1882	bfq_active_extract(st, entity);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1883	else if (!is_in_service && entity->tree == &st->idle)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1884	bfq_idle_extract(st, entity);
				1885
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1886	if (!ins_into_idle_tree \|\| !bfq_gt(entity->finish, st->vtime))
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1887	bfq_forget_entity(st, entity, is_in_service);
				1888	else
				1889	bfq_idle_insert(st, entity);
				1890
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1891	return true;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1892	}
				1893
				1894	/**
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1895	* bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1896	* @entity: the entity to deactivate.
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1897	* @ins_into_idle_tree: true if the entity can be put on the idle tree
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1898	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1899	static void bfq_deactivate_entity(struct bfq_entity *entity,
				1900	bool ins_into_idle_tree,
				1901	bool expiration)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1902	{
				1903	struct bfq_sched_data *sd;
				1904	struct bfq_entity *parent = NULL;
				1905
				1906	for_each_entity_safe(entity, parent) {
				1907	sd = entity->sched_data;
				1908
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1909	if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1910	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1911	* entity is not in any tree any more, so
				1912	* this deactivation is a no-op, and there is
				1913	* nothing to change for upper-level entities
				1914	* (in case of expiration, this can never
				1915	* happen).
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1916	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1917	return;
				1918	}
				1919
				1920	if (sd->next_in_service == entity)
				1921	/*
				1922	* entity was the next_in_service entity,
				1923	* then, since entity has just been
				1924	* deactivated, a new one must be found.
				1925	*/
				1926	bfq_update_next_in_service(sd, NULL);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1927
				1928	if (sd->next_in_service)
				1929	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1930	* The parent entity is still backlogged,
				1931	* because next_in_service is not NULL. So, no
				1932	* further upwards deactivation must be
				1933	* performed. Yet, next_in_service has
				1934	* changed. Then the schedule does need to be
				1935	* updated upwards.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1936	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1937	break;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1938
				1939	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1940	* If we get here, then the parent is no more
				1941	* backlogged and we need to propagate the
				1942	* deactivation upwards. Thus let the loop go on.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1943	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1944
				1945	/*
				1946	* Also let parent be queued into the idle tree on
				1947	* deactivation, to preserve service guarantees, and
				1948	* assuming that who invoked this function does not
				1949	* need parent entities too to be removed completely.
				1950	*/
				1951	ins_into_idle_tree = true;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1952	}
				1953
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1954	/*
				1955	* If the deactivation loop is fully executed, then there are
				1956	* no more entities to touch and next loop is not executed at
				1957	* all. Otherwise, requeue remaining entities if they are
				1958	* about to stop receiving service, or reposition them if this
				1959	* is not the case.
				1960	*/
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1961	entity = parent;
				1962	for_each_entity(entity) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1963	/*
				1964	* Invoke __bfq_requeue_entity on entity, even if
				1965	* already active, to requeue/reposition it in the
				1966	* active tree (because sd->next_in_service has
				1967	* changed)
				1968	*/
				1969	__bfq_requeue_entity(entity);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1970
				1971	sd = entity->sched_data;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1972	if (!bfq_update_next_in_service(sd, entity) &&
				1973	!expiration)
				1974	/*
				1975	* next_in_service unchanged or not causing
				1976	* any change in entity->parent->sd, and no
				1977	* requeueing needed for expiration: stop
				1978	* here.
				1979	*/
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1980	break;
				1981	}
				1982	}
				1983
				1984	/**
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1985	* bfq_calc_vtime_jump - compute the value to which the vtime should jump,
				1986	* if needed, to have at least one entity eligible.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1987	* @st: the service tree to act upon.
				1988	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1989	* Assumes that st is not empty.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1990	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1991	static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1992	{
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1993	struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1994
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1995	if (bfq_gt(root_entity->min_start, st->vtime))
				1996	return root_entity->min_start;
				1997
				1998	return st->vtime;
				1999	}
				2000
				2001	static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
				2002	{
				2003	if (new_value > st->vtime) {
				2004	st->vtime = new_value;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2005	bfq_forget_idle(st);
				2006	}
				2007	}
				2008
				2009	/**
				2010	* bfq_first_active_entity - find the eligible entity with
				2011	* the smallest finish time
				2012	* @st: the service tree to select from.
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2013	* @vtime: the system virtual to use as a reference for eligibility
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2014	*
				2015	* This function searches the first schedulable entity, starting from the
				2016	* root of the tree and going on the left every time on this side there is
				2017	* a subtree with at least one eligible (start >= vtime) entity. The path on
				2018	* the right is followed only if a) the left subtree contains no eligible
				2019	* entities and b) no eligible entity has been found yet.
				2020	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2021	static struct bfq_entity bfq_first_active_entity(struct bfq_service_tree st,
				2022	u64 vtime)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2023	{
				2024	struct bfq_entity entry, first = NULL;
				2025	struct rb_node *node = st->active.rb_node;
				2026
				2027	while (node) {
				2028	entry = rb_entry(node, struct bfq_entity, rb_node);
				2029	left:
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2030	if (!bfq_gt(entry->start, vtime))
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2031	first = entry;
				2032
				2033	if (node->rb_left) {
				2034	entry = rb_entry(node->rb_left,
				2035	struct bfq_entity, rb_node);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2036	if (!bfq_gt(entry->min_start, vtime)) {
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2037	node = node->rb_left;
				2038	goto left;
				2039	}
				2040	}
				2041	if (first)
				2042	break;
				2043	node = node->rb_right;
				2044	}
				2045
				2046	return first;
				2047	}
				2048
				2049	/**
				2050	* __bfq_lookup_next_entity - return the first eligible entity in @st.
				2051	* @st: the service tree.
				2052	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2053	* If there is no in-service entity for the sched_data st belongs to,
				2054	* then return the entity that will be set in service if:
				2055	* 1) the parent entity this st belongs to is set in service;
				2056	* 2) no entity belonging to such parent entity undergoes a state change
				2057	* that would influence the timestamps of the entity (e.g., becomes idle,
				2058	* becomes backlogged, changes its budget, ...).
				2059	*
				2060	* In this first case, update the virtual time in @st too (see the
				2061	* comments on this update inside the function).
				2062	*
				2063	* In constrast, if there is an in-service entity, then return the
				2064	* entity that would be set in service if not only the above
				2065	* conditions, but also the next one held true: the currently
				2066	* in-service entity, on expiration,
				2067	* 1) gets a finish time equal to the current one, or
				2068	* 2) is not eligible any more, or
				2069	* 3) is idle.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2070	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2071	static struct bfq_entity *
				2072	__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2073	{
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2074	struct bfq_entity *entity;
				2075	u64 new_vtime;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2076
				2077	if (RB_EMPTY_ROOT(&st->active))
				2078	return NULL;
				2079
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2080	/*
				2081	* Get the value of the system virtual time for which at
				2082	* least one entity is eligible.
				2083	*/
				2084	new_vtime = bfq_calc_vtime_jump(st);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2085
				2086	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2087	* If there is no in-service entity for the sched_data this
				2088	* active tree belongs to, then push the system virtual time
				2089	* up to the value that guarantees that at least one entity is
				2090	* eligible. If, instead, there is an in-service entity, then
				2091	* do not make any such update, because there is already an
				2092	* eligible entity, namely the in-service one (even if the
				2093	* entity is not on st, because it was extracted when set in
				2094	* service).
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2095	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2096	if (!in_service)
				2097	bfq_update_vtime(st, new_vtime);
				2098
				2099	entity = bfq_first_active_entity(st, new_vtime);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2100
				2101	return entity;
				2102	}
				2103
				2104	/**
				2105	* bfq_lookup_next_entity - return the first eligible entity in @sd.
				2106	* @sd: the sched_data.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2107	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2108	* This function is invoked when there has been a change in the trees
				2109	* for sd, and we need know what is the new next entity after this
				2110	* change.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2111	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2112	static struct bfq_entity bfq_lookup_next_entity(struct bfq_sched_data sd)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2113	{
				2114	struct bfq_service_tree *st = sd->service_tree;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2115	struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
				2116	struct bfq_entity *entity = NULL;
				2117	int class_idx = 0;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2118
				2119	/*
				2120	* Choose from idle class, if needed to guarantee a minimum
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2121	* bandwidth to this class (and if there is some active entity
				2122	* in idle class). This should also mitigate
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2123	* priority-inversion problems in case a low priority task is
				2124	* holding file system resources.
				2125	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2126	if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
				2127	BFQ_CL_IDLE_TIMEOUT)) {
				2128	if (!RB_EMPTY_ROOT(&idle_class_st->active))
				2129	class_idx = BFQ_IOPRIO_CLASSES - 1;
				2130	/* About to be served if backlogged, or not yet backlogged */
				2131	sd->bfq_class_idle_last_service = jiffies;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2132	}
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2133
				2134	/*
				2135	* Find the next entity to serve for the highest-priority
				2136	* class, unless the idle class needs to be served.
				2137	*/
				2138	for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
				2139	entity = __bfq_lookup_next_entity(st + class_idx,
				2140	sd->in_service_entity);
				2141
				2142	if (entity)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2143	break;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2144	}
				2145
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2146	if (!entity)
				2147	return NULL;
				2148
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2149	return entity;
				2150	}
				2151
				2152	static bool next_queue_may_preempt(struct bfq_data *bfqd)
				2153	{
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2154	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2155
				2156	return sd->next_in_service != sd->in_service_entity;
				2157	}
				2158
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2159	/*
				2160	* Get next queue for service.
				2161	*/
				2162	static struct bfq_queue bfq_get_next_queue(struct bfq_data bfqd)
				2163	{
				2164	struct bfq_entity *entity = NULL;
				2165	struct bfq_sched_data *sd;
				2166	struct bfq_queue *bfqq;
				2167
				2168	if (bfqd->busy_queues == 0)
				2169	return NULL;
				2170
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2171	/*
				2172	* Traverse the path from the root to the leaf entity to
				2173	* serve. Set in service all the entities visited along the
				2174	* way.
				2175	*/
				2176	sd = &bfqd->root_group->sched_data;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2177	for (; sd ; sd = entity->my_sched_data) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2178	/*
				2179	* WARNING. We are about to set the in-service entity
				2180	* to sd->next_in_service, i.e., to the (cached) value
				2181	* returned by bfq_lookup_next_entity(sd) the last
				2182	* time it was invoked, i.e., the last time when the
				2183	* service order in sd changed as a consequence of the
				2184	* activation or deactivation of an entity. In this
				2185	* respect, if we execute bfq_lookup_next_entity(sd)
				2186	* in this very moment, it may, although with low
				2187	* probability, yield a different entity than that
				2188	* pointed to by sd->next_in_service. This rare event
				2189	* happens in case there was no CLASS_IDLE entity to
				2190	* serve for sd when bfq_lookup_next_entity(sd) was
				2191	* invoked for the last time, while there is now one
				2192	* such entity.
				2193	*
				2194	* If the above event happens, then the scheduling of
				2195	* such entity in CLASS_IDLE is postponed until the
				2196	* service of the sd->next_in_service entity
				2197	* finishes. In fact, when the latter is expired,
				2198	* bfq_lookup_next_entity(sd) gets called again,
				2199	* exactly to update sd->next_in_service.
				2200	*/
				2201
				2202	/* Make next_in_service entity become in_service_entity */
				2203	entity = sd->next_in_service;
				2204	sd->in_service_entity = entity;
				2205
				2206	/*
				2207	* Reset the accumulator of the amount of service that
				2208	* the entity is about to receive.
				2209	*/
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2210	entity->service = 0;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2211
				2212	/*
				2213	* If entity is no longer a candidate for next
				2214	* service, then we extract it from its active tree,
				2215	* for the following reason. To further boost the
				2216	* throughput in some special case, BFQ needs to know
				2217	* which is the next candidate entity to serve, while
				2218	* there is already an entity in service. In this
				2219	* respect, to make it easy to compute/update the next
				2220	* candidate entity to serve after the current
				2221	* candidate has been set in service, there is a case
				2222	* where it is necessary to extract the current
				2223	* candidate from its service tree. Such a case is
				2224	* when the entity just set in service cannot be also
				2225	* a candidate for next service. Details about when
				2226	* this conditions holds are reported in the comments
				2227	* on the function bfq_no_longer_next_in_service()
				2228	* invoked below.
				2229	*/
				2230	if (bfq_no_longer_next_in_service(entity))
				2231	bfq_active_extract(bfq_entity_service_tree(entity),
				2232	entity);
				2233
				2234	/*
				2235	* For the same reason why we may have just extracted
				2236	* entity from its active tree, we may need to update
				2237	* next_in_service for the sched_data of entity too,
				2238	* regardless of whether entity has been extracted.
				2239	* In fact, even if entity has not been extracted, a
				2240	* descendant entity may get extracted. Such an event
				2241	* would cause a change in next_in_service for the
				2242	* level of the descendant entity, and thus possibly
				2243	* back to upper levels.
				2244	*
				2245	* We cannot perform the resulting needed update
				2246	* before the end of this loop, because, to know which
				2247	* is the correct next-to-serve candidate entity for
				2248	* each level, we need first to find the leaf entity
				2249	* to set in service. In fact, only after we know
				2250	* which is the next-to-serve leaf entity, we can
				2251	* discover whether the parent entity of the leaf
				2252	* entity becomes the next-to-serve, and so on.
				2253	*/
				2254
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2255	}
				2256
				2257	bfqq = bfq_entity_to_bfqq(entity);
				2258
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2259	/*
				2260	* We can finally update all next-to-serve entities along the
				2261	* path from the leaf entity just set in service to the root.
				2262	*/
				2263	for_each_entity(entity) {
				2264	struct bfq_sched_data *sd = entity->sched_data;
				2265
				2266	if (!bfq_update_next_in_service(sd, NULL))
				2267	break;
				2268	}
				2269
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2270	return bfqq;
				2271	}
				2272
				2273	static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
				2274	{
				2275	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
				2276	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2277	struct bfq_entity *entity = in_serv_entity;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2278
				2279	if (bfqd->in_service_bic) {
				2280	put_io_context(bfqd->in_service_bic->icq.ioc);
				2281	bfqd->in_service_bic = NULL;
				2282	}
				2283
				2284	bfq_clear_bfqq_wait_request(in_serv_bfqq);
				2285	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
				2286	bfqd->in_service_queue = NULL;
				2287
				2288	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2289	* When this function is called, all in-service entities have
				2290	* been properly deactivated or requeued, so we can safely
				2291	* execute the final step: reset in_service_entity along the
				2292	* path from entity to the root.
				2293	*/
				2294	for_each_entity(entity)
				2295	entity->sched_data->in_service_entity = NULL;
				2296
				2297	/*
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2298	* in_serv_entity is no longer in service, so, if it is in no
				2299	* service tree either, then release the service reference to
				2300	* the queue it represents (taken with bfq_get_entity).
				2301	*/
				2302	if (!in_serv_entity->on_st)
				2303	bfq_put_queue(in_serv_bfqq);
				2304	}
				2305
				2306	static void bfq_deactivate_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq,
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2307	bool ins_into_idle_tree, bool expiration)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2308	{
				2309	struct bfq_entity *entity = &bfqq->entity;
				2310
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2311	bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2312	}
				2313
				2314	static void bfq_activate_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
				2315	{
				2316	struct bfq_entity *entity = &bfqq->entity;
				2317
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2318	bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
				2319	false);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2320	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
				2321	}
				2322
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2323	static void bfq_requeue_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
				2324	{
				2325	struct bfq_entity *entity = &bfqq->entity;
				2326
				2327	bfq_activate_requeue_entity(entity, false,
				2328	bfqq == bfqd->in_service_queue);
				2329	}
				2330
				2331	static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
				2332
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2333	/*
				2334	* Called when the bfqq no longer has requests pending, remove it from
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2335	* the service tree. As a special case, it can be invoked during an
				2336	* expiration.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2337	*/
				2338	static void bfq_del_bfqq_busy(struct bfq_data bfqd, struct bfq_queue bfqq,
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2339	bool expiration)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2340	{
				2341	bfq_log_bfqq(bfqd, bfqq, "del from busy");
				2342
				2343	bfq_clear_bfqq_busy(bfqq);
				2344
				2345	bfqd->busy_queues--;
				2346
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2347	bfqg_stats_update_dequeue(bfqq_group(bfqq));
				2348
				2349	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2350	}
				2351
				2352	/*
				2353	* Called when an inactive queue receives a new request.
				2354	*/
				2355	static void bfq_add_bfqq_busy(struct bfq_data bfqd, struct bfq_queue bfqq)
				2356	{
				2357	bfq_log_bfqq(bfqd, bfqq, "add to busy");
				2358
				2359	bfq_activate_bfqq(bfqd, bfqq);
				2360
				2361	bfq_mark_bfqq_busy(bfqq);
				2362	bfqd->busy_queues++;
				2363	}
				2364
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2365	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				2366
				2367	/* bfqg stats flags */
				2368	enum bfqg_stats_flags {
				2369	BFQG_stats_waiting = 0,
				2370	BFQG_stats_idling,
				2371	BFQG_stats_empty,
				2372	};
				2373
				2374	#define BFQG_FLAG_FNS(name) \
				2375	static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
				2376	{ \
				2377	stats->flags \|= (1 << BFQG_stats_##name); \
				2378	} \
				2379	static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
				2380	{ \
				2381	stats->flags &= ~(1 << BFQG_stats_##name); \
				2382	} \
				2383	static int bfqg_stats_##name(struct bfqg_stats *stats) \
				2384	{ \
				2385	return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
				2386	} \
				2387
				2388	BFQG_FLAG_FNS(waiting)
				2389	BFQG_FLAG_FNS(idling)
				2390	BFQG_FLAG_FNS(empty)
				2391	#undef BFQG_FLAG_FNS
				2392
				2393	/* This should be called with the queue_lock held. */
				2394	static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
				2395	{
				2396	unsigned long long now;
				2397
				2398	if (!bfqg_stats_waiting(stats))
				2399	return;
				2400
				2401	now = sched_clock();
				2402	if (time_after64(now, stats->start_group_wait_time))
				2403	blkg_stat_add(&stats->group_wait_time,
				2404	now - stats->start_group_wait_time);
				2405	bfqg_stats_clear_waiting(stats);
				2406	}
				2407
				2408	/* This should be called with the queue_lock held. */
				2409	static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
				2410	struct bfq_group *curr_bfqg)
				2411	{
				2412	struct bfqg_stats *stats = &bfqg->stats;
				2413
				2414	if (bfqg_stats_waiting(stats))
				2415	return;
				2416	if (bfqg == curr_bfqg)
				2417	return;
				2418	stats->start_group_wait_time = sched_clock();
				2419	bfqg_stats_mark_waiting(stats);
				2420	}
				2421
				2422	/* This should be called with the queue_lock held. */
				2423	static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
				2424	{
				2425	unsigned long long now;
				2426
				2427	if (!bfqg_stats_empty(stats))
				2428	return;
				2429
				2430	now = sched_clock();
				2431	if (time_after64(now, stats->start_empty_time))
				2432	blkg_stat_add(&stats->empty_time,
				2433	now - stats->start_empty_time);
				2434	bfqg_stats_clear_empty(stats);
				2435	}
				2436
				2437	static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
				2438	{
				2439	blkg_stat_add(&bfqg->stats.dequeue, 1);
				2440	}
				2441
				2442	static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
				2443	{
				2444	struct bfqg_stats *stats = &bfqg->stats;
				2445
				2446	if (blkg_rwstat_total(&stats->queued))
				2447	return;
				2448
				2449	/*
				2450	* group is already marked empty. This can happen if bfqq got new
				2451	* request in parent group and moved to this group while being added
				2452	* to service tree. Just ignore the event and move on.
				2453	*/
				2454	if (bfqg_stats_empty(stats))
				2455	return;
				2456
				2457	stats->start_empty_time = sched_clock();
				2458	bfqg_stats_mark_empty(stats);
				2459	}
				2460
				2461	static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
				2462	{
				2463	struct bfqg_stats *stats = &bfqg->stats;
				2464
				2465	if (bfqg_stats_idling(stats)) {
				2466	unsigned long long now = sched_clock();
				2467
				2468	if (time_after64(now, stats->start_idle_time))
				2469	blkg_stat_add(&stats->idle_time,
				2470	now - stats->start_idle_time);
				2471	bfqg_stats_clear_idling(stats);
				2472	}
				2473	}
				2474
				2475	static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
				2476	{
				2477	struct bfqg_stats *stats = &bfqg->stats;
				2478
				2479	stats->start_idle_time = sched_clock();
				2480	bfqg_stats_mark_idling(stats);
				2481	}
				2482
				2483	static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
				2484	{
				2485	struct bfqg_stats *stats = &bfqg->stats;
				2486
				2487	blkg_stat_add(&stats->avg_queue_size_sum,
				2488	blkg_rwstat_total(&stats->queued));
				2489	blkg_stat_add(&stats->avg_queue_size_samples, 1);
				2490	bfqg_stats_update_group_wait_time(stats);
				2491	}
				2492
				2493	/*
				2494	* blk-cgroup policy-related handlers
				2495	* The following functions help in converting between blk-cgroup
				2496	* internal structures and BFQ-specific structures.
				2497	*/
				2498
				2499	static struct bfq_group pd_to_bfqg(struct blkg_policy_data pd)
				2500	{
				2501	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
				2502	}
				2503
				2504	static struct blkcg_gq bfqg_to_blkg(struct bfq_group bfqg)
				2505	{
				2506	return pd_to_blkg(&bfqg->pd);
				2507	}
				2508
				2509	static struct blkcg_policy blkcg_policy_bfq;
				2510
				2511	static struct bfq_group blkg_to_bfqg(struct blkcg_gq blkg)
				2512	{
				2513	return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
				2514	}
				2515
				2516	/*
				2517	* bfq_group handlers
				2518	* The following functions help in navigating the bfq_group hierarchy
				2519	* by allowing to find the parent of a bfq_group or the bfq_group
				2520	* associated to a bfq_queue.
				2521	*/
				2522
				2523	static struct bfq_group bfqg_parent(struct bfq_group bfqg)
				2524	{
				2525	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
				2526
				2527	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
				2528	}
				2529
				2530	static struct bfq_group bfqq_group(struct bfq_queue bfqq)
				2531	{
				2532	struct bfq_entity *group_entity = bfqq->entity.parent;
				2533
				2534	return group_entity ? container_of(group_entity, struct bfq_group,
				2535	entity) :
				2536	bfqq->bfqd->root_group;
				2537	}
				2538
				2539	/*
				2540	* The following two functions handle get and put of a bfq_group by
				2541	* wrapping the related blk-cgroup hooks.
				2542	*/
				2543
				2544	static void bfqg_get(struct bfq_group *bfqg)
				2545	{
				2546	return blkg_get(bfqg_to_blkg(bfqg));
				2547	}
				2548
				2549	static void bfqg_put(struct bfq_group *bfqg)
				2550	{
				2551	return blkg_put(bfqg_to_blkg(bfqg));
				2552	}
				2553
				2554	static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
				2555	struct bfq_queue *bfqq,
				2556	unsigned int op)
				2557	{
				2558	blkg_rwstat_add(&bfqg->stats.queued, op, 1);
				2559	bfqg_stats_end_empty_time(&bfqg->stats);
				2560	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
				2561	bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
				2562	}
				2563
				2564	static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
				2565	{
				2566	blkg_rwstat_add(&bfqg->stats.queued, op, -1);
				2567	}
				2568
				2569	static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
				2570	{
				2571	blkg_rwstat_add(&bfqg->stats.merged, op, 1);
				2572	}
				2573
				2574	static void bfqg_stats_update_completion(struct bfq_group *bfqg,
				2575	uint64_t start_time, uint64_t io_start_time,
				2576	unsigned int op)
				2577	{
				2578	struct bfqg_stats *stats = &bfqg->stats;
				2579	unsigned long long now = sched_clock();
				2580
				2581	if (time_after64(now, io_start_time))
				2582	blkg_rwstat_add(&stats->service_time, op,
				2583	now - io_start_time);
				2584	if (time_after64(io_start_time, start_time))
				2585	blkg_rwstat_add(&stats->wait_time, op,
				2586	io_start_time - start_time);
				2587	}
				2588
				2589	/* @stats = 0 */
				2590	static void bfqg_stats_reset(struct bfqg_stats *stats)
				2591	{
				2592	/* queued stats shouldn't be cleared */
				2593	blkg_rwstat_reset(&stats->merged);
				2594	blkg_rwstat_reset(&stats->service_time);
				2595	blkg_rwstat_reset(&stats->wait_time);
				2596	blkg_stat_reset(&stats->time);
				2597	blkg_stat_reset(&stats->avg_queue_size_sum);
				2598	blkg_stat_reset(&stats->avg_queue_size_samples);
				2599	blkg_stat_reset(&stats->dequeue);
				2600	blkg_stat_reset(&stats->group_wait_time);
				2601	blkg_stat_reset(&stats->idle_time);
				2602	blkg_stat_reset(&stats->empty_time);
				2603	}
				2604
				2605	/* @to += @from */
				2606	static void bfqg_stats_add_aux(struct bfqg_stats to, struct bfqg_stats from)
				2607	{
				2608	if (!to \|\| !from)
				2609	return;
				2610
				2611	/* queued stats shouldn't be cleared */
				2612	blkg_rwstat_add_aux(&to->merged, &from->merged);
				2613	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
				2614	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
				2615	blkg_stat_add_aux(&from->time, &from->time);
				2616	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
				2617	blkg_stat_add_aux(&to->avg_queue_size_samples,
				2618	&from->avg_queue_size_samples);
				2619	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
				2620	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
				2621	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
				2622	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
				2623	}
				2624
				2625	/*
				2626	* Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
				2627	* recursive stats can still account for the amount used by this bfqg after
				2628	* it's gone.
				2629	*/
				2630	static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
				2631	{
				2632	struct bfq_group *parent;
				2633
				2634	if (!bfqg) /* root_group */
				2635	return;
				2636
				2637	parent = bfqg_parent(bfqg);
				2638
				2639	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
				2640
				2641	if (unlikely(!parent))
				2642	return;
				2643
				2644	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
				2645	bfqg_stats_reset(&bfqg->stats);
				2646	}
				2647
				2648	static void bfq_init_entity(struct bfq_entity *entity,
				2649	struct bfq_group *bfqg)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2650	{
				2651	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				2652
				2653	entity->weight = entity->new_weight;
				2654	entity->orig_weight = entity->new_weight;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2655	if (bfqq) {
				2656	bfqq->ioprio = bfqq->new_ioprio;
				2657	bfqq->ioprio_class = bfqq->new_ioprio_class;
				2658	bfqg_get(bfqg);
				2659	}
				2660	entity->parent = bfqg->my_entity; /* NULL for root group */
				2661	entity->sched_data = &bfqg->sched_data;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2662	}
				2663
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2664	static void bfqg_stats_exit(struct bfqg_stats *stats)
				2665	{
				2666	blkg_rwstat_exit(&stats->merged);
				2667	blkg_rwstat_exit(&stats->service_time);
				2668	blkg_rwstat_exit(&stats->wait_time);
				2669	blkg_rwstat_exit(&stats->queued);
				2670	blkg_stat_exit(&stats->time);
				2671	blkg_stat_exit(&stats->avg_queue_size_sum);
				2672	blkg_stat_exit(&stats->avg_queue_size_samples);
				2673	blkg_stat_exit(&stats->dequeue);
				2674	blkg_stat_exit(&stats->group_wait_time);
				2675	blkg_stat_exit(&stats->idle_time);
				2676	blkg_stat_exit(&stats->empty_time);
				2677	}
				2678
				2679	static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
				2680	{
				2681	if (blkg_rwstat_init(&stats->merged, gfp) \|\|
				2682	blkg_rwstat_init(&stats->service_time, gfp) \|\|
				2683	blkg_rwstat_init(&stats->wait_time, gfp) \|\|
				2684	blkg_rwstat_init(&stats->queued, gfp) \|\|
				2685	blkg_stat_init(&stats->time, gfp) \|\|
				2686	blkg_stat_init(&stats->avg_queue_size_sum, gfp) \|\|
				2687	blkg_stat_init(&stats->avg_queue_size_samples, gfp) \|\|
				2688	blkg_stat_init(&stats->dequeue, gfp) \|\|
				2689	blkg_stat_init(&stats->group_wait_time, gfp) \|\|
				2690	blkg_stat_init(&stats->idle_time, gfp) \|\|
				2691	blkg_stat_init(&stats->empty_time, gfp)) {
				2692	bfqg_stats_exit(stats);
				2693	return -ENOMEM;
				2694	}
				2695
				2696	return 0;
				2697	}
				2698
				2699	static struct bfq_group_data cpd_to_bfqgd(struct blkcg_policy_data cpd)
				2700	{
				2701	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
				2702	}
				2703
				2704	static struct bfq_group_data blkcg_to_bfqgd(struct blkcg blkcg)
				2705	{
				2706	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
				2707	}
				2708
				2709	static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
				2710	{
				2711	struct bfq_group_data *bgd;
				2712
				2713	bgd = kzalloc(sizeof(*bgd), gfp);
				2714	if (!bgd)
				2715	return NULL;
				2716	return &bgd->pd;
				2717	}
				2718
				2719	static void bfq_cpd_init(struct blkcg_policy_data *cpd)
				2720	{
				2721	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
				2722
				2723	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
				2724	CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
				2725	}
				2726
				2727	static void bfq_cpd_free(struct blkcg_policy_data *cpd)
				2728	{
				2729	kfree(cpd_to_bfqgd(cpd));
				2730	}
				2731
				2732	static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
				2733	{
				2734	struct bfq_group *bfqg;
				2735
				2736	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
				2737	if (!bfqg)
				2738	return NULL;
				2739
				2740	if (bfqg_stats_init(&bfqg->stats, gfp)) {
				2741	kfree(bfqg);
				2742	return NULL;
				2743	}
				2744
				2745	return &bfqg->pd;
				2746	}
				2747
				2748	static void bfq_pd_init(struct blkg_policy_data *pd)
				2749	{
				2750	struct blkcg_gq *blkg = pd_to_blkg(pd);
				2751	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
				2752	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
				2753	struct bfq_entity *entity = &bfqg->entity;
				2754	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
				2755
				2756	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
				2757	entity->my_sched_data = &bfqg->sched_data;
				2758	bfqg->my_entity = entity; /*
				2759	* the root_group's will be set to NULL
				2760	* in bfq_init_queue()
				2761	*/
				2762	bfqg->bfqd = bfqd;
				2763	}
				2764
				2765	static void bfq_pd_free(struct blkg_policy_data *pd)
				2766	{
				2767	struct bfq_group *bfqg = pd_to_bfqg(pd);
				2768
				2769	bfqg_stats_exit(&bfqg->stats);
				2770	return kfree(bfqg);
				2771	}
				2772
				2773	static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
				2774	{
				2775	struct bfq_group *bfqg = pd_to_bfqg(pd);
				2776
				2777	bfqg_stats_reset(&bfqg->stats);
				2778	}
				2779
				2780	static void bfq_group_set_parent(struct bfq_group *bfqg,
				2781	struct bfq_group *parent)
				2782	{
				2783	struct bfq_entity *entity;
				2784
				2785	entity = &bfqg->entity;
				2786	entity->parent = parent->my_entity;
				2787	entity->sched_data = &parent->sched_data;
				2788	}
				2789
				2790	static struct bfq_group bfq_lookup_bfqg(struct bfq_data bfqd,
				2791	struct blkcg *blkcg)
				2792	{
				2793	struct blkcg_gq *blkg;
				2794
				2795	blkg = blkg_lookup(blkcg, bfqd->queue);
				2796	if (likely(blkg))
				2797	return blkg_to_bfqg(blkg);
				2798	return NULL;
				2799	}
				2800
				2801	static struct bfq_group bfq_find_set_group(struct bfq_data bfqd,
				2802	struct blkcg *blkcg)
				2803	{
				2804	struct bfq_group bfqg, parent;
				2805	struct bfq_entity *entity;
				2806
				2807	bfqg = bfq_lookup_bfqg(bfqd, blkcg);
				2808
				2809	if (unlikely(!bfqg))
				2810	return NULL;
				2811
				2812	/*
				2813	* Update chain of bfq_groups as we might be handling a leaf group
				2814	* which, along with some of its relatives, has not been hooked yet
				2815	* to the private hierarchy of BFQ.
				2816	*/
				2817	entity = &bfqg->entity;
				2818	for_each_entity(entity) {
				2819	bfqg = container_of(entity, struct bfq_group, entity);
				2820	if (bfqg != bfqd->root_group) {
				2821	parent = bfqg_parent(bfqg);
				2822	if (!parent)
				2823	parent = bfqd->root_group;
				2824	bfq_group_set_parent(bfqg, parent);
				2825	}
				2826	}
				2827
				2828	return bfqg;
				2829	}
				2830
				2831	static void bfq_bfqq_expire(struct bfq_data *bfqd,
				2832	struct bfq_queue *bfqq,
				2833	bool compensate,
				2834	enum bfqq_expiration reason);
				2835
				2836	/**
				2837	* bfq_bfqq_move - migrate @bfqq to @bfqg.
				2838	* @bfqd: queue descriptor.
				2839	* @bfqq: the queue to move.
				2840	* @bfqg: the group to move to.
				2841	*
				2842	* Move @bfqq to @bfqg, deactivating it from its old group and reactivating
				2843	* it on the new one. Avoid putting the entity on the old group idle tree.
				2844	*
				2845	* Must be called under the queue lock; the cgroup owning @bfqg must
				2846	* not disappear (by now this just means that we are called under
				2847	* rcu_read_lock()).
				2848	*/
				2849	static void bfq_bfqq_move(struct bfq_data bfqd, struct bfq_queue bfqq,
				2850	struct bfq_group *bfqg)
				2851	{
				2852	struct bfq_entity *entity = &bfqq->entity;
				2853
				2854	/* If bfqq is empty, then bfq_bfqq_expire also invokes
				2855	* bfq_del_bfqq_busy, thereby removing bfqq and its entity
				2856	* from data structures related to current group. Otherwise we
				2857	* need to remove bfqq explicitly with bfq_deactivate_bfqq, as
				2858	* we do below.
				2859	*/
				2860	if (bfqq == bfqd->in_service_queue)
				2861	bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
				2862	false, BFQQE_PREEMPTED);
				2863
				2864	if (bfq_bfqq_busy(bfqq))
				2865	bfq_deactivate_bfqq(bfqd, bfqq, false, false);
				2866	else if (entity->on_st)
				2867	bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
				2868	bfqg_put(bfqq_group(bfqq));
				2869
				2870	/*
				2871	* Here we use a reference to bfqg. We don't need a refcounter
				2872	* as the cgroup reference will not be dropped, so that its
				2873	* destroy() callback will not be invoked.
				2874	*/
				2875	entity->parent = bfqg->my_entity;
				2876	entity->sched_data = &bfqg->sched_data;
				2877	bfqg_get(bfqg);
				2878
				2879	if (bfq_bfqq_busy(bfqq))
				2880	bfq_activate_bfqq(bfqd, bfqq);
				2881
				2882	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
				2883	bfq_schedule_dispatch(bfqd);
				2884	}
				2885
				2886	/**
				2887	* __bfq_bic_change_cgroup - move @bic to @cgroup.
				2888	* @bfqd: the queue descriptor.
				2889	* @bic: the bic to move.
				2890	* @blkcg: the blk-cgroup to move to.
				2891	*
				2892	* Move bic to blkcg, assuming that bfqd->queue is locked; the caller
				2893	* has to make sure that the reference to cgroup is valid across the call.
				2894	*
				2895	* NOTE: an alternative approach might have been to store the current
				2896	* cgroup in bfqq and getting a reference to it, reducing the lookup
				2897	* time here, at the price of slightly more complex code.
				2898	*/
				2899	static struct bfq_group __bfq_bic_change_cgroup(struct bfq_data bfqd,
				2900	struct bfq_io_cq *bic,
				2901	struct blkcg *blkcg)
				2902	{
				2903	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
				2904	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
				2905	struct bfq_group *bfqg;
				2906	struct bfq_entity *entity;
				2907
				2908	bfqg = bfq_find_set_group(bfqd, blkcg);
				2909
				2910	if (unlikely(!bfqg))
				2911	bfqg = bfqd->root_group;
				2912
				2913	if (async_bfqq) {
				2914	entity = &async_bfqq->entity;
				2915
				2916	if (entity->sched_data != &bfqg->sched_data) {
				2917	bic_set_bfqq(bic, NULL, 0);
				2918	bfq_log_bfqq(bfqd, async_bfqq,
				2919	"bic_change_group: %p %d",
				2920	async_bfqq,
				2921	async_bfqq->ref);
				2922	bfq_put_queue(async_bfqq);
				2923	}
				2924	}
				2925
				2926	if (sync_bfqq) {
				2927	entity = &sync_bfqq->entity;
				2928	if (entity->sched_data != &bfqg->sched_data)
				2929	bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
				2930	}
				2931
				2932	return bfqg;
				2933	}
				2934
				2935	static void bfq_bic_update_cgroup(struct bfq_io_cq bic, struct bio bio)
				2936	{
				2937	struct bfq_data *bfqd = bic_to_bfqd(bic);
				2938	struct bfq_group *bfqg = NULL;
				2939	uint64_t serial_nr;
				2940
				2941	rcu_read_lock();
				2942	serial_nr = bio_blkcg(bio)->css.serial_nr;
				2943
				2944	/*
				2945	* Check whether blkcg has changed. The condition may trigger
				2946	* spuriously on a newly created cic but there's no harm.
				2947	*/
				2948	if (unlikely(!bfqd) \|\| likely(bic->blkcg_serial_nr == serial_nr))
				2949	goto out;
				2950
				2951	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
				2952	bic->blkcg_serial_nr = serial_nr;
				2953	out:
				2954	rcu_read_unlock();
				2955	}
				2956
				2957	/**
				2958	* bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
				2959	* @st: the service tree being flushed.
				2960	*/
				2961	static void bfq_flush_idle_tree(struct bfq_service_tree *st)
				2962	{
				2963	struct bfq_entity *entity = st->first_idle;
				2964
				2965	for (; entity ; entity = st->first_idle)
				2966	__bfq_deactivate_entity(entity, false);
				2967	}
				2968
				2969	/**
				2970	* bfq_reparent_leaf_entity - move leaf entity to the root_group.
				2971	* @bfqd: the device data structure with the root group.
				2972	* @entity: the entity to move.
				2973	*/
				2974	static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
				2975	struct bfq_entity *entity)
				2976	{
				2977	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				2978
				2979	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
				2980	}
				2981
				2982	/**
				2983	* bfq_reparent_active_entities - move to the root group all active
				2984	* entities.
				2985	* @bfqd: the device data structure with the root group.
				2986	* @bfqg: the group to move from.
				2987	* @st: the service tree with the entities.
				2988	*
				2989	* Needs queue_lock to be taken and reference to be valid over the call.
				2990	*/
				2991	static void bfq_reparent_active_entities(struct bfq_data *bfqd,
				2992	struct bfq_group *bfqg,
				2993	struct bfq_service_tree *st)
				2994	{
				2995	struct rb_root *active = &st->active;
				2996	struct bfq_entity *entity = NULL;
				2997
				2998	if (!RB_EMPTY_ROOT(&st->active))
				2999	entity = bfq_entity_of(rb_first(active));
				3000
				3001	for (; entity ; entity = bfq_entity_of(rb_first(active)))
				3002	bfq_reparent_leaf_entity(bfqd, entity);
				3003
				3004	if (bfqg->sched_data.in_service_entity)
				3005	bfq_reparent_leaf_entity(bfqd,
				3006	bfqg->sched_data.in_service_entity);
				3007	}
				3008
				3009	/**
				3010	* bfq_pd_offline - deactivate the entity associated with @pd,
				3011	* and reparent its children entities.
				3012	* @pd: descriptor of the policy going offline.
				3013	*
				3014	* blkio already grabs the queue_lock for us, so no need to use
				3015	* RCU-based magic
				3016	*/
				3017	static void bfq_pd_offline(struct blkg_policy_data *pd)
				3018	{
				3019	struct bfq_service_tree *st;
				3020	struct bfq_group *bfqg = pd_to_bfqg(pd);
				3021	struct bfq_data *bfqd = bfqg->bfqd;
				3022	struct bfq_entity *entity = bfqg->my_entity;
				3023	unsigned long flags;
				3024	int i;
				3025
				3026	if (!entity) /* root group */
				3027	return;
				3028
				3029	spin_lock_irqsave(&bfqd->lock, flags);
				3030	/*
				3031	* Empty all service_trees belonging to this group before
				3032	* deactivating the group itself.
				3033	*/
				3034	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
				3035	st = bfqg->sched_data.service_tree + i;
				3036
				3037	/*
				3038	* The idle tree may still contain bfq_queues belonging
				3039	* to exited task because they never migrated to a different
				3040	* cgroup from the one being destroyed now. No one else
				3041	* can access them so it's safe to act without any lock.
				3042	*/
				3043	bfq_flush_idle_tree(st);
				3044
				3045	/*
				3046	* It may happen that some queues are still active
				3047	* (busy) upon group destruction (if the corresponding
				3048	* processes have been forced to terminate). We move
				3049	* all the leaf entities corresponding to these queues
				3050	* to the root_group.
				3051	* Also, it may happen that the group has an entity
				3052	* in service, which is disconnected from the active
				3053	* tree: it must be moved, too.
				3054	* There is no need to put the sync queues, as the
				3055	* scheduler has taken no reference.
				3056	*/
				3057	bfq_reparent_active_entities(bfqd, bfqg, st);
				3058	}
				3059
				3060	__bfq_deactivate_entity(entity, false);
				3061	bfq_put_async_queues(bfqd, bfqg);
				3062
				3063	spin_unlock_irqrestore(&bfqd->lock, flags);
				3064	/*
				3065	* @blkg is going offline and will be ignored by
				3066	* blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
				3067	* that they don't get lost. If IOs complete after this point, the
				3068	* stats for them will be lost. Oh well...
				3069	*/
				3070	bfqg_stats_xfer_dead(bfqg);
				3071	}
				3072
				3073	static int bfq_io_show_weight(struct seq_file sf, void v)
				3074	{
				3075	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				3076	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
				3077	unsigned int val = 0;
				3078
				3079	if (bfqgd)
				3080	val = bfqgd->weight;
				3081
				3082	seq_printf(sf, "%u\n", val);
				3083
				3084	return 0;
				3085	}
				3086
				3087	static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
				3088	struct cftype *cftype,
				3089	u64 val)
				3090	{
				3091	struct blkcg *blkcg = css_to_blkcg(css);
				3092	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
				3093	struct blkcg_gq *blkg;
				3094	int ret = -ERANGE;
				3095
				3096	if (val < BFQ_MIN_WEIGHT \|\| val > BFQ_MAX_WEIGHT)
				3097	return ret;
				3098
				3099	ret = 0;
				3100	spin_lock_irq(&blkcg->lock);
				3101	bfqgd->weight = (unsigned short)val;
				3102	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
				3103	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
				3104
				3105	if (!bfqg)
				3106	continue;
				3107	/*
				3108	* Setting the prio_changed flag of the entity
				3109	* to 1 with new_weight == weight would re-set
				3110	* the value of the weight to its ioprio mapping.
				3111	* Set the flag only if necessary.
				3112	*/
				3113	if ((unsigned short)val != bfqg->entity.new_weight) {
				3114	bfqg->entity.new_weight = (unsigned short)val;
				3115	/*
				3116	* Make sure that the above new value has been
				3117	* stored in bfqg->entity.new_weight before
				3118	* setting the prio_changed flag. In fact,
				3119	* this flag may be read asynchronously (in
				3120	* critical sections protected by a different
				3121	* lock than that held here), and finding this
				3122	* flag set may cause the execution of the code
				3123	* for updating parameters whose value may
				3124	* depend also on bfqg->entity.new_weight (in
				3125	* __bfq_entity_update_weight_prio).
				3126	* This barrier makes sure that the new value
				3127	* of bfqg->entity.new_weight is correctly
				3128	* seen in that code.
				3129	*/
				3130	smp_wmb();
				3131	bfqg->entity.prio_changed = 1;
				3132	}
				3133	}
				3134	spin_unlock_irq(&blkcg->lock);
				3135
				3136	return ret;
				3137	}
				3138
				3139	static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
				3140	char *buf, size_t nbytes,
				3141	loff_t off)
				3142	{
				3143	u64 weight;
				3144	/* First unsigned long found in the file is used */
				3145	int ret = kstrtoull(strim(buf), 0, &weight);
				3146
				3147	if (ret)
				3148	return ret;
				3149
				3150	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
				3151	}
				3152
				3153	static int bfqg_print_stat(struct seq_file sf, void v)
				3154	{
				3155	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
				3156	&blkcg_policy_bfq, seq_cft(sf)->private, false);
				3157	return 0;
				3158	}
				3159
				3160	static int bfqg_print_rwstat(struct seq_file sf, void v)
				3161	{
				3162	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
				3163	&blkcg_policy_bfq, seq_cft(sf)->private, true);
				3164	return 0;
				3165	}
				3166
				3167	static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
				3168	struct blkg_policy_data *pd, int off)
				3169	{
				3170	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
				3171	&blkcg_policy_bfq, off);
				3172	return __blkg_prfill_u64(sf, pd, sum);
				3173	}
				3174
				3175	static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
				3176	struct blkg_policy_data *pd, int off)
				3177	{
				3178	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
				3179	&blkcg_policy_bfq,
				3180	off);
				3181	return __blkg_prfill_rwstat(sf, pd, &sum);
				3182	}
				3183
				3184	static int bfqg_print_stat_recursive(struct seq_file sf, void v)
				3185	{
				3186	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				3187	bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
				3188	seq_cft(sf)->private, false);
				3189	return 0;
				3190	}
				3191
				3192	static int bfqg_print_rwstat_recursive(struct seq_file sf, void v)
				3193	{
				3194	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				3195	bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
				3196	seq_cft(sf)->private, true);
				3197	return 0;
				3198	}
				3199
				3200	static u64 bfqg_prfill_sectors(struct seq_file sf, struct blkg_policy_data pd,
				3201	int off)
				3202	{
				3203	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
				3204
				3205	return __blkg_prfill_u64(sf, pd, sum >> 9);
				3206	}
				3207
				3208	static int bfqg_print_stat_sectors(struct seq_file sf, void v)
				3209	{
				3210	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				3211	bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
				3212	return 0;
				3213	}
				3214
				3215	static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
				3216	struct blkg_policy_data *pd, int off)
				3217	{
				3218	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
				3219	offsetof(struct blkcg_gq, stat_bytes));
				3220	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
				3221	atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
				3222
				3223	return __blkg_prfill_u64(sf, pd, sum >> 9);
				3224	}
				3225
				3226	static int bfqg_print_stat_sectors_recursive(struct seq_file sf, void v)
				3227	{
				3228	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				3229	bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
				3230	false);
				3231	return 0;
				3232	}
				3233
				3234	static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
				3235	struct blkg_policy_data *pd, int off)
				3236	{
				3237	struct bfq_group *bfqg = pd_to_bfqg(pd);
				3238	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
				3239	u64 v = 0;
				3240
				3241	if (samples) {
				3242	v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
				3243	v = div64_u64(v, samples);
				3244	}
				3245	__blkg_prfill_u64(sf, pd, v);
				3246	return 0;
				3247	}
				3248
				3249	/* print avg_queue_size */
				3250	static int bfqg_print_avg_queue_size(struct seq_file sf, void v)
				3251	{
				3252	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				3253	bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
				3254	0, false);
				3255	return 0;
				3256	}
				3257
				3258	static struct bfq_group *
				3259	bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
				3260	{
				3261	int ret;
				3262
				3263	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
				3264	if (ret)
				3265	return NULL;
				3266
				3267	return blkg_to_bfqg(bfqd->queue->root_blkg);
				3268	}
				3269
				3270	static struct cftype bfq_blkcg_legacy_files[] = {
				3271	{
				3272	.name = "bfq.weight",
				3273	.flags = CFTYPE_NOT_ON_ROOT,
				3274	.seq_show = bfq_io_show_weight,
				3275	.write_u64 = bfq_io_set_weight_legacy,
				3276	},
				3277
				3278	/* statistics, covers only the tasks in the bfqg */
				3279	{
				3280	.name = "bfq.time",
				3281	.private = offsetof(struct bfq_group, stats.time),
				3282	.seq_show = bfqg_print_stat,
				3283	},
				3284	{
				3285	.name = "bfq.sectors",
				3286	.seq_show = bfqg_print_stat_sectors,
				3287	},
				3288	{
				3289	.name = "bfq.io_service_bytes",
				3290	.private = (unsigned long)&blkcg_policy_bfq,
				3291	.seq_show = blkg_print_stat_bytes,
				3292	},
				3293	{
				3294	.name = "bfq.io_serviced",
				3295	.private = (unsigned long)&blkcg_policy_bfq,
				3296	.seq_show = blkg_print_stat_ios,
				3297	},
				3298	{
				3299	.name = "bfq.io_service_time",
				3300	.private = offsetof(struct bfq_group, stats.service_time),
				3301	.seq_show = bfqg_print_rwstat,
				3302	},
				3303	{
				3304	.name = "bfq.io_wait_time",
				3305	.private = offsetof(struct bfq_group, stats.wait_time),
				3306	.seq_show = bfqg_print_rwstat,
				3307	},
				3308	{
				3309	.name = "bfq.io_merged",
				3310	.private = offsetof(struct bfq_group, stats.merged),
				3311	.seq_show = bfqg_print_rwstat,
				3312	},
				3313	{
				3314	.name = "bfq.io_queued",
				3315	.private = offsetof(struct bfq_group, stats.queued),
				3316	.seq_show = bfqg_print_rwstat,
				3317	},
				3318
				3319	/* the same statictics which cover the bfqg and its descendants */
				3320	{
				3321	.name = "bfq.time_recursive",
				3322	.private = offsetof(struct bfq_group, stats.time),
				3323	.seq_show = bfqg_print_stat_recursive,
				3324	},
				3325	{
				3326	.name = "bfq.sectors_recursive",
				3327	.seq_show = bfqg_print_stat_sectors_recursive,
				3328	},
				3329	{
				3330	.name = "bfq.io_service_bytes_recursive",
				3331	.private = (unsigned long)&blkcg_policy_bfq,
				3332	.seq_show = blkg_print_stat_bytes_recursive,
				3333	},
				3334	{
				3335	.name = "bfq.io_serviced_recursive",
				3336	.private = (unsigned long)&blkcg_policy_bfq,
				3337	.seq_show = blkg_print_stat_ios_recursive,
				3338	},
				3339	{
				3340	.name = "bfq.io_service_time_recursive",
				3341	.private = offsetof(struct bfq_group, stats.service_time),
				3342	.seq_show = bfqg_print_rwstat_recursive,
				3343	},
				3344	{
				3345	.name = "bfq.io_wait_time_recursive",
				3346	.private = offsetof(struct bfq_group, stats.wait_time),
				3347	.seq_show = bfqg_print_rwstat_recursive,
				3348	},
				3349	{
				3350	.name = "bfq.io_merged_recursive",
				3351	.private = offsetof(struct bfq_group, stats.merged),
				3352	.seq_show = bfqg_print_rwstat_recursive,
				3353	},
				3354	{
				3355	.name = "bfq.io_queued_recursive",
				3356	.private = offsetof(struct bfq_group, stats.queued),
				3357	.seq_show = bfqg_print_rwstat_recursive,
				3358	},
				3359	{
				3360	.name = "bfq.avg_queue_size",
				3361	.seq_show = bfqg_print_avg_queue_size,
				3362	},
				3363	{
				3364	.name = "bfq.group_wait_time",
				3365	.private = offsetof(struct bfq_group, stats.group_wait_time),
				3366	.seq_show = bfqg_print_stat,
				3367	},
				3368	{
				3369	.name = "bfq.idle_time",
				3370	.private = offsetof(struct bfq_group, stats.idle_time),
				3371	.seq_show = bfqg_print_stat,
				3372	},
				3373	{
				3374	.name = "bfq.empty_time",
				3375	.private = offsetof(struct bfq_group, stats.empty_time),
				3376	.seq_show = bfqg_print_stat,
				3377	},
				3378	{
				3379	.name = "bfq.dequeue",
				3380	.private = offsetof(struct bfq_group, stats.dequeue),
				3381	.seq_show = bfqg_print_stat,
				3382	},
				3383	{ } /* terminate */
				3384	};
				3385
				3386	static struct cftype bfq_blkg_files[] = {
				3387	{
				3388	.name = "bfq.weight",
				3389	.flags = CFTYPE_NOT_ON_ROOT,
				3390	.seq_show = bfq_io_show_weight,
				3391	.write = bfq_io_set_weight,
				3392	},
				3393	{} /* terminate */
				3394	};
				3395
				3396	#else /* CONFIG_BFQ_GROUP_IOSCHED */
				3397
				3398	static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
				3399	struct bfq_queue *bfqq, unsigned int op) { }
				3400	static inline void
				3401	bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
				3402	static inline void
				3403	bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
				3404	static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
				3405	uint64_t start_time, uint64_t io_start_time,
				3406	unsigned int op) { }
				3407	static inline void
				3408	bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
				3409	struct bfq_group *curr_bfqg) { }
				3410	static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
				3411	static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
				3412	static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
				3413	static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
				3414	static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
				3415	static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
				3416
				3417	static void bfq_bfqq_move(struct bfq_data bfqd, struct bfq_queue bfqq,
				3418	struct bfq_group *bfqg) {}
				3419
				3420	static void bfq_init_entity(struct bfq_entity *entity,
				3421	struct bfq_group *bfqg)
				3422	{
				3423	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				3424
				3425	entity->weight = entity->new_weight;
				3426	entity->orig_weight = entity->new_weight;
				3427	if (bfqq) {
				3428	bfqq->ioprio = bfqq->new_ioprio;
				3429	bfqq->ioprio_class = bfqq->new_ioprio_class;
				3430	}
				3431	entity->sched_data = &bfqg->sched_data;
				3432	}
				3433
				3434	static void bfq_bic_update_cgroup(struct bfq_io_cq bic, struct bio bio) {}
				3435
				3436	static struct bfq_group bfq_find_set_group(struct bfq_data bfqd,
				3437	struct blkcg *blkcg)
				3438	{
				3439	return bfqd->root_group;
				3440	}
				3441
				3442	static struct bfq_group bfqq_group(struct bfq_queue bfqq)
				3443	{
				3444	return bfqq->bfqd->root_group;
				3445	}
				3446
				3447	static struct bfq_group bfq_create_group_hierarchy(struct bfq_data bfqd,
				3448	int node)
				3449	{
				3450	struct bfq_group *bfqg;
				3451	int i;
				3452
				3453	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL \| __GFP_ZERO, node);
				3454	if (!bfqg)
				3455	return NULL;
				3456
				3457	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
				3458	bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
				3459
				3460	return bfqg;
				3461	}
				3462	#endif /* CONFIG_BFQ_GROUP_IOSCHED */
				3463
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3464	#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
				3465	#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
				3466
				3467	#define bfq_sample_valid(samples) ((samples) > 80)
				3468
				3469	/*
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3470	* Lifted from AS - choose which of rq1 and rq2 that is best served now.
				3471	* We choose the request that is closesr to the head right now. Distance
				3472	* behind the head is penalized and only allowed to a certain extent.
				3473	*/
				3474	static struct request bfq_choose_req(struct bfq_data bfqd,
				3475	struct request *rq1,
				3476	struct request *rq2,
				3477	sector_t last)
				3478	{
				3479	sector_t s1, s2, d1 = 0, d2 = 0;
				3480	unsigned long back_max;
				3481	#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
				3482	#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
				3483	unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
				3484
				3485	if (!rq1 \|\| rq1 == rq2)
				3486	return rq2;
				3487	if (!rq2)
				3488	return rq1;
				3489
				3490	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
				3491	return rq1;
				3492	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
				3493	return rq2;
				3494	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
				3495	return rq1;
				3496	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
				3497	return rq2;
				3498
				3499	s1 = blk_rq_pos(rq1);
				3500	s2 = blk_rq_pos(rq2);
				3501
				3502	/*
				3503	* By definition, 1KiB is 2 sectors.
				3504	*/
				3505	back_max = bfqd->bfq_back_max * 2;
				3506
				3507	/*
				3508	* Strict one way elevator _except_ in the case where we allow
				3509	* short backward seeks which are biased as twice the cost of a
				3510	* similar forward seek.
				3511	*/
				3512	if (s1 >= last)
				3513	d1 = s1 - last;
				3514	else if (s1 + back_max >= last)
				3515	d1 = (last - s1) * bfqd->bfq_back_penalty;
				3516	else
				3517	wrap \|= BFQ_RQ1_WRAP;
				3518
				3519	if (s2 >= last)
				3520	d2 = s2 - last;
				3521	else if (s2 + back_max >= last)
				3522	d2 = (last - s2) * bfqd->bfq_back_penalty;
				3523	else
				3524	wrap \|= BFQ_RQ2_WRAP;
				3525
				3526	/* Found required data */
				3527
				3528	/*
				3529	* By doing switch() on the bit mask "wrap" we avoid having to
				3530	* check two variables for all permutations: --> faster!
				3531	*/
				3532	switch (wrap) {
				3533	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
				3534	if (d1 < d2)
				3535	return rq1;
				3536	else if (d2 < d1)
				3537	return rq2;
				3538
				3539	if (s1 >= s2)
				3540	return rq1;
				3541	else
				3542	return rq2;
				3543
				3544	case BFQ_RQ2_WRAP:
				3545	return rq1;
				3546	case BFQ_RQ1_WRAP:
				3547	return rq2;
				3548	case BFQ_RQ1_WRAP\|BFQ_RQ2_WRAP: /* both rqs wrapped */
				3549	default:
				3550	/*
				3551	* Since both rqs are wrapped,
				3552	* start with the one that's further behind head
				3553	* (--> only one back seek required),
				3554	* since back seek takes more time than forward.
				3555	*/
				3556	if (s1 <= s2)
				3557	return rq1;
				3558	else
				3559	return rq2;
				3560	}
				3561	}
				3562
				3563	/*
				3564	* Return expired entry, or NULL to just start from scratch in rbtree.
				3565	*/
				3566	static struct request bfq_check_fifo(struct bfq_queue bfqq,
				3567	struct request *last)
				3568	{
				3569	struct request *rq;
				3570
				3571	if (bfq_bfqq_fifo_expire(bfqq))
				3572	return NULL;
				3573
				3574	bfq_mark_bfqq_fifo_expire(bfqq);
				3575
				3576	rq = rq_entry_fifo(bfqq->fifo.next);
				3577
				3578	if (rq == last \|\| ktime_get_ns() < rq->fifo_time)
				3579	return NULL;
				3580
				3581	bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
				3582	return rq;
				3583	}
				3584
				3585	static struct request bfq_find_next_rq(struct bfq_data bfqd,
				3586	struct bfq_queue *bfqq,
				3587	struct request *last)
				3588	{
				3589	struct rb_node *rbnext = rb_next(&last->rb_node);
				3590	struct rb_node *rbprev = rb_prev(&last->rb_node);
				3591	struct request next, prev = NULL;
				3592
				3593	/* Follow expired path, else get first next available. */
				3594	next = bfq_check_fifo(bfqq, last);
				3595	if (next)
				3596	return next;
				3597
				3598	if (rbprev)
				3599	prev = rb_entry_rq(rbprev);
				3600
				3601	if (rbnext)
				3602	next = rb_entry_rq(rbnext);
				3603	else {
				3604	rbnext = rb_first(&bfqq->sort_list);
				3605	if (rbnext && rbnext != &last->rb_node)
				3606	next = rb_entry_rq(rbnext);
				3607	}
				3608
				3609	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
				3610	}
				3611
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	3612	/* see the definition of bfq_async_charge_factor for details */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3613	static unsigned long bfq_serv_to_charge(struct request *rq,
				3614	struct bfq_queue *bfqq)
				3615	{
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	3616	if (bfq_bfqq_sync(bfqq))
				3617	return blk_rq_sectors(rq);
				3618
				3619	return blk_rq_sectors(rq) * bfq_async_charge_factor;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3620	}
				3621
				3622	/**
				3623	* bfq_updated_next_req - update the queue after a new next_rq selection.
				3624	* @bfqd: the device data the queue belongs to.
				3625	* @bfqq: the queue to update.
				3626	*
				3627	* If the first request of a queue changes we make sure that the queue
				3628	* has enough budget to serve at least its first request (if the
				3629	* request has grown). We do this because if the queue has not enough
				3630	* budget for its first request, it has to go through two dispatch
				3631	* rounds to actually get it dispatched.
				3632	*/
				3633	static void bfq_updated_next_req(struct bfq_data *bfqd,
				3634	struct bfq_queue *bfqq)
				3635	{
				3636	struct bfq_entity *entity = &bfqq->entity;
				3637	struct request *next_rq = bfqq->next_rq;
				3638	unsigned long new_budget;
				3639
				3640	if (!next_rq)
				3641	return;
				3642
				3643	if (bfqq == bfqd->in_service_queue)
				3644	/*
				3645	* In order not to break guarantees, budgets cannot be
				3646	* changed after an entity has been selected.
				3647	*/
				3648	return;
				3649
				3650	new_budget = max_t(unsigned long, bfqq->max_budget,
				3651	bfq_serv_to_charge(next_rq, bfqq));
				3652	if (entity->budget != new_budget) {
				3653	entity->budget = new_budget;
				3654	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
				3655	new_budget);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	3656	bfq_requeue_bfqq(bfqd, bfqq);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3657	}
				3658	}
				3659
				3660	static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
				3661	{
				3662	struct bfq_entity *entity = &bfqq->entity;
				3663
				3664	return entity->budget - entity->service;
				3665	}
				3666
				3667	/*
				3668	* If enough samples have been computed, return the current max budget
				3669	* stored in bfqd, which is dynamically updated according to the
				3670	* estimated disk peak rate; otherwise return the default max budget
				3671	*/
				3672	static int bfq_max_budget(struct bfq_data *bfqd)
				3673	{
				3674	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
				3675	return bfq_default_max_budget;
				3676	else
				3677	return bfqd->bfq_max_budget;
				3678	}
				3679
				3680	/*
				3681	* Return min budget, which is a fraction of the current or default
				3682	* max budget (trying with 1/32)
				3683	*/
				3684	static int bfq_min_budget(struct bfq_data *bfqd)
				3685	{
				3686	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
				3687	return bfq_default_max_budget / 32;
				3688	else
				3689	return bfqd->bfq_max_budget / 32;
				3690	}
				3691
				3692	static void bfq_bfqq_expire(struct bfq_data *bfqd,
				3693	struct bfq_queue *bfqq,
				3694	bool compensate,
				3695	enum bfqq_expiration reason);
				3696
				3697	/*
				3698	* The next function, invoked after the input queue bfqq switches from
				3699	* idle to busy, updates the budget of bfqq. The function also tells
				3700	* whether the in-service queue should be expired, by returning
				3701	* true. The purpose of expiring the in-service queue is to give bfqq
				3702	* the chance to possibly preempt the in-service queue, and the reason
				3703	* for preempting the in-service queue is to achieve the following
				3704	* goal: guarantee to bfqq its reserved bandwidth even if bfqq has
				3705	* expired because it has remained idle.
				3706	*
				3707	* In particular, bfqq may have expired for one of the following two
				3708	* reasons:
				3709	*
				3710	* - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
				3711	* and did not make it to issue a new request before its last
				3712	* request was served;
				3713	*
				3714	* - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
				3715	* a new request before the expiration of the idling-time.
				3716	*
				3717	* Even if bfqq has expired for one of the above reasons, the process
				3718	* associated with the queue may be however issuing requests greedily,
				3719	* and thus be sensitive to the bandwidth it receives (bfqq may have
				3720	* remained idle for other reasons: CPU high load, bfqq not enjoying
				3721	* idling, I/O throttling somewhere in the path from the process to
				3722	* the I/O scheduler, ...). But if, after every expiration for one of
				3723	* the above two reasons, bfqq has to wait for the service of at least
				3724	* one full budget of another queue before being served again, then
				3725	* bfqq is likely to get a much lower bandwidth or resource time than
				3726	* its reserved ones. To address this issue, two countermeasures need
				3727	* to be taken.
				3728	*
				3729	* First, the budget and the timestamps of bfqq need to be updated in
				3730	* a special way on bfqq reactivation: they need to be updated as if
				3731	* bfqq did not remain idle and did not expire. In fact, if they are
				3732	* computed as if bfqq expired and remained idle until reactivation,
				3733	* then the process associated with bfqq is treated as if, instead of
				3734	* being greedy, it stopped issuing requests when bfqq remained idle,
				3735	* and restarts issuing requests only on this reactivation. In other
				3736	* words, the scheduler does not help the process recover the "service
				3737	* hole" between bfqq expiration and reactivation. As a consequence,
				3738	* the process receives a lower bandwidth than its reserved one. In
				3739	* contrast, to recover this hole, the budget must be updated as if
				3740	* bfqq was not expired at all before this reactivation, i.e., it must
				3741	* be set to the value of the remaining budget when bfqq was
				3742	* expired. Along the same line, timestamps need to be assigned the
				3743	* value they had the last time bfqq was selected for service, i.e.,
				3744	* before last expiration. Thus timestamps need to be back-shifted
				3745	* with respect to their normal computation (see [1] for more details
				3746	* on this tricky aspect).
				3747	*
				3748	* Secondly, to allow the process to recover the hole, the in-service
				3749	* queue must be expired too, to give bfqq the chance to preempt it
				3750	* immediately. In fact, if bfqq has to wait for a full budget of the
				3751	* in-service queue to be completed, then it may become impossible to
				3752	* let the process recover the hole, even if the back-shifted
				3753	* timestamps of bfqq are lower than those of the in-service queue. If
				3754	* this happens for most or all of the holes, then the process may not
				3755	* receive its reserved bandwidth. In this respect, it is worth noting
				3756	* that, being the service of outstanding requests unpreemptible, a
				3757	* little fraction of the holes may however be unrecoverable, thereby
				3758	* causing a little loss of bandwidth.
				3759	*
				3760	* The last important point is detecting whether bfqq does need this
				3761	* bandwidth recovery. In this respect, the next function deems the
				3762	* process associated with bfqq greedy, and thus allows it to recover
				3763	* the hole, if: 1) the process is waiting for the arrival of a new
				3764	* request (which implies that bfqq expired for one of the above two
				3765	* reasons), and 2) such a request has arrived soon. The first
				3766	* condition is controlled through the flag non_blocking_wait_rq,
				3767	* while the second through the flag arrived_in_time. If both
				3768	* conditions hold, then the function computes the budget in the
				3769	* above-described special way, and signals that the in-service queue
				3770	* should be expired. Timestamp back-shifting is done later in
				3771	* __bfq_activate_entity.
				3772	*/
				3773	static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
				3774	struct bfq_queue *bfqq,
				3775	bool arrived_in_time)
				3776	{
				3777	struct bfq_entity *entity = &bfqq->entity;
				3778
				3779	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
				3780	/*
				3781	* We do not clear the flag non_blocking_wait_rq here, as
				3782	* the latter is used in bfq_activate_bfqq to signal
				3783	* that timestamps need to be back-shifted (and is
				3784	* cleared right after).
				3785	*/
				3786
				3787	/*
				3788	* In next assignment we rely on that either
				3789	* entity->service or entity->budget are not updated
				3790	* on expiration if bfqq is empty (see
				3791	* __bfq_bfqq_recalc_budget). Thus both quantities
				3792	* remain unchanged after such an expiration, and the
				3793	* following statement therefore assigns to
				3794	* entity->budget the remaining budget on such an
				3795	* expiration. For clarity, entity->service is not
				3796	* updated on expiration in any case, and, in normal
				3797	* operation, is reset only when bfqq is selected for
				3798	* service (see bfq_get_next_queue).
				3799	*/
				3800	entity->budget = min_t(unsigned long,
				3801	bfq_bfqq_budget_left(bfqq),
				3802	bfqq->max_budget);
				3803
				3804	return true;
				3805	}
				3806
				3807	entity->budget = max_t(unsigned long, bfqq->max_budget,
				3808	bfq_serv_to_charge(bfqq->next_rq, bfqq));
				3809	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
				3810	return false;
				3811	}
				3812
				3813	static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
				3814	struct bfq_queue *bfqq,
				3815	struct request *rq)
				3816	{
				3817	bool bfqq_wants_to_preempt,
				3818	/*
				3819	* See the comments on
				3820	* bfq_bfqq_update_budg_for_activation for
				3821	* details on the usage of the next variable.
				3822	*/
				3823	arrived_in_time = ktime_get_ns() <=
				3824	bfqq->ttime.last_end_request +
				3825	bfqd->bfq_slice_idle * 3;
				3826
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	3827	bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
				3828
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3829	/*
				3830	* Update budget and check whether bfqq may want to preempt
				3831	* the in-service queue.
				3832	*/
				3833	bfqq_wants_to_preempt =
				3834	bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
				3835	arrived_in_time);
				3836
				3837	if (!bfq_bfqq_IO_bound(bfqq)) {
				3838	if (arrived_in_time) {
				3839	bfqq->requests_within_timer++;
				3840	if (bfqq->requests_within_timer >=
				3841	bfqd->bfq_requests_within_timer)
				3842	bfq_mark_bfqq_IO_bound(bfqq);
				3843	} else
				3844	bfqq->requests_within_timer = 0;
				3845	}
				3846
				3847	bfq_add_bfqq_busy(bfqd, bfqq);
				3848
				3849	/*
				3850	* Expire in-service queue only if preemption may be needed
				3851	* for guarantees. In this respect, the function
				3852	* next_queue_may_preempt just checks a simple, necessary
				3853	* condition, and not a sufficient condition based on
				3854	* timestamps. In fact, for the latter condition to be
				3855	* evaluated, timestamps would need first to be updated, and
				3856	* this operation is quite costly (see the comments on the
				3857	* function bfq_bfqq_update_budg_for_activation).
				3858	*/
				3859	if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
				3860	next_queue_may_preempt(bfqd))
				3861	bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
				3862	false, BFQQE_PREEMPTED);
				3863	}
				3864
				3865	static void bfq_add_request(struct request *rq)
				3866	{
				3867	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				3868	struct bfq_data *bfqd = bfqq->bfqd;
				3869	struct request next_rq, prev;
				3870
				3871	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
				3872	bfqq->queued[rq_is_sync(rq)]++;
				3873	bfqd->queued++;
				3874
				3875	elv_rb_add(&bfqq->sort_list, rq);
				3876
				3877	/*
				3878	* Check if this request is a better next-serve candidate.
				3879	*/
				3880	prev = bfqq->next_rq;
				3881	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
				3882	bfqq->next_rq = next_rq;
				3883
				3884	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
				3885	bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, rq);
				3886	else if (prev != bfqq->next_rq)
				3887	bfq_updated_next_req(bfqd, bfqq);
				3888	}
				3889
				3890	static struct request bfq_find_rq_fmerge(struct bfq_data bfqd,
				3891	struct bio *bio,
				3892	struct request_queue *q)
				3893	{
				3894	struct bfq_queue *bfqq = bfqd->bio_bfqq;
				3895
				3896
				3897	if (bfqq)
				3898	return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
				3899
				3900	return NULL;
				3901	}
				3902
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	3903	static sector_t get_sdist(sector_t last_pos, struct request *rq)
				3904	{
				3905	if (last_pos)
				3906	return abs(blk_rq_pos(rq) - last_pos);
				3907
				3908	return 0;
				3909	}
				3910
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3911	#if 0 /* Still not clear if we can do without next two functions */
				3912	static void bfq_activate_request(struct request_queue q, struct request rq)
				3913	{
				3914	struct bfq_data *bfqd = q->elevator->elevator_data;
				3915
				3916	bfqd->rq_in_driver++;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3917	}
				3918
				3919	static void bfq_deactivate_request(struct request_queue q, struct request rq)
				3920	{
				3921	struct bfq_data *bfqd = q->elevator->elevator_data;
				3922
				3923	bfqd->rq_in_driver--;
				3924	}
				3925	#endif
				3926
				3927	static void bfq_remove_request(struct request_queue *q,
				3928	struct request *rq)
				3929	{
				3930	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				3931	struct bfq_data *bfqd = bfqq->bfqd;
				3932	const int sync = rq_is_sync(rq);
				3933
				3934	if (bfqq->next_rq == rq) {
				3935	bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
				3936	bfq_updated_next_req(bfqd, bfqq);
				3937	}
				3938
				3939	if (rq->queuelist.prev != &rq->queuelist)
				3940	list_del_init(&rq->queuelist);
				3941	bfqq->queued[sync]--;
				3942	bfqd->queued--;
				3943	elv_rb_del(&bfqq->sort_list, rq);
				3944
				3945	elv_rqhash_del(q, rq);
				3946	if (q->last_merge == rq)
				3947	q->last_merge = NULL;
				3948
				3949	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
				3950	bfqq->next_rq = NULL;
				3951
				3952	if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	3953	bfq_del_bfqq_busy(bfqd, bfqq, false);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3954	/*
				3955	* bfqq emptied. In normal operation, when
				3956	* bfqq is empty, bfqq->entity.service and
				3957	* bfqq->entity.budget must contain,
				3958	* respectively, the service received and the
				3959	* budget used last time bfqq emptied. These
				3960	* facts do not hold in this case, as at least
				3961	* this last removal occurred while bfqq is
				3962	* not in service. To avoid inconsistencies,
				3963	* reset both bfqq->entity.service and
				3964	* bfqq->entity.budget, if bfqq has still a
				3965	* process that may issue I/O requests to it.
				3966	*/
				3967	bfqq->entity.budget = bfqq->entity.service = 0;
				3968	}
				3969	}
				3970
				3971	if (rq->cmd_flags & REQ_META)
				3972	bfqq->meta_pending--;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	3973
				3974	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3975	}
				3976
				3977	static bool bfq_bio_merge(struct blk_mq_hw_ctx hctx, struct bio bio)
				3978	{
				3979	struct request_queue *q = hctx->queue;
				3980	struct bfq_data *bfqd = q->elevator->elevator_data;
				3981	struct request *free = NULL;
				3982	/*
				3983	* bfq_bic_lookup grabs the queue_lock: invoke it now and
				3984	* store its return value for later use, to avoid nesting
				3985	* queue_lock inside the bfqd->lock. We assume that the bic
				3986	* returned by bfq_bic_lookup does not go away before
				3987	* bfqd->lock is taken.
				3988	*/
				3989	struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
				3990	bool ret;
				3991
				3992	spin_lock_irq(&bfqd->lock);
				3993
				3994	if (bic)
				3995	bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
				3996	else
				3997	bfqd->bio_bfqq = NULL;
				3998	bfqd->bio_bic = bic;
				3999
				4000	ret = blk_mq_sched_try_merge(q, bio, &free);
				4001
				4002	if (free)
				4003	blk_mq_free_request(free);
				4004	spin_unlock_irq(&bfqd->lock);
				4005
				4006	return ret;
				4007	}
				4008
				4009	static int bfq_request_merge(struct request_queue q, struct request *req,
				4010	struct bio *bio)
				4011	{
				4012	struct bfq_data *bfqd = q->elevator->elevator_data;
				4013	struct request *__rq;
				4014
				4015	__rq = bfq_find_rq_fmerge(bfqd, bio, q);
				4016	if (__rq && elv_bio_merge_ok(__rq, bio)) {
				4017	*req = __rq;
				4018	return ELEVATOR_FRONT_MERGE;
				4019	}
				4020
				4021	return ELEVATOR_NO_MERGE;
				4022	}
				4023
				4024	static void bfq_request_merged(struct request_queue q, struct request req,
				4025	enum elv_merge type)
				4026	{
				4027	if (type == ELEVATOR_FRONT_MERGE &&
				4028	rb_prev(&req->rb_node) &&
				4029	blk_rq_pos(req) <
				4030	blk_rq_pos(container_of(rb_prev(&req->rb_node),
				4031	struct request, rb_node))) {
				4032	struct bfq_queue *bfqq = RQ_BFQQ(req);
				4033	struct bfq_data *bfqd = bfqq->bfqd;
				4034	struct request prev, next_rq;
				4035
				4036	/* Reposition request in its sort_list */
				4037	elv_rb_del(&bfqq->sort_list, req);
				4038	elv_rb_add(&bfqq->sort_list, req);
				4039
				4040	/* Choose next request to be served for bfqq */
				4041	prev = bfqq->next_rq;
				4042	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
				4043	bfqd->last_position);
				4044	bfqq->next_rq = next_rq;
				4045	/*
				4046	* If next_rq changes, update the queue's budget to fit
				4047	* the new request.
				4048	*/
				4049	if (prev != bfqq->next_rq)
				4050	bfq_updated_next_req(bfqd, bfqq);
				4051	}
				4052	}
				4053
				4054	static void bfq_requests_merged(struct request_queue q, struct request rq,
				4055	struct request *next)
				4056	{
				4057	struct bfq_queue bfqq = RQ_BFQQ(rq), next_bfqq = RQ_BFQQ(next);
				4058
				4059	if (!RB_EMPTY_NODE(&rq->rb_node))
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4060	goto end;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4061	spin_lock_irq(&bfqq->bfqd->lock);
				4062
				4063	/*
				4064	* If next and rq belong to the same bfq_queue and next is older
				4065	* than rq, then reposition rq in the fifo (by substituting next
				4066	* with rq). Otherwise, if next and rq belong to different
				4067	* bfq_queues, never reposition rq: in fact, we would have to
				4068	* reposition it with respect to next's position in its own fifo,
				4069	* which would most certainly be too expensive with respect to
				4070	* the benefits.
				4071	*/
				4072	if (bfqq == next_bfqq &&
				4073	!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
				4074	next->fifo_time < rq->fifo_time) {
				4075	list_del_init(&rq->queuelist);
				4076	list_replace_init(&next->queuelist, &rq->queuelist);
				4077	rq->fifo_time = next->fifo_time;
				4078	}
				4079
				4080	if (bfqq->next_rq == next)
				4081	bfqq->next_rq = rq;
				4082
				4083	bfq_remove_request(q, next);
				4084
				4085	spin_unlock_irq(&bfqq->bfqd->lock);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4086	end:
				4087	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4088	}
				4089
				4090	static bool bfq_allow_bio_merge(struct request_queue q, struct request rq,
				4091	struct bio *bio)
				4092	{
				4093	struct bfq_data *bfqd = q->elevator->elevator_data;
				4094	bool is_sync = op_is_sync(bio->bi_opf);
				4095	struct bfq_queue *bfqq = bfqd->bio_bfqq;
				4096
				4097	/*
				4098	* Disallow merge of a sync bio into an async request.
				4099	*/
				4100	if (is_sync && !rq_is_sync(rq))
				4101	return false;
				4102
				4103	/*
				4104	* Lookup the bfqq that this bio will be queued with. Allow
				4105	* merge only if rq is queued there.
				4106	*/
				4107	if (!bfqq)
				4108	return false;
				4109
				4110	return bfqq == RQ_BFQQ(rq);
				4111	}
				4112
				4113	static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
				4114	struct bfq_queue *bfqq)
				4115	{
				4116	if (bfqq) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4117	bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4118	bfq_mark_bfqq_budget_new(bfqq);
				4119	bfq_clear_bfqq_fifo_expire(bfqq);
				4120
				4121	bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
				4122
				4123	bfq_log_bfqq(bfqd, bfqq,
				4124	"set_in_service_queue, cur-budget = %d",
				4125	bfqq->entity.budget);
				4126	}
				4127
				4128	bfqd->in_service_queue = bfqq;
				4129	}
				4130
				4131	/*
				4132	* Get and set a new queue for service.
				4133	*/
				4134	static struct bfq_queue bfq_set_in_service_queue(struct bfq_data bfqd)
				4135	{
				4136	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
				4137
				4138	__bfq_set_in_service_queue(bfqd, bfqq);
				4139	return bfqq;
				4140	}
				4141
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4142	static void bfq_arm_slice_timer(struct bfq_data *bfqd)
				4143	{
				4144	struct bfq_queue *bfqq = bfqd->in_service_queue;
				4145	struct bfq_io_cq *bic;
				4146	u32 sl;
				4147
				4148	/* Processes have exited, don't wait. */
				4149	bic = bfqd->in_service_bic;
				4150	if (!bic \|\| atomic_read(&bic->icq.ioc->active_ref) == 0)
				4151	return;
				4152
				4153	bfq_mark_bfqq_wait_request(bfqq);
				4154
				4155	/*
				4156	* We don't want to idle for seeks, but we do want to allow
				4157	* fair distribution of slice time for a process doing back-to-back
				4158	* seeks. So allow a little bit of time for him to submit a new rq.
				4159	*/
				4160	sl = bfqd->bfq_slice_idle;
				4161	/*
				4162	* Grant only minimum idle time if the queue is seeky.
				4163	*/
				4164	if (BFQQ_SEEKY(bfqq))
				4165	sl = min_t(u64, sl, BFQ_MIN_TT);
				4166
				4167	bfqd->last_idling_start = ktime_get();
				4168	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
				4169	HRTIMER_MODE_REL);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4170	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4171	}
				4172
				4173	/*
				4174	* Set the maximum time for the in-service queue to consume its
				4175	* budget. This prevents seeky processes from lowering the disk
				4176	* throughput (always guaranteed with a time slice scheme as in CFQ).
				4177	*/
				4178	static void bfq_set_budget_timeout(struct bfq_data *bfqd)
				4179	{
				4180	struct bfq_queue *bfqq = bfqd->in_service_queue;
				4181	unsigned int timeout_coeff = bfqq->entity.weight /
				4182	bfqq->entity.orig_weight;
				4183
				4184	bfqd->last_budget_start = ktime_get();
				4185
				4186	bfq_clear_bfqq_budget_new(bfqq);
				4187	bfqq->budget_timeout = jiffies +
				4188	bfqd->bfq_timeout * timeout_coeff;
				4189
				4190	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
				4191	jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
				4192	}
				4193
				4194	/*
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4195	* In autotuning mode, max_budget is dynamically recomputed as the
				4196	* amount of sectors transferred in timeout at the estimated peak
				4197	* rate. This enables BFQ to utilize a full timeslice with a full
				4198	* budget, even if the in-service queue is served at peak rate. And
				4199	* this maximises throughput with sequential workloads.
				4200	*/
				4201	static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
				4202	{
				4203	return (u64)bfqd->peak_rate * USEC_PER_MSEC *
				4204	jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
				4205	}
				4206
				4207	static void bfq_reset_rate_computation(struct bfq_data *bfqd,
				4208	struct request *rq)
				4209	{
				4210	if (rq != NULL) { /* new rq dispatch now, reset accordingly */
				4211	bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
				4212	bfqd->peak_rate_samples = 1;
				4213	bfqd->sequential_samples = 0;
				4214	bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
				4215	blk_rq_sectors(rq);
				4216	} else /* no new rq dispatched, just reset the number of samples */
				4217	bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
				4218
				4219	bfq_log(bfqd,
				4220	"reset_rate_computation at end, sample %u/%u tot_sects %llu",
				4221	bfqd->peak_rate_samples, bfqd->sequential_samples,
				4222	bfqd->tot_sectors_dispatched);
				4223	}
				4224
				4225	static void bfq_update_rate_reset(struct bfq_data bfqd, struct request rq)
				4226	{
				4227	u32 rate, weight, divisor;
				4228
				4229	/*
				4230	* For the convergence property to hold (see comments on
				4231	* bfq_update_peak_rate()) and for the assessment to be
				4232	* reliable, a minimum number of samples must be present, and
				4233	* a minimum amount of time must have elapsed. If not so, do
				4234	* not compute new rate. Just reset parameters, to get ready
				4235	* for a new evaluation attempt.
				4236	*/
				4237	if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES \|\|
				4238	bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
				4239	goto reset_computation;
				4240
				4241	/*
				4242	* If a new request completion has occurred after last
				4243	* dispatch, then, to approximate the rate at which requests
				4244	* have been served by the device, it is more precise to
				4245	* extend the observation interval to the last completion.
				4246	*/
				4247	bfqd->delta_from_first =
				4248	max_t(u64, bfqd->delta_from_first,
				4249	bfqd->last_completion - bfqd->first_dispatch);
				4250
				4251	/*
				4252	* Rate computed in sects/usec, and not sects/nsec, for
				4253	* precision issues.
				4254	*/
				4255	rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
				4256	div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
				4257
				4258	/*
				4259	* Peak rate not updated if:
				4260	* - the percentage of sequential dispatches is below 3/4 of the
				4261	* total, and rate is below the current estimated peak rate
				4262	* - rate is unreasonably high (> 20M sectors/sec)
				4263	*/
				4264	if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
				4265	rate <= bfqd->peak_rate) \|\|
				4266	rate > 20<<BFQ_RATE_SHIFT)
				4267	goto reset_computation;
				4268
				4269	/*
				4270	* We have to update the peak rate, at last! To this purpose,
				4271	* we use a low-pass filter. We compute the smoothing constant
				4272	* of the filter as a function of the 'weight' of the new
				4273	* measured rate.
				4274	*
				4275	* As can be seen in next formulas, we define this weight as a
				4276	* quantity proportional to how sequential the workload is,
				4277	* and to how long the observation time interval is.
				4278	*
				4279	* The weight runs from 0 to 8. The maximum value of the
				4280	* weight, 8, yields the minimum value for the smoothing
				4281	* constant. At this minimum value for the smoothing constant,
				4282	* the measured rate contributes for half of the next value of
				4283	* the estimated peak rate.
				4284	*
				4285	* So, the first step is to compute the weight as a function
				4286	* of how sequential the workload is. Note that the weight
				4287	* cannot reach 9, because bfqd->sequential_samples cannot
				4288	* become equal to bfqd->peak_rate_samples, which, in its
				4289	* turn, holds true because bfqd->sequential_samples is not
				4290	* incremented for the first sample.
				4291	*/
				4292	weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
				4293
				4294	/*
				4295	* Second step: further refine the weight as a function of the
				4296	* duration of the observation interval.
				4297	*/
				4298	weight = min_t(u32, 8,
				4299	div_u64(weight * bfqd->delta_from_first,
				4300	BFQ_RATE_REF_INTERVAL));
				4301
				4302	/*
				4303	* Divisor ranging from 10, for minimum weight, to 2, for
				4304	* maximum weight.
				4305	*/
				4306	divisor = 10 - weight;
				4307
				4308	/*
				4309	* Finally, update peak rate:
				4310	*
				4311	* peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
				4312	*/
				4313	bfqd->peak_rate *= divisor-1;
				4314	bfqd->peak_rate /= divisor;
				4315	rate /= divisor; /* smoothing constant alpha = 1/divisor */
				4316
				4317	bfqd->peak_rate += rate;
				4318	if (bfqd->bfq_user_max_budget == 0)
				4319	bfqd->bfq_max_budget =
				4320	bfq_calc_max_budget(bfqd);
				4321
				4322	reset_computation:
				4323	bfq_reset_rate_computation(bfqd, rq);
				4324	}
				4325
				4326	/*
				4327	* Update the read/write peak rate (the main quantity used for
				4328	* auto-tuning, see update_thr_responsiveness_params()).
				4329	*
				4330	* It is not trivial to estimate the peak rate (correctly): because of
				4331	* the presence of sw and hw queues between the scheduler and the
				4332	* device components that finally serve I/O requests, it is hard to
				4333	* say exactly when a given dispatched request is served inside the
				4334	* device, and for how long. As a consequence, it is hard to know
				4335	* precisely at what rate a given set of requests is actually served
				4336	* by the device.
				4337	*
				4338	* On the opposite end, the dispatch time of any request is trivially
				4339	* available, and, from this piece of information, the "dispatch rate"
				4340	* of requests can be immediately computed. So, the idea in the next
				4341	* function is to use what is known, namely request dispatch times
				4342	* (plus, when useful, request completion times), to estimate what is
				4343	* unknown, namely in-device request service rate.
				4344	*
				4345	* The main issue is that, because of the above facts, the rate at
				4346	* which a certain set of requests is dispatched over a certain time
				4347	* interval can vary greatly with respect to the rate at which the
				4348	* same requests are then served. But, since the size of any
				4349	* intermediate queue is limited, and the service scheme is lossless
				4350	* (no request is silently dropped), the following obvious convergence
				4351	* property holds: the number of requests dispatched MUST become
				4352	* closer and closer to the number of requests completed as the
				4353	* observation interval grows. This is the key property used in
				4354	* the next function to estimate the peak service rate as a function
				4355	* of the observed dispatch rate. The function assumes to be invoked
				4356	* on every request dispatch.
				4357	*/
				4358	static void bfq_update_peak_rate(struct bfq_data bfqd, struct request rq)
				4359	{
				4360	u64 now_ns = ktime_get_ns();
				4361
				4362	if (bfqd->peak_rate_samples == 0) { /* first dispatch */
				4363	bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
				4364	bfqd->peak_rate_samples);
				4365	bfq_reset_rate_computation(bfqd, rq);
				4366	goto update_last_values; /* will add one sample */
				4367	}
				4368
				4369	/*
				4370	* Device idle for very long: the observation interval lasting
				4371	* up to this dispatch cannot be a valid observation interval
				4372	* for computing a new peak rate (similarly to the late-
				4373	* completion event in bfq_completed_request()). Go to
				4374	* update_rate_and_reset to have the following three steps
				4375	* taken:
				4376	* - close the observation interval at the last (previous)
				4377	* request dispatch or completion
				4378	* - compute rate, if possible, for that observation interval
				4379	* - start a new observation interval with this dispatch
				4380	*/
				4381	if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
				4382	bfqd->rq_in_driver == 0)
				4383	goto update_rate_and_reset;
				4384
				4385	/* Update sampling information */
				4386	bfqd->peak_rate_samples++;
				4387
				4388	if ((bfqd->rq_in_driver > 0 \|\|
				4389	now_ns - bfqd->last_completion < BFQ_MIN_TT)
				4390	&& get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
				4391	bfqd->sequential_samples++;
				4392
				4393	bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
				4394
				4395	/* Reset max observed rq size every 32 dispatches */
				4396	if (likely(bfqd->peak_rate_samples % 32))
				4397	bfqd->last_rq_max_size =
				4398	max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
				4399	else
				4400	bfqd->last_rq_max_size = blk_rq_sectors(rq);
				4401
				4402	bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
				4403
				4404	/* Target observation interval not yet reached, go on sampling */
				4405	if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
				4406	goto update_last_values;
				4407
				4408	update_rate_and_reset:
				4409	bfq_update_rate_reset(bfqd, rq);
				4410	update_last_values:
				4411	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
				4412	bfqd->last_dispatch = now_ns;
				4413	}
				4414
				4415	/*
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4416	* Remove request from internal lists.
				4417	*/
				4418	static void bfq_dispatch_remove(struct request_queue q, struct request rq)
				4419	{
				4420	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				4421
				4422	/*
				4423	* For consistency, the next instruction should have been
				4424	* executed after removing the request from the queue and
				4425	* dispatching it. We execute instead this instruction before
				4426	* bfq_remove_request() (and hence introduce a temporary
				4427	* inconsistency), for efficiency. In fact, should this
				4428	* dispatch occur for a non in-service bfqq, this anticipated
				4429	* increment prevents two counters related to bfqq->dispatched
				4430	* from risking to be, first, uselessly decremented, and then
				4431	* incremented again when the (new) value of bfqq->dispatched
				4432	* happens to be taken into account.
				4433	*/
				4434	bfqq->dispatched++;
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4435	bfq_update_peak_rate(q->elevator->elevator_data, rq);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4436
				4437	bfq_remove_request(q, rq);
				4438	}
				4439
				4440	static void __bfq_bfqq_expire(struct bfq_data bfqd, struct bfq_queue bfqq)
				4441	{
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4442	if (RB_EMPTY_ROOT(&bfqq->sort_list))
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4443	bfq_del_bfqq_busy(bfqd, bfqq, true);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4444	else
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4445	bfq_requeue_bfqq(bfqd, bfqq);
				4446
				4447	/*
				4448	* All in-service entities must have been properly deactivated
				4449	* or requeued before executing the next function, which
				4450	* resets all in-service entites as no more in service.
				4451	*/
				4452	__bfq_bfqd_reset_in_service(bfqd);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4453	}
				4454
				4455	/**
				4456	* __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
				4457	* @bfqd: device data.
				4458	* @bfqq: queue to update.
				4459	* @reason: reason for expiration.
				4460	*
				4461	* Handle the feedback on @bfqq budget at queue expiration.
				4462	* See the body for detailed comments.
				4463	*/
				4464	static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
				4465	struct bfq_queue *bfqq,
				4466	enum bfqq_expiration reason)
				4467	{
				4468	struct request *next_rq;
				4469	int budget, min_budget;
				4470
				4471	budget = bfqq->max_budget;
				4472	min_budget = bfq_min_budget(bfqd);
				4473
				4474	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
				4475	bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
				4476	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
				4477	budget, bfq_min_budget(bfqd));
				4478	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
				4479	bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
				4480
				4481	if (bfq_bfqq_sync(bfqq)) {
				4482	switch (reason) {
				4483	/*
				4484	* Caveat: in all the following cases we trade latency
				4485	* for throughput.
				4486	*/
				4487	case BFQQE_TOO_IDLE:
Paolo Valente	54b6045	2017-04-12 18:23:09 +0200	[diff] [blame]	4488	/*
				4489	* This is the only case where we may reduce
				4490	* the budget: if there is no request of the
				4491	* process still waiting for completion, then
				4492	* we assume (tentatively) that the timer has
				4493	* expired because the batch of requests of
				4494	* the process could have been served with a
				4495	* smaller budget. Hence, betting that
				4496	* process will behave in the same way when it
				4497	* becomes backlogged again, we reduce its
				4498	* next budget. As long as we guess right,
				4499	* this budget cut reduces the latency
				4500	* experienced by the process.
				4501	*
				4502	* However, if there are still outstanding
				4503	* requests, then the process may have not yet
				4504	* issued its next request just because it is
				4505	* still waiting for the completion of some of
				4506	* the still outstanding ones. So in this
				4507	* subcase we do not reduce its budget, on the
				4508	* contrary we increase it to possibly boost
				4509	* the throughput, as discussed in the
				4510	* comments to the BUDGET_TIMEOUT case.
				4511	*/
				4512	if (bfqq->dispatched > 0) /* still outstanding reqs */
				4513	budget = min(budget * 2, bfqd->bfq_max_budget);
				4514	else {
				4515	if (budget > 5 * min_budget)
				4516	budget -= 4 * min_budget;
				4517	else
				4518	budget = min_budget;
				4519	}
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4520	break;
				4521	case BFQQE_BUDGET_TIMEOUT:
Paolo Valente	54b6045	2017-04-12 18:23:09 +0200	[diff] [blame]	4522	/*
				4523	* We double the budget here because it gives
				4524	* the chance to boost the throughput if this
				4525	* is not a seeky process (and has bumped into
				4526	* this timeout because of, e.g., ZBR).
				4527	*/
				4528	budget = min(budget * 2, bfqd->bfq_max_budget);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4529	break;
				4530	case BFQQE_BUDGET_EXHAUSTED:
				4531	/*
				4532	* The process still has backlog, and did not
				4533	* let either the budget timeout or the disk
				4534	* idling timeout expire. Hence it is not
				4535	* seeky, has a short thinktime and may be
				4536	* happy with a higher budget too. So
				4537	* definitely increase the budget of this good
				4538	* candidate to boost the disk throughput.
				4539	*/
Paolo Valente	54b6045	2017-04-12 18:23:09 +0200	[diff] [blame]	4540	budget = min(budget * 4, bfqd->bfq_max_budget);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4541	break;
				4542	case BFQQE_NO_MORE_REQUESTS:
				4543	/*
				4544	* For queues that expire for this reason, it
				4545	* is particularly important to keep the
				4546	* budget close to the actual service they
				4547	* need. Doing so reduces the timestamp
				4548	* misalignment problem described in the
				4549	* comments in the body of
				4550	* __bfq_activate_entity. In fact, suppose
				4551	* that a queue systematically expires for
				4552	* BFQQE_NO_MORE_REQUESTS and presents a
				4553	* new request in time to enjoy timestamp
				4554	* back-shifting. The larger the budget of the
				4555	* queue is with respect to the service the
				4556	* queue actually requests in each service
				4557	* slot, the more times the queue can be
				4558	* reactivated with the same virtual finish
				4559	* time. It follows that, even if this finish
				4560	* time is pushed to the system virtual time
				4561	* to reduce the consequent timestamp
				4562	* misalignment, the queue unjustly enjoys for
				4563	* many re-activations a lower finish time
				4564	* than all newly activated queues.
				4565	*
				4566	* The service needed by bfqq is measured
				4567	* quite precisely by bfqq->entity.service.
				4568	* Since bfqq does not enjoy device idling,
				4569	* bfqq->entity.service is equal to the number
				4570	* of sectors that the process associated with
				4571	* bfqq requested to read/write before waiting
				4572	* for request completions, or blocking for
				4573	* other reasons.
				4574	*/
				4575	budget = max_t(int, bfqq->entity.service, min_budget);
				4576	break;
				4577	default:
				4578	return;
				4579	}
				4580	} else {
				4581	/*
				4582	* Async queues get always the maximum possible
				4583	* budget, as for them we do not care about latency
				4584	* (in addition, their ability to dispatch is limited
				4585	* by the charging factor).
				4586	*/
				4587	budget = bfqd->bfq_max_budget;
				4588	}
				4589
				4590	bfqq->max_budget = budget;
				4591
				4592	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
				4593	!bfqd->bfq_user_max_budget)
				4594	bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
				4595
				4596	/*
				4597	* If there is still backlog, then assign a new budget, making
				4598	* sure that it is large enough for the next request. Since
				4599	* the finish time of bfqq must be kept in sync with the
				4600	* budget, be sure to call __bfq_bfqq_expire() after this
				4601	* update.
				4602	*
				4603	* If there is no backlog, then no need to update the budget;
				4604	* it will be updated on the arrival of a new request.
				4605	*/
				4606	next_rq = bfqq->next_rq;
				4607	if (next_rq)
				4608	bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
				4609	bfq_serv_to_charge(next_rq, bfqq));
				4610
				4611	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
				4612	next_rq ? blk_rq_sectors(next_rq) : 0,
				4613	bfqq->entity.budget);
				4614	}
				4615
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4616	/*
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4617	* Return true if the process associated with bfqq is "slow". The slow
				4618	* flag is used, in addition to the budget timeout, to reduce the
				4619	* amount of service provided to seeky processes, and thus reduce
				4620	* their chances to lower the throughput. More details in the comments
				4621	* on the function bfq_bfqq_expire().
				4622	*
				4623	* An important observation is in order: as discussed in the comments
				4624	* on the function bfq_update_peak_rate(), with devices with internal
				4625	* queues, it is hard if ever possible to know when and for how long
				4626	* an I/O request is processed by the device (apart from the trivial
				4627	* I/O pattern where a new request is dispatched only after the
				4628	* previous one has been completed). This makes it hard to evaluate
				4629	* the real rate at which the I/O requests of each bfq_queue are
				4630	* served. In fact, for an I/O scheduler like BFQ, serving a
				4631	* bfq_queue means just dispatching its requests during its service
				4632	* slot (i.e., until the budget of the queue is exhausted, or the
				4633	* queue remains idle, or, finally, a timeout fires). But, during the
				4634	* service slot of a bfq_queue, around 100 ms at most, the device may
				4635	* be even still processing requests of bfq_queues served in previous
				4636	* service slots. On the opposite end, the requests of the in-service
				4637	* bfq_queue may be completed after the service slot of the queue
				4638	* finishes.
				4639	*
				4640	* Anyway, unless more sophisticated solutions are used
				4641	* (where possible), the sum of the sizes of the requests dispatched
				4642	* during the service slot of a bfq_queue is probably the only
				4643	* approximation available for the service received by the bfq_queue
				4644	* during its service slot. And this sum is the quantity used in this
				4645	* function to evaluate the I/O speed of a process.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4646	*/
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4647	static bool bfq_bfqq_is_slow(struct bfq_data bfqd, struct bfq_queue bfqq,
				4648	bool compensate, enum bfqq_expiration reason,
				4649	unsigned long *delta_ms)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4650	{
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4651	ktime_t delta_ktime;
				4652	u32 delta_usecs;
				4653	bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4654
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4655	if (!bfq_bfqq_sync(bfqq))
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4656	return false;
				4657
				4658	if (compensate)
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4659	delta_ktime = bfqd->last_idling_start;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4660	else
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4661	delta_ktime = ktime_get();
				4662	delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
				4663	delta_usecs = ktime_to_us(delta_ktime);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4664
				4665	/* don't use too short time intervals */
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4666	if (delta_usecs < 1000) {
				4667	if (blk_queue_nonrot(bfqd->queue))
				4668	/*
				4669	* give same worst-case guarantees as idling
				4670	* for seeky
				4671	*/
				4672	*delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
				4673	else /* charge at least one seek */
				4674	*delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4675
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4676	return slow;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4677	}
				4678
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4679	*delta_ms = delta_usecs / USEC_PER_MSEC;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4680
				4681	/*
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4682	* Use only long (> 20ms) intervals to filter out excessive
				4683	* spikes in service rate estimation.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4684	*/
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4685	if (delta_usecs > 20000) {
				4686	/*
				4687	* Caveat for rotational devices: processes doing I/O
				4688	* in the slower disk zones tend to be slow(er) even
				4689	* if not seeky. In this respect, the estimated peak
				4690	* rate is likely to be an average over the disk
				4691	* surface. Accordingly, to not be too harsh with
				4692	* unlucky processes, a process is deemed slow only if
				4693	* its rate has been lower than half of the estimated
				4694	* peak rate.
				4695	*/
				4696	slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
				4697	}
				4698
				4699	bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
				4700
				4701	return slow;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4702	}
				4703
				4704	/*
				4705	* Return the farthest past time instant according to jiffies
				4706	* macros.
				4707	*/
				4708	static unsigned long bfq_smallest_from_now(void)
				4709	{
				4710	return jiffies - MAX_JIFFY_OFFSET;
				4711	}
				4712
				4713	/**
				4714	* bfq_bfqq_expire - expire a queue.
				4715	* @bfqd: device owning the queue.
				4716	* @bfqq: the queue to expire.
				4717	* @compensate: if true, compensate for the time spent idling.
				4718	* @reason: the reason causing the expiration.
				4719	*
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	4720	* If the process associated with bfqq does slow I/O (e.g., because it
				4721	* issues random requests), we charge bfqq with the time it has been
				4722	* in service instead of the service it has received (see
				4723	* bfq_bfqq_charge_time for details on how this goal is achieved). As
				4724	* a consequence, bfqq will typically get higher timestamps upon
				4725	* reactivation, and hence it will be rescheduled as if it had
				4726	* received more service than what it has actually received. In the
				4727	* end, bfqq receives less service in proportion to how slowly its
				4728	* associated process consumes its budgets (and hence how seriously it
				4729	* tends to lower the throughput). In addition, this time-charging
				4730	* strategy guarantees time fairness among slow processes. In
				4731	* contrast, if the process associated with bfqq is not slow, we
				4732	* charge bfqq exactly with the service it has received.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4733	*
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	4734	* Charging time to the first type of queues and the exact service to
				4735	* the other has the effect of using the WF2Q+ policy to schedule the
				4736	* former on a timeslice basis, without violating service domain
				4737	* guarantees among the latter.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4738	*/
				4739	static void bfq_bfqq_expire(struct bfq_data *bfqd,
				4740	struct bfq_queue *bfqq,
				4741	bool compensate,
				4742	enum bfqq_expiration reason)
				4743	{
				4744	bool slow;
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4745	unsigned long delta = 0;
				4746	struct bfq_entity *entity = &bfqq->entity;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4747	int ref;
				4748
				4749	/*
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4750	* Check whether the process is slow (see bfq_bfqq_is_slow).
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4751	*/
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4752	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4753
				4754	/*
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	4755	* As above explained, charge slow (typically seeky) and
				4756	* timed-out queues with the time and not the service
				4757	* received, to favor sequential workloads.
				4758	*
				4759	* Processes doing I/O in the slower disk zones will tend to
				4760	* be slow(er) even if not seeky. Therefore, since the
				4761	* estimated peak rate is actually an average over the disk
				4762	* surface, these processes may timeout just for bad luck. To
				4763	* avoid punishing them, do not charge time to processes that
				4764	* succeeded in consuming at least 2/3 of their budget. This
				4765	* allows BFQ to preserve enough elasticity to still perform
				4766	* bandwidth, and not time, distribution with little unlucky
				4767	* or quasi-sequential processes.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4768	*/
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame^]	4769	if (slow \|\|
				4770	(reason == BFQQE_BUDGET_TIMEOUT &&
				4771	bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))
				4772	bfq_bfqq_charge_time(bfqd, bfqq, delta);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4773
				4774	if (reason == BFQQE_TOO_IDLE &&
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4775	entity->service <= 2 * entity->budget / 10)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4776	bfq_clear_bfqq_IO_bound(bfqq);
				4777
				4778	bfq_log_bfqq(bfqd, bfqq,
				4779	"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
				4780	slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
				4781
				4782	/*
				4783	* Increase, decrease or leave budget unchanged according to
				4784	* reason.
				4785	*/
				4786	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
				4787	ref = bfqq->ref;
				4788	__bfq_bfqq_expire(bfqd, bfqq);
				4789
				4790	/* mark bfqq as waiting a request only if a bic still points to it */
				4791	if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
				4792	reason != BFQQE_BUDGET_TIMEOUT &&
				4793	reason != BFQQE_BUDGET_EXHAUSTED)
				4794	bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
				4795	}
				4796
				4797	/*
				4798	* Budget timeout is not implemented through a dedicated timer, but
				4799	* just checked on request arrivals and completions, as well as on
				4800	* idle timer expirations.
				4801	*/
				4802	static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
				4803	{
				4804	if (bfq_bfqq_budget_new(bfqq) \|\|
				4805	time_is_after_jiffies(bfqq->budget_timeout))
				4806	return false;
				4807	return true;
				4808	}
				4809
				4810	/*
				4811	* If we expire a queue that is actively waiting (i.e., with the
				4812	* device idled) for the arrival of a new request, then we may incur
				4813	* the timestamp misalignment problem described in the body of the
				4814	* function __bfq_activate_entity. Hence we return true only if this
				4815	* condition does not hold, or if the queue is slow enough to deserve
				4816	* only to be kicked off for preserving a high throughput.
				4817	*/
				4818	static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
				4819	{
				4820	bfq_log_bfqq(bfqq->bfqd, bfqq,
				4821	"may_budget_timeout: wait_request %d left %d timeout %d",
				4822	bfq_bfqq_wait_request(bfqq),
				4823	bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
				4824	bfq_bfqq_budget_timeout(bfqq));
				4825
				4826	return (!bfq_bfqq_wait_request(bfqq) \|\|
				4827	bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
				4828	&&
				4829	bfq_bfqq_budget_timeout(bfqq);
				4830	}
				4831
				4832	/*
				4833	* For a queue that becomes empty, device idling is allowed only if
				4834	* this function returns true for the queue. And this function returns
				4835	* true only if idling is beneficial for throughput.
				4836	*/
				4837	static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
				4838	{
				4839	struct bfq_data *bfqd = bfqq->bfqd;
				4840	bool idling_boosts_thr;
				4841
				4842	if (bfqd->strict_guarantees)
				4843	return true;
				4844
				4845	/*
				4846	* The value of the next variable is computed considering that
				4847	* idling is usually beneficial for the throughput if:
				4848	* (a) the device is not NCQ-capable, or
				4849	* (b) regardless of the presence of NCQ, the request pattern
				4850	* for bfqq is I/O-bound (possible throughput losses
				4851	* caused by granting idling to seeky queues are mitigated
				4852	* by the fact that, in all scenarios where boosting
				4853	* throughput is the best thing to do, i.e., in all
				4854	* symmetric scenarios, only a minimal idle time is
				4855	* allowed to seeky queues).
				4856	*/
				4857	idling_boosts_thr = !bfqd->hw_tag \|\| bfq_bfqq_IO_bound(bfqq);
				4858
				4859	/*
				4860	* We have now the components we need to compute the return
				4861	* value of the function, which is true only if both the
				4862	* following conditions hold:
				4863	* 1) bfqq is sync, because idling make sense only for sync queues;
				4864	* 2) idling boosts the throughput.
				4865	*/
				4866	return bfq_bfqq_sync(bfqq) && idling_boosts_thr;
				4867	}
				4868
				4869	/*
				4870	* If the in-service queue is empty but the function bfq_bfqq_may_idle
				4871	* returns true, then:
				4872	* 1) the queue must remain in service and cannot be expired, and
				4873	* 2) the device must be idled to wait for the possible arrival of a new
				4874	* request for the queue.
				4875	* See the comments on the function bfq_bfqq_may_idle for the reasons
				4876	* why performing device idling is the best choice to boost the throughput
				4877	* and preserve service guarantees when bfq_bfqq_may_idle itself
				4878	* returns true.
				4879	*/
				4880	static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
				4881	{
				4882	struct bfq_data *bfqd = bfqq->bfqd;
				4883
				4884	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
				4885	bfq_bfqq_may_idle(bfqq);
				4886	}
				4887
				4888	/*
				4889	* Select a queue for service. If we have a current queue in service,
				4890	* check whether to continue servicing it, or retrieve and set a new one.
				4891	*/
				4892	static struct bfq_queue bfq_select_queue(struct bfq_data bfqd)
				4893	{
				4894	struct bfq_queue *bfqq;
				4895	struct request *next_rq;
				4896	enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
				4897
				4898	bfqq = bfqd->in_service_queue;
				4899	if (!bfqq)
				4900	goto new_queue;
				4901
				4902	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
				4903
				4904	if (bfq_may_expire_for_budg_timeout(bfqq) &&
				4905	!bfq_bfqq_wait_request(bfqq) &&
				4906	!bfq_bfqq_must_idle(bfqq))
				4907	goto expire;
				4908
				4909	check_queue:
				4910	/*
				4911	* This loop is rarely executed more than once. Even when it
				4912	* happens, it is much more convenient to re-execute this loop
				4913	* than to return NULL and trigger a new dispatch to get a
				4914	* request served.
				4915	*/
				4916	next_rq = bfqq->next_rq;
				4917	/*
				4918	* If bfqq has requests queued and it has enough budget left to
				4919	* serve them, keep the queue, otherwise expire it.
				4920	*/
				4921	if (next_rq) {
				4922	if (bfq_serv_to_charge(next_rq, bfqq) >
				4923	bfq_bfqq_budget_left(bfqq)) {
				4924	/*
				4925	* Expire the queue for budget exhaustion,
				4926	* which makes sure that the next budget is
				4927	* enough to serve the next request, even if
				4928	* it comes from the fifo expired path.
				4929	*/
				4930	reason = BFQQE_BUDGET_EXHAUSTED;
				4931	goto expire;
				4932	} else {
				4933	/*
				4934	* The idle timer may be pending because we may
				4935	* not disable disk idling even when a new request
				4936	* arrives.
				4937	*/
				4938	if (bfq_bfqq_wait_request(bfqq)) {
				4939	/*
				4940	* If we get here: 1) at least a new request
				4941	* has arrived but we have not disabled the
				4942	* timer because the request was too small,
				4943	* 2) then the block layer has unplugged
				4944	* the device, causing the dispatch to be
				4945	* invoked.
				4946	*
				4947	* Since the device is unplugged, now the
				4948	* requests are probably large enough to
				4949	* provide a reasonable throughput.
				4950	* So we disable idling.
				4951	*/
				4952	bfq_clear_bfqq_wait_request(bfqq);
				4953	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4954	bfqg_stats_update_idle_time(bfqq_group(bfqq));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4955	}
				4956	goto keep_queue;
				4957	}
				4958	}
				4959
				4960	/*
				4961	* No requests pending. However, if the in-service queue is idling
				4962	* for a new request, or has requests waiting for a completion and
				4963	* may idle after their completion, then keep it anyway.
				4964	*/
				4965	if (bfq_bfqq_wait_request(bfqq) \|\|
				4966	(bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
				4967	bfqq = NULL;
				4968	goto keep_queue;
				4969	}
				4970
				4971	reason = BFQQE_NO_MORE_REQUESTS;
				4972	expire:
				4973	bfq_bfqq_expire(bfqd, bfqq, false, reason);
				4974	new_queue:
				4975	bfqq = bfq_set_in_service_queue(bfqd);
				4976	if (bfqq) {
				4977	bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
				4978	goto check_queue;
				4979	}
				4980	keep_queue:
				4981	if (bfqq)
				4982	bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
				4983	else
				4984	bfq_log(bfqd, "select_queue: no queue returned");
				4985
				4986	return bfqq;
				4987	}
				4988
				4989	/*
				4990	* Dispatch next request from bfqq.
				4991	*/
				4992	static struct request bfq_dispatch_rq_from_bfqq(struct bfq_data bfqd,
				4993	struct bfq_queue *bfqq)
				4994	{
				4995	struct request *rq = bfqq->next_rq;
				4996	unsigned long service_to_charge;
				4997
				4998	service_to_charge = bfq_serv_to_charge(rq, bfqq);
				4999
				5000	bfq_bfqq_served(bfqq, service_to_charge);
				5001
				5002	bfq_dispatch_remove(bfqd->queue, rq);
				5003
				5004	if (!bfqd->in_service_bic) {
				5005	atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
				5006	bfqd->in_service_bic = RQ_BIC(rq);
				5007	}
				5008
				5009	/*
				5010	* Expire bfqq, pretending that its budget expired, if bfqq
				5011	* belongs to CLASS_IDLE and other queues are waiting for
				5012	* service.
				5013	*/
				5014	if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
				5015	goto expire;
				5016
				5017	return rq;
				5018
				5019	expire:
				5020	bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
				5021	return rq;
				5022	}
				5023
				5024	static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
				5025	{
				5026	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
				5027
				5028	/*
				5029	* Avoiding lock: a race on bfqd->busy_queues should cause at
				5030	* most a call to dispatch for nothing
				5031	*/
				5032	return !list_empty_careful(&bfqd->dispatch) \|\|
				5033	bfqd->busy_queues > 0;
				5034	}
				5035
				5036	static struct request __bfq_dispatch_request(struct blk_mq_hw_ctx hctx)
				5037	{
				5038	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
				5039	struct request *rq = NULL;
				5040	struct bfq_queue *bfqq = NULL;
				5041
				5042	if (!list_empty(&bfqd->dispatch)) {
				5043	rq = list_first_entry(&bfqd->dispatch, struct request,
				5044	queuelist);
				5045	list_del_init(&rq->queuelist);
				5046
				5047	bfqq = RQ_BFQQ(rq);
				5048
				5049	if (bfqq) {
				5050	/*
				5051	* Increment counters here, because this
				5052	* dispatch does not follow the standard
				5053	* dispatch flow (where counters are
				5054	* incremented)
				5055	*/
				5056	bfqq->dispatched++;
				5057
				5058	goto inc_in_driver_start_rq;
				5059	}
				5060
				5061	/*
				5062	* We exploit the put_rq_private hook to decrement
				5063	* rq_in_driver, but put_rq_private will not be
				5064	* invoked on this request. So, to avoid unbalance,
				5065	* just start this request, without incrementing
				5066	* rq_in_driver. As a negative consequence,
				5067	* rq_in_driver is deceptively lower than it should be
				5068	* while this request is in service. This may cause
				5069	* bfq_schedule_dispatch to be invoked uselessly.
				5070	*
				5071	* As for implementing an exact solution, the
				5072	* put_request hook, if defined, is probably invoked
				5073	* also on this request. So, by exploiting this hook,
				5074	* we could 1) increment rq_in_driver here, and 2)
				5075	* decrement it in put_request. Such a solution would
				5076	* let the value of the counter be always accurate,
				5077	* but it would entail using an extra interface
				5078	* function. This cost seems higher than the benefit,
				5079	* being the frequency of non-elevator-private
				5080	* requests very low.
				5081	*/
				5082	goto start_rq;
				5083	}
				5084
				5085	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
				5086
				5087	if (bfqd->busy_queues == 0)
				5088	goto exit;
				5089
				5090	/*
				5091	* Force device to serve one request at a time if
				5092	* strict_guarantees is true. Forcing this service scheme is
				5093	* currently the ONLY way to guarantee that the request
				5094	* service order enforced by the scheduler is respected by a
				5095	* queueing device. Otherwise the device is free even to make
				5096	* some unlucky request wait for as long as the device
				5097	* wishes.
				5098	*
				5099	* Of course, serving one request at at time may cause loss of
				5100	* throughput.
				5101	*/
				5102	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
				5103	goto exit;
				5104
				5105	bfqq = bfq_select_queue(bfqd);
				5106	if (!bfqq)
				5107	goto exit;
				5108
				5109	rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
				5110
				5111	if (rq) {
				5112	inc_in_driver_start_rq:
				5113	bfqd->rq_in_driver++;
				5114	start_rq:
				5115	rq->rq_flags \|= RQF_STARTED;
				5116	}
				5117	exit:
				5118	return rq;
				5119	}
				5120
				5121	static struct request bfq_dispatch_request(struct blk_mq_hw_ctx hctx)
				5122	{
				5123	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
				5124	struct request *rq;
				5125
				5126	spin_lock_irq(&bfqd->lock);
				5127	rq = __bfq_dispatch_request(hctx);
				5128	spin_unlock_irq(&bfqd->lock);
				5129
				5130	return rq;
				5131	}
				5132
				5133	/*
				5134	* Task holds one reference to the queue, dropped when task exits. Each rq
				5135	* in-flight on this queue also holds a reference, dropped when rq is freed.
				5136	*
				5137	* Scheduler lock must be held here. Recall not to use bfqq after calling
				5138	* this function on it.
				5139	*/
				5140	static void bfq_put_queue(struct bfq_queue *bfqq)
				5141	{
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5142	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				5143	struct bfq_group *bfqg = bfqq_group(bfqq);
				5144	#endif
				5145
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5146	if (bfqq->bfqd)
				5147	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
				5148	bfqq, bfqq->ref);
				5149
				5150	bfqq->ref--;
				5151	if (bfqq->ref)
				5152	return;
				5153
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5154	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
				5155
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5156	kmem_cache_free(bfq_pool, bfqq);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5157	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				5158	bfqg_put(bfqg);
				5159	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5160	}
				5161
				5162	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
				5163	{
				5164	if (bfqq == bfqd->in_service_queue) {
				5165	__bfq_bfqq_expire(bfqd, bfqq);
				5166	bfq_schedule_dispatch(bfqd);
				5167	}
				5168
				5169	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
				5170
				5171	bfq_put_queue(bfqq); /* release process reference */
				5172	}
				5173
				5174	static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
				5175	{
				5176	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
				5177	struct bfq_data *bfqd;
				5178
				5179	if (bfqq)
				5180	bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
				5181
				5182	if (bfqq && bfqd) {
				5183	unsigned long flags;
				5184
				5185	spin_lock_irqsave(&bfqd->lock, flags);
				5186	bfq_exit_bfqq(bfqd, bfqq);
				5187	bic_set_bfqq(bic, NULL, is_sync);
				5188	spin_unlock_irq(&bfqd->lock);
				5189	}
				5190	}
				5191
				5192	static void bfq_exit_icq(struct io_cq *icq)
				5193	{
				5194	struct bfq_io_cq *bic = icq_to_bic(icq);
				5195
				5196	bfq_exit_icq_bfqq(bic, true);
				5197	bfq_exit_icq_bfqq(bic, false);
				5198	}
				5199
				5200	/*
				5201	* Update the entity prio values; note that the new values will not
				5202	* be used until the next (re)activation.
				5203	*/
				5204	static void
				5205	bfq_set_next_ioprio_data(struct bfq_queue bfqq, struct bfq_io_cq bic)
				5206	{
				5207	struct task_struct *tsk = current;
				5208	int ioprio_class;
				5209	struct bfq_data *bfqd = bfqq->bfqd;
				5210
				5211	if (!bfqd)
				5212	return;
				5213
				5214	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
				5215	switch (ioprio_class) {
				5216	default:
				5217	dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
				5218	"bfq: bad prio class %d\n", ioprio_class);
				5219	case IOPRIO_CLASS_NONE:
				5220	/*
				5221	* No prio set, inherit CPU scheduling settings.
				5222	*/
				5223	bfqq->new_ioprio = task_nice_ioprio(tsk);
				5224	bfqq->new_ioprio_class = task_nice_ioclass(tsk);
				5225	break;
				5226	case IOPRIO_CLASS_RT:
				5227	bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
				5228	bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
				5229	break;
				5230	case IOPRIO_CLASS_BE:
				5231	bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
				5232	bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
				5233	break;
				5234	case IOPRIO_CLASS_IDLE:
				5235	bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
				5236	bfqq->new_ioprio = 7;
				5237	bfq_clear_bfqq_idle_window(bfqq);
				5238	break;
				5239	}
				5240
				5241	if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
				5242	pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
				5243	bfqq->new_ioprio);
				5244	bfqq->new_ioprio = IOPRIO_BE_NR;
				5245	}
				5246
				5247	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
				5248	bfqq->entity.prio_changed = 1;
				5249	}
				5250
				5251	static void bfq_check_ioprio_change(struct bfq_io_cq bic, struct bio bio)
				5252	{
				5253	struct bfq_data *bfqd = bic_to_bfqd(bic);
				5254	struct bfq_queue *bfqq;
				5255	int ioprio = bic->icq.ioc->ioprio;
				5256
				5257	/*
				5258	* This condition may trigger on a newly created bic, be sure to
				5259	* drop the lock before returning.
				5260	*/
				5261	if (unlikely(!bfqd) \|\| likely(bic->ioprio == ioprio))
				5262	return;
				5263
				5264	bic->ioprio = ioprio;
				5265
				5266	bfqq = bic_to_bfqq(bic, false);
				5267	if (bfqq) {
				5268	/* release process reference on this queue */
				5269	bfq_put_queue(bfqq);
				5270	bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
				5271	bic_set_bfqq(bic, bfqq, false);
				5272	}
				5273
				5274	bfqq = bic_to_bfqq(bic, true);
				5275	if (bfqq)
				5276	bfq_set_next_ioprio_data(bfqq, bic);
				5277	}
				5278
				5279	static void bfq_init_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq,
				5280	struct bfq_io_cq *bic, pid_t pid, int is_sync)
				5281	{
				5282	RB_CLEAR_NODE(&bfqq->entity.rb_node);
				5283	INIT_LIST_HEAD(&bfqq->fifo);
				5284
				5285	bfqq->ref = 0;
				5286	bfqq->bfqd = bfqd;
				5287
				5288	if (bic)
				5289	bfq_set_next_ioprio_data(bfqq, bic);
				5290
				5291	if (is_sync) {
				5292	if (!bfq_class_idle(bfqq))
				5293	bfq_mark_bfqq_idle_window(bfqq);
				5294	bfq_mark_bfqq_sync(bfqq);
				5295	} else
				5296	bfq_clear_bfqq_sync(bfqq);
				5297
				5298	/* set end request to minus infinity from now */
				5299	bfqq->ttime.last_end_request = ktime_get_ns() + 1;
				5300
				5301	bfq_mark_bfqq_IO_bound(bfqq);
				5302
				5303	bfqq->pid = pid;
				5304
				5305	/* Tentative initial value to trade off between thr and lat */
Paolo Valente	54b6045	2017-04-12 18:23:09 +0200	[diff] [blame]	5306	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5307	bfqq->budget_timeout = bfq_smallest_from_now();
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5308
				5309	/* first request is almost certainly seeky */
				5310	bfqq->seek_history = 1;
				5311	}
				5312
				5313	static struct bfq_queue *bfq_async_queue_prio(struct bfq_data bfqd,
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5314	struct bfq_group *bfqg,
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5315	int ioprio_class, int ioprio)
				5316	{
				5317	switch (ioprio_class) {
				5318	case IOPRIO_CLASS_RT:
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5319	return &bfqg->async_bfqq[0][ioprio];
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5320	case IOPRIO_CLASS_NONE:
				5321	ioprio = IOPRIO_NORM;
				5322	/* fall through */
				5323	case IOPRIO_CLASS_BE:
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5324	return &bfqg->async_bfqq[1][ioprio];
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5325	case IOPRIO_CLASS_IDLE:
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5326	return &bfqg->async_idle_bfqq;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5327	default:
				5328	return NULL;
				5329	}
				5330	}
				5331
				5332	static struct bfq_queue bfq_get_queue(struct bfq_data bfqd,
				5333	struct bio *bio, bool is_sync,
				5334	struct bfq_io_cq *bic)
				5335	{
				5336	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
				5337	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
				5338	struct bfq_queue **async_bfqq = NULL;
				5339	struct bfq_queue *bfqq;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5340	struct bfq_group *bfqg;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5341
				5342	rcu_read_lock();
				5343
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5344	bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
				5345	if (!bfqg) {
				5346	bfqq = &bfqd->oom_bfqq;
				5347	goto out;
				5348	}
				5349
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5350	if (!is_sync) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5351	async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5352	ioprio);
				5353	bfqq = *async_bfqq;
				5354	if (bfqq)
				5355	goto out;
				5356	}
				5357
				5358	bfqq = kmem_cache_alloc_node(bfq_pool,
				5359	GFP_NOWAIT \| __GFP_ZERO \| __GFP_NOWARN,
				5360	bfqd->queue->node);
				5361
				5362	if (bfqq) {
				5363	bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
				5364	is_sync);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5365	bfq_init_entity(&bfqq->entity, bfqg);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5366	bfq_log_bfqq(bfqd, bfqq, "allocated");
				5367	} else {
				5368	bfqq = &bfqd->oom_bfqq;
				5369	bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
				5370	goto out;
				5371	}
				5372
				5373	/*
				5374	* Pin the queue now that it's allocated, scheduler exit will
				5375	* prune it.
				5376	*/
				5377	if (async_bfqq) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5378	bfqq->ref++; /*
				5379	* Extra group reference, w.r.t. sync
				5380	* queue. This extra reference is removed
				5381	* only if bfqq->bfqg disappears, to
				5382	* guarantee that this queue is not freed
				5383	* until its group goes away.
				5384	*/
				5385	bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5386	bfqq, bfqq->ref);
				5387	*async_bfqq = bfqq;
				5388	}
				5389
				5390	out:
				5391	bfqq->ref++; /* get a process reference to this queue */
				5392	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
				5393	rcu_read_unlock();
				5394	return bfqq;
				5395	}
				5396
				5397	static void bfq_update_io_thinktime(struct bfq_data *bfqd,
				5398	struct bfq_queue *bfqq)
				5399	{
				5400	struct bfq_ttime *ttime = &bfqq->ttime;
				5401	u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
				5402
				5403	elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
				5404
				5405	ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
				5406	ttime->ttime_total = div_u64(7ttime->ttime_total + 256elapsed, 8);
				5407	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
				5408	ttime->ttime_samples);
				5409	}
				5410
				5411	static void
				5412	bfq_update_io_seektime(struct bfq_data bfqd, struct bfq_queue bfqq,
				5413	struct request *rq)
				5414	{
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5415	bfqq->seek_history <<= 1;
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5416	bfqq->seek_history \|=
				5417	get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5418	(!blk_queue_nonrot(bfqd->queue) \|\|
				5419	blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
				5420	}
				5421
				5422	/*
				5423	* Disable idle window if the process thinks too long or seeks so much that
				5424	* it doesn't matter.
				5425	*/
				5426	static void bfq_update_idle_window(struct bfq_data *bfqd,
				5427	struct bfq_queue *bfqq,
				5428	struct bfq_io_cq *bic)
				5429	{
				5430	int enable_idle;
				5431
				5432	/* Don't idle for async or idle io prio class. */
				5433	if (!bfq_bfqq_sync(bfqq) \|\| bfq_class_idle(bfqq))
				5434	return;
				5435
				5436	enable_idle = bfq_bfqq_idle_window(bfqq);
				5437
				5438	if (atomic_read(&bic->icq.ioc->active_ref) == 0 \|\|
				5439	bfqd->bfq_slice_idle == 0 \|\|
				5440	(bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
				5441	enable_idle = 0;
				5442	else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
				5443	if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)
				5444	enable_idle = 0;
				5445	else
				5446	enable_idle = 1;
				5447	}
				5448	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
				5449	enable_idle);
				5450
				5451	if (enable_idle)
				5452	bfq_mark_bfqq_idle_window(bfqq);
				5453	else
				5454	bfq_clear_bfqq_idle_window(bfqq);
				5455	}
				5456
				5457	/*
				5458	* Called when a new fs request (rq) is added to bfqq. Check if there's
				5459	* something we should do about it.
				5460	*/
				5461	static void bfq_rq_enqueued(struct bfq_data bfqd, struct bfq_queue bfqq,
				5462	struct request *rq)
				5463	{
				5464	struct bfq_io_cq *bic = RQ_BIC(rq);
				5465
				5466	if (rq->cmd_flags & REQ_META)
				5467	bfqq->meta_pending++;
				5468
				5469	bfq_update_io_thinktime(bfqd, bfqq);
				5470	bfq_update_io_seektime(bfqd, bfqq, rq);
				5471	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 \|\|
				5472	!BFQQ_SEEKY(bfqq))
				5473	bfq_update_idle_window(bfqd, bfqq, bic);
				5474
				5475	bfq_log_bfqq(bfqd, bfqq,
				5476	"rq_enqueued: idle_window=%d (seeky %d)",
				5477	bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
				5478
				5479	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
				5480
				5481	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
				5482	bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
				5483	blk_rq_sectors(rq) < 32;
				5484	bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
				5485
				5486	/*
				5487	* There is just this request queued: if the request
				5488	* is small and the queue is not to be expired, then
				5489	* just exit.
				5490	*
				5491	* In this way, if the device is being idled to wait
				5492	* for a new request from the in-service queue, we
				5493	* avoid unplugging the device and committing the
				5494	* device to serve just a small request. On the
				5495	* contrary, we wait for the block layer to decide
				5496	* when to unplug the device: hopefully, new requests
				5497	* will be merged to this one quickly, then the device
				5498	* will be unplugged and larger requests will be
				5499	* dispatched.
				5500	*/
				5501	if (small_req && !budget_timeout)
				5502	return;
				5503
				5504	/*
				5505	* A large enough request arrived, or the queue is to
				5506	* be expired: in both cases disk idling is to be
				5507	* stopped, so clear wait_request flag and reset
				5508	* timer.
				5509	*/
				5510	bfq_clear_bfqq_wait_request(bfqq);
				5511	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5512	bfqg_stats_update_idle_time(bfqq_group(bfqq));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5513
				5514	/*
				5515	* The queue is not empty, because a new request just
				5516	* arrived. Hence we can safely expire the queue, in
				5517	* case of budget timeout, without risking that the
				5518	* timestamps of the queue are not updated correctly.
				5519	* See [1] for more details.
				5520	*/
				5521	if (budget_timeout)
				5522	bfq_bfqq_expire(bfqd, bfqq, false,
				5523	BFQQE_BUDGET_TIMEOUT);
				5524	}
				5525	}
				5526
				5527	static void __bfq_insert_request(struct bfq_data bfqd, struct request rq)
				5528	{
				5529	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				5530
				5531	bfq_add_request(rq);
				5532
				5533	rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
				5534	list_add_tail(&rq->queuelist, &bfqq->fifo);
				5535
				5536	bfq_rq_enqueued(bfqd, bfqq, rq);
				5537	}
				5538
				5539	static void bfq_insert_request(struct blk_mq_hw_ctx hctx, struct request rq,
				5540	bool at_head)
				5541	{
				5542	struct request_queue *q = hctx->queue;
				5543	struct bfq_data *bfqd = q->elevator->elevator_data;
				5544
				5545	spin_lock_irq(&bfqd->lock);
				5546	if (blk_mq_sched_try_insert_merge(q, rq)) {
				5547	spin_unlock_irq(&bfqd->lock);
				5548	return;
				5549	}
				5550
				5551	spin_unlock_irq(&bfqd->lock);
				5552
				5553	blk_mq_sched_request_inserted(rq);
				5554
				5555	spin_lock_irq(&bfqd->lock);
				5556	if (at_head \|\| blk_rq_is_passthrough(rq)) {
				5557	if (at_head)
				5558	list_add(&rq->queuelist, &bfqd->dispatch);
				5559	else
				5560	list_add_tail(&rq->queuelist, &bfqd->dispatch);
				5561	} else {
				5562	__bfq_insert_request(bfqd, rq);
				5563
				5564	if (rq_mergeable(rq)) {
				5565	elv_rqhash_add(q, rq);
				5566	if (!q->last_merge)
				5567	q->last_merge = rq;
				5568	}
				5569	}
				5570
				5571	spin_unlock_irq(&bfqd->lock);
				5572	}
				5573
				5574	static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
				5575	struct list_head *list, bool at_head)
				5576	{
				5577	while (!list_empty(list)) {
				5578	struct request *rq;
				5579
				5580	rq = list_first_entry(list, struct request, queuelist);
				5581	list_del_init(&rq->queuelist);
				5582	bfq_insert_request(hctx, rq, at_head);
				5583	}
				5584	}
				5585
				5586	static void bfq_update_hw_tag(struct bfq_data *bfqd)
				5587	{
				5588	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
				5589	bfqd->rq_in_driver);
				5590
				5591	if (bfqd->hw_tag == 1)
				5592	return;
				5593
				5594	/*
				5595	* This sample is valid if the number of outstanding requests
				5596	* is large enough to allow a queueing behavior. Note that the
				5597	* sum is not exact, as it's not taking into account deactivated
				5598	* requests.
				5599	*/
				5600	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
				5601	return;
				5602
				5603	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
				5604	return;
				5605
				5606	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
				5607	bfqd->max_rq_in_driver = 0;
				5608	bfqd->hw_tag_samples = 0;
				5609	}
				5610
				5611	static void bfq_completed_request(struct bfq_queue bfqq, struct bfq_data bfqd)
				5612	{
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5613	u64 now_ns;
				5614	u32 delta_us;
				5615
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5616	bfq_update_hw_tag(bfqd);
				5617
				5618	bfqd->rq_in_driver--;
				5619	bfqq->dispatched--;
				5620
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5621	now_ns = ktime_get_ns();
				5622
				5623	bfqq->ttime.last_end_request = now_ns;
				5624
				5625	/*
				5626	* Using us instead of ns, to get a reasonable precision in
				5627	* computing rate in next check.
				5628	*/
				5629	delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
				5630
				5631	/*
				5632	* If the request took rather long to complete, and, according
				5633	* to the maximum request size recorded, this completion latency
				5634	* implies that the request was certainly served at a very low
				5635	* rate (less than 1M sectors/sec), then the whole observation
				5636	* interval that lasts up to this time instant cannot be a
				5637	* valid time interval for computing a new peak rate. Invoke
				5638	* bfq_update_rate_reset to have the following three steps
				5639	* taken:
				5640	* - close the observation interval at the last (previous)
				5641	* request dispatch or completion
				5642	* - compute rate, if possible, for that observation interval
				5643	* - reset to zero samples, which will trigger a proper
				5644	* re-initialization of the observation interval on next
				5645	* dispatch
				5646	*/
				5647	if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
				5648	(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
				5649	1UL<<(BFQ_RATE_SHIFT - 10))
				5650	bfq_update_rate_reset(bfqd, NULL);
				5651	bfqd->last_completion = now_ns;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5652
				5653	/*
				5654	* If this is the in-service queue, check if it needs to be expired,
				5655	* or if we want to idle in case it has no pending requests.
				5656	*/
				5657	if (bfqd->in_service_queue == bfqq) {
				5658	if (bfq_bfqq_budget_new(bfqq))
				5659	bfq_set_budget_timeout(bfqd);
				5660
				5661	if (bfq_bfqq_must_idle(bfqq)) {
				5662	bfq_arm_slice_timer(bfqd);
				5663	return;
				5664	} else if (bfq_may_expire_for_budg_timeout(bfqq))
				5665	bfq_bfqq_expire(bfqd, bfqq, false,
				5666	BFQQE_BUDGET_TIMEOUT);
				5667	else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
				5668	(bfqq->dispatched == 0 \|\|
				5669	!bfq_bfqq_may_idle(bfqq)))
				5670	bfq_bfqq_expire(bfqd, bfqq, false,
				5671	BFQQE_NO_MORE_REQUESTS);
				5672	}
				5673	}
				5674
				5675	static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
				5676	{
				5677	bfqq->allocated--;
				5678
				5679	bfq_put_queue(bfqq);
				5680	}
				5681
				5682	static void bfq_put_rq_private(struct request_queue q, struct request rq)
				5683	{
				5684	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				5685	struct bfq_data *bfqd = bfqq->bfqd;
				5686
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5687	if (rq->rq_flags & RQF_STARTED)
				5688	bfqg_stats_update_completion(bfqq_group(bfqq),
				5689	rq_start_time_ns(rq),
				5690	rq_io_start_time_ns(rq),
				5691	rq->cmd_flags);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5692
				5693	if (likely(rq->rq_flags & RQF_STARTED)) {
				5694	unsigned long flags;
				5695
				5696	spin_lock_irqsave(&bfqd->lock, flags);
				5697
				5698	bfq_completed_request(bfqq, bfqd);
				5699	bfq_put_rq_priv_body(bfqq);
				5700
				5701	spin_unlock_irqrestore(&bfqd->lock, flags);
				5702	} else {
				5703	/*
				5704	* Request rq may be still/already in the scheduler,
				5705	* in which case we need to remove it. And we cannot
				5706	* defer such a check and removal, to avoid
				5707	* inconsistencies in the time interval from the end
				5708	* of this function to the start of the deferred work.
				5709	* This situation seems to occur only in process
				5710	* context, as a consequence of a merge. In the
				5711	* current version of the code, this implies that the
				5712	* lock is held.
				5713	*/
				5714
				5715	if (!RB_EMPTY_NODE(&rq->rb_node))
				5716	bfq_remove_request(q, rq);
				5717	bfq_put_rq_priv_body(bfqq);
				5718	}
				5719
				5720	rq->elv.priv[0] = NULL;
				5721	rq->elv.priv[1] = NULL;
				5722	}
				5723
				5724	/*
				5725	* Allocate bfq data structures associated with this request.
				5726	*/
				5727	static int bfq_get_rq_private(struct request_queue q, struct request rq,
				5728	struct bio *bio)
				5729	{
				5730	struct bfq_data *bfqd = q->elevator->elevator_data;
				5731	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
				5732	const int is_sync = rq_is_sync(rq);
				5733	struct bfq_queue *bfqq;
				5734
				5735	spin_lock_irq(&bfqd->lock);
				5736
				5737	bfq_check_ioprio_change(bic, bio);
				5738
				5739	if (!bic)
				5740	goto queue_fail;
				5741
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5742	bfq_bic_update_cgroup(bic, bio);
				5743
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5744	bfqq = bic_to_bfqq(bic, is_sync);
				5745	if (!bfqq \|\| bfqq == &bfqd->oom_bfqq) {
				5746	if (bfqq)
				5747	bfq_put_queue(bfqq);
				5748	bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
				5749	bic_set_bfqq(bic, bfqq, is_sync);
				5750	}
				5751
				5752	bfqq->allocated++;
				5753	bfqq->ref++;
				5754	bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
				5755	rq, bfqq, bfqq->ref);
				5756
				5757	rq->elv.priv[0] = bic;
				5758	rq->elv.priv[1] = bfqq;
				5759
				5760	spin_unlock_irq(&bfqd->lock);
				5761
				5762	return 0;
				5763
				5764	queue_fail:
				5765	spin_unlock_irq(&bfqd->lock);
				5766
				5767	return 1;
				5768	}
				5769
				5770	static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
				5771	{
				5772	struct bfq_data *bfqd = bfqq->bfqd;
				5773	enum bfqq_expiration reason;
				5774	unsigned long flags;
				5775
				5776	spin_lock_irqsave(&bfqd->lock, flags);
				5777	bfq_clear_bfqq_wait_request(bfqq);
				5778
				5779	if (bfqq != bfqd->in_service_queue) {
				5780	spin_unlock_irqrestore(&bfqd->lock, flags);
				5781	return;
				5782	}
				5783
				5784	if (bfq_bfqq_budget_timeout(bfqq))
				5785	/*
				5786	* Also here the queue can be safely expired
				5787	* for budget timeout without wasting
				5788	* guarantees
				5789	*/
				5790	reason = BFQQE_BUDGET_TIMEOUT;
				5791	else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
				5792	/*
				5793	* The queue may not be empty upon timer expiration,
				5794	* because we may not disable the timer when the
				5795	* first request of the in-service queue arrives
				5796	* during disk idling.
				5797	*/
				5798	reason = BFQQE_TOO_IDLE;
				5799	else
				5800	goto schedule_dispatch;
				5801
				5802	bfq_bfqq_expire(bfqd, bfqq, true, reason);
				5803
				5804	schedule_dispatch:
				5805	spin_unlock_irqrestore(&bfqd->lock, flags);
				5806	bfq_schedule_dispatch(bfqd);
				5807	}
				5808
				5809	/*
				5810	* Handler of the expiration of the timer running if the in-service queue
				5811	* is idling inside its time slice.
				5812	*/
				5813	static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
				5814	{
				5815	struct bfq_data *bfqd = container_of(timer, struct bfq_data,
				5816	idle_slice_timer);
				5817	struct bfq_queue *bfqq = bfqd->in_service_queue;
				5818
				5819	/*
				5820	* Theoretical race here: the in-service queue can be NULL or
				5821	* different from the queue that was idling if a new request
				5822	* arrives for the current queue and there is a full dispatch
				5823	* cycle that changes the in-service queue. This can hardly
				5824	* happen, but in the worst case we just expire a queue too
				5825	* early.
				5826	*/
				5827	if (bfqq)
				5828	bfq_idle_slice_timer_body(bfqq);
				5829
				5830	return HRTIMER_NORESTART;
				5831	}
				5832
				5833	static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
				5834	struct bfq_queue **bfqq_ptr)
				5835	{
				5836	struct bfq_queue bfqq = bfqq_ptr;
				5837
				5838	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
				5839	if (bfqq) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5840	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
				5841
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5842	bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
				5843	bfqq, bfqq->ref);
				5844	bfq_put_queue(bfqq);
				5845	*bfqq_ptr = NULL;
				5846	}
				5847	}
				5848
				5849	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5850	* Release all the bfqg references to its async queues. If we are
				5851	* deallocating the group these queues may still contain requests, so
				5852	* we reparent them to the root cgroup (i.e., the only one that will
				5853	* exist for sure until all the requests on a device are gone).
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5854	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5855	static void bfq_put_async_queues(struct bfq_data bfqd, struct bfq_group bfqg)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5856	{
				5857	int i, j;
				5858
				5859	for (i = 0; i < 2; i++)
				5860	for (j = 0; j < IOPRIO_BE_NR; j++)
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5861	__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5862
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5863	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5864	}
				5865
				5866	static void bfq_exit_queue(struct elevator_queue *e)
				5867	{
				5868	struct bfq_data *bfqd = e->elevator_data;
				5869	struct bfq_queue bfqq, n;
				5870
				5871	hrtimer_cancel(&bfqd->idle_slice_timer);
				5872
				5873	spin_lock_irq(&bfqd->lock);
				5874	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5875	bfq_deactivate_bfqq(bfqd, bfqq, false, false);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5876	spin_unlock_irq(&bfqd->lock);
				5877
				5878	hrtimer_cancel(&bfqd->idle_slice_timer);
				5879
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5880	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				5881	blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
				5882	#else
				5883	spin_lock_irq(&bfqd->lock);
				5884	bfq_put_async_queues(bfqd, bfqd->root_group);
				5885	kfree(bfqd->root_group);
				5886	spin_unlock_irq(&bfqd->lock);
				5887	#endif
				5888
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5889	kfree(bfqd);
				5890	}
				5891
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5892	static void bfq_init_root_group(struct bfq_group *root_group,
				5893	struct bfq_data *bfqd)
				5894	{
				5895	int i;
				5896
				5897	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				5898	root_group->entity.parent = NULL;
				5899	root_group->my_entity = NULL;
				5900	root_group->bfqd = bfqd;
				5901	#endif
				5902	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
				5903	root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
				5904	root_group->sched_data.bfq_class_idle_last_service = jiffies;
				5905	}
				5906
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5907	static int bfq_init_queue(struct request_queue q, struct elevator_type e)
				5908	{
				5909	struct bfq_data *bfqd;
				5910	struct elevator_queue *eq;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5911
				5912	eq = elevator_alloc(q, e);
				5913	if (!eq)
				5914	return -ENOMEM;
				5915
				5916	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
				5917	if (!bfqd) {
				5918	kobject_put(&eq->kobj);
				5919	return -ENOMEM;
				5920	}
				5921	eq->elevator_data = bfqd;
				5922
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5923	spin_lock_irq(q->queue_lock);
				5924	q->elevator = eq;
				5925	spin_unlock_irq(q->queue_lock);
				5926
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5927	/*
				5928	* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
				5929	* Grab a permanent reference to it, so that the normal code flow
				5930	* will not attempt to free it.
				5931	*/
				5932	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
				5933	bfqd->oom_bfqq.ref++;
				5934	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
				5935	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
				5936	bfqd->oom_bfqq.entity.new_weight =
				5937	bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
				5938	/*
				5939	* Trigger weight initialization, according to ioprio, at the
				5940	* oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
				5941	* class won't be changed any more.
				5942	*/
				5943	bfqd->oom_bfqq.entity.prio_changed = 1;
				5944
				5945	bfqd->queue = q;
				5946
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5947	INIT_LIST_HEAD(&bfqd->dispatch);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5948
				5949	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
				5950	HRTIMER_MODE_REL);
				5951	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
				5952
				5953	INIT_LIST_HEAD(&bfqd->active_list);
				5954	INIT_LIST_HEAD(&bfqd->idle_list);
				5955
				5956	bfqd->hw_tag = -1;
				5957
				5958	bfqd->bfq_max_budget = bfq_default_max_budget;
				5959
				5960	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
				5961	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
				5962	bfqd->bfq_back_max = bfq_back_max;
				5963	bfqd->bfq_back_penalty = bfq_back_penalty;
				5964	bfqd->bfq_slice_idle = bfq_slice_idle;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5965	bfqd->bfq_timeout = bfq_timeout;
				5966
				5967	bfqd->bfq_requests_within_timer = 120;
				5968
				5969	spin_lock_init(&bfqd->lock);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5970
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5971	/*
				5972	* The invocation of the next bfq_create_group_hierarchy
				5973	* function is the head of a chain of function calls
				5974	* (bfq_create_group_hierarchy->blkcg_activate_policy->
				5975	* blk_mq_freeze_queue) that may lead to the invocation of the
				5976	* has_work hook function. For this reason,
				5977	* bfq_create_group_hierarchy is invoked only after all
				5978	* scheduler data has been initialized, apart from the fields
				5979	* that can be initialized only after invoking
				5980	* bfq_create_group_hierarchy. This, in particular, enables
				5981	* has_work to correctly return false. Of course, to avoid
				5982	* other inconsistencies, the blk-mq stack must then refrain
				5983	* from invoking further scheduler hooks before this init
				5984	* function is finished.
				5985	*/
				5986	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
				5987	if (!bfqd->root_group)
				5988	goto out_free;
				5989	bfq_init_root_group(bfqd->root_group, bfqd);
				5990	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
				5991
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5992
				5993	return 0;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5994
				5995	out_free:
				5996	kfree(bfqd);
				5997	kobject_put(&eq->kobj);
				5998	return -ENOMEM;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5999	}
				6000
				6001	static void bfq_slab_kill(void)
				6002	{
				6003	kmem_cache_destroy(bfq_pool);
				6004	}
				6005
				6006	static int __init bfq_slab_setup(void)
				6007	{
				6008	bfq_pool = KMEM_CACHE(bfq_queue, 0);
				6009	if (!bfq_pool)
				6010	return -ENOMEM;
				6011	return 0;
				6012	}
				6013
				6014	static ssize_t bfq_var_show(unsigned int var, char *page)
				6015	{
				6016	return sprintf(page, "%u\n", var);
				6017	}
				6018
				6019	static ssize_t bfq_var_store(unsigned long var, const char page,
				6020	size_t count)
				6021	{
				6022	unsigned long new_val;
				6023	int ret = kstrtoul(page, 10, &new_val);
				6024
				6025	if (ret == 0)
				6026	*var = new_val;
				6027
				6028	return count;
				6029	}
				6030
				6031	#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
				6032	static ssize_t __FUNC(struct elevator_queue e, char page) \
				6033	{ \
				6034	struct bfq_data *bfqd = e->elevator_data; \
				6035	u64 __data = __VAR; \
				6036	if (__CONV == 1) \
				6037	__data = jiffies_to_msecs(__data); \
				6038	else if (__CONV == 2) \
				6039	__data = div_u64(__data, NSEC_PER_MSEC); \
				6040	return bfq_var_show(__data, (page)); \
				6041	}
				6042	SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
				6043	SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
				6044	SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
				6045	SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
				6046	SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
				6047	SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
				6048	SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
				6049	SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
				6050	#undef SHOW_FUNCTION
				6051
				6052	#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
				6053	static ssize_t __FUNC(struct elevator_queue e, char page) \
				6054	{ \
				6055	struct bfq_data *bfqd = e->elevator_data; \
				6056	u64 __data = __VAR; \
				6057	__data = div_u64(__data, NSEC_PER_USEC); \
				6058	return bfq_var_show(__data, (page)); \
				6059	}
				6060	USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
				6061	#undef USEC_SHOW_FUNCTION
				6062
				6063	#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
				6064	static ssize_t \
				6065	__FUNC(struct elevator_queue e, const char page, size_t count) \
				6066	{ \
				6067	struct bfq_data *bfqd = e->elevator_data; \
				6068	unsigned long uninitialized_var(__data); \
				6069	int ret = bfq_var_store(&__data, (page), count); \
				6070	if (__data < (MIN)) \
				6071	__data = (MIN); \
				6072	else if (__data > (MAX)) \
				6073	__data = (MAX); \
				6074	if (__CONV == 1) \
				6075	*(__PTR) = msecs_to_jiffies(__data); \
				6076	else if (__CONV == 2) \
				6077	(__PTR) = (u64)__data NSEC_PER_MSEC; \
				6078	else \
				6079	*(__PTR) = __data; \
				6080	return ret; \
				6081	}
				6082	STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
				6083	INT_MAX, 2);
				6084	STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
				6085	INT_MAX, 2);
				6086	STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
				6087	STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
				6088	INT_MAX, 0);
				6089	STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
				6090	#undef STORE_FUNCTION
				6091
				6092	#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
				6093	static ssize_t __FUNC(struct elevator_queue e, const char page, size_t count)\
				6094	{ \
				6095	struct bfq_data *bfqd = e->elevator_data; \
				6096	unsigned long uninitialized_var(__data); \
				6097	int ret = bfq_var_store(&__data, (page), count); \
				6098	if (__data < (MIN)) \
				6099	__data = (MIN); \
				6100	else if (__data > (MAX)) \
				6101	__data = (MAX); \
				6102	(__PTR) = (u64)__data NSEC_PER_USEC; \
				6103	return ret; \
				6104	}
				6105	USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
				6106	UINT_MAX);
				6107	#undef USEC_STORE_FUNCTION
				6108
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6109	static ssize_t bfq_max_budget_store(struct elevator_queue *e,
				6110	const char *page, size_t count)
				6111	{
				6112	struct bfq_data *bfqd = e->elevator_data;
				6113	unsigned long uninitialized_var(__data);
				6114	int ret = bfq_var_store(&__data, (page), count);
				6115
				6116	if (__data == 0)
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	6117	bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6118	else {
				6119	if (__data > INT_MAX)
				6120	__data = INT_MAX;
				6121	bfqd->bfq_max_budget = __data;
				6122	}
				6123
				6124	bfqd->bfq_user_max_budget = __data;
				6125
				6126	return ret;
				6127	}
				6128
				6129	/*
				6130	* Leaving this name to preserve name compatibility with cfq
				6131	* parameters, but this timeout is used for both sync and async.
				6132	*/
				6133	static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
				6134	const char *page, size_t count)
				6135	{
				6136	struct bfq_data *bfqd = e->elevator_data;
				6137	unsigned long uninitialized_var(__data);
				6138	int ret = bfq_var_store(&__data, (page), count);
				6139
				6140	if (__data < 1)
				6141	__data = 1;
				6142	else if (__data > INT_MAX)
				6143	__data = INT_MAX;
				6144
				6145	bfqd->bfq_timeout = msecs_to_jiffies(__data);
				6146	if (bfqd->bfq_user_max_budget == 0)
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	6147	bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6148
				6149	return ret;
				6150	}
				6151
				6152	static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
				6153	const char *page, size_t count)
				6154	{
				6155	struct bfq_data *bfqd = e->elevator_data;
				6156	unsigned long uninitialized_var(__data);
				6157	int ret = bfq_var_store(&__data, (page), count);
				6158
				6159	if (__data > 1)
				6160	__data = 1;
				6161	if (!bfqd->strict_guarantees && __data == 1
				6162	&& bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
				6163	bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
				6164
				6165	bfqd->strict_guarantees = __data;
				6166
				6167	return ret;
				6168	}
				6169
				6170	#define BFQ_ATTR(name) \
				6171	__ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
				6172
				6173	static struct elv_fs_entry bfq_attrs[] = {
				6174	BFQ_ATTR(fifo_expire_sync),
				6175	BFQ_ATTR(fifo_expire_async),
				6176	BFQ_ATTR(back_seek_max),
				6177	BFQ_ATTR(back_seek_penalty),
				6178	BFQ_ATTR(slice_idle),
				6179	BFQ_ATTR(slice_idle_us),
				6180	BFQ_ATTR(max_budget),
				6181	BFQ_ATTR(timeout_sync),
				6182	BFQ_ATTR(strict_guarantees),
				6183	__ATTR_NULL
				6184	};
				6185
				6186	static struct elevator_type iosched_bfq_mq = {
				6187	.ops.mq = {
				6188	.get_rq_priv = bfq_get_rq_private,
				6189	.put_rq_priv = bfq_put_rq_private,
				6190	.exit_icq = bfq_exit_icq,
				6191	.insert_requests = bfq_insert_requests,
				6192	.dispatch_request = bfq_dispatch_request,
				6193	.next_request = elv_rb_latter_request,
				6194	.former_request = elv_rb_former_request,
				6195	.allow_merge = bfq_allow_bio_merge,
				6196	.bio_merge = bfq_bio_merge,
				6197	.request_merge = bfq_request_merge,
				6198	.requests_merged = bfq_requests_merged,
				6199	.request_merged = bfq_request_merged,
				6200	.has_work = bfq_has_work,
				6201	.init_sched = bfq_init_queue,
				6202	.exit_sched = bfq_exit_queue,
				6203	},
				6204
				6205	.uses_mq = true,
				6206	.icq_size = sizeof(struct bfq_io_cq),
				6207	.icq_align = __alignof__(struct bfq_io_cq),
				6208	.elevator_attrs = bfq_attrs,
				6209	.elevator_name = "bfq",
				6210	.elevator_owner = THIS_MODULE,
				6211	};
				6212
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6213	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				6214	static struct blkcg_policy blkcg_policy_bfq = {
				6215	.dfl_cftypes = bfq_blkg_files,
				6216	.legacy_cftypes = bfq_blkcg_legacy_files,
				6217
				6218	.cpd_alloc_fn = bfq_cpd_alloc,
				6219	.cpd_init_fn = bfq_cpd_init,
				6220	.cpd_bind_fn = bfq_cpd_init,
				6221	.cpd_free_fn = bfq_cpd_free,
				6222
				6223	.pd_alloc_fn = bfq_pd_alloc,
				6224	.pd_init_fn = bfq_pd_init,
				6225	.pd_offline_fn = bfq_pd_offline,
				6226	.pd_free_fn = bfq_pd_free,
				6227	.pd_reset_stats_fn = bfq_pd_reset_stats,
				6228	};
				6229	#endif
				6230
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6231	static int __init bfq_init(void)
				6232	{
				6233	int ret;
				6234
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6235	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				6236	ret = blkcg_policy_register(&blkcg_policy_bfq);
				6237	if (ret)
				6238	return ret;
				6239	#endif
				6240
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6241	ret = -ENOMEM;
				6242	if (bfq_slab_setup())
				6243	goto err_pol_unreg;
				6244
				6245	ret = elv_register(&iosched_bfq_mq);
				6246	if (ret)
				6247	goto err_pol_unreg;
				6248
				6249	return 0;
				6250
				6251	err_pol_unreg:
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6252	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				6253	blkcg_policy_unregister(&blkcg_policy_bfq);
				6254	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6255	return ret;
				6256	}
				6257
				6258	static void __exit bfq_exit(void)
				6259	{
				6260	elv_unregister(&iosched_bfq_mq);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6261	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				6262	blkcg_policy_unregister(&blkcg_policy_bfq);
				6263	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6264	bfq_slab_kill();
				6265	}
				6266
				6267	module_init(bfq_init);
				6268	module_exit(bfq_exit);
				6269
				6270	MODULE_AUTHOR("Paolo Valente");
				6271	MODULE_LICENSE("GPL");
				6272	MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");