Blame - block/bfq-iosched.c - SHIFTPHONES/kernel/common

blob: 1a32c8341ab043636f87a15f3f3e41574ac10cf1 [file] [log] [blame]

Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1	/*
				2	* Budget Fair Queueing (BFQ) I/O scheduler.
				3	*
				4	* Based on ideas and code from CFQ:
				5	* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
				6	*
				7	* Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
				8	* Paolo Valente <paolo.valente@unimore.it>
				9	*
				10	* Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
				11	* Arianna Avanzini <avanzini@google.com>
				12	*
				13	* Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
				14	*
				15	* This program is free software; you can redistribute it and/or
				16	* modify it under the terms of the GNU General Public License as
				17	* published by the Free Software Foundation; either version 2 of the
				18	* License, or (at your option) any later version.
				19	*
				20	* This program is distributed in the hope that it will be useful,
				21	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				22	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				23	* General Public License for more details.
				24	*
				25	* BFQ is a proportional-share I/O scheduler, with some extra
				26	* low-latency capabilities. BFQ also supports full hierarchical
				27	* scheduling through cgroups. Next paragraphs provide an introduction
				28	* on BFQ inner workings. Details on BFQ benefits, usage and
				29	* limitations can be found in Documentation/block/bfq-iosched.txt.
				30	*
				31	* BFQ is a proportional-share storage-I/O scheduling algorithm based
				32	* on the slice-by-slice service scheme of CFQ. But BFQ assigns
				33	* budgets, measured in number of sectors, to processes instead of
				34	* time slices. The device is not granted to the in-service process
				35	* for a given time slice, but until it has exhausted its assigned
				36	* budget. This change from the time to the service domain enables BFQ
				37	* to distribute the device throughput among processes as desired,
				38	* without any distortion due to throughput fluctuations, or to device
				39	* internal queueing. BFQ uses an ad hoc internal scheduler, called
				40	* B-WF2Q+, to schedule processes according to their budgets. More
				41	* precisely, BFQ schedules queues associated with processes. Each
				42	* process/queue is assigned a user-configurable weight, and B-WF2Q+
				43	* guarantees that each queue receives a fraction of the throughput
				44	* proportional to its weight. Thanks to the accurate policy of
				45	* B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
				46	* processes issuing sequential requests (to boost the throughput),
				47	* and yet guarantee a low latency to interactive and soft real-time
				48	* applications.
				49	*
				50	* In particular, to provide these low-latency guarantees, BFQ
				51	* explicitly privileges the I/O of two classes of time-sensitive
				52	* applications: interactive and soft real-time. This feature enables
				53	* BFQ to provide applications in these classes with a very low
				54	* latency. Finally, BFQ also features additional heuristics for
				55	* preserving both a low latency and a high throughput on NCQ-capable,
				56	* rotational or flash-based devices, and to get the job done quickly
				57	* for applications consisting in many I/O-bound processes.
				58	*
				59	* BFQ is described in [1], where also a reference to the initial, more
				60	* theoretical paper on BFQ can be found. The interested reader can find
				61	* in the latter paper full details on the main algorithm, as well as
				62	* formulas of the guarantees and formal proofs of all the properties.
				63	* With respect to the version of BFQ presented in these papers, this
				64	* implementation adds a few more heuristics, such as the one that
				65	* guarantees a low latency to soft real-time applications, and a
				66	* hierarchical extension based on H-WF2Q+.
				67	*
				68	* B-WF2Q+ is based on WF2Q+, which is described in [2], together with
				69	* H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
				70	* with O(log N) complexity derives from the one introduced with EEVDF
				71	* in [3].
				72	*
				73	* [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
				74	* Scheduler", Proceedings of the First Workshop on Mobile System
				75	* Technologies (MST-2015), May 2015.
				76	* http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
				77	*
				78	* [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
				79	* Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
				80	* Oct 1997.
				81	*
				82	* http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
				83	*
				84	* [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
				85	* First: A Flexible and Accurate Mechanism for Proportional Share
				86	* Resource Allocation", technical report.
				87	*
				88	* http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
				89	*/
				90	#include <linux/module.h>
				91	#include <linux/slab.h>
				92	#include <linux/blkdev.h>
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	93	#include <linux/cgroup.h>
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	94	#include <linux/elevator.h>
				95	#include <linux/ktime.h>
				96	#include <linux/rbtree.h>
				97	#include <linux/ioprio.h>
				98	#include <linux/sbitmap.h>
				99	#include <linux/delay.h>
				100
				101	#include "blk.h"
				102	#include "blk-mq.h"
				103	#include "blk-mq-tag.h"
				104	#include "blk-mq-sched.h"
				105	#include <linux/blktrace_api.h>
				106	#include <linux/hrtimer.h>
				107	#include <linux/blk-cgroup.h>
				108
				109	#define BFQ_IOPRIO_CLASSES 3
				110	#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
				111
				112	#define BFQ_MIN_WEIGHT 1
				113	#define BFQ_MAX_WEIGHT 1000
				114	#define BFQ_WEIGHT_CONVERSION_COEFF 10
				115
				116	#define BFQ_DEFAULT_QUEUE_IOPRIO 4
				117
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	118	#define BFQ_WEIGHT_LEGACY_DFL 100
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	119	#define BFQ_DEFAULT_GRP_IOPRIO 0
				120	#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
				121
				122	struct bfq_entity;
				123
				124	/**
				125	* struct bfq_service_tree - per ioprio_class service tree.
				126	*
				127	* Each service tree represents a B-WF2Q+ scheduler on its own. Each
				128	* ioprio_class has its own independent scheduler, and so its own
				129	* bfq_service_tree. All the fields are protected by the queue lock
				130	* of the containing bfqd.
				131	*/
				132	struct bfq_service_tree {
				133	/* tree for active entities (i.e., those backlogged) */
				134	struct rb_root active;
				135	/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
				136	struct rb_root idle;
				137
				138	/* idle entity with minimum F_i */
				139	struct bfq_entity *first_idle;
				140	/* idle entity with maximum F_i */
				141	struct bfq_entity *last_idle;
				142
				143	/* scheduler virtual time */
				144	u64 vtime;
				145	/* scheduler weight sum; active and idle entities contribute to it */
				146	unsigned long wsum;
				147	};
				148
				149	/**
				150	* struct bfq_sched_data - multi-class scheduler.
				151	*
				152	* bfq_sched_data is the basic scheduler queue. It supports three
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	153	* ioprio_classes, and can be used either as a toplevel queue or as an
				154	* intermediate queue on a hierarchical setup. @next_in_service
				155	* points to the active entity of the sched_data service trees that
				156	* will be scheduled next. It is used to reduce the number of steps
				157	* needed for each hierarchical-schedule update.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	158	*
				159	* The supported ioprio_classes are the same as in CFQ, in descending
				160	* priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
				161	* Requests from higher priority queues are served before all the
				162	* requests from lower priority queues; among requests of the same
				163	* queue requests are served according to B-WF2Q+.
				164	* All the fields are protected by the queue lock of the containing bfqd.
				165	*/
				166	struct bfq_sched_data {
				167	/* entity in service */
				168	struct bfq_entity *in_service_entity;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	169	/* head-of-line entity (see comments above) */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	170	struct bfq_entity *next_in_service;
				171	/* array of service trees, one per ioprio_class */
				172	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	173	/* last time CLASS_IDLE was served */
				174	unsigned long bfq_class_idle_last_service;
				175
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	176	};
				177
				178	/**
				179	* struct bfq_entity - schedulable entity.
				180	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	181	* A bfq_entity is used to represent either a bfq_queue (leaf node in the
				182	* cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
				183	* entity belongs to the sched_data of the parent group in the cgroup
				184	* hierarchy. Non-leaf entities have also their own sched_data, stored
				185	* in @my_sched_data.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	186	*
				187	* Each entity stores independently its priority values; this would
				188	* allow different weights on different devices, but this
				189	* functionality is not exported to userspace by now. Priorities and
				190	* weights are updated lazily, first storing the new values into the
				191	* new_* fields, then setting the @prio_changed flag. As soon as
				192	* there is a transition in the entity state that allows the priority
				193	* update to take place the effective and the requested priority
				194	* values are synchronized.
				195	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	196	* Unless cgroups are used, the weight value is calculated from the
				197	* ioprio to export the same interface as CFQ. When dealing with
				198	* ``well-behaved'' queues (i.e., queues that do not spend too much
				199	* time to consume their budget and have true sequential behavior, and
				200	* when there are no external factors breaking anticipation) the
				201	* relative weights at each level of the cgroups hierarchy should be
				202	* guaranteed. All the fields are protected by the queue lock of the
				203	* containing bfqd.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	204	*/
				205	struct bfq_entity {
				206	/* service_tree member */
				207	struct rb_node rb_node;
				208
				209	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	210	* Flag, true if the entity is on a tree (either the active or
				211	* the idle one of its service_tree) or is in service.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	212	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	213	bool on_st;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	214
				215	/* B-WF2Q+ start and finish timestamps [sectors/weight] */
				216	u64 start, finish;
				217
				218	/* tree the entity is enqueued into; %NULL if not on a tree */
				219	struct rb_root *tree;
				220
				221	/*
				222	* minimum start time of the (active) subtree rooted at this
				223	* entity; used for O(log N) lookups into active trees
				224	*/
				225	u64 min_start;
				226
				227	/* amount of service received during the last service slot */
				228	int service;
				229
				230	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
				231	int budget;
				232
				233	/* weight of the queue */
				234	int weight;
				235	/* next weight if a change is in progress */
				236	int new_weight;
				237
				238	/* original weight, used to implement weight boosting */
				239	int orig_weight;
				240
				241	/* parent entity, for hierarchical scheduling */
				242	struct bfq_entity *parent;
				243
				244	/*
				245	* For non-leaf nodes in the hierarchy, the associated
				246	* scheduler queue, %NULL on leaf nodes.
				247	*/
				248	struct bfq_sched_data *my_sched_data;
				249	/* the scheduler queue this entity belongs to */
				250	struct bfq_sched_data *sched_data;
				251
				252	/* flag, set to request a weight, ioprio or ioprio_class change */
				253	int prio_changed;
				254	};
				255
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	256	struct bfq_group;
				257
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	258	/**
				259	* struct bfq_ttime - per process thinktime stats.
				260	*/
				261	struct bfq_ttime {
				262	/* completion time of the last request */
				263	u64 last_end_request;
				264
				265	/* total process thinktime */
				266	u64 ttime_total;
				267	/* number of thinktime samples */
				268	unsigned long ttime_samples;
				269	/* average process thinktime */
				270	u64 ttime_mean;
				271	};
				272
				273	/**
				274	* struct bfq_queue - leaf schedulable entity.
				275	*
				276	* A bfq_queue is a leaf request queue; it can be associated with an
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	277	* io_context or more, if it is async. @cgroup holds a reference to
				278	* the cgroup, to be sure that it does not disappear while a bfqq
				279	* still references it (mostly to avoid races between request issuing
				280	* and task migration followed by cgroup destruction). All the fields
				281	* are protected by the queue lock of the containing bfqd.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	282	*/
				283	struct bfq_queue {
				284	/* reference counter */
				285	int ref;
				286	/* parent bfq_data */
				287	struct bfq_data *bfqd;
				288
				289	/* current ioprio and ioprio class */
				290	unsigned short ioprio, ioprio_class;
				291	/* next ioprio and ioprio class if a change is in progress */
				292	unsigned short new_ioprio, new_ioprio_class;
				293
				294	/* sorted list of pending requests */
				295	struct rb_root sort_list;
				296	/* if fifo isn't expired, next request to serve */
				297	struct request *next_rq;
				298	/* number of sync and async requests queued */
				299	int queued[2];
				300	/* number of requests currently allocated */
				301	int allocated;
				302	/* number of pending metadata requests */
				303	int meta_pending;
				304	/* fifo list of requests in sort_list */
				305	struct list_head fifo;
				306
				307	/* entity representing this queue in the scheduler */
				308	struct bfq_entity entity;
				309
				310	/* maximum budget allowed from the feedback mechanism */
				311	int max_budget;
				312	/* budget expiration (in jiffies) */
				313	unsigned long budget_timeout;
				314
				315	/* number of requests on the dispatch list or inside driver */
				316	int dispatched;
				317
				318	/* status flags */
				319	unsigned long flags;
				320
				321	/* node for active/idle bfqq list inside parent bfqd */
				322	struct list_head bfqq_list;
				323
				324	/* associated @bfq_ttime struct */
				325	struct bfq_ttime ttime;
				326
				327	/* bit vector: a 1 for each seeky requests in history */
				328	u32 seek_history;
				329	/* position of the last request enqueued */
				330	sector_t last_request_pos;
				331
				332	/* Number of consecutive pairs of request completion and
				333	* arrival, such that the queue becomes idle after the
				334	* completion, but the next request arrives within an idle
				335	* time slice; used only if the queue's IO_bound flag has been
				336	* cleared.
				337	*/
				338	unsigned int requests_within_timer;
				339
				340	/* pid of the process owning the queue, used for logging purposes */
				341	pid_t pid;
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	342
				343	/* current maximum weight-raising time for this queue */
				344	unsigned long wr_cur_max_time;
				345	/*
				346	* Start time of the current weight-raising period if
				347	* the @bfq-queue is being weight-raised, otherwise
				348	* finish time of the last weight-raising period.
				349	*/
				350	unsigned long last_wr_start_finish;
				351	/* factor by which the weight of this queue is multiplied */
				352	unsigned int wr_coeff;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	353	};
				354
				355	/**
				356	* struct bfq_io_cq - per (request_queue, io_context) structure.
				357	*/
				358	struct bfq_io_cq {
				359	/* associated io_cq structure */
				360	struct io_cq icq; /* must be the first member */
				361	/* array of two process queues, the sync and the async */
				362	struct bfq_queue *bfqq[2];
				363	/* per (request_queue, blkcg) ioprio */
				364	int ioprio;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	365	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				366	uint64_t blkcg_serial_nr; /* the current blkcg serial */
				367	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	368	};
				369
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	370	enum bfq_device_speed {
				371	BFQ_BFQD_FAST,
				372	BFQ_BFQD_SLOW,
				373	};
				374
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	375	/**
				376	* struct bfq_data - per-device data structure.
				377	*
				378	* All the fields are protected by @lock.
				379	*/
				380	struct bfq_data {
				381	/* device request queue */
				382	struct request_queue *queue;
				383	/* dispatch queue */
				384	struct list_head dispatch;
				385
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	386	/* root bfq_group for the device */
				387	struct bfq_group *root_group;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	388
				389	/*
				390	* Number of bfq_queues containing requests (including the
				391	* queue in service, even if it is idling).
				392	*/
				393	int busy_queues;
				394	/* number of queued requests */
				395	int queued;
				396	/* number of requests dispatched and waiting for completion */
				397	int rq_in_driver;
				398
				399	/*
				400	* Maximum number of requests in driver in the last
				401	* @hw_tag_samples completed requests.
				402	*/
				403	int max_rq_in_driver;
				404	/* number of samples used to calculate hw_tag */
				405	int hw_tag_samples;
				406	/* flag set to one if the driver is showing a queueing behavior */
				407	int hw_tag;
				408
				409	/* number of budgets assigned */
				410	int budgets_assigned;
				411
				412	/*
				413	* Timer set when idling (waiting) for the next request from
				414	* the queue in service.
				415	*/
				416	struct hrtimer idle_slice_timer;
				417
				418	/* bfq_queue in service */
				419	struct bfq_queue *in_service_queue;
				420	/* bfq_io_cq (bic) associated with the @in_service_queue */
				421	struct bfq_io_cq *in_service_bic;
				422
				423	/* on-disk position of the last served request */
				424	sector_t last_position;
				425
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	426	/* time of last request completion (ns) */
				427	u64 last_completion;
				428
				429	/* time of first rq dispatch in current observation interval (ns) */
				430	u64 first_dispatch;
				431	/* time of last rq dispatch in current observation interval (ns) */
				432	u64 last_dispatch;
				433
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	434	/* beginning of the last budget */
				435	ktime_t last_budget_start;
				436	/* beginning of the last idle slice */
				437	ktime_t last_idling_start;
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	438
				439	/* number of samples in current observation interval */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	440	int peak_rate_samples;
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	441	/* num of samples of seq dispatches in current observation interval */
				442	u32 sequential_samples;
				443	/* total num of sectors transferred in current observation interval */
				444	u64 tot_sectors_dispatched;
				445	/* max rq size seen during current observation interval (sectors) */
				446	u32 last_rq_max_size;
				447	/* time elapsed from first dispatch in current observ. interval (us) */
				448	u64 delta_from_first;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	449	/*
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	450	* Current estimate of the device peak rate, measured in
				451	* [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
				452	* BFQ_RATE_SHIFT is performed to increase precision in
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	453	* fixed-point calculations.
				454	*/
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	455	u32 peak_rate;
				456
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	457	/* maximum budget allotted to a bfq_queue before rescheduling */
				458	int bfq_max_budget;
				459
				460	/* list of all the bfq_queues active on the device */
				461	struct list_head active_list;
				462	/* list of all the bfq_queues idle on the device */
				463	struct list_head idle_list;
				464
				465	/*
				466	* Timeout for async/sync requests; when it fires, requests
				467	* are served in fifo order.
				468	*/
				469	u64 bfq_fifo_expire[2];
				470	/* weight of backward seeks wrt forward ones */
				471	unsigned int bfq_back_penalty;
				472	/* maximum allowed backward seek */
				473	unsigned int bfq_back_max;
				474	/* maximum idling time */
				475	u32 bfq_slice_idle;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	476
				477	/* user-configured max budget value (0 for auto-tuning) */
				478	int bfq_user_max_budget;
				479	/*
				480	* Timeout for bfq_queues to consume their budget; used to
				481	* prevent seeky queues from imposing long latencies to
				482	* sequential or quasi-sequential ones (this also implies that
				483	* seeky queues cannot receive guarantees in the service
				484	* domain; after a timeout they are charged for the time they
				485	* have been in service, to preserve fairness among them, but
				486	* without service-domain guarantees).
				487	*/
				488	unsigned int bfq_timeout;
				489
				490	/*
				491	* Number of consecutive requests that must be issued within
				492	* the idle time slice to set again idling to a queue which
				493	* was marked as non-I/O-bound (see the definition of the
				494	* IO_bound flag for further details).
				495	*/
				496	unsigned int bfq_requests_within_timer;
				497
				498	/*
				499	* Force device idling whenever needed to provide accurate
				500	* service guarantees, without caring about throughput
				501	* issues. CAVEAT: this may even increase latencies, in case
				502	* of useless idling for processes that did stop doing I/O.
				503	*/
				504	bool strict_guarantees;
				505
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	506	/* if set to true, low-latency heuristics are enabled */
				507	bool low_latency;
				508	/*
				509	* Maximum factor by which the weight of a weight-raised queue
				510	* is multiplied.
				511	*/
				512	unsigned int bfq_wr_coeff;
				513	/* maximum duration of a weight-raising period (jiffies) */
				514	unsigned int bfq_wr_max_time;
				515	/*
				516	* Minimum idle period after which weight-raising may be
				517	* reactivated for a queue (in jiffies).
				518	*/
				519	unsigned int bfq_wr_min_idle_time;
				520	/*
				521	* Minimum period between request arrivals after which
				522	* weight-raising may be reactivated for an already busy async
				523	* queue (in jiffies).
				524	*/
				525	unsigned long bfq_wr_min_inter_arr_async;
				526	/*
				527	* Cached value of the product R*T, used for computing the
				528	* maximum duration of weight raising automatically.
				529	*/
				530	u64 RT_prod;
				531	/* device-speed class for the low-latency heuristic */
				532	enum bfq_device_speed device_speed;
				533
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	534	/* fallback dummy bfqq for extreme OOM conditions */
				535	struct bfq_queue oom_bfqq;
				536
				537	spinlock_t lock;
				538
				539	/*
				540	* bic associated with the task issuing current bio for
				541	* merging. This and the next field are used as a support to
				542	* be able to perform the bic lookup, needed by bio-merge
				543	* functions, before the scheduler lock is taken, and thus
				544	* avoid taking the request-queue lock while the scheduler
				545	* lock is being held.
				546	*/
				547	struct bfq_io_cq *bio_bic;
				548	/* bfqq associated with the task issuing current bio for merging */
				549	struct bfq_queue *bio_bfqq;
				550	};
				551
				552	enum bfqq_state_flags {
				553	BFQQF_busy = 0, /* has requests or is in service */
				554	BFQQF_wait_request, /* waiting for a request */
				555	BFQQF_non_blocking_wait_rq, /*
				556	* waiting for a request
				557	* without idling the device
				558	*/
				559	BFQQF_fifo_expire, /* FIFO checked in this slice */
				560	BFQQF_idle_window, /* slice idling enabled */
				561	BFQQF_sync, /* synchronous queue */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	562	BFQQF_IO_bound, /*
				563	* bfqq has timed-out at least once
				564	* having consumed at most 2/10 of
				565	* its budget
				566	*/
				567	};
				568
				569	#define BFQ_BFQQ_FNS(name) \
				570	static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
				571	{ \
				572	__set_bit(BFQQF_##name, &(bfqq)->flags); \
				573	} \
				574	static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
				575	{ \
				576	__clear_bit(BFQQF_##name, &(bfqq)->flags); \
				577	} \
				578	static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
				579	{ \
				580	return test_bit(BFQQF_##name, &(bfqq)->flags); \
				581	}
				582
				583	BFQ_BFQQ_FNS(busy);
				584	BFQ_BFQQ_FNS(wait_request);
				585	BFQ_BFQQ_FNS(non_blocking_wait_rq);
				586	BFQ_BFQQ_FNS(fifo_expire);
				587	BFQ_BFQQ_FNS(idle_window);
				588	BFQ_BFQQ_FNS(sync);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	589	BFQ_BFQQ_FNS(IO_bound);
				590	#undef BFQ_BFQQ_FNS
				591
				592	/* Logging facilities. */
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	593	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				594	static struct bfq_group bfqq_group(struct bfq_queue bfqq);
				595	static struct blkcg_gq bfqg_to_blkg(struct bfq_group bfqg);
				596
				597	#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
				598	char __pbuf[128]; \
				599	\
				600	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
				601	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
				602	bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
				603	__pbuf, ##args); \
				604	} while (0)
				605
				606	#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
				607	char __pbuf[128]; \
				608	\
				609	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
				610	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
				611	} while (0)
				612
				613	#else /* CONFIG_BFQ_GROUP_IOSCHED */
				614
				615	#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
				616	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
				617	bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
				618	##args)
				619	#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
				620
				621	#endif /* CONFIG_BFQ_GROUP_IOSCHED */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	622
				623	#define bfq_log(bfqd, fmt, args...) \
				624	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
				625
				626	/* Expiration reasons. */
				627	enum bfqq_expiration {
				628	BFQQE_TOO_IDLE = 0, /*
				629	* queue has been idling for
				630	* too long
				631	*/
				632	BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
				633	BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
				634	BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
				635	BFQQE_PREEMPTED /* preemption in progress */
				636	};
				637
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	638	struct bfqg_stats {
				639	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				640	/* number of ios merged */
				641	struct blkg_rwstat merged;
				642	/* total time spent on device in ns, may not be accurate w/ queueing */
				643	struct blkg_rwstat service_time;
				644	/* total time spent waiting in scheduler queue in ns */
				645	struct blkg_rwstat wait_time;
				646	/* number of IOs queued up */
				647	struct blkg_rwstat queued;
				648	/* total disk time and nr sectors dispatched by this group */
				649	struct blkg_stat time;
				650	/* sum of number of ios queued across all samples */
				651	struct blkg_stat avg_queue_size_sum;
				652	/* count of samples taken for average */
				653	struct blkg_stat avg_queue_size_samples;
				654	/* how many times this group has been removed from service tree */
				655	struct blkg_stat dequeue;
				656	/* total time spent waiting for it to be assigned a timeslice. */
				657	struct blkg_stat group_wait_time;
				658	/* time spent idling for this blkcg_gq */
				659	struct blkg_stat idle_time;
				660	/* total time with empty current active q with other requests queued */
				661	struct blkg_stat empty_time;
				662	/* fields after this shouldn't be cleared on stat reset */
				663	uint64_t start_group_wait_time;
				664	uint64_t start_idle_time;
				665	uint64_t start_empty_time;
				666	uint16_t flags;
				667	#endif /* CONFIG_BFQ_GROUP_IOSCHED */
				668	};
				669
				670	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				671
				672	/*
				673	* struct bfq_group_data - per-blkcg storage for the blkio subsystem.
				674	*
				675	* @ps: @blkcg_policy_storage that this structure inherits
				676	* @weight: weight of the bfq_group
				677	*/
				678	struct bfq_group_data {
				679	/* must be the first member */
				680	struct blkcg_policy_data pd;
				681
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	682	unsigned int weight;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	683	};
				684
				685	/**
				686	* struct bfq_group - per (device, cgroup) data structure.
				687	* @entity: schedulable entity to insert into the parent group sched_data.
				688	* @sched_data: own sched_data, to contain child entities (they may be
				689	* both bfq_queues and bfq_groups).
				690	* @bfqd: the bfq_data for the device this group acts upon.
				691	* @async_bfqq: array of async queues for all the tasks belonging to
				692	* the group, one queue per ioprio value per ioprio_class,
				693	* except for the idle class that has only one queue.
				694	* @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
				695	* @my_entity: pointer to @entity, %NULL for the toplevel group; used
				696	* to avoid too many special cases during group creation/
				697	* migration.
				698	* @stats: stats for this bfqg.
				699	*
				700	* Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
				701	* there is a set of bfq_groups, each one collecting the lower-level
				702	* entities belonging to the group that are acting on the same device.
				703	*
				704	* Locking works as follows:
				705	* o @bfqd is protected by the queue lock, RCU is used to access it
				706	* from the readers.
				707	* o All the other fields are protected by the @bfqd queue lock.
				708	*/
				709	struct bfq_group {
				710	/* must be the first member */
				711	struct blkg_policy_data pd;
				712
				713	struct bfq_entity entity;
				714	struct bfq_sched_data sched_data;
				715
				716	void *bfqd;
				717
				718	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
				719	struct bfq_queue *async_idle_bfqq;
				720
				721	struct bfq_entity *my_entity;
				722
				723	struct bfqg_stats stats;
				724	};
				725
				726	#else
				727	struct bfq_group {
				728	struct bfq_sched_data sched_data;
				729
				730	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
				731	struct bfq_queue *async_idle_bfqq;
				732
				733	struct rb_root rq_pos_tree;
				734	};
				735	#endif
				736
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	737	static struct bfq_queue bfq_entity_to_bfqq(struct bfq_entity entity);
				738
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	739	static unsigned int bfq_class_idx(struct bfq_entity *entity)
				740	{
				741	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				742
				743	return bfqq ? bfqq->ioprio_class - 1 :
				744	BFQ_DEFAULT_GRP_CLASS - 1;
				745	}
				746
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	747	static struct bfq_service_tree *
				748	bfq_entity_service_tree(struct bfq_entity *entity)
				749	{
				750	struct bfq_sched_data *sched_data = entity->sched_data;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	751	unsigned int idx = bfq_class_idx(entity);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	752
				753	return sched_data->service_tree + idx;
				754	}
				755
				756	static struct bfq_queue bic_to_bfqq(struct bfq_io_cq bic, bool is_sync)
				757	{
				758	return bic->bfqq[is_sync];
				759	}
				760
				761	static void bic_set_bfqq(struct bfq_io_cq bic, struct bfq_queue bfqq,
				762	bool is_sync)
				763	{
				764	bic->bfqq[is_sync] = bfqq;
				765	}
				766
				767	static struct bfq_data bic_to_bfqd(struct bfq_io_cq bic)
				768	{
				769	return bic->icq.q->elevator->elevator_data;
				770	}
				771
				772	static void bfq_check_ioprio_change(struct bfq_io_cq bic, struct bio bio);
				773	static void bfq_put_queue(struct bfq_queue *bfqq);
				774	static struct bfq_queue bfq_get_queue(struct bfq_data bfqd,
				775	struct bio *bio, bool is_sync,
				776	struct bfq_io_cq *bic);
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	777	static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
				778	struct bfq_group *bfqg);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	779	static void bfq_put_async_queues(struct bfq_data bfqd, struct bfq_group bfqg);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	780	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq);
				781
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	782	/* Expiration time of sync (0) and async (1) requests, in ns. */
				783	static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
				784
				785	/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
				786	static const int bfq_back_max = 16 * 1024;
				787
				788	/* Penalty of a backwards seek, in number of sectors. */
				789	static const int bfq_back_penalty = 2;
				790
				791	/* Idling period duration, in ns. */
				792	static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
				793
				794	/* Minimum number of assigned budgets for which stats are safe to compute. */
				795	static const int bfq_stats_min_budgets = 194;
				796
				797	/* Default maximum budget values, in sectors and number of requests. */
				798	static const int bfq_default_max_budget = 16 * 1024;
				799
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	800	/*
				801	* Async to sync throughput distribution is controlled as follows:
				802	* when an async request is served, the entity is charged the number
				803	* of sectors of the request, multiplied by the factor below
				804	*/
				805	static const int bfq_async_charge_factor = 10;
				806
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	807	/* Default timeout values, in jiffies, approximating CFQ defaults. */
				808	static const int bfq_timeout = HZ / 8;
				809
				810	static struct kmem_cache *bfq_pool;
				811
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	812	/* Below this threshold (in ns), we consider thinktime immediate. */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	813	#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
				814
				815	/* hw_tag detection: parallel requests threshold and min samples needed. */
				816	#define BFQ_HW_QUEUE_THRESHOLD 4
				817	#define BFQ_HW_QUEUE_SAMPLES 32
				818
				819	#define BFQQ_SEEK_THR (sector_t)(8 * 100)
				820	#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
				821	#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
				822	#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
				823
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	824	/* Min number of samples required to perform peak-rate update */
				825	#define BFQ_RATE_MIN_SAMPLES 32
				826	/* Min observation time interval required to perform a peak-rate update (ns) */
				827	#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
				828	/* Target observation time interval for a peak-rate update (ns) */
				829	#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	830
				831	/* Shift used for peak rate fixed precision calculations. */
				832	#define BFQ_RATE_SHIFT 16
				833
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	834	/*
				835	* By default, BFQ computes the duration of the weight raising for
				836	* interactive applications automatically, using the following formula:
				837	* duration = (R / r) * T, where r is the peak rate of the device, and
				838	* R and T are two reference parameters.
				839	* In particular, R is the peak rate of the reference device (see below),
				840	* and T is a reference time: given the systems that are likely to be
				841	* installed on the reference device according to its speed class, T is
				842	* about the maximum time needed, under BFQ and while reading two files in
				843	* parallel, to load typical large applications on these systems.
				844	* In practice, the slower/faster the device at hand is, the more/less it
				845	* takes to load applications with respect to the reference device.
				846	* Accordingly, the longer/shorter BFQ grants weight raising to interactive
				847	* applications.
				848	*
				849	* BFQ uses four different reference pairs (R, T), depending on:
				850	* . whether the device is rotational or non-rotational;
				851	* . whether the device is slow, such as old or portable HDDs, as well as
				852	* SD cards, or fast, such as newer HDDs and SSDs.
				853	*
				854	* The device's speed class is dynamically (re)detected in
				855	* bfq_update_peak_rate() every time the estimated peak rate is updated.
				856	*
				857	* In the following definitions, R_slow[0]/R_fast[0] and
				858	* T_slow[0]/T_fast[0] are the reference values for a slow/fast
				859	* rotational device, whereas R_slow[1]/R_fast[1] and
				860	* T_slow[1]/T_fast[1] are the reference values for a slow/fast
				861	* non-rotational device. Finally, device_speed_thresh are the
				862	* thresholds used to switch between speed classes. The reference
				863	* rates are not the actual peak rates of the devices used as a
				864	* reference, but slightly lower values. The reason for using these
				865	* slightly lower values is that the peak-rate estimator tends to
				866	* yield slightly lower values than the actual peak rate (it can yield
				867	* the actual peak rate only if there is only one process doing I/O,
				868	* and the process does sequential I/O).
				869	*
				870	* Both the reference peak rates and the thresholds are measured in
				871	* sectors/usec, left-shifted by BFQ_RATE_SHIFT.
				872	*/
				873	static int R_slow[2] = {1000, 10700};
				874	static int R_fast[2] = {14000, 33000};
				875	/*
				876	* To improve readability, a conversion function is used to initialize the
				877	* following arrays, which entails that they can be initialized only in a
				878	* function.
				879	*/
				880	static int T_slow[2];
				881	static int T_fast[2];
				882	static int device_speed_thresh[2];
				883
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	884	#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
				885	{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
				886
				887	#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
				888	#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
				889
				890	/**
				891	* icq_to_bic - convert iocontext queue structure to bfq_io_cq.
				892	* @icq: the iocontext queue.
				893	*/
				894	static struct bfq_io_cq icq_to_bic(struct io_cq icq)
				895	{
				896	/* bic->icq is the first member, %NULL will convert to %NULL */
				897	return container_of(icq, struct bfq_io_cq, icq);
				898	}
				899
				900	/**
				901	* bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
				902	* @bfqd: the lookup key.
				903	* @ioc: the io_context of the process doing I/O.
				904	* @q: the request queue.
				905	*/
				906	static struct bfq_io_cq bfq_bic_lookup(struct bfq_data bfqd,
				907	struct io_context *ioc,
				908	struct request_queue *q)
				909	{
				910	if (ioc) {
				911	unsigned long flags;
				912	struct bfq_io_cq *icq;
				913
				914	spin_lock_irqsave(q->queue_lock, flags);
				915	icq = icq_to_bic(ioc_lookup_icq(ioc, q));
				916	spin_unlock_irqrestore(q->queue_lock, flags);
				917
				918	return icq;
				919	}
				920
				921	return NULL;
				922	}
				923
				924	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	925	* Scheduler run of queue, if there are requests pending and no one in the
				926	* driver that will restart queueing.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	927	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	928	static void bfq_schedule_dispatch(struct bfq_data *bfqd)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	929	{
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	930	if (bfqd->queued != 0) {
				931	bfq_log(bfqd, "schedule dispatch");
				932	blk_mq_run_hw_queues(bfqd->queue, true);
				933	}
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	934	}
				935
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	936	/**
				937	* bfq_gt - compare two timestamps.
				938	* @a: first ts.
				939	* @b: second ts.
				940	*
				941	* Return @a > @b, dealing with wrapping correctly.
				942	*/
				943	static int bfq_gt(u64 a, u64 b)
				944	{
				945	return (s64)(a - b) > 0;
				946	}
				947
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	948	static struct bfq_entity bfq_root_active_entity(struct rb_root tree)
				949	{
				950	struct rb_node *node = tree->rb_node;
				951
				952	return rb_entry(node, struct bfq_entity, rb_node);
				953	}
				954
				955	static struct bfq_entity bfq_lookup_next_entity(struct bfq_sched_data sd);
				956
				957	static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
				958
				959	/**
				960	* bfq_update_next_in_service - update sd->next_in_service
				961	* @sd: sched_data for which to perform the update.
				962	* @new_entity: if not NULL, pointer to the entity whose activation,
				963	* requeueing or repositionig triggered the invocation of
				964	* this function.
				965	*
				966	* This function is called to update sd->next_in_service, which, in
				967	* its turn, may change as a consequence of the insertion or
				968	* extraction of an entity into/from one of the active trees of
				969	* sd. These insertions/extractions occur as a consequence of
				970	* activations/deactivations of entities, with some activations being
				971	* 'true' activations, and other activations being requeueings (i.e.,
				972	* implementing the second, requeueing phase of the mechanism used to
				973	* reposition an entity in its active tree; see comments on
				974	* __bfq_activate_entity and __bfq_requeue_entity for details). In
				975	* both the last two activation sub-cases, new_entity points to the
				976	* just activated or requeued entity.
				977	*
				978	* Returns true if sd->next_in_service changes in such a way that
				979	* entity->parent may become the next_in_service for its parent
				980	* entity.
				981	*/
				982	static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
				983	struct bfq_entity *new_entity)
				984	{
				985	struct bfq_entity *next_in_service = sd->next_in_service;
				986	bool parent_sched_may_change = false;
				987
				988	/*
				989	* If this update is triggered by the activation, requeueing
				990	* or repositiong of an entity that does not coincide with
				991	* sd->next_in_service, then a full lookup in the active tree
				992	* can be avoided. In fact, it is enough to check whether the
				993	* just-modified entity has a higher priority than
				994	* sd->next_in_service, or, even if it has the same priority
				995	* as sd->next_in_service, is eligible and has a lower virtual
				996	* finish time than sd->next_in_service. If this compound
				997	* condition holds, then the new entity becomes the new
				998	* next_in_service. Otherwise no change is needed.
				999	*/
				1000	if (new_entity && new_entity != sd->next_in_service) {
				1001	/*
				1002	* Flag used to decide whether to replace
				1003	* sd->next_in_service with new_entity. Tentatively
				1004	* set to true, and left as true if
				1005	* sd->next_in_service is NULL.
				1006	*/
				1007	bool replace_next = true;
				1008
				1009	/*
				1010	* If there is already a next_in_service candidate
				1011	* entity, then compare class priorities or timestamps
				1012	* to decide whether to replace sd->service_tree with
				1013	* new_entity.
				1014	*/
				1015	if (next_in_service) {
				1016	unsigned int new_entity_class_idx =
				1017	bfq_class_idx(new_entity);
				1018	struct bfq_service_tree *st =
				1019	sd->service_tree + new_entity_class_idx;
				1020
				1021	/*
				1022	* For efficiency, evaluate the most likely
				1023	* sub-condition first.
				1024	*/
				1025	replace_next =
				1026	(new_entity_class_idx ==
				1027	bfq_class_idx(next_in_service)
				1028	&&
				1029	!bfq_gt(new_entity->start, st->vtime)
				1030	&&
				1031	bfq_gt(next_in_service->finish,
				1032	new_entity->finish))
				1033	\|\|
				1034	new_entity_class_idx <
				1035	bfq_class_idx(next_in_service);
				1036	}
				1037
				1038	if (replace_next)
				1039	next_in_service = new_entity;
				1040	} else /* invoked because of a deactivation: lookup needed */
				1041	next_in_service = bfq_lookup_next_entity(sd);
				1042
				1043	if (next_in_service) {
				1044	parent_sched_may_change = !sd->next_in_service \|\|
				1045	bfq_update_parent_budget(next_in_service);
				1046	}
				1047
				1048	sd->next_in_service = next_in_service;
				1049
				1050	if (!next_in_service)
				1051	return parent_sched_may_change;
				1052
				1053	return parent_sched_may_change;
				1054	}
				1055
				1056	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1057	/* both next loops stop at one of the child entities of the root group */
				1058	#define for_each_entity(entity) \
				1059	for (; entity ; entity = entity->parent)
				1060
				1061	/*
				1062	* For each iteration, compute parent in advance, so as to be safe if
				1063	* entity is deallocated during the iteration. Such a deallocation may
				1064	* happen as a consequence of a bfq_put_queue that frees the bfq_queue
				1065	* containing entity.
				1066	*/
				1067	#define for_each_entity_safe(entity, parent) \
				1068	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
				1069
				1070	/*
				1071	* Returns true if this budget changes may let next_in_service->parent
				1072	* become the next_in_service entity for its parent entity.
				1073	*/
				1074	static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
				1075	{
				1076	struct bfq_entity *bfqg_entity;
				1077	struct bfq_group *bfqg;
				1078	struct bfq_sched_data *group_sd;
				1079	bool ret = false;
				1080
				1081	group_sd = next_in_service->sched_data;
				1082
				1083	bfqg = container_of(group_sd, struct bfq_group, sched_data);
				1084	/*
				1085	* bfq_group's my_entity field is not NULL only if the group
				1086	* is not the root group. We must not touch the root entity
				1087	* as it must never become an in-service entity.
				1088	*/
				1089	bfqg_entity = bfqg->my_entity;
				1090	if (bfqg_entity) {
				1091	if (bfqg_entity->budget > next_in_service->budget)
				1092	ret = true;
				1093	bfqg_entity->budget = next_in_service->budget;
				1094	}
				1095
				1096	return ret;
				1097	}
				1098
				1099	/*
				1100	* This function tells whether entity stops being a candidate for next
				1101	* service, according to the following logic.
				1102	*
				1103	* This function is invoked for an entity that is about to be set in
				1104	* service. If such an entity is a queue, then the entity is no longer
				1105	* a candidate for next service (i.e, a candidate entity to serve
				1106	* after the in-service entity is expired). The function then returns
				1107	* true.
				1108	*/
				1109	static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
				1110	{
				1111	if (bfq_entity_to_bfqq(entity))
				1112	return true;
				1113
				1114	return false;
				1115	}
				1116
				1117	#else /* CONFIG_BFQ_GROUP_IOSCHED */
				1118	/*
				1119	* Next two macros are fake loops when cgroups support is not
				1120	* enabled. I fact, in such a case, there is only one level to go up
				1121	* (to reach the root group).
				1122	*/
				1123	#define for_each_entity(entity) \
				1124	for (; entity ; entity = NULL)
				1125
				1126	#define for_each_entity_safe(entity, parent) \
				1127	for (parent = NULL; entity ; entity = parent)
				1128
				1129	static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
				1130	{
				1131	return false;
				1132	}
				1133
				1134	static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
				1135	{
				1136	return true;
				1137	}
				1138
				1139	#endif /* CONFIG_BFQ_GROUP_IOSCHED */
				1140
				1141	/*
				1142	* Shift for timestamp calculations. This actually limits the maximum
				1143	* service allowed in one timestamp delta (small shift values increase it),
				1144	* the maximum total weight that can be used for the queues in the system
				1145	* (big shift values increase it), and the period of virtual time
				1146	* wraparounds.
				1147	*/
				1148	#define WFQ_SERVICE_SHIFT 22
				1149
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1150	static struct bfq_queue bfq_entity_to_bfqq(struct bfq_entity entity)
				1151	{
				1152	struct bfq_queue *bfqq = NULL;
				1153
				1154	if (!entity->my_sched_data)
				1155	bfqq = container_of(entity, struct bfq_queue, entity);
				1156
				1157	return bfqq;
				1158	}
				1159
				1160
				1161	/**
				1162	* bfq_delta - map service into the virtual time domain.
				1163	* @service: amount of service.
				1164	* @weight: scale factor (weight of an entity or weight sum).
				1165	*/
				1166	static u64 bfq_delta(unsigned long service, unsigned long weight)
				1167	{
				1168	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
				1169
				1170	do_div(d, weight);
				1171	return d;
				1172	}
				1173
				1174	/**
				1175	* bfq_calc_finish - assign the finish time to an entity.
				1176	* @entity: the entity to act upon.
				1177	* @service: the service to be charged to the entity.
				1178	*/
				1179	static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
				1180	{
				1181	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1182
				1183	entity->finish = entity->start +
				1184	bfq_delta(service, entity->weight);
				1185
				1186	if (bfqq) {
				1187	bfq_log_bfqq(bfqq->bfqd, bfqq,
				1188	"calc_finish: serv %lu, w %d",
				1189	service, entity->weight);
				1190	bfq_log_bfqq(bfqq->bfqd, bfqq,
				1191	"calc_finish: start %llu, finish %llu, delta %llu",
				1192	entity->start, entity->finish,
				1193	bfq_delta(service, entity->weight));
				1194	}
				1195	}
				1196
				1197	/**
				1198	* bfq_entity_of - get an entity from a node.
				1199	* @node: the node field of the entity.
				1200	*
				1201	* Convert a node pointer to the relative entity. This is used only
				1202	* to simplify the logic of some functions and not as the generic
				1203	* conversion mechanism because, e.g., in the tree walking functions,
				1204	* the check for a %NULL value would be redundant.
				1205	*/
				1206	static struct bfq_entity bfq_entity_of(struct rb_node node)
				1207	{
				1208	struct bfq_entity *entity = NULL;
				1209
				1210	if (node)
				1211	entity = rb_entry(node, struct bfq_entity, rb_node);
				1212
				1213	return entity;
				1214	}
				1215
				1216	/**
				1217	* bfq_extract - remove an entity from a tree.
				1218	* @root: the tree root.
				1219	* @entity: the entity to remove.
				1220	*/
				1221	static void bfq_extract(struct rb_root root, struct bfq_entity entity)
				1222	{
				1223	entity->tree = NULL;
				1224	rb_erase(&entity->rb_node, root);
				1225	}
				1226
				1227	/**
				1228	* bfq_idle_extract - extract an entity from the idle tree.
				1229	* @st: the service tree of the owning @entity.
				1230	* @entity: the entity being removed.
				1231	*/
				1232	static void bfq_idle_extract(struct bfq_service_tree *st,
				1233	struct bfq_entity *entity)
				1234	{
				1235	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1236	struct rb_node *next;
				1237
				1238	if (entity == st->first_idle) {
				1239	next = rb_next(&entity->rb_node);
				1240	st->first_idle = bfq_entity_of(next);
				1241	}
				1242
				1243	if (entity == st->last_idle) {
				1244	next = rb_prev(&entity->rb_node);
				1245	st->last_idle = bfq_entity_of(next);
				1246	}
				1247
				1248	bfq_extract(&st->idle, entity);
				1249
				1250	if (bfqq)
				1251	list_del(&bfqq->bfqq_list);
				1252	}
				1253
				1254	/**
				1255	* bfq_insert - generic tree insertion.
				1256	* @root: tree root.
				1257	* @entity: entity to insert.
				1258	*
				1259	* This is used for the idle and the active tree, since they are both
				1260	* ordered by finish time.
				1261	*/
				1262	static void bfq_insert(struct rb_root root, struct bfq_entity entity)
				1263	{
				1264	struct bfq_entity *entry;
				1265	struct rb_node **node = &root->rb_node;
				1266	struct rb_node *parent = NULL;
				1267
				1268	while (*node) {
				1269	parent = *node;
				1270	entry = rb_entry(parent, struct bfq_entity, rb_node);
				1271
				1272	if (bfq_gt(entry->finish, entity->finish))
				1273	node = &parent->rb_left;
				1274	else
				1275	node = &parent->rb_right;
				1276	}
				1277
				1278	rb_link_node(&entity->rb_node, parent, node);
				1279	rb_insert_color(&entity->rb_node, root);
				1280
				1281	entity->tree = root;
				1282	}
				1283
				1284	/**
				1285	* bfq_update_min - update the min_start field of a entity.
				1286	* @entity: the entity to update.
				1287	* @node: one of its children.
				1288	*
				1289	* This function is called when @entity may store an invalid value for
				1290	* min_start due to updates to the active tree. The function assumes
				1291	* that the subtree rooted at @node (which may be its left or its right
				1292	* child) has a valid min_start value.
				1293	*/
				1294	static void bfq_update_min(struct bfq_entity entity, struct rb_node node)
				1295	{
				1296	struct bfq_entity *child;
				1297
				1298	if (node) {
				1299	child = rb_entry(node, struct bfq_entity, rb_node);
				1300	if (bfq_gt(entity->min_start, child->min_start))
				1301	entity->min_start = child->min_start;
				1302	}
				1303	}
				1304
				1305	/**
				1306	* bfq_update_active_node - recalculate min_start.
				1307	* @node: the node to update.
				1308	*
				1309	* @node may have changed position or one of its children may have moved,
				1310	* this function updates its min_start value. The left and right subtrees
				1311	* are assumed to hold a correct min_start value.
				1312	*/
				1313	static void bfq_update_active_node(struct rb_node *node)
				1314	{
				1315	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
				1316
				1317	entity->min_start = entity->start;
				1318	bfq_update_min(entity, node->rb_right);
				1319	bfq_update_min(entity, node->rb_left);
				1320	}
				1321
				1322	/**
				1323	* bfq_update_active_tree - update min_start for the whole active tree.
				1324	* @node: the starting node.
				1325	*
				1326	* @node must be the deepest modified node after an update. This function
				1327	* updates its min_start using the values held by its children, assuming
				1328	* that they did not change, and then updates all the nodes that may have
				1329	* changed in the path to the root. The only nodes that may have changed
				1330	* are the ones in the path or their siblings.
				1331	*/
				1332	static void bfq_update_active_tree(struct rb_node *node)
				1333	{
				1334	struct rb_node *parent;
				1335
				1336	up:
				1337	bfq_update_active_node(node);
				1338
				1339	parent = rb_parent(node);
				1340	if (!parent)
				1341	return;
				1342
				1343	if (node == parent->rb_left && parent->rb_right)
				1344	bfq_update_active_node(parent->rb_right);
				1345	else if (parent->rb_left)
				1346	bfq_update_active_node(parent->rb_left);
				1347
				1348	node = parent;
				1349	goto up;
				1350	}
				1351
				1352	/**
				1353	* bfq_active_insert - insert an entity in the active tree of its
				1354	* group/device.
				1355	* @st: the service tree of the entity.
				1356	* @entity: the entity being inserted.
				1357	*
				1358	* The active tree is ordered by finish time, but an extra key is kept
				1359	* per each node, containing the minimum value for the start times of
				1360	* its children (and the node itself), so it's possible to search for
				1361	* the eligible node with the lowest finish time in logarithmic time.
				1362	*/
				1363	static void bfq_active_insert(struct bfq_service_tree *st,
				1364	struct bfq_entity *entity)
				1365	{
				1366	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1367	struct rb_node *node = &entity->rb_node;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1368	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1369	struct bfq_sched_data *sd = NULL;
				1370	struct bfq_group *bfqg = NULL;
				1371	struct bfq_data *bfqd = NULL;
				1372	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1373
				1374	bfq_insert(&st->active, entity);
				1375
				1376	if (node->rb_left)
				1377	node = node->rb_left;
				1378	else if (node->rb_right)
				1379	node = node->rb_right;
				1380
				1381	bfq_update_active_tree(node);
				1382
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1383	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1384	sd = entity->sched_data;
				1385	bfqg = container_of(sd, struct bfq_group, sched_data);
				1386	bfqd = (struct bfq_data *)bfqg->bfqd;
				1387	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1388	if (bfqq)
				1389	list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
				1390	}
				1391
				1392	/**
				1393	* bfq_ioprio_to_weight - calc a weight from an ioprio.
				1394	* @ioprio: the ioprio value to convert.
				1395	*/
				1396	static unsigned short bfq_ioprio_to_weight(int ioprio)
				1397	{
				1398	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
				1399	}
				1400
				1401	/**
				1402	* bfq_weight_to_ioprio - calc an ioprio from a weight.
				1403	* @weight: the weight value to convert.
				1404	*
				1405	* To preserve as much as possible the old only-ioprio user interface,
				1406	* 0 is used as an escape ioprio value for weights (numerically) equal or
				1407	* larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
				1408	*/
				1409	static unsigned short bfq_weight_to_ioprio(int weight)
				1410	{
				1411	return max_t(int, 0,
				1412	IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
				1413	}
				1414
				1415	static void bfq_get_entity(struct bfq_entity *entity)
				1416	{
				1417	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1418
				1419	if (bfqq) {
				1420	bfqq->ref++;
				1421	bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
				1422	bfqq, bfqq->ref);
				1423	}
				1424	}
				1425
				1426	/**
				1427	* bfq_find_deepest - find the deepest node that an extraction can modify.
				1428	* @node: the node being removed.
				1429	*
				1430	* Do the first step of an extraction in an rb tree, looking for the
				1431	* node that will replace @node, and returning the deepest node that
				1432	* the following modifications to the tree can touch. If @node is the
				1433	* last node in the tree return %NULL.
				1434	*/
				1435	static struct rb_node bfq_find_deepest(struct rb_node node)
				1436	{
				1437	struct rb_node *deepest;
				1438
				1439	if (!node->rb_right && !node->rb_left)
				1440	deepest = rb_parent(node);
				1441	else if (!node->rb_right)
				1442	deepest = node->rb_left;
				1443	else if (!node->rb_left)
				1444	deepest = node->rb_right;
				1445	else {
				1446	deepest = rb_next(node);
				1447	if (deepest->rb_right)
				1448	deepest = deepest->rb_right;
				1449	else if (rb_parent(deepest) != node)
				1450	deepest = rb_parent(deepest);
				1451	}
				1452
				1453	return deepest;
				1454	}
				1455
				1456	/**
				1457	* bfq_active_extract - remove an entity from the active tree.
				1458	* @st: the service_tree containing the tree.
				1459	* @entity: the entity being removed.
				1460	*/
				1461	static void bfq_active_extract(struct bfq_service_tree *st,
				1462	struct bfq_entity *entity)
				1463	{
				1464	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1465	struct rb_node *node;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1466	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1467	struct bfq_sched_data *sd = NULL;
				1468	struct bfq_group *bfqg = NULL;
				1469	struct bfq_data *bfqd = NULL;
				1470	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1471
				1472	node = bfq_find_deepest(&entity->rb_node);
				1473	bfq_extract(&st->active, entity);
				1474
				1475	if (node)
				1476	bfq_update_active_tree(node);
				1477
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1478	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1479	sd = entity->sched_data;
				1480	bfqg = container_of(sd, struct bfq_group, sched_data);
				1481	bfqd = (struct bfq_data *)bfqg->bfqd;
				1482	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1483	if (bfqq)
				1484	list_del(&bfqq->bfqq_list);
				1485	}
				1486
				1487	/**
				1488	* bfq_idle_insert - insert an entity into the idle tree.
				1489	* @st: the service tree containing the tree.
				1490	* @entity: the entity to insert.
				1491	*/
				1492	static void bfq_idle_insert(struct bfq_service_tree *st,
				1493	struct bfq_entity *entity)
				1494	{
				1495	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1496	struct bfq_entity *first_idle = st->first_idle;
				1497	struct bfq_entity *last_idle = st->last_idle;
				1498
				1499	if (!first_idle \|\| bfq_gt(first_idle->finish, entity->finish))
				1500	st->first_idle = entity;
				1501	if (!last_idle \|\| bfq_gt(entity->finish, last_idle->finish))
				1502	st->last_idle = entity;
				1503
				1504	bfq_insert(&st->idle, entity);
				1505
				1506	if (bfqq)
				1507	list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
				1508	}
				1509
				1510	/**
				1511	* bfq_forget_entity - do not consider entity any longer for scheduling
				1512	* @st: the service tree.
				1513	* @entity: the entity being removed.
				1514	* @is_in_service: true if entity is currently the in-service entity.
				1515	*
				1516	* Forget everything about @entity. In addition, if entity represents
				1517	* a queue, and the latter is not in service, then release the service
				1518	* reference to the queue (the one taken through bfq_get_entity). In
				1519	* fact, in this case, there is really no more service reference to
				1520	* the queue, as the latter is also outside any service tree. If,
				1521	* instead, the queue is in service, then __bfq_bfqd_reset_in_service
				1522	* will take care of putting the reference when the queue finally
				1523	* stops being served.
				1524	*/
				1525	static void bfq_forget_entity(struct bfq_service_tree *st,
				1526	struct bfq_entity *entity,
				1527	bool is_in_service)
				1528	{
				1529	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1530
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1531	entity->on_st = false;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1532	st->wsum -= entity->weight;
				1533	if (bfqq && !is_in_service)
				1534	bfq_put_queue(bfqq);
				1535	}
				1536
				1537	/**
				1538	* bfq_put_idle_entity - release the idle tree ref of an entity.
				1539	* @st: service tree for the entity.
				1540	* @entity: the entity being released.
				1541	*/
				1542	static void bfq_put_idle_entity(struct bfq_service_tree *st,
				1543	struct bfq_entity *entity)
				1544	{
				1545	bfq_idle_extract(st, entity);
				1546	bfq_forget_entity(st, entity,
				1547	entity == entity->sched_data->in_service_entity);
				1548	}
				1549
				1550	/**
				1551	* bfq_forget_idle - update the idle tree if necessary.
				1552	* @st: the service tree to act upon.
				1553	*
				1554	* To preserve the global O(log N) complexity we only remove one entry here;
				1555	* as the idle tree will not grow indefinitely this can be done safely.
				1556	*/
				1557	static void bfq_forget_idle(struct bfq_service_tree *st)
				1558	{
				1559	struct bfq_entity *first_idle = st->first_idle;
				1560	struct bfq_entity *last_idle = st->last_idle;
				1561
				1562	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
				1563	!bfq_gt(last_idle->finish, st->vtime)) {
				1564	/*
				1565	* Forget the whole idle tree, increasing the vtime past
				1566	* the last finish time of idle entities.
				1567	*/
				1568	st->vtime = last_idle->finish;
				1569	}
				1570
				1571	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
				1572	bfq_put_idle_entity(st, first_idle);
				1573	}
				1574
				1575	static struct bfq_service_tree *
				1576	__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1577	struct bfq_entity *entity)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1578	{
				1579	struct bfq_service_tree *new_st = old_st;
				1580
				1581	if (entity->prio_changed) {
				1582	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	1583	unsigned int prev_weight, new_weight;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1584	struct bfq_data *bfqd = NULL;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1585	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1586	struct bfq_sched_data *sd;
				1587	struct bfq_group *bfqg;
				1588	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1589
				1590	if (bfqq)
				1591	bfqd = bfqq->bfqd;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1592	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				1593	else {
				1594	sd = entity->my_sched_data;
				1595	bfqg = container_of(sd, struct bfq_group, sched_data);
				1596	bfqd = (struct bfq_data *)bfqg->bfqd;
				1597	}
				1598	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1599
				1600	old_st->wsum -= entity->weight;
				1601
				1602	if (entity->new_weight != entity->orig_weight) {
				1603	if (entity->new_weight < BFQ_MIN_WEIGHT \|\|
				1604	entity->new_weight > BFQ_MAX_WEIGHT) {
				1605	pr_crit("update_weight_prio: new_weight %d\n",
				1606	entity->new_weight);
				1607	if (entity->new_weight < BFQ_MIN_WEIGHT)
				1608	entity->new_weight = BFQ_MIN_WEIGHT;
				1609	else
				1610	entity->new_weight = BFQ_MAX_WEIGHT;
				1611	}
				1612	entity->orig_weight = entity->new_weight;
				1613	if (bfqq)
				1614	bfqq->ioprio =
				1615	bfq_weight_to_ioprio(entity->orig_weight);
				1616	}
				1617
				1618	if (bfqq)
				1619	bfqq->ioprio_class = bfqq->new_ioprio_class;
				1620	entity->prio_changed = 0;
				1621
				1622	/*
				1623	* NOTE: here we may be changing the weight too early,
				1624	* this will cause unfairness. The correct approach
				1625	* would have required additional complexity to defer
				1626	* weight changes to the proper time instants (i.e.,
				1627	* when entity->finish <= old_st->vtime).
				1628	*/
				1629	new_st = bfq_entity_service_tree(entity);
				1630
				1631	prev_weight = entity->weight;
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	1632	new_weight = entity->orig_weight *
				1633	(bfqq ? bfqq->wr_coeff : 1);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1634	entity->weight = new_weight;
				1635
				1636	new_st->wsum += entity->weight;
				1637
				1638	if (new_st != old_st)
				1639	entity->start = new_st->vtime;
				1640	}
				1641
				1642	return new_st;
				1643	}
				1644
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1645	static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
				1646	static struct bfq_group bfqq_group(struct bfq_queue bfqq);
				1647
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1648	/**
				1649	* bfq_bfqq_served - update the scheduler status after selection for
				1650	* service.
				1651	* @bfqq: the queue being served.
				1652	* @served: bytes to transfer.
				1653	*
				1654	* NOTE: this can be optimized, as the timestamps of upper level entities
				1655	* are synchronized every time a new bfqq is selected for service. By now,
				1656	* we keep it to better check consistency.
				1657	*/
				1658	static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
				1659	{
				1660	struct bfq_entity *entity = &bfqq->entity;
				1661	struct bfq_service_tree *st;
				1662
				1663	for_each_entity(entity) {
				1664	st = bfq_entity_service_tree(entity);
				1665
				1666	entity->service += served;
				1667
				1668	st->vtime += bfq_delta(served, st->wsum);
				1669	bfq_forget_idle(st);
				1670	}
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1671	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1672	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
				1673	}
				1674
				1675	/**
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	1676	* bfq_bfqq_charge_time - charge an amount of service equivalent to the length
				1677	* of the time interval during which bfqq has been in
				1678	* service.
				1679	* @bfqd: the device
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1680	* @bfqq: the queue that needs a service update.
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	1681	* @time_ms: the amount of time during which the queue has received service
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1682	*
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	1683	* If a queue does not consume its budget fast enough, then providing
				1684	* the queue with service fairness may impair throughput, more or less
				1685	* severely. For this reason, queues that consume their budget slowly
				1686	* are provided with time fairness instead of service fairness. This
				1687	* goal is achieved through the BFQ scheduling engine, even if such an
				1688	* engine works in the service, and not in the time domain. The trick
				1689	* is charging these queues with an inflated amount of service, equal
				1690	* to the amount of service that they would have received during their
				1691	* service slot if they had been fast, i.e., if their requests had
				1692	* been dispatched at a rate equal to the estimated peak rate.
				1693	*
				1694	* It is worth noting that time fairness can cause important
				1695	* distortions in terms of bandwidth distribution, on devices with
				1696	* internal queueing. The reason is that I/O requests dispatched
				1697	* during the service slot of a queue may be served after that service
				1698	* slot is finished, and may have a total processing time loosely
				1699	* correlated with the duration of the service slot. This is
				1700	* especially true for short service slots.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1701	*/
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	1702	static void bfq_bfqq_charge_time(struct bfq_data bfqd, struct bfq_queue bfqq,
				1703	unsigned long time_ms)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1704	{
				1705	struct bfq_entity *entity = &bfqq->entity;
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	1706	int tot_serv_to_charge = entity->service;
				1707	unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1708
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	1709	if (time_ms > 0 && time_ms < timeout_ms)
				1710	tot_serv_to_charge =
				1711	(bfqd->bfq_max_budget * time_ms) / timeout_ms;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1712
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	1713	if (tot_serv_to_charge < entity->service)
				1714	tot_serv_to_charge = entity->service;
				1715
				1716	/* Increase budget to avoid inconsistencies */
				1717	if (tot_serv_to_charge > entity->budget)
				1718	entity->budget = tot_serv_to_charge;
				1719
				1720	bfq_bfqq_served(bfqq,
				1721	max_t(int, 0, tot_serv_to_charge - entity->service));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1722	}
				1723
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1724	static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
				1725	struct bfq_service_tree *st,
				1726	bool backshifted)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1727	{
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	1728	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				1729
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1730	st = __bfq_entity_update_weight_prio(st, entity);
				1731	bfq_calc_finish(entity, entity->budget);
				1732
				1733	/*
				1734	* If some queues enjoy backshifting for a while, then their
				1735	* (virtual) finish timestamps may happen to become lower and
				1736	* lower than the system virtual time. In particular, if
				1737	* these queues often happen to be idle for short time
				1738	* periods, and during such time periods other queues with
				1739	* higher timestamps happen to be busy, then the backshifted
				1740	* timestamps of the former queues can become much lower than
				1741	* the system virtual time. In fact, to serve the queues with
				1742	* higher timestamps while the ones with lower timestamps are
				1743	* idle, the system virtual time may be pushed-up to much
				1744	* higher values than the finish timestamps of the idle
				1745	* queues. As a consequence, the finish timestamps of all new
				1746	* or newly activated queues may end up being much larger than
				1747	* those of lucky queues with backshifted timestamps. The
				1748	* latter queues may then monopolize the device for a lot of
				1749	* time. This would simply break service guarantees.
				1750	*
				1751	* To reduce this problem, push up a little bit the
				1752	* backshifted timestamps of the queue associated with this
				1753	* entity (only a queue can happen to have the backshifted
				1754	* flag set): just enough to let the finish timestamp of the
				1755	* queue be equal to the current value of the system virtual
				1756	* time. This may introduce a little unfairness among queues
				1757	* with backshifted timestamps, but it does not break
				1758	* worst-case fairness guarantees.
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	1759	*
				1760	* As a special case, if bfqq is weight-raised, push up
				1761	* timestamps much less, to keep very low the probability that
				1762	* this push up causes the backshifted finish timestamps of
				1763	* weight-raised queues to become higher than the backshifted
				1764	* finish timestamps of non weight-raised queues.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1765	*/
				1766	if (backshifted && bfq_gt(st->vtime, entity->finish)) {
				1767	unsigned long delta = st->vtime - entity->finish;
				1768
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	1769	if (bfqq)
				1770	delta /= bfqq->wr_coeff;
				1771
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1772	entity->start += delta;
				1773	entity->finish += delta;
				1774	}
				1775
				1776	bfq_active_insert(st, entity);
				1777	}
				1778
				1779	/**
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1780	* __bfq_activate_entity - handle activation of entity.
				1781	* @entity: the entity being activated.
				1782	* @non_blocking_wait_rq: true if entity was waiting for a request
				1783	*
				1784	* Called for a 'true' activation, i.e., if entity is not active and
				1785	* one of its children receives a new request.
				1786	*
				1787	* Basically, this function updates the timestamps of entity and
				1788	* inserts entity into its active tree, ater possible extracting it
				1789	* from its idle tree.
				1790	*/
				1791	static void __bfq_activate_entity(struct bfq_entity *entity,
				1792	bool non_blocking_wait_rq)
				1793	{
				1794	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				1795	bool backshifted = false;
				1796	unsigned long long min_vstart;
				1797
				1798	/* See comments on bfq_fqq_update_budg_for_activation */
				1799	if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
				1800	backshifted = true;
				1801	min_vstart = entity->finish;
				1802	} else
				1803	min_vstart = st->vtime;
				1804
				1805	if (entity->tree == &st->idle) {
				1806	/*
				1807	* Must be on the idle tree, bfq_idle_extract() will
				1808	* check for that.
				1809	*/
				1810	bfq_idle_extract(st, entity);
				1811	entity->start = bfq_gt(min_vstart, entity->finish) ?
				1812	min_vstart : entity->finish;
				1813	} else {
				1814	/*
				1815	* The finish time of the entity may be invalid, and
				1816	* it is in the past for sure, otherwise the queue
				1817	* would have been on the idle tree.
				1818	*/
				1819	entity->start = min_vstart;
				1820	st->wsum += entity->weight;
				1821	/*
				1822	* entity is about to be inserted into a service tree,
				1823	* and then set in service: get a reference to make
				1824	* sure entity does not disappear until it is no
				1825	* longer in service or scheduled for service.
				1826	*/
				1827	bfq_get_entity(entity);
				1828
				1829	entity->on_st = true;
				1830	}
				1831
				1832	bfq_update_fin_time_enqueue(entity, st, backshifted);
				1833	}
				1834
				1835	/**
				1836	* __bfq_requeue_entity - handle requeueing or repositioning of an entity.
				1837	* @entity: the entity being requeued or repositioned.
				1838	*
				1839	* Requeueing is needed if this entity stops being served, which
				1840	* happens if a leaf descendant entity has expired. On the other hand,
				1841	* repositioning is needed if the next_inservice_entity for the child
				1842	* entity has changed. See the comments inside the function for
				1843	* details.
				1844	*
				1845	* Basically, this function: 1) removes entity from its active tree if
				1846	* present there, 2) updates the timestamps of entity and 3) inserts
				1847	* entity back into its active tree (in the new, right position for
				1848	* the new values of the timestamps).
				1849	*/
				1850	static void __bfq_requeue_entity(struct bfq_entity *entity)
				1851	{
				1852	struct bfq_sched_data *sd = entity->sched_data;
				1853	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				1854
				1855	if (entity == sd->in_service_entity) {
				1856	/*
				1857	* We are requeueing the current in-service entity,
				1858	* which may have to be done for one of the following
				1859	* reasons:
				1860	* - entity represents the in-service queue, and the
				1861	* in-service queue is being requeued after an
				1862	* expiration;
				1863	* - entity represents a group, and its budget has
				1864	* changed because one of its child entities has
				1865	* just been either activated or requeued for some
				1866	* reason; the timestamps of the entity need then to
				1867	* be updated, and the entity needs to be enqueued
				1868	* or repositioned accordingly.
				1869	*
				1870	* In particular, before requeueing, the start time of
				1871	* the entity must be moved forward to account for the
				1872	* service that the entity has received while in
				1873	* service. This is done by the next instructions. The
				1874	* finish time will then be updated according to this
				1875	* new value of the start time, and to the budget of
				1876	* the entity.
				1877	*/
				1878	bfq_calc_finish(entity, entity->service);
				1879	entity->start = entity->finish;
				1880	/*
				1881	* In addition, if the entity had more than one child
				1882	* when set in service, then was not extracted from
				1883	* the active tree. This implies that the position of
				1884	* the entity in the active tree may need to be
				1885	* changed now, because we have just updated the start
				1886	* time of the entity, and we will update its finish
				1887	* time in a moment (the requeueing is then, more
				1888	* precisely, a repositioning in this case). To
				1889	* implement this repositioning, we: 1) dequeue the
				1890	* entity here, 2) update the finish time and
				1891	* requeue the entity according to the new
				1892	* timestamps below.
				1893	*/
				1894	if (entity->tree)
				1895	bfq_active_extract(st, entity);
				1896	} else { /* The entity is already active, and not in service */
				1897	/*
				1898	* In this case, this function gets called only if the
				1899	* next_in_service entity below this entity has
				1900	* changed, and this change has caused the budget of
				1901	* this entity to change, which, finally implies that
				1902	* the finish time of this entity must be
				1903	* updated. Such an update may cause the scheduling,
				1904	* i.e., the position in the active tree, of this
				1905	* entity to change. We handle this change by: 1)
				1906	* dequeueing the entity here, 2) updating the finish
				1907	* time and requeueing the entity according to the new
				1908	* timestamps below. This is the same approach as the
				1909	* non-extracted-entity sub-case above.
				1910	*/
				1911	bfq_active_extract(st, entity);
				1912	}
				1913
				1914	bfq_update_fin_time_enqueue(entity, st, false);
				1915	}
				1916
				1917	static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
				1918	struct bfq_sched_data *sd,
				1919	bool non_blocking_wait_rq)
				1920	{
				1921	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				1922
				1923	if (sd->in_service_entity == entity \|\| entity->tree == &st->active)
				1924	/*
				1925	* in service or already queued on the active tree,
				1926	* requeue or reposition
				1927	*/
				1928	__bfq_requeue_entity(entity);
				1929	else
				1930	/*
				1931	* Not in service and not queued on its active tree:
				1932	* the activity is idle and this is a true activation.
				1933	*/
				1934	__bfq_activate_entity(entity, non_blocking_wait_rq);
				1935	}
				1936
				1937
				1938	/**
				1939	* bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
				1940	* and activate, requeue or reposition all ancestors
				1941	* for which such an update becomes necessary.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1942	* @entity: the entity to activate.
				1943	* @non_blocking_wait_rq: true if this entity was waiting for a request
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1944	* @requeue: true if this is a requeue, which implies that bfqq is
				1945	* being expired; thus ALL its ancestors stop being served and must
				1946	* therefore be requeued
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1947	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1948	static void bfq_activate_requeue_entity(struct bfq_entity *entity,
				1949	bool non_blocking_wait_rq,
				1950	bool requeue)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1951	{
				1952	struct bfq_sched_data *sd;
				1953
				1954	for_each_entity(entity) {
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1955	sd = entity->sched_data;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1956	__bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
				1957
				1958	if (!bfq_update_next_in_service(sd, entity) && !requeue)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1959	break;
				1960	}
				1961	}
				1962
				1963	/**
				1964	* __bfq_deactivate_entity - deactivate an entity from its service tree.
				1965	* @entity: the entity to deactivate.
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1966	* @ins_into_idle_tree: if false, the entity will not be put into the
				1967	* idle tree.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1968	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1969	* Deactivates an entity, independently from its previous state. Must
				1970	* be invoked only if entity is on a service tree. Extracts the entity
				1971	* from that tree, and if necessary and allowed, puts it on the idle
				1972	* tree.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1973	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1974	static bool __bfq_deactivate_entity(struct bfq_entity *entity,
				1975	bool ins_into_idle_tree)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1976	{
				1977	struct bfq_sched_data *sd = entity->sched_data;
				1978	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
				1979	int is_in_service = entity == sd->in_service_entity;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1980
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1981	if (!entity->on_st) /* entity never activated, or already inactive */
				1982	return false;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1983
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1984	if (is_in_service)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1985	bfq_calc_finish(entity, entity->service);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1986
				1987	if (entity->tree == &st->active)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1988	bfq_active_extract(st, entity);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1989	else if (!is_in_service && entity->tree == &st->idle)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1990	bfq_idle_extract(st, entity);
				1991
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1992	if (!ins_into_idle_tree \|\| !bfq_gt(entity->finish, st->vtime))
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1993	bfq_forget_entity(st, entity, is_in_service);
				1994	else
				1995	bfq_idle_insert(st, entity);
				1996
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	1997	return true;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	1998	}
				1999
				2000	/**
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2001	* bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2002	* @entity: the entity to deactivate.
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2003	* @ins_into_idle_tree: true if the entity can be put on the idle tree
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2004	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2005	static void bfq_deactivate_entity(struct bfq_entity *entity,
				2006	bool ins_into_idle_tree,
				2007	bool expiration)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2008	{
				2009	struct bfq_sched_data *sd;
				2010	struct bfq_entity *parent = NULL;
				2011
				2012	for_each_entity_safe(entity, parent) {
				2013	sd = entity->sched_data;
				2014
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2015	if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2016	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2017	* entity is not in any tree any more, so
				2018	* this deactivation is a no-op, and there is
				2019	* nothing to change for upper-level entities
				2020	* (in case of expiration, this can never
				2021	* happen).
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2022	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2023	return;
				2024	}
				2025
				2026	if (sd->next_in_service == entity)
				2027	/*
				2028	* entity was the next_in_service entity,
				2029	* then, since entity has just been
				2030	* deactivated, a new one must be found.
				2031	*/
				2032	bfq_update_next_in_service(sd, NULL);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2033
				2034	if (sd->next_in_service)
				2035	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2036	* The parent entity is still backlogged,
				2037	* because next_in_service is not NULL. So, no
				2038	* further upwards deactivation must be
				2039	* performed. Yet, next_in_service has
				2040	* changed. Then the schedule does need to be
				2041	* updated upwards.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2042	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2043	break;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2044
				2045	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2046	* If we get here, then the parent is no more
				2047	* backlogged and we need to propagate the
				2048	* deactivation upwards. Thus let the loop go on.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2049	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2050
				2051	/*
				2052	* Also let parent be queued into the idle tree on
				2053	* deactivation, to preserve service guarantees, and
				2054	* assuming that who invoked this function does not
				2055	* need parent entities too to be removed completely.
				2056	*/
				2057	ins_into_idle_tree = true;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2058	}
				2059
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2060	/*
				2061	* If the deactivation loop is fully executed, then there are
				2062	* no more entities to touch and next loop is not executed at
				2063	* all. Otherwise, requeue remaining entities if they are
				2064	* about to stop receiving service, or reposition them if this
				2065	* is not the case.
				2066	*/
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2067	entity = parent;
				2068	for_each_entity(entity) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2069	/*
				2070	* Invoke __bfq_requeue_entity on entity, even if
				2071	* already active, to requeue/reposition it in the
				2072	* active tree (because sd->next_in_service has
				2073	* changed)
				2074	*/
				2075	__bfq_requeue_entity(entity);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2076
				2077	sd = entity->sched_data;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2078	if (!bfq_update_next_in_service(sd, entity) &&
				2079	!expiration)
				2080	/*
				2081	* next_in_service unchanged or not causing
				2082	* any change in entity->parent->sd, and no
				2083	* requeueing needed for expiration: stop
				2084	* here.
				2085	*/
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2086	break;
				2087	}
				2088	}
				2089
				2090	/**
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2091	* bfq_calc_vtime_jump - compute the value to which the vtime should jump,
				2092	* if needed, to have at least one entity eligible.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2093	* @st: the service tree to act upon.
				2094	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2095	* Assumes that st is not empty.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2096	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2097	static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2098	{
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2099	struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2100
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2101	if (bfq_gt(root_entity->min_start, st->vtime))
				2102	return root_entity->min_start;
				2103
				2104	return st->vtime;
				2105	}
				2106
				2107	static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
				2108	{
				2109	if (new_value > st->vtime) {
				2110	st->vtime = new_value;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2111	bfq_forget_idle(st);
				2112	}
				2113	}
				2114
				2115	/**
				2116	* bfq_first_active_entity - find the eligible entity with
				2117	* the smallest finish time
				2118	* @st: the service tree to select from.
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2119	* @vtime: the system virtual to use as a reference for eligibility
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2120	*
				2121	* This function searches the first schedulable entity, starting from the
				2122	* root of the tree and going on the left every time on this side there is
				2123	* a subtree with at least one eligible (start >= vtime) entity. The path on
				2124	* the right is followed only if a) the left subtree contains no eligible
				2125	* entities and b) no eligible entity has been found yet.
				2126	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2127	static struct bfq_entity bfq_first_active_entity(struct bfq_service_tree st,
				2128	u64 vtime)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2129	{
				2130	struct bfq_entity entry, first = NULL;
				2131	struct rb_node *node = st->active.rb_node;
				2132
				2133	while (node) {
				2134	entry = rb_entry(node, struct bfq_entity, rb_node);
				2135	left:
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2136	if (!bfq_gt(entry->start, vtime))
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2137	first = entry;
				2138
				2139	if (node->rb_left) {
				2140	entry = rb_entry(node->rb_left,
				2141	struct bfq_entity, rb_node);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2142	if (!bfq_gt(entry->min_start, vtime)) {
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2143	node = node->rb_left;
				2144	goto left;
				2145	}
				2146	}
				2147	if (first)
				2148	break;
				2149	node = node->rb_right;
				2150	}
				2151
				2152	return first;
				2153	}
				2154
				2155	/**
				2156	* __bfq_lookup_next_entity - return the first eligible entity in @st.
				2157	* @st: the service tree.
				2158	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2159	* If there is no in-service entity for the sched_data st belongs to,
				2160	* then return the entity that will be set in service if:
				2161	* 1) the parent entity this st belongs to is set in service;
				2162	* 2) no entity belonging to such parent entity undergoes a state change
				2163	* that would influence the timestamps of the entity (e.g., becomes idle,
				2164	* becomes backlogged, changes its budget, ...).
				2165	*
				2166	* In this first case, update the virtual time in @st too (see the
				2167	* comments on this update inside the function).
				2168	*
				2169	* In constrast, if there is an in-service entity, then return the
				2170	* entity that would be set in service if not only the above
				2171	* conditions, but also the next one held true: the currently
				2172	* in-service entity, on expiration,
				2173	* 1) gets a finish time equal to the current one, or
				2174	* 2) is not eligible any more, or
				2175	* 3) is idle.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2176	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2177	static struct bfq_entity *
				2178	__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2179	{
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2180	struct bfq_entity *entity;
				2181	u64 new_vtime;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2182
				2183	if (RB_EMPTY_ROOT(&st->active))
				2184	return NULL;
				2185
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2186	/*
				2187	* Get the value of the system virtual time for which at
				2188	* least one entity is eligible.
				2189	*/
				2190	new_vtime = bfq_calc_vtime_jump(st);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2191
				2192	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2193	* If there is no in-service entity for the sched_data this
				2194	* active tree belongs to, then push the system virtual time
				2195	* up to the value that guarantees that at least one entity is
				2196	* eligible. If, instead, there is an in-service entity, then
				2197	* do not make any such update, because there is already an
				2198	* eligible entity, namely the in-service one (even if the
				2199	* entity is not on st, because it was extracted when set in
				2200	* service).
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2201	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2202	if (!in_service)
				2203	bfq_update_vtime(st, new_vtime);
				2204
				2205	entity = bfq_first_active_entity(st, new_vtime);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2206
				2207	return entity;
				2208	}
				2209
				2210	/**
				2211	* bfq_lookup_next_entity - return the first eligible entity in @sd.
				2212	* @sd: the sched_data.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2213	*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2214	* This function is invoked when there has been a change in the trees
				2215	* for sd, and we need know what is the new next entity after this
				2216	* change.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2217	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2218	static struct bfq_entity bfq_lookup_next_entity(struct bfq_sched_data sd)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2219	{
				2220	struct bfq_service_tree *st = sd->service_tree;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2221	struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
				2222	struct bfq_entity *entity = NULL;
				2223	int class_idx = 0;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2224
				2225	/*
				2226	* Choose from idle class, if needed to guarantee a minimum
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2227	* bandwidth to this class (and if there is some active entity
				2228	* in idle class). This should also mitigate
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2229	* priority-inversion problems in case a low priority task is
				2230	* holding file system resources.
				2231	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2232	if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
				2233	BFQ_CL_IDLE_TIMEOUT)) {
				2234	if (!RB_EMPTY_ROOT(&idle_class_st->active))
				2235	class_idx = BFQ_IOPRIO_CLASSES - 1;
				2236	/* About to be served if backlogged, or not yet backlogged */
				2237	sd->bfq_class_idle_last_service = jiffies;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2238	}
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2239
				2240	/*
				2241	* Find the next entity to serve for the highest-priority
				2242	* class, unless the idle class needs to be served.
				2243	*/
				2244	for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
				2245	entity = __bfq_lookup_next_entity(st + class_idx,
				2246	sd->in_service_entity);
				2247
				2248	if (entity)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2249	break;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2250	}
				2251
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2252	if (!entity)
				2253	return NULL;
				2254
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2255	return entity;
				2256	}
				2257
				2258	static bool next_queue_may_preempt(struct bfq_data *bfqd)
				2259	{
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2260	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2261
				2262	return sd->next_in_service != sd->in_service_entity;
				2263	}
				2264
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2265	/*
				2266	* Get next queue for service.
				2267	*/
				2268	static struct bfq_queue bfq_get_next_queue(struct bfq_data bfqd)
				2269	{
				2270	struct bfq_entity *entity = NULL;
				2271	struct bfq_sched_data *sd;
				2272	struct bfq_queue *bfqq;
				2273
				2274	if (bfqd->busy_queues == 0)
				2275	return NULL;
				2276
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2277	/*
				2278	* Traverse the path from the root to the leaf entity to
				2279	* serve. Set in service all the entities visited along the
				2280	* way.
				2281	*/
				2282	sd = &bfqd->root_group->sched_data;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2283	for (; sd ; sd = entity->my_sched_data) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2284	/*
				2285	* WARNING. We are about to set the in-service entity
				2286	* to sd->next_in_service, i.e., to the (cached) value
				2287	* returned by bfq_lookup_next_entity(sd) the last
				2288	* time it was invoked, i.e., the last time when the
				2289	* service order in sd changed as a consequence of the
				2290	* activation or deactivation of an entity. In this
				2291	* respect, if we execute bfq_lookup_next_entity(sd)
				2292	* in this very moment, it may, although with low
				2293	* probability, yield a different entity than that
				2294	* pointed to by sd->next_in_service. This rare event
				2295	* happens in case there was no CLASS_IDLE entity to
				2296	* serve for sd when bfq_lookup_next_entity(sd) was
				2297	* invoked for the last time, while there is now one
				2298	* such entity.
				2299	*
				2300	* If the above event happens, then the scheduling of
				2301	* such entity in CLASS_IDLE is postponed until the
				2302	* service of the sd->next_in_service entity
				2303	* finishes. In fact, when the latter is expired,
				2304	* bfq_lookup_next_entity(sd) gets called again,
				2305	* exactly to update sd->next_in_service.
				2306	*/
				2307
				2308	/* Make next_in_service entity become in_service_entity */
				2309	entity = sd->next_in_service;
				2310	sd->in_service_entity = entity;
				2311
				2312	/*
				2313	* Reset the accumulator of the amount of service that
				2314	* the entity is about to receive.
				2315	*/
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2316	entity->service = 0;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2317
				2318	/*
				2319	* If entity is no longer a candidate for next
				2320	* service, then we extract it from its active tree,
				2321	* for the following reason. To further boost the
				2322	* throughput in some special case, BFQ needs to know
				2323	* which is the next candidate entity to serve, while
				2324	* there is already an entity in service. In this
				2325	* respect, to make it easy to compute/update the next
				2326	* candidate entity to serve after the current
				2327	* candidate has been set in service, there is a case
				2328	* where it is necessary to extract the current
				2329	* candidate from its service tree. Such a case is
				2330	* when the entity just set in service cannot be also
				2331	* a candidate for next service. Details about when
				2332	* this conditions holds are reported in the comments
				2333	* on the function bfq_no_longer_next_in_service()
				2334	* invoked below.
				2335	*/
				2336	if (bfq_no_longer_next_in_service(entity))
				2337	bfq_active_extract(bfq_entity_service_tree(entity),
				2338	entity);
				2339
				2340	/*
				2341	* For the same reason why we may have just extracted
				2342	* entity from its active tree, we may need to update
				2343	* next_in_service for the sched_data of entity too,
				2344	* regardless of whether entity has been extracted.
				2345	* In fact, even if entity has not been extracted, a
				2346	* descendant entity may get extracted. Such an event
				2347	* would cause a change in next_in_service for the
				2348	* level of the descendant entity, and thus possibly
				2349	* back to upper levels.
				2350	*
				2351	* We cannot perform the resulting needed update
				2352	* before the end of this loop, because, to know which
				2353	* is the correct next-to-serve candidate entity for
				2354	* each level, we need first to find the leaf entity
				2355	* to set in service. In fact, only after we know
				2356	* which is the next-to-serve leaf entity, we can
				2357	* discover whether the parent entity of the leaf
				2358	* entity becomes the next-to-serve, and so on.
				2359	*/
				2360
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2361	}
				2362
				2363	bfqq = bfq_entity_to_bfqq(entity);
				2364
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2365	/*
				2366	* We can finally update all next-to-serve entities along the
				2367	* path from the leaf entity just set in service to the root.
				2368	*/
				2369	for_each_entity(entity) {
				2370	struct bfq_sched_data *sd = entity->sched_data;
				2371
				2372	if (!bfq_update_next_in_service(sd, NULL))
				2373	break;
				2374	}
				2375
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2376	return bfqq;
				2377	}
				2378
				2379	static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
				2380	{
				2381	struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
				2382	struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2383	struct bfq_entity *entity = in_serv_entity;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2384
				2385	if (bfqd->in_service_bic) {
				2386	put_io_context(bfqd->in_service_bic->icq.ioc);
				2387	bfqd->in_service_bic = NULL;
				2388	}
				2389
				2390	bfq_clear_bfqq_wait_request(in_serv_bfqq);
				2391	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
				2392	bfqd->in_service_queue = NULL;
				2393
				2394	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2395	* When this function is called, all in-service entities have
				2396	* been properly deactivated or requeued, so we can safely
				2397	* execute the final step: reset in_service_entity along the
				2398	* path from entity to the root.
				2399	*/
				2400	for_each_entity(entity)
				2401	entity->sched_data->in_service_entity = NULL;
				2402
				2403	/*
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2404	* in_serv_entity is no longer in service, so, if it is in no
				2405	* service tree either, then release the service reference to
				2406	* the queue it represents (taken with bfq_get_entity).
				2407	*/
				2408	if (!in_serv_entity->on_st)
				2409	bfq_put_queue(in_serv_bfqq);
				2410	}
				2411
				2412	static void bfq_deactivate_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq,
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2413	bool ins_into_idle_tree, bool expiration)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2414	{
				2415	struct bfq_entity *entity = &bfqq->entity;
				2416
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2417	bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2418	}
				2419
				2420	static void bfq_activate_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
				2421	{
				2422	struct bfq_entity *entity = &bfqq->entity;
				2423
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2424	bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
				2425	false);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2426	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
				2427	}
				2428
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2429	static void bfq_requeue_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
				2430	{
				2431	struct bfq_entity *entity = &bfqq->entity;
				2432
				2433	bfq_activate_requeue_entity(entity, false,
				2434	bfqq == bfqd->in_service_queue);
				2435	}
				2436
				2437	static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
				2438
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2439	/*
				2440	* Called when the bfqq no longer has requests pending, remove it from
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2441	* the service tree. As a special case, it can be invoked during an
				2442	* expiration.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2443	*/
				2444	static void bfq_del_bfqq_busy(struct bfq_data bfqd, struct bfq_queue bfqq,
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2445	bool expiration)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2446	{
				2447	bfq_log_bfqq(bfqd, bfqq, "del from busy");
				2448
				2449	bfq_clear_bfqq_busy(bfqq);
				2450
				2451	bfqd->busy_queues--;
				2452
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2453	bfqg_stats_update_dequeue(bfqq_group(bfqq));
				2454
				2455	bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2456	}
				2457
				2458	/*
				2459	* Called when an inactive queue receives a new request.
				2460	*/
				2461	static void bfq_add_bfqq_busy(struct bfq_data bfqd, struct bfq_queue bfqq)
				2462	{
				2463	bfq_log_bfqq(bfqd, bfqq, "add to busy");
				2464
				2465	bfq_activate_bfqq(bfqd, bfqq);
				2466
				2467	bfq_mark_bfqq_busy(bfqq);
				2468	bfqd->busy_queues++;
				2469	}
				2470
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2471	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				2472
				2473	/* bfqg stats flags */
				2474	enum bfqg_stats_flags {
				2475	BFQG_stats_waiting = 0,
				2476	BFQG_stats_idling,
				2477	BFQG_stats_empty,
				2478	};
				2479
				2480	#define BFQG_FLAG_FNS(name) \
				2481	static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
				2482	{ \
				2483	stats->flags \|= (1 << BFQG_stats_##name); \
				2484	} \
				2485	static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
				2486	{ \
				2487	stats->flags &= ~(1 << BFQG_stats_##name); \
				2488	} \
				2489	static int bfqg_stats_##name(struct bfqg_stats *stats) \
				2490	{ \
				2491	return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
				2492	} \
				2493
				2494	BFQG_FLAG_FNS(waiting)
				2495	BFQG_FLAG_FNS(idling)
				2496	BFQG_FLAG_FNS(empty)
				2497	#undef BFQG_FLAG_FNS
				2498
				2499	/* This should be called with the queue_lock held. */
				2500	static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
				2501	{
				2502	unsigned long long now;
				2503
				2504	if (!bfqg_stats_waiting(stats))
				2505	return;
				2506
				2507	now = sched_clock();
				2508	if (time_after64(now, stats->start_group_wait_time))
				2509	blkg_stat_add(&stats->group_wait_time,
				2510	now - stats->start_group_wait_time);
				2511	bfqg_stats_clear_waiting(stats);
				2512	}
				2513
				2514	/* This should be called with the queue_lock held. */
				2515	static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
				2516	struct bfq_group *curr_bfqg)
				2517	{
				2518	struct bfqg_stats *stats = &bfqg->stats;
				2519
				2520	if (bfqg_stats_waiting(stats))
				2521	return;
				2522	if (bfqg == curr_bfqg)
				2523	return;
				2524	stats->start_group_wait_time = sched_clock();
				2525	bfqg_stats_mark_waiting(stats);
				2526	}
				2527
				2528	/* This should be called with the queue_lock held. */
				2529	static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
				2530	{
				2531	unsigned long long now;
				2532
				2533	if (!bfqg_stats_empty(stats))
				2534	return;
				2535
				2536	now = sched_clock();
				2537	if (time_after64(now, stats->start_empty_time))
				2538	blkg_stat_add(&stats->empty_time,
				2539	now - stats->start_empty_time);
				2540	bfqg_stats_clear_empty(stats);
				2541	}
				2542
				2543	static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
				2544	{
				2545	blkg_stat_add(&bfqg->stats.dequeue, 1);
				2546	}
				2547
				2548	static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
				2549	{
				2550	struct bfqg_stats *stats = &bfqg->stats;
				2551
				2552	if (blkg_rwstat_total(&stats->queued))
				2553	return;
				2554
				2555	/*
				2556	* group is already marked empty. This can happen if bfqq got new
				2557	* request in parent group and moved to this group while being added
				2558	* to service tree. Just ignore the event and move on.
				2559	*/
				2560	if (bfqg_stats_empty(stats))
				2561	return;
				2562
				2563	stats->start_empty_time = sched_clock();
				2564	bfqg_stats_mark_empty(stats);
				2565	}
				2566
				2567	static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
				2568	{
				2569	struct bfqg_stats *stats = &bfqg->stats;
				2570
				2571	if (bfqg_stats_idling(stats)) {
				2572	unsigned long long now = sched_clock();
				2573
				2574	if (time_after64(now, stats->start_idle_time))
				2575	blkg_stat_add(&stats->idle_time,
				2576	now - stats->start_idle_time);
				2577	bfqg_stats_clear_idling(stats);
				2578	}
				2579	}
				2580
				2581	static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
				2582	{
				2583	struct bfqg_stats *stats = &bfqg->stats;
				2584
				2585	stats->start_idle_time = sched_clock();
				2586	bfqg_stats_mark_idling(stats);
				2587	}
				2588
				2589	static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
				2590	{
				2591	struct bfqg_stats *stats = &bfqg->stats;
				2592
				2593	blkg_stat_add(&stats->avg_queue_size_sum,
				2594	blkg_rwstat_total(&stats->queued));
				2595	blkg_stat_add(&stats->avg_queue_size_samples, 1);
				2596	bfqg_stats_update_group_wait_time(stats);
				2597	}
				2598
				2599	/*
				2600	* blk-cgroup policy-related handlers
				2601	* The following functions help in converting between blk-cgroup
				2602	* internal structures and BFQ-specific structures.
				2603	*/
				2604
				2605	static struct bfq_group pd_to_bfqg(struct blkg_policy_data pd)
				2606	{
				2607	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
				2608	}
				2609
				2610	static struct blkcg_gq bfqg_to_blkg(struct bfq_group bfqg)
				2611	{
				2612	return pd_to_blkg(&bfqg->pd);
				2613	}
				2614
				2615	static struct blkcg_policy blkcg_policy_bfq;
				2616
				2617	static struct bfq_group blkg_to_bfqg(struct blkcg_gq blkg)
				2618	{
				2619	return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
				2620	}
				2621
				2622	/*
				2623	* bfq_group handlers
				2624	* The following functions help in navigating the bfq_group hierarchy
				2625	* by allowing to find the parent of a bfq_group or the bfq_group
				2626	* associated to a bfq_queue.
				2627	*/
				2628
				2629	static struct bfq_group bfqg_parent(struct bfq_group bfqg)
				2630	{
				2631	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
				2632
				2633	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
				2634	}
				2635
				2636	static struct bfq_group bfqq_group(struct bfq_queue bfqq)
				2637	{
				2638	struct bfq_entity *group_entity = bfqq->entity.parent;
				2639
				2640	return group_entity ? container_of(group_entity, struct bfq_group,
				2641	entity) :
				2642	bfqq->bfqd->root_group;
				2643	}
				2644
				2645	/*
				2646	* The following two functions handle get and put of a bfq_group by
				2647	* wrapping the related blk-cgroup hooks.
				2648	*/
				2649
				2650	static void bfqg_get(struct bfq_group *bfqg)
				2651	{
				2652	return blkg_get(bfqg_to_blkg(bfqg));
				2653	}
				2654
				2655	static void bfqg_put(struct bfq_group *bfqg)
				2656	{
				2657	return blkg_put(bfqg_to_blkg(bfqg));
				2658	}
				2659
				2660	static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
				2661	struct bfq_queue *bfqq,
				2662	unsigned int op)
				2663	{
				2664	blkg_rwstat_add(&bfqg->stats.queued, op, 1);
				2665	bfqg_stats_end_empty_time(&bfqg->stats);
				2666	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
				2667	bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
				2668	}
				2669
				2670	static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
				2671	{
				2672	blkg_rwstat_add(&bfqg->stats.queued, op, -1);
				2673	}
				2674
				2675	static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
				2676	{
				2677	blkg_rwstat_add(&bfqg->stats.merged, op, 1);
				2678	}
				2679
				2680	static void bfqg_stats_update_completion(struct bfq_group *bfqg,
				2681	uint64_t start_time, uint64_t io_start_time,
				2682	unsigned int op)
				2683	{
				2684	struct bfqg_stats *stats = &bfqg->stats;
				2685	unsigned long long now = sched_clock();
				2686
				2687	if (time_after64(now, io_start_time))
				2688	blkg_rwstat_add(&stats->service_time, op,
				2689	now - io_start_time);
				2690	if (time_after64(io_start_time, start_time))
				2691	blkg_rwstat_add(&stats->wait_time, op,
				2692	io_start_time - start_time);
				2693	}
				2694
				2695	/* @stats = 0 */
				2696	static void bfqg_stats_reset(struct bfqg_stats *stats)
				2697	{
				2698	/* queued stats shouldn't be cleared */
				2699	blkg_rwstat_reset(&stats->merged);
				2700	blkg_rwstat_reset(&stats->service_time);
				2701	blkg_rwstat_reset(&stats->wait_time);
				2702	blkg_stat_reset(&stats->time);
				2703	blkg_stat_reset(&stats->avg_queue_size_sum);
				2704	blkg_stat_reset(&stats->avg_queue_size_samples);
				2705	blkg_stat_reset(&stats->dequeue);
				2706	blkg_stat_reset(&stats->group_wait_time);
				2707	blkg_stat_reset(&stats->idle_time);
				2708	blkg_stat_reset(&stats->empty_time);
				2709	}
				2710
				2711	/* @to += @from */
				2712	static void bfqg_stats_add_aux(struct bfqg_stats to, struct bfqg_stats from)
				2713	{
				2714	if (!to \|\| !from)
				2715	return;
				2716
				2717	/* queued stats shouldn't be cleared */
				2718	blkg_rwstat_add_aux(&to->merged, &from->merged);
				2719	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
				2720	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
				2721	blkg_stat_add_aux(&from->time, &from->time);
				2722	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
				2723	blkg_stat_add_aux(&to->avg_queue_size_samples,
				2724	&from->avg_queue_size_samples);
				2725	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
				2726	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
				2727	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
				2728	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
				2729	}
				2730
				2731	/*
				2732	* Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
				2733	* recursive stats can still account for the amount used by this bfqg after
				2734	* it's gone.
				2735	*/
				2736	static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
				2737	{
				2738	struct bfq_group *parent;
				2739
				2740	if (!bfqg) /* root_group */
				2741	return;
				2742
				2743	parent = bfqg_parent(bfqg);
				2744
				2745	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
				2746
				2747	if (unlikely(!parent))
				2748	return;
				2749
				2750	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
				2751	bfqg_stats_reset(&bfqg->stats);
				2752	}
				2753
				2754	static void bfq_init_entity(struct bfq_entity *entity,
				2755	struct bfq_group *bfqg)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2756	{
				2757	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				2758
				2759	entity->weight = entity->new_weight;
				2760	entity->orig_weight = entity->new_weight;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2761	if (bfqq) {
				2762	bfqq->ioprio = bfqq->new_ioprio;
				2763	bfqq->ioprio_class = bfqq->new_ioprio_class;
				2764	bfqg_get(bfqg);
				2765	}
				2766	entity->parent = bfqg->my_entity; /* NULL for root group */
				2767	entity->sched_data = &bfqg->sched_data;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	2768	}
				2769
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	2770	static void bfqg_stats_exit(struct bfqg_stats *stats)
				2771	{
				2772	blkg_rwstat_exit(&stats->merged);
				2773	blkg_rwstat_exit(&stats->service_time);
				2774	blkg_rwstat_exit(&stats->wait_time);
				2775	blkg_rwstat_exit(&stats->queued);
				2776	blkg_stat_exit(&stats->time);
				2777	blkg_stat_exit(&stats->avg_queue_size_sum);
				2778	blkg_stat_exit(&stats->avg_queue_size_samples);
				2779	blkg_stat_exit(&stats->dequeue);
				2780	blkg_stat_exit(&stats->group_wait_time);
				2781	blkg_stat_exit(&stats->idle_time);
				2782	blkg_stat_exit(&stats->empty_time);
				2783	}
				2784
				2785	static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
				2786	{
				2787	if (blkg_rwstat_init(&stats->merged, gfp) \|\|
				2788	blkg_rwstat_init(&stats->service_time, gfp) \|\|
				2789	blkg_rwstat_init(&stats->wait_time, gfp) \|\|
				2790	blkg_rwstat_init(&stats->queued, gfp) \|\|
				2791	blkg_stat_init(&stats->time, gfp) \|\|
				2792	blkg_stat_init(&stats->avg_queue_size_sum, gfp) \|\|
				2793	blkg_stat_init(&stats->avg_queue_size_samples, gfp) \|\|
				2794	blkg_stat_init(&stats->dequeue, gfp) \|\|
				2795	blkg_stat_init(&stats->group_wait_time, gfp) \|\|
				2796	blkg_stat_init(&stats->idle_time, gfp) \|\|
				2797	blkg_stat_init(&stats->empty_time, gfp)) {
				2798	bfqg_stats_exit(stats);
				2799	return -ENOMEM;
				2800	}
				2801
				2802	return 0;
				2803	}
				2804
				2805	static struct bfq_group_data cpd_to_bfqgd(struct blkcg_policy_data cpd)
				2806	{
				2807	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
				2808	}
				2809
				2810	static struct bfq_group_data blkcg_to_bfqgd(struct blkcg blkcg)
				2811	{
				2812	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
				2813	}
				2814
				2815	static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
				2816	{
				2817	struct bfq_group_data *bgd;
				2818
				2819	bgd = kzalloc(sizeof(*bgd), gfp);
				2820	if (!bgd)
				2821	return NULL;
				2822	return &bgd->pd;
				2823	}
				2824
				2825	static void bfq_cpd_init(struct blkcg_policy_data *cpd)
				2826	{
				2827	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
				2828
				2829	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
				2830	CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
				2831	}
				2832
				2833	static void bfq_cpd_free(struct blkcg_policy_data *cpd)
				2834	{
				2835	kfree(cpd_to_bfqgd(cpd));
				2836	}
				2837
				2838	static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
				2839	{
				2840	struct bfq_group *bfqg;
				2841
				2842	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
				2843	if (!bfqg)
				2844	return NULL;
				2845
				2846	if (bfqg_stats_init(&bfqg->stats, gfp)) {
				2847	kfree(bfqg);
				2848	return NULL;
				2849	}
				2850
				2851	return &bfqg->pd;
				2852	}
				2853
				2854	static void bfq_pd_init(struct blkg_policy_data *pd)
				2855	{
				2856	struct blkcg_gq *blkg = pd_to_blkg(pd);
				2857	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
				2858	struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
				2859	struct bfq_entity *entity = &bfqg->entity;
				2860	struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
				2861
				2862	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
				2863	entity->my_sched_data = &bfqg->sched_data;
				2864	bfqg->my_entity = entity; /*
				2865	* the root_group's will be set to NULL
				2866	* in bfq_init_queue()
				2867	*/
				2868	bfqg->bfqd = bfqd;
				2869	}
				2870
				2871	static void bfq_pd_free(struct blkg_policy_data *pd)
				2872	{
				2873	struct bfq_group *bfqg = pd_to_bfqg(pd);
				2874
				2875	bfqg_stats_exit(&bfqg->stats);
				2876	return kfree(bfqg);
				2877	}
				2878
				2879	static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
				2880	{
				2881	struct bfq_group *bfqg = pd_to_bfqg(pd);
				2882
				2883	bfqg_stats_reset(&bfqg->stats);
				2884	}
				2885
				2886	static void bfq_group_set_parent(struct bfq_group *bfqg,
				2887	struct bfq_group *parent)
				2888	{
				2889	struct bfq_entity *entity;
				2890
				2891	entity = &bfqg->entity;
				2892	entity->parent = parent->my_entity;
				2893	entity->sched_data = &parent->sched_data;
				2894	}
				2895
				2896	static struct bfq_group bfq_lookup_bfqg(struct bfq_data bfqd,
				2897	struct blkcg *blkcg)
				2898	{
				2899	struct blkcg_gq *blkg;
				2900
				2901	blkg = blkg_lookup(blkcg, bfqd->queue);
				2902	if (likely(blkg))
				2903	return blkg_to_bfqg(blkg);
				2904	return NULL;
				2905	}
				2906
				2907	static struct bfq_group bfq_find_set_group(struct bfq_data bfqd,
				2908	struct blkcg *blkcg)
				2909	{
				2910	struct bfq_group bfqg, parent;
				2911	struct bfq_entity *entity;
				2912
				2913	bfqg = bfq_lookup_bfqg(bfqd, blkcg);
				2914
				2915	if (unlikely(!bfqg))
				2916	return NULL;
				2917
				2918	/*
				2919	* Update chain of bfq_groups as we might be handling a leaf group
				2920	* which, along with some of its relatives, has not been hooked yet
				2921	* to the private hierarchy of BFQ.
				2922	*/
				2923	entity = &bfqg->entity;
				2924	for_each_entity(entity) {
				2925	bfqg = container_of(entity, struct bfq_group, entity);
				2926	if (bfqg != bfqd->root_group) {
				2927	parent = bfqg_parent(bfqg);
				2928	if (!parent)
				2929	parent = bfqd->root_group;
				2930	bfq_group_set_parent(bfqg, parent);
				2931	}
				2932	}
				2933
				2934	return bfqg;
				2935	}
				2936
				2937	static void bfq_bfqq_expire(struct bfq_data *bfqd,
				2938	struct bfq_queue *bfqq,
				2939	bool compensate,
				2940	enum bfqq_expiration reason);
				2941
				2942	/**
				2943	* bfq_bfqq_move - migrate @bfqq to @bfqg.
				2944	* @bfqd: queue descriptor.
				2945	* @bfqq: the queue to move.
				2946	* @bfqg: the group to move to.
				2947	*
				2948	* Move @bfqq to @bfqg, deactivating it from its old group and reactivating
				2949	* it on the new one. Avoid putting the entity on the old group idle tree.
				2950	*
				2951	* Must be called under the queue lock; the cgroup owning @bfqg must
				2952	* not disappear (by now this just means that we are called under
				2953	* rcu_read_lock()).
				2954	*/
				2955	static void bfq_bfqq_move(struct bfq_data bfqd, struct bfq_queue bfqq,
				2956	struct bfq_group *bfqg)
				2957	{
				2958	struct bfq_entity *entity = &bfqq->entity;
				2959
				2960	/* If bfqq is empty, then bfq_bfqq_expire also invokes
				2961	* bfq_del_bfqq_busy, thereby removing bfqq and its entity
				2962	* from data structures related to current group. Otherwise we
				2963	* need to remove bfqq explicitly with bfq_deactivate_bfqq, as
				2964	* we do below.
				2965	*/
				2966	if (bfqq == bfqd->in_service_queue)
				2967	bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
				2968	false, BFQQE_PREEMPTED);
				2969
				2970	if (bfq_bfqq_busy(bfqq))
				2971	bfq_deactivate_bfqq(bfqd, bfqq, false, false);
				2972	else if (entity->on_st)
				2973	bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
				2974	bfqg_put(bfqq_group(bfqq));
				2975
				2976	/*
				2977	* Here we use a reference to bfqg. We don't need a refcounter
				2978	* as the cgroup reference will not be dropped, so that its
				2979	* destroy() callback will not be invoked.
				2980	*/
				2981	entity->parent = bfqg->my_entity;
				2982	entity->sched_data = &bfqg->sched_data;
				2983	bfqg_get(bfqg);
				2984
				2985	if (bfq_bfqq_busy(bfqq))
				2986	bfq_activate_bfqq(bfqd, bfqq);
				2987
				2988	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
				2989	bfq_schedule_dispatch(bfqd);
				2990	}
				2991
				2992	/**
				2993	* __bfq_bic_change_cgroup - move @bic to @cgroup.
				2994	* @bfqd: the queue descriptor.
				2995	* @bic: the bic to move.
				2996	* @blkcg: the blk-cgroup to move to.
				2997	*
				2998	* Move bic to blkcg, assuming that bfqd->queue is locked; the caller
				2999	* has to make sure that the reference to cgroup is valid across the call.
				3000	*
				3001	* NOTE: an alternative approach might have been to store the current
				3002	* cgroup in bfqq and getting a reference to it, reducing the lookup
				3003	* time here, at the price of slightly more complex code.
				3004	*/
				3005	static struct bfq_group __bfq_bic_change_cgroup(struct bfq_data bfqd,
				3006	struct bfq_io_cq *bic,
				3007	struct blkcg *blkcg)
				3008	{
				3009	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
				3010	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
				3011	struct bfq_group *bfqg;
				3012	struct bfq_entity *entity;
				3013
				3014	bfqg = bfq_find_set_group(bfqd, blkcg);
				3015
				3016	if (unlikely(!bfqg))
				3017	bfqg = bfqd->root_group;
				3018
				3019	if (async_bfqq) {
				3020	entity = &async_bfqq->entity;
				3021
				3022	if (entity->sched_data != &bfqg->sched_data) {
				3023	bic_set_bfqq(bic, NULL, 0);
				3024	bfq_log_bfqq(bfqd, async_bfqq,
				3025	"bic_change_group: %p %d",
				3026	async_bfqq,
				3027	async_bfqq->ref);
				3028	bfq_put_queue(async_bfqq);
				3029	}
				3030	}
				3031
				3032	if (sync_bfqq) {
				3033	entity = &sync_bfqq->entity;
				3034	if (entity->sched_data != &bfqg->sched_data)
				3035	bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
				3036	}
				3037
				3038	return bfqg;
				3039	}
				3040
				3041	static void bfq_bic_update_cgroup(struct bfq_io_cq bic, struct bio bio)
				3042	{
				3043	struct bfq_data *bfqd = bic_to_bfqd(bic);
				3044	struct bfq_group *bfqg = NULL;
				3045	uint64_t serial_nr;
				3046
				3047	rcu_read_lock();
				3048	serial_nr = bio_blkcg(bio)->css.serial_nr;
				3049
				3050	/*
				3051	* Check whether blkcg has changed. The condition may trigger
				3052	* spuriously on a newly created cic but there's no harm.
				3053	*/
				3054	if (unlikely(!bfqd) \|\| likely(bic->blkcg_serial_nr == serial_nr))
				3055	goto out;
				3056
				3057	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
				3058	bic->blkcg_serial_nr = serial_nr;
				3059	out:
				3060	rcu_read_unlock();
				3061	}
				3062
				3063	/**
				3064	* bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
				3065	* @st: the service tree being flushed.
				3066	*/
				3067	static void bfq_flush_idle_tree(struct bfq_service_tree *st)
				3068	{
				3069	struct bfq_entity *entity = st->first_idle;
				3070
				3071	for (; entity ; entity = st->first_idle)
				3072	__bfq_deactivate_entity(entity, false);
				3073	}
				3074
				3075	/**
				3076	* bfq_reparent_leaf_entity - move leaf entity to the root_group.
				3077	* @bfqd: the device data structure with the root group.
				3078	* @entity: the entity to move.
				3079	*/
				3080	static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
				3081	struct bfq_entity *entity)
				3082	{
				3083	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				3084
				3085	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
				3086	}
				3087
				3088	/**
				3089	* bfq_reparent_active_entities - move to the root group all active
				3090	* entities.
				3091	* @bfqd: the device data structure with the root group.
				3092	* @bfqg: the group to move from.
				3093	* @st: the service tree with the entities.
				3094	*
				3095	* Needs queue_lock to be taken and reference to be valid over the call.
				3096	*/
				3097	static void bfq_reparent_active_entities(struct bfq_data *bfqd,
				3098	struct bfq_group *bfqg,
				3099	struct bfq_service_tree *st)
				3100	{
				3101	struct rb_root *active = &st->active;
				3102	struct bfq_entity *entity = NULL;
				3103
				3104	if (!RB_EMPTY_ROOT(&st->active))
				3105	entity = bfq_entity_of(rb_first(active));
				3106
				3107	for (; entity ; entity = bfq_entity_of(rb_first(active)))
				3108	bfq_reparent_leaf_entity(bfqd, entity);
				3109
				3110	if (bfqg->sched_data.in_service_entity)
				3111	bfq_reparent_leaf_entity(bfqd,
				3112	bfqg->sched_data.in_service_entity);
				3113	}
				3114
				3115	/**
				3116	* bfq_pd_offline - deactivate the entity associated with @pd,
				3117	* and reparent its children entities.
				3118	* @pd: descriptor of the policy going offline.
				3119	*
				3120	* blkio already grabs the queue_lock for us, so no need to use
				3121	* RCU-based magic
				3122	*/
				3123	static void bfq_pd_offline(struct blkg_policy_data *pd)
				3124	{
				3125	struct bfq_service_tree *st;
				3126	struct bfq_group *bfqg = pd_to_bfqg(pd);
				3127	struct bfq_data *bfqd = bfqg->bfqd;
				3128	struct bfq_entity *entity = bfqg->my_entity;
				3129	unsigned long flags;
				3130	int i;
				3131
				3132	if (!entity) /* root group */
				3133	return;
				3134
				3135	spin_lock_irqsave(&bfqd->lock, flags);
				3136	/*
				3137	* Empty all service_trees belonging to this group before
				3138	* deactivating the group itself.
				3139	*/
				3140	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
				3141	st = bfqg->sched_data.service_tree + i;
				3142
				3143	/*
				3144	* The idle tree may still contain bfq_queues belonging
				3145	* to exited task because they never migrated to a different
				3146	* cgroup from the one being destroyed now. No one else
				3147	* can access them so it's safe to act without any lock.
				3148	*/
				3149	bfq_flush_idle_tree(st);
				3150
				3151	/*
				3152	* It may happen that some queues are still active
				3153	* (busy) upon group destruction (if the corresponding
				3154	* processes have been forced to terminate). We move
				3155	* all the leaf entities corresponding to these queues
				3156	* to the root_group.
				3157	* Also, it may happen that the group has an entity
				3158	* in service, which is disconnected from the active
				3159	* tree: it must be moved, too.
				3160	* There is no need to put the sync queues, as the
				3161	* scheduler has taken no reference.
				3162	*/
				3163	bfq_reparent_active_entities(bfqd, bfqg, st);
				3164	}
				3165
				3166	__bfq_deactivate_entity(entity, false);
				3167	bfq_put_async_queues(bfqd, bfqg);
				3168
				3169	spin_unlock_irqrestore(&bfqd->lock, flags);
				3170	/*
				3171	* @blkg is going offline and will be ignored by
				3172	* blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
				3173	* that they don't get lost. If IOs complete after this point, the
				3174	* stats for them will be lost. Oh well...
				3175	*/
				3176	bfqg_stats_xfer_dead(bfqg);
				3177	}
				3178
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	3179	static void bfq_end_wr_async(struct bfq_data *bfqd)
				3180	{
				3181	struct blkcg_gq *blkg;
				3182
				3183	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
				3184	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
				3185
				3186	bfq_end_wr_async_queues(bfqd, bfqg);
				3187	}
				3188	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
				3189	}
				3190
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	3191	static int bfq_io_show_weight(struct seq_file sf, void v)
				3192	{
				3193	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
				3194	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
				3195	unsigned int val = 0;
				3196
				3197	if (bfqgd)
				3198	val = bfqgd->weight;
				3199
				3200	seq_printf(sf, "%u\n", val);
				3201
				3202	return 0;
				3203	}
				3204
				3205	static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
				3206	struct cftype *cftype,
				3207	u64 val)
				3208	{
				3209	struct blkcg *blkcg = css_to_blkcg(css);
				3210	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
				3211	struct blkcg_gq *blkg;
				3212	int ret = -ERANGE;
				3213
				3214	if (val < BFQ_MIN_WEIGHT \|\| val > BFQ_MAX_WEIGHT)
				3215	return ret;
				3216
				3217	ret = 0;
				3218	spin_lock_irq(&blkcg->lock);
				3219	bfqgd->weight = (unsigned short)val;
				3220	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
				3221	struct bfq_group *bfqg = blkg_to_bfqg(blkg);
				3222
				3223	if (!bfqg)
				3224	continue;
				3225	/*
				3226	* Setting the prio_changed flag of the entity
				3227	* to 1 with new_weight == weight would re-set
				3228	* the value of the weight to its ioprio mapping.
				3229	* Set the flag only if necessary.
				3230	*/
				3231	if ((unsigned short)val != bfqg->entity.new_weight) {
				3232	bfqg->entity.new_weight = (unsigned short)val;
				3233	/*
				3234	* Make sure that the above new value has been
				3235	* stored in bfqg->entity.new_weight before
				3236	* setting the prio_changed flag. In fact,
				3237	* this flag may be read asynchronously (in
				3238	* critical sections protected by a different
				3239	* lock than that held here), and finding this
				3240	* flag set may cause the execution of the code
				3241	* for updating parameters whose value may
				3242	* depend also on bfqg->entity.new_weight (in
				3243	* __bfq_entity_update_weight_prio).
				3244	* This barrier makes sure that the new value
				3245	* of bfqg->entity.new_weight is correctly
				3246	* seen in that code.
				3247	*/
				3248	smp_wmb();
				3249	bfqg->entity.prio_changed = 1;
				3250	}
				3251	}
				3252	spin_unlock_irq(&blkcg->lock);
				3253
				3254	return ret;
				3255	}
				3256
				3257	static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
				3258	char *buf, size_t nbytes,
				3259	loff_t off)
				3260	{
				3261	u64 weight;
				3262	/* First unsigned long found in the file is used */
				3263	int ret = kstrtoull(strim(buf), 0, &weight);
				3264
				3265	if (ret)
				3266	return ret;
				3267
				3268	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
				3269	}
				3270
				3271	static int bfqg_print_stat(struct seq_file sf, void v)
				3272	{
				3273	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
				3274	&blkcg_policy_bfq, seq_cft(sf)->private, false);
				3275	return 0;
				3276	}
				3277
				3278	static int bfqg_print_rwstat(struct seq_file sf, void v)
				3279	{
				3280	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
				3281	&blkcg_policy_bfq, seq_cft(sf)->private, true);
				3282	return 0;
				3283	}
				3284
				3285	static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
				3286	struct blkg_policy_data *pd, int off)
				3287	{
				3288	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
				3289	&blkcg_policy_bfq, off);
				3290	return __blkg_prfill_u64(sf, pd, sum);
				3291	}
				3292
				3293	static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
				3294	struct blkg_policy_data *pd, int off)
				3295	{
				3296	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
				3297	&blkcg_policy_bfq,
				3298	off);
				3299	return __blkg_prfill_rwstat(sf, pd, &sum);
				3300	}
				3301
				3302	static int bfqg_print_stat_recursive(struct seq_file sf, void v)
				3303	{
				3304	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				3305	bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
				3306	seq_cft(sf)->private, false);
				3307	return 0;
				3308	}
				3309
				3310	static int bfqg_print_rwstat_recursive(struct seq_file sf, void v)
				3311	{
				3312	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				3313	bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
				3314	seq_cft(sf)->private, true);
				3315	return 0;
				3316	}
				3317
				3318	static u64 bfqg_prfill_sectors(struct seq_file sf, struct blkg_policy_data pd,
				3319	int off)
				3320	{
				3321	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
				3322
				3323	return __blkg_prfill_u64(sf, pd, sum >> 9);
				3324	}
				3325
				3326	static int bfqg_print_stat_sectors(struct seq_file sf, void v)
				3327	{
				3328	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				3329	bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
				3330	return 0;
				3331	}
				3332
				3333	static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
				3334	struct blkg_policy_data *pd, int off)
				3335	{
				3336	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
				3337	offsetof(struct blkcg_gq, stat_bytes));
				3338	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
				3339	atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
				3340
				3341	return __blkg_prfill_u64(sf, pd, sum >> 9);
				3342	}
				3343
				3344	static int bfqg_print_stat_sectors_recursive(struct seq_file sf, void v)
				3345	{
				3346	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				3347	bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
				3348	false);
				3349	return 0;
				3350	}
				3351
				3352	static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
				3353	struct blkg_policy_data *pd, int off)
				3354	{
				3355	struct bfq_group *bfqg = pd_to_bfqg(pd);
				3356	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
				3357	u64 v = 0;
				3358
				3359	if (samples) {
				3360	v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
				3361	v = div64_u64(v, samples);
				3362	}
				3363	__blkg_prfill_u64(sf, pd, v);
				3364	return 0;
				3365	}
				3366
				3367	/* print avg_queue_size */
				3368	static int bfqg_print_avg_queue_size(struct seq_file sf, void v)
				3369	{
				3370	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
				3371	bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
				3372	0, false);
				3373	return 0;
				3374	}
				3375
				3376	static struct bfq_group *
				3377	bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
				3378	{
				3379	int ret;
				3380
				3381	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
				3382	if (ret)
				3383	return NULL;
				3384
				3385	return blkg_to_bfqg(bfqd->queue->root_blkg);
				3386	}
				3387
				3388	static struct cftype bfq_blkcg_legacy_files[] = {
				3389	{
				3390	.name = "bfq.weight",
				3391	.flags = CFTYPE_NOT_ON_ROOT,
				3392	.seq_show = bfq_io_show_weight,
				3393	.write_u64 = bfq_io_set_weight_legacy,
				3394	},
				3395
				3396	/* statistics, covers only the tasks in the bfqg */
				3397	{
				3398	.name = "bfq.time",
				3399	.private = offsetof(struct bfq_group, stats.time),
				3400	.seq_show = bfqg_print_stat,
				3401	},
				3402	{
				3403	.name = "bfq.sectors",
				3404	.seq_show = bfqg_print_stat_sectors,
				3405	},
				3406	{
				3407	.name = "bfq.io_service_bytes",
				3408	.private = (unsigned long)&blkcg_policy_bfq,
				3409	.seq_show = blkg_print_stat_bytes,
				3410	},
				3411	{
				3412	.name = "bfq.io_serviced",
				3413	.private = (unsigned long)&blkcg_policy_bfq,
				3414	.seq_show = blkg_print_stat_ios,
				3415	},
				3416	{
				3417	.name = "bfq.io_service_time",
				3418	.private = offsetof(struct bfq_group, stats.service_time),
				3419	.seq_show = bfqg_print_rwstat,
				3420	},
				3421	{
				3422	.name = "bfq.io_wait_time",
				3423	.private = offsetof(struct bfq_group, stats.wait_time),
				3424	.seq_show = bfqg_print_rwstat,
				3425	},
				3426	{
				3427	.name = "bfq.io_merged",
				3428	.private = offsetof(struct bfq_group, stats.merged),
				3429	.seq_show = bfqg_print_rwstat,
				3430	},
				3431	{
				3432	.name = "bfq.io_queued",
				3433	.private = offsetof(struct bfq_group, stats.queued),
				3434	.seq_show = bfqg_print_rwstat,
				3435	},
				3436
				3437	/* the same statictics which cover the bfqg and its descendants */
				3438	{
				3439	.name = "bfq.time_recursive",
				3440	.private = offsetof(struct bfq_group, stats.time),
				3441	.seq_show = bfqg_print_stat_recursive,
				3442	},
				3443	{
				3444	.name = "bfq.sectors_recursive",
				3445	.seq_show = bfqg_print_stat_sectors_recursive,
				3446	},
				3447	{
				3448	.name = "bfq.io_service_bytes_recursive",
				3449	.private = (unsigned long)&blkcg_policy_bfq,
				3450	.seq_show = blkg_print_stat_bytes_recursive,
				3451	},
				3452	{
				3453	.name = "bfq.io_serviced_recursive",
				3454	.private = (unsigned long)&blkcg_policy_bfq,
				3455	.seq_show = blkg_print_stat_ios_recursive,
				3456	},
				3457	{
				3458	.name = "bfq.io_service_time_recursive",
				3459	.private = offsetof(struct bfq_group, stats.service_time),
				3460	.seq_show = bfqg_print_rwstat_recursive,
				3461	},
				3462	{
				3463	.name = "bfq.io_wait_time_recursive",
				3464	.private = offsetof(struct bfq_group, stats.wait_time),
				3465	.seq_show = bfqg_print_rwstat_recursive,
				3466	},
				3467	{
				3468	.name = "bfq.io_merged_recursive",
				3469	.private = offsetof(struct bfq_group, stats.merged),
				3470	.seq_show = bfqg_print_rwstat_recursive,
				3471	},
				3472	{
				3473	.name = "bfq.io_queued_recursive",
				3474	.private = offsetof(struct bfq_group, stats.queued),
				3475	.seq_show = bfqg_print_rwstat_recursive,
				3476	},
				3477	{
				3478	.name = "bfq.avg_queue_size",
				3479	.seq_show = bfqg_print_avg_queue_size,
				3480	},
				3481	{
				3482	.name = "bfq.group_wait_time",
				3483	.private = offsetof(struct bfq_group, stats.group_wait_time),
				3484	.seq_show = bfqg_print_stat,
				3485	},
				3486	{
				3487	.name = "bfq.idle_time",
				3488	.private = offsetof(struct bfq_group, stats.idle_time),
				3489	.seq_show = bfqg_print_stat,
				3490	},
				3491	{
				3492	.name = "bfq.empty_time",
				3493	.private = offsetof(struct bfq_group, stats.empty_time),
				3494	.seq_show = bfqg_print_stat,
				3495	},
				3496	{
				3497	.name = "bfq.dequeue",
				3498	.private = offsetof(struct bfq_group, stats.dequeue),
				3499	.seq_show = bfqg_print_stat,
				3500	},
				3501	{ } /* terminate */
				3502	};
				3503
				3504	static struct cftype bfq_blkg_files[] = {
				3505	{
				3506	.name = "bfq.weight",
				3507	.flags = CFTYPE_NOT_ON_ROOT,
				3508	.seq_show = bfq_io_show_weight,
				3509	.write = bfq_io_set_weight,
				3510	},
				3511	{} /* terminate */
				3512	};
				3513
				3514	#else /* CONFIG_BFQ_GROUP_IOSCHED */
				3515
				3516	static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
				3517	struct bfq_queue *bfqq, unsigned int op) { }
				3518	static inline void
				3519	bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
				3520	static inline void
				3521	bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
				3522	static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
				3523	uint64_t start_time, uint64_t io_start_time,
				3524	unsigned int op) { }
				3525	static inline void
				3526	bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
				3527	struct bfq_group *curr_bfqg) { }
				3528	static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
				3529	static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
				3530	static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
				3531	static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
				3532	static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
				3533	static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
				3534
				3535	static void bfq_bfqq_move(struct bfq_data bfqd, struct bfq_queue bfqq,
				3536	struct bfq_group *bfqg) {}
				3537
				3538	static void bfq_init_entity(struct bfq_entity *entity,
				3539	struct bfq_group *bfqg)
				3540	{
				3541	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
				3542
				3543	entity->weight = entity->new_weight;
				3544	entity->orig_weight = entity->new_weight;
				3545	if (bfqq) {
				3546	bfqq->ioprio = bfqq->new_ioprio;
				3547	bfqq->ioprio_class = bfqq->new_ioprio_class;
				3548	}
				3549	entity->sched_data = &bfqg->sched_data;
				3550	}
				3551
				3552	static void bfq_bic_update_cgroup(struct bfq_io_cq bic, struct bio bio) {}
				3553
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	3554	static void bfq_end_wr_async(struct bfq_data *bfqd)
				3555	{
				3556	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
				3557	}
				3558
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	3559	static struct bfq_group bfq_find_set_group(struct bfq_data bfqd,
				3560	struct blkcg *blkcg)
				3561	{
				3562	return bfqd->root_group;
				3563	}
				3564
				3565	static struct bfq_group bfqq_group(struct bfq_queue bfqq)
				3566	{
				3567	return bfqq->bfqd->root_group;
				3568	}
				3569
				3570	static struct bfq_group bfq_create_group_hierarchy(struct bfq_data bfqd,
				3571	int node)
				3572	{
				3573	struct bfq_group *bfqg;
				3574	int i;
				3575
				3576	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL \| __GFP_ZERO, node);
				3577	if (!bfqg)
				3578	return NULL;
				3579
				3580	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
				3581	bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
				3582
				3583	return bfqg;
				3584	}
				3585	#endif /* CONFIG_BFQ_GROUP_IOSCHED */
				3586
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3587	#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
				3588	#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
				3589
				3590	#define bfq_sample_valid(samples) ((samples) > 80)
				3591
				3592	/*
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3593	* Lifted from AS - choose which of rq1 and rq2 that is best served now.
				3594	* We choose the request that is closesr to the head right now. Distance
				3595	* behind the head is penalized and only allowed to a certain extent.
				3596	*/
				3597	static struct request bfq_choose_req(struct bfq_data bfqd,
				3598	struct request *rq1,
				3599	struct request *rq2,
				3600	sector_t last)
				3601	{
				3602	sector_t s1, s2, d1 = 0, d2 = 0;
				3603	unsigned long back_max;
				3604	#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
				3605	#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
				3606	unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
				3607
				3608	if (!rq1 \|\| rq1 == rq2)
				3609	return rq2;
				3610	if (!rq2)
				3611	return rq1;
				3612
				3613	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
				3614	return rq1;
				3615	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
				3616	return rq2;
				3617	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
				3618	return rq1;
				3619	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
				3620	return rq2;
				3621
				3622	s1 = blk_rq_pos(rq1);
				3623	s2 = blk_rq_pos(rq2);
				3624
				3625	/*
				3626	* By definition, 1KiB is 2 sectors.
				3627	*/
				3628	back_max = bfqd->bfq_back_max * 2;
				3629
				3630	/*
				3631	* Strict one way elevator _except_ in the case where we allow
				3632	* short backward seeks which are biased as twice the cost of a
				3633	* similar forward seek.
				3634	*/
				3635	if (s1 >= last)
				3636	d1 = s1 - last;
				3637	else if (s1 + back_max >= last)
				3638	d1 = (last - s1) * bfqd->bfq_back_penalty;
				3639	else
				3640	wrap \|= BFQ_RQ1_WRAP;
				3641
				3642	if (s2 >= last)
				3643	d2 = s2 - last;
				3644	else if (s2 + back_max >= last)
				3645	d2 = (last - s2) * bfqd->bfq_back_penalty;
				3646	else
				3647	wrap \|= BFQ_RQ2_WRAP;
				3648
				3649	/* Found required data */
				3650
				3651	/*
				3652	* By doing switch() on the bit mask "wrap" we avoid having to
				3653	* check two variables for all permutations: --> faster!
				3654	*/
				3655	switch (wrap) {
				3656	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
				3657	if (d1 < d2)
				3658	return rq1;
				3659	else if (d2 < d1)
				3660	return rq2;
				3661
				3662	if (s1 >= s2)
				3663	return rq1;
				3664	else
				3665	return rq2;
				3666
				3667	case BFQ_RQ2_WRAP:
				3668	return rq1;
				3669	case BFQ_RQ1_WRAP:
				3670	return rq2;
				3671	case BFQ_RQ1_WRAP\|BFQ_RQ2_WRAP: /* both rqs wrapped */
				3672	default:
				3673	/*
				3674	* Since both rqs are wrapped,
				3675	* start with the one that's further behind head
				3676	* (--> only one back seek required),
				3677	* since back seek takes more time than forward.
				3678	*/
				3679	if (s1 <= s2)
				3680	return rq1;
				3681	else
				3682	return rq2;
				3683	}
				3684	}
				3685
				3686	/*
				3687	* Return expired entry, or NULL to just start from scratch in rbtree.
				3688	*/
				3689	static struct request bfq_check_fifo(struct bfq_queue bfqq,
				3690	struct request *last)
				3691	{
				3692	struct request *rq;
				3693
				3694	if (bfq_bfqq_fifo_expire(bfqq))
				3695	return NULL;
				3696
				3697	bfq_mark_bfqq_fifo_expire(bfqq);
				3698
				3699	rq = rq_entry_fifo(bfqq->fifo.next);
				3700
				3701	if (rq == last \|\| ktime_get_ns() < rq->fifo_time)
				3702	return NULL;
				3703
				3704	bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
				3705	return rq;
				3706	}
				3707
				3708	static struct request bfq_find_next_rq(struct bfq_data bfqd,
				3709	struct bfq_queue *bfqq,
				3710	struct request *last)
				3711	{
				3712	struct rb_node *rbnext = rb_next(&last->rb_node);
				3713	struct rb_node *rbprev = rb_prev(&last->rb_node);
				3714	struct request next, prev = NULL;
				3715
				3716	/* Follow expired path, else get first next available. */
				3717	next = bfq_check_fifo(bfqq, last);
				3718	if (next)
				3719	return next;
				3720
				3721	if (rbprev)
				3722	prev = rb_entry_rq(rbprev);
				3723
				3724	if (rbnext)
				3725	next = rb_entry_rq(rbnext);
				3726	else {
				3727	rbnext = rb_first(&bfqq->sort_list);
				3728	if (rbnext && rbnext != &last->rb_node)
				3729	next = rb_entry_rq(rbnext);
				3730	}
				3731
				3732	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
				3733	}
				3734
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	3735	/* see the definition of bfq_async_charge_factor for details */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3736	static unsigned long bfq_serv_to_charge(struct request *rq,
				3737	struct bfq_queue *bfqq)
				3738	{
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	3739	if (bfq_bfqq_sync(bfqq) \|\| bfqq->wr_coeff > 1)
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	3740	return blk_rq_sectors(rq);
				3741
				3742	return blk_rq_sectors(rq) * bfq_async_charge_factor;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3743	}
				3744
				3745	/**
				3746	* bfq_updated_next_req - update the queue after a new next_rq selection.
				3747	* @bfqd: the device data the queue belongs to.
				3748	* @bfqq: the queue to update.
				3749	*
				3750	* If the first request of a queue changes we make sure that the queue
				3751	* has enough budget to serve at least its first request (if the
				3752	* request has grown). We do this because if the queue has not enough
				3753	* budget for its first request, it has to go through two dispatch
				3754	* rounds to actually get it dispatched.
				3755	*/
				3756	static void bfq_updated_next_req(struct bfq_data *bfqd,
				3757	struct bfq_queue *bfqq)
				3758	{
				3759	struct bfq_entity *entity = &bfqq->entity;
				3760	struct request *next_rq = bfqq->next_rq;
				3761	unsigned long new_budget;
				3762
				3763	if (!next_rq)
				3764	return;
				3765
				3766	if (bfqq == bfqd->in_service_queue)
				3767	/*
				3768	* In order not to break guarantees, budgets cannot be
				3769	* changed after an entity has been selected.
				3770	*/
				3771	return;
				3772
				3773	new_budget = max_t(unsigned long, bfqq->max_budget,
				3774	bfq_serv_to_charge(next_rq, bfqq));
				3775	if (entity->budget != new_budget) {
				3776	entity->budget = new_budget;
				3777	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
				3778	new_budget);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	3779	bfq_requeue_bfqq(bfqd, bfqq);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3780	}
				3781	}
				3782
				3783	static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
				3784	{
				3785	struct bfq_entity *entity = &bfqq->entity;
				3786
				3787	return entity->budget - entity->service;
				3788	}
				3789
				3790	/*
				3791	* If enough samples have been computed, return the current max budget
				3792	* stored in bfqd, which is dynamically updated according to the
				3793	* estimated disk peak rate; otherwise return the default max budget
				3794	*/
				3795	static int bfq_max_budget(struct bfq_data *bfqd)
				3796	{
				3797	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
				3798	return bfq_default_max_budget;
				3799	else
				3800	return bfqd->bfq_max_budget;
				3801	}
				3802
				3803	/*
				3804	* Return min budget, which is a fraction of the current or default
				3805	* max budget (trying with 1/32)
				3806	*/
				3807	static int bfq_min_budget(struct bfq_data *bfqd)
				3808	{
				3809	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
				3810	return bfq_default_max_budget / 32;
				3811	else
				3812	return bfqd->bfq_max_budget / 32;
				3813	}
				3814
				3815	static void bfq_bfqq_expire(struct bfq_data *bfqd,
				3816	struct bfq_queue *bfqq,
				3817	bool compensate,
				3818	enum bfqq_expiration reason);
				3819
				3820	/*
				3821	* The next function, invoked after the input queue bfqq switches from
				3822	* idle to busy, updates the budget of bfqq. The function also tells
				3823	* whether the in-service queue should be expired, by returning
				3824	* true. The purpose of expiring the in-service queue is to give bfqq
				3825	* the chance to possibly preempt the in-service queue, and the reason
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	3826	* for preempting the in-service queue is to achieve one of the two
				3827	* goals below.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3828	*
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	3829	* 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
				3830	* expired because it has remained idle. In particular, bfqq may have
				3831	* expired for one of the following two reasons:
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3832	*
				3833	* - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
				3834	* and did not make it to issue a new request before its last
				3835	* request was served;
				3836	*
				3837	* - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
				3838	* a new request before the expiration of the idling-time.
				3839	*
				3840	* Even if bfqq has expired for one of the above reasons, the process
				3841	* associated with the queue may be however issuing requests greedily,
				3842	* and thus be sensitive to the bandwidth it receives (bfqq may have
				3843	* remained idle for other reasons: CPU high load, bfqq not enjoying
				3844	* idling, I/O throttling somewhere in the path from the process to
				3845	* the I/O scheduler, ...). But if, after every expiration for one of
				3846	* the above two reasons, bfqq has to wait for the service of at least
				3847	* one full budget of another queue before being served again, then
				3848	* bfqq is likely to get a much lower bandwidth or resource time than
				3849	* its reserved ones. To address this issue, two countermeasures need
				3850	* to be taken.
				3851	*
				3852	* First, the budget and the timestamps of bfqq need to be updated in
				3853	* a special way on bfqq reactivation: they need to be updated as if
				3854	* bfqq did not remain idle and did not expire. In fact, if they are
				3855	* computed as if bfqq expired and remained idle until reactivation,
				3856	* then the process associated with bfqq is treated as if, instead of
				3857	* being greedy, it stopped issuing requests when bfqq remained idle,
				3858	* and restarts issuing requests only on this reactivation. In other
				3859	* words, the scheduler does not help the process recover the "service
				3860	* hole" between bfqq expiration and reactivation. As a consequence,
				3861	* the process receives a lower bandwidth than its reserved one. In
				3862	* contrast, to recover this hole, the budget must be updated as if
				3863	* bfqq was not expired at all before this reactivation, i.e., it must
				3864	* be set to the value of the remaining budget when bfqq was
				3865	* expired. Along the same line, timestamps need to be assigned the
				3866	* value they had the last time bfqq was selected for service, i.e.,
				3867	* before last expiration. Thus timestamps need to be back-shifted
				3868	* with respect to their normal computation (see [1] for more details
				3869	* on this tricky aspect).
				3870	*
				3871	* Secondly, to allow the process to recover the hole, the in-service
				3872	* queue must be expired too, to give bfqq the chance to preempt it
				3873	* immediately. In fact, if bfqq has to wait for a full budget of the
				3874	* in-service queue to be completed, then it may become impossible to
				3875	* let the process recover the hole, even if the back-shifted
				3876	* timestamps of bfqq are lower than those of the in-service queue. If
				3877	* this happens for most or all of the holes, then the process may not
				3878	* receive its reserved bandwidth. In this respect, it is worth noting
				3879	* that, being the service of outstanding requests unpreemptible, a
				3880	* little fraction of the holes may however be unrecoverable, thereby
				3881	* causing a little loss of bandwidth.
				3882	*
				3883	* The last important point is detecting whether bfqq does need this
				3884	* bandwidth recovery. In this respect, the next function deems the
				3885	* process associated with bfqq greedy, and thus allows it to recover
				3886	* the hole, if: 1) the process is waiting for the arrival of a new
				3887	* request (which implies that bfqq expired for one of the above two
				3888	* reasons), and 2) such a request has arrived soon. The first
				3889	* condition is controlled through the flag non_blocking_wait_rq,
				3890	* while the second through the flag arrived_in_time. If both
				3891	* conditions hold, then the function computes the budget in the
				3892	* above-described special way, and signals that the in-service queue
				3893	* should be expired. Timestamp back-shifting is done later in
				3894	* __bfq_activate_entity.
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	3895	*
				3896	* 2. Reduce latency. Even if timestamps are not backshifted to let
				3897	* the process associated with bfqq recover a service hole, bfqq may
				3898	* however happen to have, after being (re)activated, a lower finish
				3899	* timestamp than the in-service queue. That is, the next budget of
				3900	* bfqq may have to be completed before the one of the in-service
				3901	* queue. If this is the case, then preempting the in-service queue
				3902	* allows this goal to be achieved, apart from the unpreemptible,
				3903	* outstanding requests mentioned above.
				3904	*
				3905	* Unfortunately, regardless of which of the above two goals one wants
				3906	* to achieve, service trees need first to be updated to know whether
				3907	* the in-service queue must be preempted. To have service trees
				3908	* correctly updated, the in-service queue must be expired and
				3909	* rescheduled, and bfqq must be scheduled too. This is one of the
				3910	* most costly operations (in future versions, the scheduling
				3911	* mechanism may be re-designed in such a way to make it possible to
				3912	* know whether preemption is needed without needing to update service
				3913	* trees). In addition, queue preemptions almost always cause random
				3914	* I/O, and thus loss of throughput. Because of these facts, the next
				3915	* function adopts the following simple scheme to avoid both costly
				3916	* operations and too frequent preemptions: it requests the expiration
				3917	* of the in-service queue (unconditionally) only for queues that need
				3918	* to recover a hole, or that either are weight-raised or deserve to
				3919	* be weight-raised.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3920	*/
				3921	static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
				3922	struct bfq_queue *bfqq,
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	3923	bool arrived_in_time,
				3924	bool wr_or_deserves_wr)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	3925	{
				3926	struct bfq_entity *entity = &bfqq->entity;
				3927
				3928	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
				3929	/*
				3930	* We do not clear the flag non_blocking_wait_rq here, as
				3931	* the latter is used in bfq_activate_bfqq to signal
				3932	* that timestamps need to be back-shifted (and is
				3933	* cleared right after).
				3934	*/
				3935
				3936	/*
				3937	* In next assignment we rely on that either
				3938	* entity->service or entity->budget are not updated
				3939	* on expiration if bfqq is empty (see
				3940	* __bfq_bfqq_recalc_budget). Thus both quantities
				3941	* remain unchanged after such an expiration, and the
				3942	* following statement therefore assigns to
				3943	* entity->budget the remaining budget on such an
				3944	* expiration. For clarity, entity->service is not
				3945	* updated on expiration in any case, and, in normal
				3946	* operation, is reset only when bfqq is selected for
				3947	* service (see bfq_get_next_queue).
				3948	*/
				3949	entity->budget = min_t(unsigned long,
				3950	bfq_bfqq_budget_left(bfqq),
				3951	bfqq->max_budget);
				3952
				3953	return true;
				3954	}
				3955
				3956	entity->budget = max_t(unsigned long, bfqq->max_budget,
				3957	bfq_serv_to_charge(bfqq->next_rq, bfqq));
				3958	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	3959	return wr_or_deserves_wr;
				3960	}
				3961
				3962	static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
				3963	{
				3964	u64 dur;
				3965
				3966	if (bfqd->bfq_wr_max_time > 0)
				3967	return bfqd->bfq_wr_max_time;
				3968
				3969	dur = bfqd->RT_prod;
				3970	do_div(dur, bfqd->peak_rate);
				3971
				3972	/*
				3973	* Limit duration between 3 and 13 seconds. Tests show that
				3974	* higher values than 13 seconds often yield the opposite of
				3975	* the desired result, i.e., worsen responsiveness by letting
				3976	* non-interactive and non-soft-real-time applications
				3977	* preserve weight raising for a too long time interval.
				3978	*
				3979	* On the other end, lower values than 3 seconds make it
				3980	* difficult for most interactive tasks to complete their jobs
				3981	* before weight-raising finishes.
				3982	*/
				3983	if (dur > msecs_to_jiffies(13000))
				3984	dur = msecs_to_jiffies(13000);
				3985	else if (dur < msecs_to_jiffies(3000))
				3986	dur = msecs_to_jiffies(3000);
				3987
				3988	return dur;
				3989	}
				3990
				3991	static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
				3992	struct bfq_queue *bfqq,
				3993	unsigned int old_wr_coeff,
				3994	bool wr_or_deserves_wr,
				3995	bool interactive)
				3996	{
				3997	if (old_wr_coeff == 1 && wr_or_deserves_wr) {
				3998	/* start a weight-raising period */
				3999	bfqq->wr_coeff = bfqd->bfq_wr_coeff;
				4000	/* update wr duration */
				4001	bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
				4002
				4003	/*
				4004	* If needed, further reduce budget to make sure it is
				4005	* close to bfqq's backlog, so as to reduce the
				4006	* scheduling-error component due to a too large
				4007	* budget. Do not care about throughput consequences,
				4008	* but only about latency. Finally, do not assign a
				4009	* too small budget either, to avoid increasing
				4010	* latency by causing too frequent expirations.
				4011	*/
				4012	bfqq->entity.budget = min_t(unsigned long,
				4013	bfqq->entity.budget,
				4014	2 * bfq_min_budget(bfqd));
				4015	} else if (old_wr_coeff > 1) {
				4016	/* update wr duration */
				4017	bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
				4018	}
				4019	}
				4020
				4021	static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
				4022	struct bfq_queue *bfqq)
				4023	{
				4024	return bfqq->dispatched == 0 &&
				4025	time_is_before_jiffies(
				4026	bfqq->budget_timeout +
				4027	bfqd->bfq_wr_min_idle_time);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4028	}
				4029
				4030	static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
				4031	struct bfq_queue *bfqq,
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4032	int old_wr_coeff,
				4033	struct request *rq,
				4034	bool *interactive)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4035	{
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4036	bool wr_or_deserves_wr, bfqq_wants_to_preempt,
				4037	idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4038	/*
				4039	* See the comments on
				4040	* bfq_bfqq_update_budg_for_activation for
				4041	* details on the usage of the next variable.
				4042	*/
				4043	arrived_in_time = ktime_get_ns() <=
				4044	bfqq->ttime.last_end_request +
				4045	bfqd->bfq_slice_idle * 3;
				4046
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4047	bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
				4048
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4049	/*
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4050	* bfqq deserves to be weight-raised if:
				4051	* - it is sync,
				4052	* - it has been idle for enough time.
				4053	*/
				4054	*interactive = idle_for_long_time;
				4055	wr_or_deserves_wr = bfqd->low_latency &&
				4056	(bfqq->wr_coeff > 1 \|\|
				4057	(bfq_bfqq_sync(bfqq) && *interactive));
				4058
				4059	/*
				4060	* Using the last flag, update budget and check whether bfqq
				4061	* may want to preempt the in-service queue.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4062	*/
				4063	bfqq_wants_to_preempt =
				4064	bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4065	arrived_in_time,
				4066	wr_or_deserves_wr);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4067
				4068	if (!bfq_bfqq_IO_bound(bfqq)) {
				4069	if (arrived_in_time) {
				4070	bfqq->requests_within_timer++;
				4071	if (bfqq->requests_within_timer >=
				4072	bfqd->bfq_requests_within_timer)
				4073	bfq_mark_bfqq_IO_bound(bfqq);
				4074	} else
				4075	bfqq->requests_within_timer = 0;
				4076	}
				4077
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4078	if (bfqd->low_latency) {
				4079	bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
				4080	old_wr_coeff,
				4081	wr_or_deserves_wr,
				4082	*interactive);
				4083
				4084	if (old_wr_coeff != bfqq->wr_coeff)
				4085	bfqq->entity.prio_changed = 1;
				4086	}
				4087
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4088	bfq_add_bfqq_busy(bfqd, bfqq);
				4089
				4090	/*
				4091	* Expire in-service queue only if preemption may be needed
				4092	* for guarantees. In this respect, the function
				4093	* next_queue_may_preempt just checks a simple, necessary
				4094	* condition, and not a sufficient condition based on
				4095	* timestamps. In fact, for the latter condition to be
				4096	* evaluated, timestamps would need first to be updated, and
				4097	* this operation is quite costly (see the comments on the
				4098	* function bfq_bfqq_update_budg_for_activation).
				4099	*/
				4100	if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4101	bfqd->in_service_queue->wr_coeff == 1 &&
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4102	next_queue_may_preempt(bfqd))
				4103	bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
				4104	false, BFQQE_PREEMPTED);
				4105	}
				4106
				4107	static void bfq_add_request(struct request *rq)
				4108	{
				4109	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				4110	struct bfq_data *bfqd = bfqq->bfqd;
				4111	struct request next_rq, prev;
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4112	unsigned int old_wr_coeff = bfqq->wr_coeff;
				4113	bool interactive = false;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4114
				4115	bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
				4116	bfqq->queued[rq_is_sync(rq)]++;
				4117	bfqd->queued++;
				4118
				4119	elv_rb_add(&bfqq->sort_list, rq);
				4120
				4121	/*
				4122	* Check if this request is a better next-serve candidate.
				4123	*/
				4124	prev = bfqq->next_rq;
				4125	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
				4126	bfqq->next_rq = next_rq;
				4127
				4128	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4129	bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
				4130	rq, &interactive);
				4131	else {
				4132	if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
				4133	time_is_before_jiffies(
				4134	bfqq->last_wr_start_finish +
				4135	bfqd->bfq_wr_min_inter_arr_async)) {
				4136	bfqq->wr_coeff = bfqd->bfq_wr_coeff;
				4137	bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
				4138
				4139	bfqq->entity.prio_changed = 1;
				4140	}
				4141	if (prev != bfqq->next_rq)
				4142	bfq_updated_next_req(bfqd, bfqq);
				4143	}
				4144
				4145	/*
				4146	* Assign jiffies to last_wr_start_finish in the following
				4147	* cases:
				4148	*
				4149	* . if bfqq is not going to be weight-raised, because, for
				4150	* non weight-raised queues, last_wr_start_finish stores the
				4151	* arrival time of the last request; as of now, this piece
				4152	* of information is used only for deciding whether to
				4153	* weight-raise async queues
				4154	*
				4155	* . if bfqq is not weight-raised, because, if bfqq is now
				4156	* switching to weight-raised, then last_wr_start_finish
				4157	* stores the time when weight-raising starts
				4158	*
				4159	* . if bfqq is interactive, because, regardless of whether
				4160	* bfqq is currently weight-raised, the weight-raising
				4161	* period must start or restart (this case is considered
				4162	* separately because it is not detected by the above
				4163	* conditions, if bfqq is already weight-raised)
				4164	*/
				4165	if (bfqd->low_latency &&
				4166	(old_wr_coeff == 1 \|\| bfqq->wr_coeff == 1 \|\| interactive))
				4167	bfqq->last_wr_start_finish = jiffies;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4168	}
				4169
				4170	static struct request bfq_find_rq_fmerge(struct bfq_data bfqd,
				4171	struct bio *bio,
				4172	struct request_queue *q)
				4173	{
				4174	struct bfq_queue *bfqq = bfqd->bio_bfqq;
				4175
				4176
				4177	if (bfqq)
				4178	return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
				4179
				4180	return NULL;
				4181	}
				4182
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4183	static sector_t get_sdist(sector_t last_pos, struct request *rq)
				4184	{
				4185	if (last_pos)
				4186	return abs(blk_rq_pos(rq) - last_pos);
				4187
				4188	return 0;
				4189	}
				4190
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4191	#if 0 /* Still not clear if we can do without next two functions */
				4192	static void bfq_activate_request(struct request_queue q, struct request rq)
				4193	{
				4194	struct bfq_data *bfqd = q->elevator->elevator_data;
				4195
				4196	bfqd->rq_in_driver++;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4197	}
				4198
				4199	static void bfq_deactivate_request(struct request_queue q, struct request rq)
				4200	{
				4201	struct bfq_data *bfqd = q->elevator->elevator_data;
				4202
				4203	bfqd->rq_in_driver--;
				4204	}
				4205	#endif
				4206
				4207	static void bfq_remove_request(struct request_queue *q,
				4208	struct request *rq)
				4209	{
				4210	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				4211	struct bfq_data *bfqd = bfqq->bfqd;
				4212	const int sync = rq_is_sync(rq);
				4213
				4214	if (bfqq->next_rq == rq) {
				4215	bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
				4216	bfq_updated_next_req(bfqd, bfqq);
				4217	}
				4218
				4219	if (rq->queuelist.prev != &rq->queuelist)
				4220	list_del_init(&rq->queuelist);
				4221	bfqq->queued[sync]--;
				4222	bfqd->queued--;
				4223	elv_rb_del(&bfqq->sort_list, rq);
				4224
				4225	elv_rqhash_del(q, rq);
				4226	if (q->last_merge == rq)
				4227	q->last_merge = NULL;
				4228
				4229	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
				4230	bfqq->next_rq = NULL;
				4231
				4232	if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4233	bfq_del_bfqq_busy(bfqd, bfqq, false);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4234	/*
				4235	* bfqq emptied. In normal operation, when
				4236	* bfqq is empty, bfqq->entity.service and
				4237	* bfqq->entity.budget must contain,
				4238	* respectively, the service received and the
				4239	* budget used last time bfqq emptied. These
				4240	* facts do not hold in this case, as at least
				4241	* this last removal occurred while bfqq is
				4242	* not in service. To avoid inconsistencies,
				4243	* reset both bfqq->entity.service and
				4244	* bfqq->entity.budget, if bfqq has still a
				4245	* process that may issue I/O requests to it.
				4246	*/
				4247	bfqq->entity.budget = bfqq->entity.service = 0;
				4248	}
				4249	}
				4250
				4251	if (rq->cmd_flags & REQ_META)
				4252	bfqq->meta_pending--;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4253
				4254	bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4255	}
				4256
				4257	static bool bfq_bio_merge(struct blk_mq_hw_ctx hctx, struct bio bio)
				4258	{
				4259	struct request_queue *q = hctx->queue;
				4260	struct bfq_data *bfqd = q->elevator->elevator_data;
				4261	struct request *free = NULL;
				4262	/*
				4263	* bfq_bic_lookup grabs the queue_lock: invoke it now and
				4264	* store its return value for later use, to avoid nesting
				4265	* queue_lock inside the bfqd->lock. We assume that the bic
				4266	* returned by bfq_bic_lookup does not go away before
				4267	* bfqd->lock is taken.
				4268	*/
				4269	struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
				4270	bool ret;
				4271
				4272	spin_lock_irq(&bfqd->lock);
				4273
				4274	if (bic)
				4275	bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
				4276	else
				4277	bfqd->bio_bfqq = NULL;
				4278	bfqd->bio_bic = bic;
				4279
				4280	ret = blk_mq_sched_try_merge(q, bio, &free);
				4281
				4282	if (free)
				4283	blk_mq_free_request(free);
				4284	spin_unlock_irq(&bfqd->lock);
				4285
				4286	return ret;
				4287	}
				4288
				4289	static int bfq_request_merge(struct request_queue q, struct request *req,
				4290	struct bio *bio)
				4291	{
				4292	struct bfq_data *bfqd = q->elevator->elevator_data;
				4293	struct request *__rq;
				4294
				4295	__rq = bfq_find_rq_fmerge(bfqd, bio, q);
				4296	if (__rq && elv_bio_merge_ok(__rq, bio)) {
				4297	*req = __rq;
				4298	return ELEVATOR_FRONT_MERGE;
				4299	}
				4300
				4301	return ELEVATOR_NO_MERGE;
				4302	}
				4303
				4304	static void bfq_request_merged(struct request_queue q, struct request req,
				4305	enum elv_merge type)
				4306	{
				4307	if (type == ELEVATOR_FRONT_MERGE &&
				4308	rb_prev(&req->rb_node) &&
				4309	blk_rq_pos(req) <
				4310	blk_rq_pos(container_of(rb_prev(&req->rb_node),
				4311	struct request, rb_node))) {
				4312	struct bfq_queue *bfqq = RQ_BFQQ(req);
				4313	struct bfq_data *bfqd = bfqq->bfqd;
				4314	struct request prev, next_rq;
				4315
				4316	/* Reposition request in its sort_list */
				4317	elv_rb_del(&bfqq->sort_list, req);
				4318	elv_rb_add(&bfqq->sort_list, req);
				4319
				4320	/* Choose next request to be served for bfqq */
				4321	prev = bfqq->next_rq;
				4322	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
				4323	bfqd->last_position);
				4324	bfqq->next_rq = next_rq;
				4325	/*
				4326	* If next_rq changes, update the queue's budget to fit
				4327	* the new request.
				4328	*/
				4329	if (prev != bfqq->next_rq)
				4330	bfq_updated_next_req(bfqd, bfqq);
				4331	}
				4332	}
				4333
				4334	static void bfq_requests_merged(struct request_queue q, struct request rq,
				4335	struct request *next)
				4336	{
				4337	struct bfq_queue bfqq = RQ_BFQQ(rq), next_bfqq = RQ_BFQQ(next);
				4338
				4339	if (!RB_EMPTY_NODE(&rq->rb_node))
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4340	goto end;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4341	spin_lock_irq(&bfqq->bfqd->lock);
				4342
				4343	/*
				4344	* If next and rq belong to the same bfq_queue and next is older
				4345	* than rq, then reposition rq in the fifo (by substituting next
				4346	* with rq). Otherwise, if next and rq belong to different
				4347	* bfq_queues, never reposition rq: in fact, we would have to
				4348	* reposition it with respect to next's position in its own fifo,
				4349	* which would most certainly be too expensive with respect to
				4350	* the benefits.
				4351	*/
				4352	if (bfqq == next_bfqq &&
				4353	!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
				4354	next->fifo_time < rq->fifo_time) {
				4355	list_del_init(&rq->queuelist);
				4356	list_replace_init(&next->queuelist, &rq->queuelist);
				4357	rq->fifo_time = next->fifo_time;
				4358	}
				4359
				4360	if (bfqq->next_rq == next)
				4361	bfqq->next_rq = rq;
				4362
				4363	bfq_remove_request(q, next);
				4364
				4365	spin_unlock_irq(&bfqq->bfqd->lock);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4366	end:
				4367	bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4368	}
				4369
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4370	/* Must be called with bfqq != NULL */
				4371	static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
				4372	{
				4373	bfqq->wr_coeff = 1;
				4374	bfqq->wr_cur_max_time = 0;
				4375	/*
				4376	* Trigger a weight change on the next invocation of
				4377	* __bfq_entity_update_weight_prio.
				4378	*/
				4379	bfqq->entity.prio_changed = 1;
				4380	}
				4381
				4382	static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
				4383	struct bfq_group *bfqg)
				4384	{
				4385	int i, j;
				4386
				4387	for (i = 0; i < 2; i++)
				4388	for (j = 0; j < IOPRIO_BE_NR; j++)
				4389	if (bfqg->async_bfqq[i][j])
				4390	bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
				4391	if (bfqg->async_idle_bfqq)
				4392	bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
				4393	}
				4394
				4395	static void bfq_end_wr(struct bfq_data *bfqd)
				4396	{
				4397	struct bfq_queue *bfqq;
				4398
				4399	spin_lock_irq(&bfqd->lock);
				4400
				4401	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
				4402	bfq_bfqq_end_wr(bfqq);
				4403	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
				4404	bfq_bfqq_end_wr(bfqq);
				4405	bfq_end_wr_async(bfqd);
				4406
				4407	spin_unlock_irq(&bfqd->lock);
				4408	}
				4409
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4410	static bool bfq_allow_bio_merge(struct request_queue q, struct request rq,
				4411	struct bio *bio)
				4412	{
				4413	struct bfq_data *bfqd = q->elevator->elevator_data;
				4414	bool is_sync = op_is_sync(bio->bi_opf);
				4415	struct bfq_queue *bfqq = bfqd->bio_bfqq;
				4416
				4417	/*
				4418	* Disallow merge of a sync bio into an async request.
				4419	*/
				4420	if (is_sync && !rq_is_sync(rq))
				4421	return false;
				4422
				4423	/*
				4424	* Lookup the bfqq that this bio will be queued with. Allow
				4425	* merge only if rq is queued there.
				4426	*/
				4427	if (!bfqq)
				4428	return false;
				4429
				4430	return bfqq == RQ_BFQQ(rq);
				4431	}
				4432
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4433	/*
				4434	* Set the maximum time for the in-service queue to consume its
				4435	* budget. This prevents seeky processes from lowering the throughput.
				4436	* In practice, a time-slice service scheme is used with seeky
				4437	* processes.
				4438	*/
				4439	static void bfq_set_budget_timeout(struct bfq_data *bfqd,
				4440	struct bfq_queue *bfqq)
				4441	{
				4442	bfqd->last_budget_start = ktime_get();
				4443
				4444	bfqq->budget_timeout = jiffies +
				4445	bfqd->bfq_timeout *
				4446	(bfqq->entity.weight / bfqq->entity.orig_weight);
				4447	}
				4448
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4449	static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
				4450	struct bfq_queue *bfqq)
				4451	{
				4452	if (bfqq) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4453	bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4454	bfq_clear_bfqq_fifo_expire(bfqq);
				4455
				4456	bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
				4457
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4458	bfq_set_budget_timeout(bfqd, bfqq);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4459	bfq_log_bfqq(bfqd, bfqq,
				4460	"set_in_service_queue, cur-budget = %d",
				4461	bfqq->entity.budget);
				4462	}
				4463
				4464	bfqd->in_service_queue = bfqq;
				4465	}
				4466
				4467	/*
				4468	* Get and set a new queue for service.
				4469	*/
				4470	static struct bfq_queue bfq_set_in_service_queue(struct bfq_data bfqd)
				4471	{
				4472	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
				4473
				4474	__bfq_set_in_service_queue(bfqd, bfqq);
				4475	return bfqq;
				4476	}
				4477
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4478	static void bfq_arm_slice_timer(struct bfq_data *bfqd)
				4479	{
				4480	struct bfq_queue *bfqq = bfqd->in_service_queue;
				4481	struct bfq_io_cq *bic;
				4482	u32 sl;
				4483
				4484	/* Processes have exited, don't wait. */
				4485	bic = bfqd->in_service_bic;
				4486	if (!bic \|\| atomic_read(&bic->icq.ioc->active_ref) == 0)
				4487	return;
				4488
				4489	bfq_mark_bfqq_wait_request(bfqq);
				4490
				4491	/*
				4492	* We don't want to idle for seeks, but we do want to allow
				4493	* fair distribution of slice time for a process doing back-to-back
				4494	* seeks. So allow a little bit of time for him to submit a new rq.
				4495	*/
				4496	sl = bfqd->bfq_slice_idle;
				4497	/*
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4498	* Unless the queue is being weight-raised, grant only minimum
				4499	* idle time if the queue is seeky. A long idling is preserved
				4500	* for a weight-raised queue, because it is needed for
				4501	* guaranteeing to the queue its reserved share of the
				4502	* throughput.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4503	*/
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4504	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4505	sl = min_t(u64, sl, BFQ_MIN_TT);
				4506
				4507	bfqd->last_idling_start = ktime_get();
				4508	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
				4509	HRTIMER_MODE_REL);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4510	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4511	}
				4512
				4513	/*
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4514	* In autotuning mode, max_budget is dynamically recomputed as the
				4515	* amount of sectors transferred in timeout at the estimated peak
				4516	* rate. This enables BFQ to utilize a full timeslice with a full
				4517	* budget, even if the in-service queue is served at peak rate. And
				4518	* this maximises throughput with sequential workloads.
				4519	*/
				4520	static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
				4521	{
				4522	return (u64)bfqd->peak_rate * USEC_PER_MSEC *
				4523	jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
				4524	}
				4525
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4526	/*
				4527	* Update parameters related to throughput and responsiveness, as a
				4528	* function of the estimated peak rate. See comments on
				4529	* bfq_calc_max_budget(), and on T_slow and T_fast arrays.
				4530	*/
				4531	static void update_thr_responsiveness_params(struct bfq_data *bfqd)
				4532	{
				4533	int dev_type = blk_queue_nonrot(bfqd->queue);
				4534
				4535	if (bfqd->bfq_user_max_budget == 0)
				4536	bfqd->bfq_max_budget =
				4537	bfq_calc_max_budget(bfqd);
				4538
				4539	if (bfqd->device_speed == BFQ_BFQD_FAST &&
				4540	bfqd->peak_rate < device_speed_thresh[dev_type]) {
				4541	bfqd->device_speed = BFQ_BFQD_SLOW;
				4542	bfqd->RT_prod = R_slow[dev_type] *
				4543	T_slow[dev_type];
				4544	} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
				4545	bfqd->peak_rate > device_speed_thresh[dev_type]) {
				4546	bfqd->device_speed = BFQ_BFQD_FAST;
				4547	bfqd->RT_prod = R_fast[dev_type] *
				4548	T_fast[dev_type];
				4549	}
				4550
				4551	bfq_log(bfqd,
				4552	"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
				4553	dev_type == 0 ? "ROT" : "NONROT",
				4554	bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
				4555	bfqd->device_speed == BFQ_BFQD_FAST ?
				4556	(USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
				4557	(USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
				4558	(USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
				4559	BFQ_RATE_SHIFT);
				4560	}
				4561
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4562	static void bfq_reset_rate_computation(struct bfq_data *bfqd,
				4563	struct request *rq)
				4564	{
				4565	if (rq != NULL) { /* new rq dispatch now, reset accordingly */
				4566	bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
				4567	bfqd->peak_rate_samples = 1;
				4568	bfqd->sequential_samples = 0;
				4569	bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
				4570	blk_rq_sectors(rq);
				4571	} else /* no new rq dispatched, just reset the number of samples */
				4572	bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
				4573
				4574	bfq_log(bfqd,
				4575	"reset_rate_computation at end, sample %u/%u tot_sects %llu",
				4576	bfqd->peak_rate_samples, bfqd->sequential_samples,
				4577	bfqd->tot_sectors_dispatched);
				4578	}
				4579
				4580	static void bfq_update_rate_reset(struct bfq_data bfqd, struct request rq)
				4581	{
				4582	u32 rate, weight, divisor;
				4583
				4584	/*
				4585	* For the convergence property to hold (see comments on
				4586	* bfq_update_peak_rate()) and for the assessment to be
				4587	* reliable, a minimum number of samples must be present, and
				4588	* a minimum amount of time must have elapsed. If not so, do
				4589	* not compute new rate. Just reset parameters, to get ready
				4590	* for a new evaluation attempt.
				4591	*/
				4592	if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES \|\|
				4593	bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
				4594	goto reset_computation;
				4595
				4596	/*
				4597	* If a new request completion has occurred after last
				4598	* dispatch, then, to approximate the rate at which requests
				4599	* have been served by the device, it is more precise to
				4600	* extend the observation interval to the last completion.
				4601	*/
				4602	bfqd->delta_from_first =
				4603	max_t(u64, bfqd->delta_from_first,
				4604	bfqd->last_completion - bfqd->first_dispatch);
				4605
				4606	/*
				4607	* Rate computed in sects/usec, and not sects/nsec, for
				4608	* precision issues.
				4609	*/
				4610	rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
				4611	div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
				4612
				4613	/*
				4614	* Peak rate not updated if:
				4615	* - the percentage of sequential dispatches is below 3/4 of the
				4616	* total, and rate is below the current estimated peak rate
				4617	* - rate is unreasonably high (> 20M sectors/sec)
				4618	*/
				4619	if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
				4620	rate <= bfqd->peak_rate) \|\|
				4621	rate > 20<<BFQ_RATE_SHIFT)
				4622	goto reset_computation;
				4623
				4624	/*
				4625	* We have to update the peak rate, at last! To this purpose,
				4626	* we use a low-pass filter. We compute the smoothing constant
				4627	* of the filter as a function of the 'weight' of the new
				4628	* measured rate.
				4629	*
				4630	* As can be seen in next formulas, we define this weight as a
				4631	* quantity proportional to how sequential the workload is,
				4632	* and to how long the observation time interval is.
				4633	*
				4634	* The weight runs from 0 to 8. The maximum value of the
				4635	* weight, 8, yields the minimum value for the smoothing
				4636	* constant. At this minimum value for the smoothing constant,
				4637	* the measured rate contributes for half of the next value of
				4638	* the estimated peak rate.
				4639	*
				4640	* So, the first step is to compute the weight as a function
				4641	* of how sequential the workload is. Note that the weight
				4642	* cannot reach 9, because bfqd->sequential_samples cannot
				4643	* become equal to bfqd->peak_rate_samples, which, in its
				4644	* turn, holds true because bfqd->sequential_samples is not
				4645	* incremented for the first sample.
				4646	*/
				4647	weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
				4648
				4649	/*
				4650	* Second step: further refine the weight as a function of the
				4651	* duration of the observation interval.
				4652	*/
				4653	weight = min_t(u32, 8,
				4654	div_u64(weight * bfqd->delta_from_first,
				4655	BFQ_RATE_REF_INTERVAL));
				4656
				4657	/*
				4658	* Divisor ranging from 10, for minimum weight, to 2, for
				4659	* maximum weight.
				4660	*/
				4661	divisor = 10 - weight;
				4662
				4663	/*
				4664	* Finally, update peak rate:
				4665	*
				4666	* peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
				4667	*/
				4668	bfqd->peak_rate *= divisor-1;
				4669	bfqd->peak_rate /= divisor;
				4670	rate /= divisor; /* smoothing constant alpha = 1/divisor */
				4671
				4672	bfqd->peak_rate += rate;
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4673	update_thr_responsiveness_params(bfqd);
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4674
				4675	reset_computation:
				4676	bfq_reset_rate_computation(bfqd, rq);
				4677	}
				4678
				4679	/*
				4680	* Update the read/write peak rate (the main quantity used for
				4681	* auto-tuning, see update_thr_responsiveness_params()).
				4682	*
				4683	* It is not trivial to estimate the peak rate (correctly): because of
				4684	* the presence of sw and hw queues between the scheduler and the
				4685	* device components that finally serve I/O requests, it is hard to
				4686	* say exactly when a given dispatched request is served inside the
				4687	* device, and for how long. As a consequence, it is hard to know
				4688	* precisely at what rate a given set of requests is actually served
				4689	* by the device.
				4690	*
				4691	* On the opposite end, the dispatch time of any request is trivially
				4692	* available, and, from this piece of information, the "dispatch rate"
				4693	* of requests can be immediately computed. So, the idea in the next
				4694	* function is to use what is known, namely request dispatch times
				4695	* (plus, when useful, request completion times), to estimate what is
				4696	* unknown, namely in-device request service rate.
				4697	*
				4698	* The main issue is that, because of the above facts, the rate at
				4699	* which a certain set of requests is dispatched over a certain time
				4700	* interval can vary greatly with respect to the rate at which the
				4701	* same requests are then served. But, since the size of any
				4702	* intermediate queue is limited, and the service scheme is lossless
				4703	* (no request is silently dropped), the following obvious convergence
				4704	* property holds: the number of requests dispatched MUST become
				4705	* closer and closer to the number of requests completed as the
				4706	* observation interval grows. This is the key property used in
				4707	* the next function to estimate the peak service rate as a function
				4708	* of the observed dispatch rate. The function assumes to be invoked
				4709	* on every request dispatch.
				4710	*/
				4711	static void bfq_update_peak_rate(struct bfq_data bfqd, struct request rq)
				4712	{
				4713	u64 now_ns = ktime_get_ns();
				4714
				4715	if (bfqd->peak_rate_samples == 0) { /* first dispatch */
				4716	bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
				4717	bfqd->peak_rate_samples);
				4718	bfq_reset_rate_computation(bfqd, rq);
				4719	goto update_last_values; /* will add one sample */
				4720	}
				4721
				4722	/*
				4723	* Device idle for very long: the observation interval lasting
				4724	* up to this dispatch cannot be a valid observation interval
				4725	* for computing a new peak rate (similarly to the late-
				4726	* completion event in bfq_completed_request()). Go to
				4727	* update_rate_and_reset to have the following three steps
				4728	* taken:
				4729	* - close the observation interval at the last (previous)
				4730	* request dispatch or completion
				4731	* - compute rate, if possible, for that observation interval
				4732	* - start a new observation interval with this dispatch
				4733	*/
				4734	if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
				4735	bfqd->rq_in_driver == 0)
				4736	goto update_rate_and_reset;
				4737
				4738	/* Update sampling information */
				4739	bfqd->peak_rate_samples++;
				4740
				4741	if ((bfqd->rq_in_driver > 0 \|\|
				4742	now_ns - bfqd->last_completion < BFQ_MIN_TT)
				4743	&& get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
				4744	bfqd->sequential_samples++;
				4745
				4746	bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
				4747
				4748	/* Reset max observed rq size every 32 dispatches */
				4749	if (likely(bfqd->peak_rate_samples % 32))
				4750	bfqd->last_rq_max_size =
				4751	max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
				4752	else
				4753	bfqd->last_rq_max_size = blk_rq_sectors(rq);
				4754
				4755	bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
				4756
				4757	/* Target observation interval not yet reached, go on sampling */
				4758	if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
				4759	goto update_last_values;
				4760
				4761	update_rate_and_reset:
				4762	bfq_update_rate_reset(bfqd, rq);
				4763	update_last_values:
				4764	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
				4765	bfqd->last_dispatch = now_ns;
				4766	}
				4767
				4768	/*
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4769	* Remove request from internal lists.
				4770	*/
				4771	static void bfq_dispatch_remove(struct request_queue q, struct request rq)
				4772	{
				4773	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				4774
				4775	/*
				4776	* For consistency, the next instruction should have been
				4777	* executed after removing the request from the queue and
				4778	* dispatching it. We execute instead this instruction before
				4779	* bfq_remove_request() (and hence introduce a temporary
				4780	* inconsistency), for efficiency. In fact, should this
				4781	* dispatch occur for a non in-service bfqq, this anticipated
				4782	* increment prevents two counters related to bfqq->dispatched
				4783	* from risking to be, first, uselessly decremented, and then
				4784	* incremented again when the (new) value of bfqq->dispatched
				4785	* happens to be taken into account.
				4786	*/
				4787	bfqq->dispatched++;
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4788	bfq_update_peak_rate(q->elevator->elevator_data, rq);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4789
				4790	bfq_remove_request(q, rq);
				4791	}
				4792
				4793	static void __bfq_bfqq_expire(struct bfq_data bfqd, struct bfq_queue bfqq)
				4794	{
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4795	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
				4796	if (bfqq->dispatched == 0)
				4797	/*
				4798	* Overloading budget_timeout field to store
				4799	* the time at which the queue remains with no
				4800	* backlog and no outstanding request; used by
				4801	* the weight-raising mechanism.
				4802	*/
				4803	bfqq->budget_timeout = jiffies;
				4804
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4805	bfq_del_bfqq_busy(bfqd, bfqq, true);
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4806	} else
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	4807	bfq_requeue_bfqq(bfqd, bfqq);
				4808
				4809	/*
				4810	* All in-service entities must have been properly deactivated
				4811	* or requeued before executing the next function, which
				4812	* resets all in-service entites as no more in service.
				4813	*/
				4814	__bfq_bfqd_reset_in_service(bfqd);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4815	}
				4816
				4817	/**
				4818	* __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
				4819	* @bfqd: device data.
				4820	* @bfqq: queue to update.
				4821	* @reason: reason for expiration.
				4822	*
				4823	* Handle the feedback on @bfqq budget at queue expiration.
				4824	* See the body for detailed comments.
				4825	*/
				4826	static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
				4827	struct bfq_queue *bfqq,
				4828	enum bfqq_expiration reason)
				4829	{
				4830	struct request *next_rq;
				4831	int budget, min_budget;
				4832
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4833	min_budget = bfq_min_budget(bfqd);
				4834
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4835	if (bfqq->wr_coeff == 1)
				4836	budget = bfqq->max_budget;
				4837	else /*
				4838	* Use a constant, low budget for weight-raised queues,
				4839	* to help achieve a low latency. Keep it slightly higher
				4840	* than the minimum possible budget, to cause a little
				4841	* bit fewer expirations.
				4842	*/
				4843	budget = 2 * min_budget;
				4844
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4845	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
				4846	bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
				4847	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
				4848	budget, bfq_min_budget(bfqd));
				4849	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
				4850	bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
				4851
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4852	if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4853	switch (reason) {
				4854	/*
				4855	* Caveat: in all the following cases we trade latency
				4856	* for throughput.
				4857	*/
				4858	case BFQQE_TOO_IDLE:
Paolo Valente	54b6045	2017-04-12 18:23:09 +0200	[diff] [blame]	4859	/*
				4860	* This is the only case where we may reduce
				4861	* the budget: if there is no request of the
				4862	* process still waiting for completion, then
				4863	* we assume (tentatively) that the timer has
				4864	* expired because the batch of requests of
				4865	* the process could have been served with a
				4866	* smaller budget. Hence, betting that
				4867	* process will behave in the same way when it
				4868	* becomes backlogged again, we reduce its
				4869	* next budget. As long as we guess right,
				4870	* this budget cut reduces the latency
				4871	* experienced by the process.
				4872	*
				4873	* However, if there are still outstanding
				4874	* requests, then the process may have not yet
				4875	* issued its next request just because it is
				4876	* still waiting for the completion of some of
				4877	* the still outstanding ones. So in this
				4878	* subcase we do not reduce its budget, on the
				4879	* contrary we increase it to possibly boost
				4880	* the throughput, as discussed in the
				4881	* comments to the BUDGET_TIMEOUT case.
				4882	*/
				4883	if (bfqq->dispatched > 0) /* still outstanding reqs */
				4884	budget = min(budget * 2, bfqd->bfq_max_budget);
				4885	else {
				4886	if (budget > 5 * min_budget)
				4887	budget -= 4 * min_budget;
				4888	else
				4889	budget = min_budget;
				4890	}
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4891	break;
				4892	case BFQQE_BUDGET_TIMEOUT:
Paolo Valente	54b6045	2017-04-12 18:23:09 +0200	[diff] [blame]	4893	/*
				4894	* We double the budget here because it gives
				4895	* the chance to boost the throughput if this
				4896	* is not a seeky process (and has bumped into
				4897	* this timeout because of, e.g., ZBR).
				4898	*/
				4899	budget = min(budget * 2, bfqd->bfq_max_budget);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4900	break;
				4901	case BFQQE_BUDGET_EXHAUSTED:
				4902	/*
				4903	* The process still has backlog, and did not
				4904	* let either the budget timeout or the disk
				4905	* idling timeout expire. Hence it is not
				4906	* seeky, has a short thinktime and may be
				4907	* happy with a higher budget too. So
				4908	* definitely increase the budget of this good
				4909	* candidate to boost the disk throughput.
				4910	*/
Paolo Valente	54b6045	2017-04-12 18:23:09 +0200	[diff] [blame]	4911	budget = min(budget * 4, bfqd->bfq_max_budget);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4912	break;
				4913	case BFQQE_NO_MORE_REQUESTS:
				4914	/*
				4915	* For queues that expire for this reason, it
				4916	* is particularly important to keep the
				4917	* budget close to the actual service they
				4918	* need. Doing so reduces the timestamp
				4919	* misalignment problem described in the
				4920	* comments in the body of
				4921	* __bfq_activate_entity. In fact, suppose
				4922	* that a queue systematically expires for
				4923	* BFQQE_NO_MORE_REQUESTS and presents a
				4924	* new request in time to enjoy timestamp
				4925	* back-shifting. The larger the budget of the
				4926	* queue is with respect to the service the
				4927	* queue actually requests in each service
				4928	* slot, the more times the queue can be
				4929	* reactivated with the same virtual finish
				4930	* time. It follows that, even if this finish
				4931	* time is pushed to the system virtual time
				4932	* to reduce the consequent timestamp
				4933	* misalignment, the queue unjustly enjoys for
				4934	* many re-activations a lower finish time
				4935	* than all newly activated queues.
				4936	*
				4937	* The service needed by bfqq is measured
				4938	* quite precisely by bfqq->entity.service.
				4939	* Since bfqq does not enjoy device idling,
				4940	* bfqq->entity.service is equal to the number
				4941	* of sectors that the process associated with
				4942	* bfqq requested to read/write before waiting
				4943	* for request completions, or blocking for
				4944	* other reasons.
				4945	*/
				4946	budget = max_t(int, bfqq->entity.service, min_budget);
				4947	break;
				4948	default:
				4949	return;
				4950	}
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	4951	} else if (!bfq_bfqq_sync(bfqq)) {
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4952	/*
				4953	* Async queues get always the maximum possible
				4954	* budget, as for them we do not care about latency
				4955	* (in addition, their ability to dispatch is limited
				4956	* by the charging factor).
				4957	*/
				4958	budget = bfqd->bfq_max_budget;
				4959	}
				4960
				4961	bfqq->max_budget = budget;
				4962
				4963	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
				4964	!bfqd->bfq_user_max_budget)
				4965	bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
				4966
				4967	/*
				4968	* If there is still backlog, then assign a new budget, making
				4969	* sure that it is large enough for the next request. Since
				4970	* the finish time of bfqq must be kept in sync with the
				4971	* budget, be sure to call __bfq_bfqq_expire() after this
				4972	* update.
				4973	*
				4974	* If there is no backlog, then no need to update the budget;
				4975	* it will be updated on the arrival of a new request.
				4976	*/
				4977	next_rq = bfqq->next_rq;
				4978	if (next_rq)
				4979	bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
				4980	bfq_serv_to_charge(next_rq, bfqq));
				4981
				4982	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
				4983	next_rq ? blk_rq_sectors(next_rq) : 0,
				4984	bfqq->entity.budget);
				4985	}
				4986
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	4987	/*
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	4988	* Return true if the process associated with bfqq is "slow". The slow
				4989	* flag is used, in addition to the budget timeout, to reduce the
				4990	* amount of service provided to seeky processes, and thus reduce
				4991	* their chances to lower the throughput. More details in the comments
				4992	* on the function bfq_bfqq_expire().
				4993	*
				4994	* An important observation is in order: as discussed in the comments
				4995	* on the function bfq_update_peak_rate(), with devices with internal
				4996	* queues, it is hard if ever possible to know when and for how long
				4997	* an I/O request is processed by the device (apart from the trivial
				4998	* I/O pattern where a new request is dispatched only after the
				4999	* previous one has been completed). This makes it hard to evaluate
				5000	* the real rate at which the I/O requests of each bfq_queue are
				5001	* served. In fact, for an I/O scheduler like BFQ, serving a
				5002	* bfq_queue means just dispatching its requests during its service
				5003	* slot (i.e., until the budget of the queue is exhausted, or the
				5004	* queue remains idle, or, finally, a timeout fires). But, during the
				5005	* service slot of a bfq_queue, around 100 ms at most, the device may
				5006	* be even still processing requests of bfq_queues served in previous
				5007	* service slots. On the opposite end, the requests of the in-service
				5008	* bfq_queue may be completed after the service slot of the queue
				5009	* finishes.
				5010	*
				5011	* Anyway, unless more sophisticated solutions are used
				5012	* (where possible), the sum of the sizes of the requests dispatched
				5013	* during the service slot of a bfq_queue is probably the only
				5014	* approximation available for the service received by the bfq_queue
				5015	* during its service slot. And this sum is the quantity used in this
				5016	* function to evaluate the I/O speed of a process.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5017	*/
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5018	static bool bfq_bfqq_is_slow(struct bfq_data bfqd, struct bfq_queue bfqq,
				5019	bool compensate, enum bfqq_expiration reason,
				5020	unsigned long *delta_ms)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5021	{
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5022	ktime_t delta_ktime;
				5023	u32 delta_usecs;
				5024	bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5025
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5026	if (!bfq_bfqq_sync(bfqq))
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5027	return false;
				5028
				5029	if (compensate)
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5030	delta_ktime = bfqd->last_idling_start;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5031	else
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5032	delta_ktime = ktime_get();
				5033	delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
				5034	delta_usecs = ktime_to_us(delta_ktime);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5035
				5036	/* don't use too short time intervals */
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5037	if (delta_usecs < 1000) {
				5038	if (blk_queue_nonrot(bfqd->queue))
				5039	/*
				5040	* give same worst-case guarantees as idling
				5041	* for seeky
				5042	*/
				5043	*delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
				5044	else /* charge at least one seek */
				5045	*delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5046
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5047	return slow;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5048	}
				5049
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5050	*delta_ms = delta_usecs / USEC_PER_MSEC;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5051
				5052	/*
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5053	* Use only long (> 20ms) intervals to filter out excessive
				5054	* spikes in service rate estimation.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5055	*/
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5056	if (delta_usecs > 20000) {
				5057	/*
				5058	* Caveat for rotational devices: processes doing I/O
				5059	* in the slower disk zones tend to be slow(er) even
				5060	* if not seeky. In this respect, the estimated peak
				5061	* rate is likely to be an average over the disk
				5062	* surface. Accordingly, to not be too harsh with
				5063	* unlucky processes, a process is deemed slow only if
				5064	* its rate has been lower than half of the estimated
				5065	* peak rate.
				5066	*/
				5067	slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
				5068	}
				5069
				5070	bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
				5071
				5072	return slow;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5073	}
				5074
				5075	/*
				5076	* Return the farthest past time instant according to jiffies
				5077	* macros.
				5078	*/
				5079	static unsigned long bfq_smallest_from_now(void)
				5080	{
				5081	return jiffies - MAX_JIFFY_OFFSET;
				5082	}
				5083
				5084	/**
				5085	* bfq_bfqq_expire - expire a queue.
				5086	* @bfqd: device owning the queue.
				5087	* @bfqq: the queue to expire.
				5088	* @compensate: if true, compensate for the time spent idling.
				5089	* @reason: the reason causing the expiration.
				5090	*
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	5091	* If the process associated with bfqq does slow I/O (e.g., because it
				5092	* issues random requests), we charge bfqq with the time it has been
				5093	* in service instead of the service it has received (see
				5094	* bfq_bfqq_charge_time for details on how this goal is achieved). As
				5095	* a consequence, bfqq will typically get higher timestamps upon
				5096	* reactivation, and hence it will be rescheduled as if it had
				5097	* received more service than what it has actually received. In the
				5098	* end, bfqq receives less service in proportion to how slowly its
				5099	* associated process consumes its budgets (and hence how seriously it
				5100	* tends to lower the throughput). In addition, this time-charging
				5101	* strategy guarantees time fairness among slow processes. In
				5102	* contrast, if the process associated with bfqq is not slow, we
				5103	* charge bfqq exactly with the service it has received.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5104	*
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	5105	* Charging time to the first type of queues and the exact service to
				5106	* the other has the effect of using the WF2Q+ policy to schedule the
				5107	* former on a timeslice basis, without violating service domain
				5108	* guarantees among the latter.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5109	*/
				5110	static void bfq_bfqq_expire(struct bfq_data *bfqd,
				5111	struct bfq_queue *bfqq,
				5112	bool compensate,
				5113	enum bfqq_expiration reason)
				5114	{
				5115	bool slow;
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5116	unsigned long delta = 0;
				5117	struct bfq_entity *entity = &bfqq->entity;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5118	int ref;
				5119
				5120	/*
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5121	* Check whether the process is slow (see bfq_bfqq_is_slow).
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5122	*/
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5123	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5124
				5125	/*
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	5126	* As above explained, charge slow (typically seeky) and
				5127	* timed-out queues with the time and not the service
				5128	* received, to favor sequential workloads.
				5129	*
				5130	* Processes doing I/O in the slower disk zones will tend to
				5131	* be slow(er) even if not seeky. Therefore, since the
				5132	* estimated peak rate is actually an average over the disk
				5133	* surface, these processes may timeout just for bad luck. To
				5134	* avoid punishing them, do not charge time to processes that
				5135	* succeeded in consuming at least 2/3 of their budget. This
				5136	* allows BFQ to preserve enough elasticity to still perform
				5137	* bandwidth, and not time, distribution with little unlucky
				5138	* or quasi-sequential processes.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5139	*/
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5140	if (bfqq->wr_coeff == 1 &&
				5141	(slow \|\|
				5142	(reason == BFQQE_BUDGET_TIMEOUT &&
				5143	bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
Paolo Valente	c074170e	2017-04-12 18:23:11 +0200	[diff] [blame]	5144	bfq_bfqq_charge_time(bfqd, bfqq, delta);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5145
				5146	if (reason == BFQQE_TOO_IDLE &&
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5147	entity->service <= 2 * entity->budget / 10)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5148	bfq_clear_bfqq_IO_bound(bfqq);
				5149
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5150	if (bfqd->low_latency && bfqq->wr_coeff == 1)
				5151	bfqq->last_wr_start_finish = jiffies;
				5152
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5153	bfq_log_bfqq(bfqd, bfqq,
				5154	"expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
				5155	slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
				5156
				5157	/*
				5158	* Increase, decrease or leave budget unchanged according to
				5159	* reason.
				5160	*/
				5161	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
				5162	ref = bfqq->ref;
				5163	__bfq_bfqq_expire(bfqd, bfqq);
				5164
				5165	/* mark bfqq as waiting a request only if a bic still points to it */
				5166	if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
				5167	reason != BFQQE_BUDGET_TIMEOUT &&
				5168	reason != BFQQE_BUDGET_EXHAUSTED)
				5169	bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
				5170	}
				5171
				5172	/*
				5173	* Budget timeout is not implemented through a dedicated timer, but
				5174	* just checked on request arrivals and completions, as well as on
				5175	* idle timer expirations.
				5176	*/
				5177	static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
				5178	{
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5179	return time_is_before_eq_jiffies(bfqq->budget_timeout);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5180	}
				5181
				5182	/*
				5183	* If we expire a queue that is actively waiting (i.e., with the
				5184	* device idled) for the arrival of a new request, then we may incur
				5185	* the timestamp misalignment problem described in the body of the
				5186	* function __bfq_activate_entity. Hence we return true only if this
				5187	* condition does not hold, or if the queue is slow enough to deserve
				5188	* only to be kicked off for preserving a high throughput.
				5189	*/
				5190	static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
				5191	{
				5192	bfq_log_bfqq(bfqq->bfqd, bfqq,
				5193	"may_budget_timeout: wait_request %d left %d timeout %d",
				5194	bfq_bfqq_wait_request(bfqq),
				5195	bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
				5196	bfq_bfqq_budget_timeout(bfqq));
				5197
				5198	return (!bfq_bfqq_wait_request(bfqq) \|\|
				5199	bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
				5200	&&
				5201	bfq_bfqq_budget_timeout(bfqq);
				5202	}
				5203
				5204	/*
				5205	* For a queue that becomes empty, device idling is allowed only if
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5206	* this function returns true for the queue. As a consequence, since
				5207	* device idling plays a critical role in both throughput boosting and
				5208	* service guarantees, the return value of this function plays a
				5209	* critical role in both these aspects as well.
				5210	*
				5211	* In a nutshell, this function returns true only if idling is
				5212	* beneficial for throughput or, even if detrimental for throughput,
				5213	* idling is however necessary to preserve service guarantees (low
				5214	* latency, desired throughput distribution, ...). In particular, on
				5215	* NCQ-capable devices, this function tries to return false, so as to
				5216	* help keep the drives' internal queues full, whenever this helps the
				5217	* device boost the throughput without causing any service-guarantee
				5218	* issue.
				5219	*
				5220	* In more detail, the return value of this function is obtained by,
				5221	* first, computing a number of boolean variables that take into
				5222	* account throughput and service-guarantee issues, and, then,
				5223	* combining these variables in a logical expression. Most of the
				5224	* issues taken into account are not trivial. We discuss these issues
				5225	* individually while introducing the variables.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5226	*/
				5227	static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
				5228	{
				5229	struct bfq_data *bfqd = bfqq->bfqd;
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5230	bool idling_boosts_thr, asymmetric_scenario;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5231
				5232	if (bfqd->strict_guarantees)
				5233	return true;
				5234
				5235	/*
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5236	* The next variable takes into account the cases where idling
				5237	* boosts the throughput.
				5238	*
				5239	* The value of the variable is computed considering that
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5240	* idling is usually beneficial for the throughput if:
				5241	* (a) the device is not NCQ-capable, or
				5242	* (b) regardless of the presence of NCQ, the request pattern
				5243	* for bfqq is I/O-bound (possible throughput losses
				5244	* caused by granting idling to seeky queues are mitigated
				5245	* by the fact that, in all scenarios where boosting
				5246	* throughput is the best thing to do, i.e., in all
				5247	* symmetric scenarios, only a minimal idle time is
				5248	* allowed to seeky queues).
				5249	*/
				5250	idling_boosts_thr = !bfqd->hw_tag \|\| bfq_bfqq_IO_bound(bfqq);
				5251
				5252	/*
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5253	* There is then a case where idling must be performed not for
				5254	* throughput concerns, but to preserve service guarantees. To
				5255	* introduce it, we can note that allowing the drive to
				5256	* enqueue more than one request at a time, and hence
				5257	* delegating de facto final scheduling decisions to the
				5258	* drive's internal scheduler, causes loss of control on the
				5259	* actual request service order. In particular, the critical
				5260	* situation is when requests from different processes happens
				5261	* to be present, at the same time, in the internal queue(s)
				5262	* of the drive. In such a situation, the drive, by deciding
				5263	* the service order of the internally-queued requests, does
				5264	* determine also the actual throughput distribution among
				5265	* these processes. But the drive typically has no notion or
				5266	* concern about per-process throughput distribution, and
				5267	* makes its decisions only on a per-request basis. Therefore,
				5268	* the service distribution enforced by the drive's internal
				5269	* scheduler is likely to coincide with the desired
				5270	* device-throughput distribution only in a completely
				5271	* symmetric scenario where: (i) each of these processes must
				5272	* get the same throughput as the others; (ii) all these
				5273	* processes have the same I/O pattern (either sequential or
				5274	* random). In fact, in such a scenario, the drive will tend
				5275	* to treat the requests of each of these processes in about
				5276	* the same way as the requests of the others, and thus to
				5277	* provide each of these processes with about the same
				5278	* throughput (which is exactly the desired throughput
				5279	* distribution). In contrast, in any asymmetric scenario,
				5280	* device idling is certainly needed to guarantee that bfqq
				5281	* receives its assigned fraction of the device throughput
				5282	* (see [1] for details).
				5283	*
				5284	* As for sub-condition (i), actually we check only whether
				5285	* bfqq is being weight-raised. In fact, if bfqq is not being
				5286	* weight-raised, we have that:
				5287	* - if the process associated with bfqq is not I/O-bound, then
				5288	* it is not either latency- or throughput-critical; therefore
				5289	* idling is not needed for bfqq;
				5290	* - if the process asociated with bfqq is I/O-bound, then
				5291	* idling is already granted with bfqq (see the comments on
				5292	* idling_boosts_thr).
				5293	*
				5294	* We do not check sub-condition (ii) at all, i.e., the next
				5295	* variable is true if and only if bfqq is being
				5296	* weight-raised. We do not need to control sub-condition (ii)
				5297	* for the following reason:
				5298	* - if bfqq is being weight-raised, then idling is already
				5299	* guaranteed to bfqq by sub-condition (i);
				5300	* - if bfqq is not being weight-raised, then idling is
				5301	* already guaranteed to bfqq (only) if it matters, i.e., if
				5302	* bfqq is associated to a currently I/O-bound process (see
				5303	* the above comment on sub-condition (i)).
				5304	*
				5305	* As a side note, it is worth considering that the above
				5306	* device-idling countermeasures may however fail in the
				5307	* following unlucky scenario: if idling is (correctly)
				5308	* disabled in a time period during which the symmetry
				5309	* sub-condition holds, and hence the device is allowed to
				5310	* enqueue many requests, but at some later point in time some
				5311	* sub-condition stops to hold, then it may become impossible
				5312	* to let requests be served in the desired order until all
				5313	* the requests already queued in the device have been served.
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5314	*/
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5315	asymmetric_scenario = bfqq->wr_coeff > 1;
				5316
				5317	/*
				5318	* We have now all the components we need to compute the return
				5319	* value of the function, which is true only if both the following
				5320	* conditions hold:
				5321	* 1) bfqq is sync, because idling make sense only for sync queues;
				5322	* 2) idling either boosts the throughput (without issues), or
				5323	* is necessary to preserve service guarantees.
				5324	*/
				5325	return bfq_bfqq_sync(bfqq) &&
				5326	(idling_boosts_thr \|\| asymmetric_scenario);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5327	}
				5328
				5329	/*
				5330	* If the in-service queue is empty but the function bfq_bfqq_may_idle
				5331	* returns true, then:
				5332	* 1) the queue must remain in service and cannot be expired, and
				5333	* 2) the device must be idled to wait for the possible arrival of a new
				5334	* request for the queue.
				5335	* See the comments on the function bfq_bfqq_may_idle for the reasons
				5336	* why performing device idling is the best choice to boost the throughput
				5337	* and preserve service guarantees when bfq_bfqq_may_idle itself
				5338	* returns true.
				5339	*/
				5340	static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
				5341	{
				5342	struct bfq_data *bfqd = bfqq->bfqd;
				5343
				5344	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
				5345	bfq_bfqq_may_idle(bfqq);
				5346	}
				5347
				5348	/*
				5349	* Select a queue for service. If we have a current queue in service,
				5350	* check whether to continue servicing it, or retrieve and set a new one.
				5351	*/
				5352	static struct bfq_queue bfq_select_queue(struct bfq_data bfqd)
				5353	{
				5354	struct bfq_queue *bfqq;
				5355	struct request *next_rq;
				5356	enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
				5357
				5358	bfqq = bfqd->in_service_queue;
				5359	if (!bfqq)
				5360	goto new_queue;
				5361
				5362	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
				5363
				5364	if (bfq_may_expire_for_budg_timeout(bfqq) &&
				5365	!bfq_bfqq_wait_request(bfqq) &&
				5366	!bfq_bfqq_must_idle(bfqq))
				5367	goto expire;
				5368
				5369	check_queue:
				5370	/*
				5371	* This loop is rarely executed more than once. Even when it
				5372	* happens, it is much more convenient to re-execute this loop
				5373	* than to return NULL and trigger a new dispatch to get a
				5374	* request served.
				5375	*/
				5376	next_rq = bfqq->next_rq;
				5377	/*
				5378	* If bfqq has requests queued and it has enough budget left to
				5379	* serve them, keep the queue, otherwise expire it.
				5380	*/
				5381	if (next_rq) {
				5382	if (bfq_serv_to_charge(next_rq, bfqq) >
				5383	bfq_bfqq_budget_left(bfqq)) {
				5384	/*
				5385	* Expire the queue for budget exhaustion,
				5386	* which makes sure that the next budget is
				5387	* enough to serve the next request, even if
				5388	* it comes from the fifo expired path.
				5389	*/
				5390	reason = BFQQE_BUDGET_EXHAUSTED;
				5391	goto expire;
				5392	} else {
				5393	/*
				5394	* The idle timer may be pending because we may
				5395	* not disable disk idling even when a new request
				5396	* arrives.
				5397	*/
				5398	if (bfq_bfqq_wait_request(bfqq)) {
				5399	/*
				5400	* If we get here: 1) at least a new request
				5401	* has arrived but we have not disabled the
				5402	* timer because the request was too small,
				5403	* 2) then the block layer has unplugged
				5404	* the device, causing the dispatch to be
				5405	* invoked.
				5406	*
				5407	* Since the device is unplugged, now the
				5408	* requests are probably large enough to
				5409	* provide a reasonable throughput.
				5410	* So we disable idling.
				5411	*/
				5412	bfq_clear_bfqq_wait_request(bfqq);
				5413	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5414	bfqg_stats_update_idle_time(bfqq_group(bfqq));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5415	}
				5416	goto keep_queue;
				5417	}
				5418	}
				5419
				5420	/*
				5421	* No requests pending. However, if the in-service queue is idling
				5422	* for a new request, or has requests waiting for a completion and
				5423	* may idle after their completion, then keep it anyway.
				5424	*/
				5425	if (bfq_bfqq_wait_request(bfqq) \|\|
				5426	(bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
				5427	bfqq = NULL;
				5428	goto keep_queue;
				5429	}
				5430
				5431	reason = BFQQE_NO_MORE_REQUESTS;
				5432	expire:
				5433	bfq_bfqq_expire(bfqd, bfqq, false, reason);
				5434	new_queue:
				5435	bfqq = bfq_set_in_service_queue(bfqd);
				5436	if (bfqq) {
				5437	bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
				5438	goto check_queue;
				5439	}
				5440	keep_queue:
				5441	if (bfqq)
				5442	bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
				5443	else
				5444	bfq_log(bfqd, "select_queue: no queue returned");
				5445
				5446	return bfqq;
				5447	}
				5448
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5449	static void bfq_update_wr_data(struct bfq_data bfqd, struct bfq_queue bfqq)
				5450	{
				5451	struct bfq_entity *entity = &bfqq->entity;
				5452
				5453	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
				5454	bfq_log_bfqq(bfqd, bfqq,
				5455	"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
				5456	jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
				5457	jiffies_to_msecs(bfqq->wr_cur_max_time),
				5458	bfqq->wr_coeff,
				5459	bfqq->entity.weight, bfqq->entity.orig_weight);
				5460
				5461	if (entity->prio_changed)
				5462	bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
				5463
				5464	/*
				5465	* If too much time has elapsed from the beginning of
				5466	* this weight-raising period, then end weight
				5467	* raising.
				5468	*/
				5469	if (time_is_before_jiffies(bfqq->last_wr_start_finish +
				5470	bfqq->wr_cur_max_time)) {
				5471	bfqq->last_wr_start_finish = jiffies;
				5472	bfq_log_bfqq(bfqd, bfqq,
				5473	"wrais ending at %lu, rais_max_time %u",
				5474	bfqq->last_wr_start_finish,
				5475	jiffies_to_msecs(bfqq->wr_cur_max_time));
				5476	bfq_bfqq_end_wr(bfqq);
				5477	}
				5478	}
				5479	/* Update weight both if it must be raised and if it must be lowered */
				5480	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
				5481	__bfq_entity_update_weight_prio(
				5482	bfq_entity_service_tree(entity),
				5483	entity);
				5484	}
				5485
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5486	/*
				5487	* Dispatch next request from bfqq.
				5488	*/
				5489	static struct request bfq_dispatch_rq_from_bfqq(struct bfq_data bfqd,
				5490	struct bfq_queue *bfqq)
				5491	{
				5492	struct request *rq = bfqq->next_rq;
				5493	unsigned long service_to_charge;
				5494
				5495	service_to_charge = bfq_serv_to_charge(rq, bfqq);
				5496
				5497	bfq_bfqq_served(bfqq, service_to_charge);
				5498
				5499	bfq_dispatch_remove(bfqd->queue, rq);
				5500
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5501	/*
				5502	* If weight raising has to terminate for bfqq, then next
				5503	* function causes an immediate update of bfqq's weight,
				5504	* without waiting for next activation. As a consequence, on
				5505	* expiration, bfqq will be timestamped as if has never been
				5506	* weight-raised during this service slot, even if it has
				5507	* received part or even most of the service as a
				5508	* weight-raised queue. This inflates bfqq's timestamps, which
				5509	* is beneficial, as bfqq is then more willing to leave the
				5510	* device immediately to possible other weight-raised queues.
				5511	*/
				5512	bfq_update_wr_data(bfqd, bfqq);
				5513
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5514	if (!bfqd->in_service_bic) {
				5515	atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
				5516	bfqd->in_service_bic = RQ_BIC(rq);
				5517	}
				5518
				5519	/*
				5520	* Expire bfqq, pretending that its budget expired, if bfqq
				5521	* belongs to CLASS_IDLE and other queues are waiting for
				5522	* service.
				5523	*/
				5524	if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
				5525	goto expire;
				5526
				5527	return rq;
				5528
				5529	expire:
				5530	bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
				5531	return rq;
				5532	}
				5533
				5534	static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
				5535	{
				5536	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
				5537
				5538	/*
				5539	* Avoiding lock: a race on bfqd->busy_queues should cause at
				5540	* most a call to dispatch for nothing
				5541	*/
				5542	return !list_empty_careful(&bfqd->dispatch) \|\|
				5543	bfqd->busy_queues > 0;
				5544	}
				5545
				5546	static struct request __bfq_dispatch_request(struct blk_mq_hw_ctx hctx)
				5547	{
				5548	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
				5549	struct request *rq = NULL;
				5550	struct bfq_queue *bfqq = NULL;
				5551
				5552	if (!list_empty(&bfqd->dispatch)) {
				5553	rq = list_first_entry(&bfqd->dispatch, struct request,
				5554	queuelist);
				5555	list_del_init(&rq->queuelist);
				5556
				5557	bfqq = RQ_BFQQ(rq);
				5558
				5559	if (bfqq) {
				5560	/*
				5561	* Increment counters here, because this
				5562	* dispatch does not follow the standard
				5563	* dispatch flow (where counters are
				5564	* incremented)
				5565	*/
				5566	bfqq->dispatched++;
				5567
				5568	goto inc_in_driver_start_rq;
				5569	}
				5570
				5571	/*
				5572	* We exploit the put_rq_private hook to decrement
				5573	* rq_in_driver, but put_rq_private will not be
				5574	* invoked on this request. So, to avoid unbalance,
				5575	* just start this request, without incrementing
				5576	* rq_in_driver. As a negative consequence,
				5577	* rq_in_driver is deceptively lower than it should be
				5578	* while this request is in service. This may cause
				5579	* bfq_schedule_dispatch to be invoked uselessly.
				5580	*
				5581	* As for implementing an exact solution, the
				5582	* put_request hook, if defined, is probably invoked
				5583	* also on this request. So, by exploiting this hook,
				5584	* we could 1) increment rq_in_driver here, and 2)
				5585	* decrement it in put_request. Such a solution would
				5586	* let the value of the counter be always accurate,
				5587	* but it would entail using an extra interface
				5588	* function. This cost seems higher than the benefit,
				5589	* being the frequency of non-elevator-private
				5590	* requests very low.
				5591	*/
				5592	goto start_rq;
				5593	}
				5594
				5595	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
				5596
				5597	if (bfqd->busy_queues == 0)
				5598	goto exit;
				5599
				5600	/*
				5601	* Force device to serve one request at a time if
				5602	* strict_guarantees is true. Forcing this service scheme is
				5603	* currently the ONLY way to guarantee that the request
				5604	* service order enforced by the scheduler is respected by a
				5605	* queueing device. Otherwise the device is free even to make
				5606	* some unlucky request wait for as long as the device
				5607	* wishes.
				5608	*
				5609	* Of course, serving one request at at time may cause loss of
				5610	* throughput.
				5611	*/
				5612	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
				5613	goto exit;
				5614
				5615	bfqq = bfq_select_queue(bfqd);
				5616	if (!bfqq)
				5617	goto exit;
				5618
				5619	rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
				5620
				5621	if (rq) {
				5622	inc_in_driver_start_rq:
				5623	bfqd->rq_in_driver++;
				5624	start_rq:
				5625	rq->rq_flags \|= RQF_STARTED;
				5626	}
				5627	exit:
				5628	return rq;
				5629	}
				5630
				5631	static struct request bfq_dispatch_request(struct blk_mq_hw_ctx hctx)
				5632	{
				5633	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
				5634	struct request *rq;
				5635
				5636	spin_lock_irq(&bfqd->lock);
				5637	rq = __bfq_dispatch_request(hctx);
				5638	spin_unlock_irq(&bfqd->lock);
				5639
				5640	return rq;
				5641	}
				5642
				5643	/*
				5644	* Task holds one reference to the queue, dropped when task exits. Each rq
				5645	* in-flight on this queue also holds a reference, dropped when rq is freed.
				5646	*
				5647	* Scheduler lock must be held here. Recall not to use bfqq after calling
				5648	* this function on it.
				5649	*/
				5650	static void bfq_put_queue(struct bfq_queue *bfqq)
				5651	{
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5652	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				5653	struct bfq_group *bfqg = bfqq_group(bfqq);
				5654	#endif
				5655
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5656	if (bfqq->bfqd)
				5657	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
				5658	bfqq, bfqq->ref);
				5659
				5660	bfqq->ref--;
				5661	if (bfqq->ref)
				5662	return;
				5663
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5664	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
				5665
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5666	kmem_cache_free(bfq_pool, bfqq);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5667	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				5668	bfqg_put(bfqg);
				5669	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5670	}
				5671
				5672	static void bfq_exit_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq)
				5673	{
				5674	if (bfqq == bfqd->in_service_queue) {
				5675	__bfq_bfqq_expire(bfqd, bfqq);
				5676	bfq_schedule_dispatch(bfqd);
				5677	}
				5678
				5679	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
				5680
				5681	bfq_put_queue(bfqq); /* release process reference */
				5682	}
				5683
				5684	static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
				5685	{
				5686	struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
				5687	struct bfq_data *bfqd;
				5688
				5689	if (bfqq)
				5690	bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
				5691
				5692	if (bfqq && bfqd) {
				5693	unsigned long flags;
				5694
				5695	spin_lock_irqsave(&bfqd->lock, flags);
				5696	bfq_exit_bfqq(bfqd, bfqq);
				5697	bic_set_bfqq(bic, NULL, is_sync);
				5698	spin_unlock_irq(&bfqd->lock);
				5699	}
				5700	}
				5701
				5702	static void bfq_exit_icq(struct io_cq *icq)
				5703	{
				5704	struct bfq_io_cq *bic = icq_to_bic(icq);
				5705
				5706	bfq_exit_icq_bfqq(bic, true);
				5707	bfq_exit_icq_bfqq(bic, false);
				5708	}
				5709
				5710	/*
				5711	* Update the entity prio values; note that the new values will not
				5712	* be used until the next (re)activation.
				5713	*/
				5714	static void
				5715	bfq_set_next_ioprio_data(struct bfq_queue bfqq, struct bfq_io_cq bic)
				5716	{
				5717	struct task_struct *tsk = current;
				5718	int ioprio_class;
				5719	struct bfq_data *bfqd = bfqq->bfqd;
				5720
				5721	if (!bfqd)
				5722	return;
				5723
				5724	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
				5725	switch (ioprio_class) {
				5726	default:
				5727	dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
				5728	"bfq: bad prio class %d\n", ioprio_class);
				5729	case IOPRIO_CLASS_NONE:
				5730	/*
				5731	* No prio set, inherit CPU scheduling settings.
				5732	*/
				5733	bfqq->new_ioprio = task_nice_ioprio(tsk);
				5734	bfqq->new_ioprio_class = task_nice_ioclass(tsk);
				5735	break;
				5736	case IOPRIO_CLASS_RT:
				5737	bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
				5738	bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
				5739	break;
				5740	case IOPRIO_CLASS_BE:
				5741	bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
				5742	bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
				5743	break;
				5744	case IOPRIO_CLASS_IDLE:
				5745	bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
				5746	bfqq->new_ioprio = 7;
				5747	bfq_clear_bfqq_idle_window(bfqq);
				5748	break;
				5749	}
				5750
				5751	if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
				5752	pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
				5753	bfqq->new_ioprio);
				5754	bfqq->new_ioprio = IOPRIO_BE_NR;
				5755	}
				5756
				5757	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
				5758	bfqq->entity.prio_changed = 1;
				5759	}
				5760
				5761	static void bfq_check_ioprio_change(struct bfq_io_cq bic, struct bio bio)
				5762	{
				5763	struct bfq_data *bfqd = bic_to_bfqd(bic);
				5764	struct bfq_queue *bfqq;
				5765	int ioprio = bic->icq.ioc->ioprio;
				5766
				5767	/*
				5768	* This condition may trigger on a newly created bic, be sure to
				5769	* drop the lock before returning.
				5770	*/
				5771	if (unlikely(!bfqd) \|\| likely(bic->ioprio == ioprio))
				5772	return;
				5773
				5774	bic->ioprio = ioprio;
				5775
				5776	bfqq = bic_to_bfqq(bic, false);
				5777	if (bfqq) {
				5778	/* release process reference on this queue */
				5779	bfq_put_queue(bfqq);
				5780	bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
				5781	bic_set_bfqq(bic, bfqq, false);
				5782	}
				5783
				5784	bfqq = bic_to_bfqq(bic, true);
				5785	if (bfqq)
				5786	bfq_set_next_ioprio_data(bfqq, bic);
				5787	}
				5788
				5789	static void bfq_init_bfqq(struct bfq_data bfqd, struct bfq_queue bfqq,
				5790	struct bfq_io_cq *bic, pid_t pid, int is_sync)
				5791	{
				5792	RB_CLEAR_NODE(&bfqq->entity.rb_node);
				5793	INIT_LIST_HEAD(&bfqq->fifo);
				5794
				5795	bfqq->ref = 0;
				5796	bfqq->bfqd = bfqd;
				5797
				5798	if (bic)
				5799	bfq_set_next_ioprio_data(bfqq, bic);
				5800
				5801	if (is_sync) {
				5802	if (!bfq_class_idle(bfqq))
				5803	bfq_mark_bfqq_idle_window(bfqq);
				5804	bfq_mark_bfqq_sync(bfqq);
				5805	} else
				5806	bfq_clear_bfqq_sync(bfqq);
				5807
				5808	/* set end request to minus infinity from now */
				5809	bfqq->ttime.last_end_request = ktime_get_ns() + 1;
				5810
				5811	bfq_mark_bfqq_IO_bound(bfqq);
				5812
				5813	bfqq->pid = pid;
				5814
				5815	/* Tentative initial value to trade off between thr and lat */
Paolo Valente	54b6045	2017-04-12 18:23:09 +0200	[diff] [blame]	5816	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5817	bfqq->budget_timeout = bfq_smallest_from_now();
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5818
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5819	bfqq->wr_coeff = 1;
				5820	bfqq->last_wr_start_finish = bfq_smallest_from_now();
				5821
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5822	/* first request is almost certainly seeky */
				5823	bfqq->seek_history = 1;
				5824	}
				5825
				5826	static struct bfq_queue *bfq_async_queue_prio(struct bfq_data bfqd,
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5827	struct bfq_group *bfqg,
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5828	int ioprio_class, int ioprio)
				5829	{
				5830	switch (ioprio_class) {
				5831	case IOPRIO_CLASS_RT:
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5832	return &bfqg->async_bfqq[0][ioprio];
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5833	case IOPRIO_CLASS_NONE:
				5834	ioprio = IOPRIO_NORM;
				5835	/* fall through */
				5836	case IOPRIO_CLASS_BE:
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5837	return &bfqg->async_bfqq[1][ioprio];
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5838	case IOPRIO_CLASS_IDLE:
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5839	return &bfqg->async_idle_bfqq;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5840	default:
				5841	return NULL;
				5842	}
				5843	}
				5844
				5845	static struct bfq_queue bfq_get_queue(struct bfq_data bfqd,
				5846	struct bio *bio, bool is_sync,
				5847	struct bfq_io_cq *bic)
				5848	{
				5849	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
				5850	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
				5851	struct bfq_queue **async_bfqq = NULL;
				5852	struct bfq_queue *bfqq;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5853	struct bfq_group *bfqg;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5854
				5855	rcu_read_lock();
				5856
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5857	bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
				5858	if (!bfqg) {
				5859	bfqq = &bfqd->oom_bfqq;
				5860	goto out;
				5861	}
				5862
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5863	if (!is_sync) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5864	async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5865	ioprio);
				5866	bfqq = *async_bfqq;
				5867	if (bfqq)
				5868	goto out;
				5869	}
				5870
				5871	bfqq = kmem_cache_alloc_node(bfq_pool,
				5872	GFP_NOWAIT \| __GFP_ZERO \| __GFP_NOWARN,
				5873	bfqd->queue->node);
				5874
				5875	if (bfqq) {
				5876	bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
				5877	is_sync);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5878	bfq_init_entity(&bfqq->entity, bfqg);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5879	bfq_log_bfqq(bfqd, bfqq, "allocated");
				5880	} else {
				5881	bfqq = &bfqd->oom_bfqq;
				5882	bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
				5883	goto out;
				5884	}
				5885
				5886	/*
				5887	* Pin the queue now that it's allocated, scheduler exit will
				5888	* prune it.
				5889	*/
				5890	if (async_bfqq) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	5891	bfqq->ref++; /*
				5892	* Extra group reference, w.r.t. sync
				5893	* queue. This extra reference is removed
				5894	* only if bfqq->bfqg disappears, to
				5895	* guarantee that this queue is not freed
				5896	* until its group goes away.
				5897	*/
				5898	bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5899	bfqq, bfqq->ref);
				5900	*async_bfqq = bfqq;
				5901	}
				5902
				5903	out:
				5904	bfqq->ref++; /* get a process reference to this queue */
				5905	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
				5906	rcu_read_unlock();
				5907	return bfqq;
				5908	}
				5909
				5910	static void bfq_update_io_thinktime(struct bfq_data *bfqd,
				5911	struct bfq_queue *bfqq)
				5912	{
				5913	struct bfq_ttime *ttime = &bfqq->ttime;
				5914	u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
				5915
				5916	elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
				5917
				5918	ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
				5919	ttime->ttime_total = div_u64(7ttime->ttime_total + 256elapsed, 8);
				5920	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
				5921	ttime->ttime_samples);
				5922	}
				5923
				5924	static void
				5925	bfq_update_io_seektime(struct bfq_data bfqd, struct bfq_queue bfqq,
				5926	struct request *rq)
				5927	{
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5928	bfqq->seek_history <<= 1;
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	5929	bfqq->seek_history \|=
				5930	get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5931	(!blk_queue_nonrot(bfqd->queue) \|\|
				5932	blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
				5933	}
				5934
				5935	/*
				5936	* Disable idle window if the process thinks too long or seeks so much that
				5937	* it doesn't matter.
				5938	*/
				5939	static void bfq_update_idle_window(struct bfq_data *bfqd,
				5940	struct bfq_queue *bfqq,
				5941	struct bfq_io_cq *bic)
				5942	{
				5943	int enable_idle;
				5944
				5945	/* Don't idle for async or idle io prio class. */
				5946	if (!bfq_bfqq_sync(bfqq) \|\| bfq_class_idle(bfqq))
				5947	return;
				5948
				5949	enable_idle = bfq_bfqq_idle_window(bfqq);
				5950
				5951	if (atomic_read(&bic->icq.ioc->active_ref) == 0 \|\|
				5952	bfqd->bfq_slice_idle == 0 \|\|
				5953	(bfqd->hw_tag && BFQQ_SEEKY(bfqq)))
				5954	enable_idle = 0;
				5955	else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	5956	if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle &&
				5957	bfqq->wr_coeff == 1)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	5958	enable_idle = 0;
				5959	else
				5960	enable_idle = 1;
				5961	}
				5962	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
				5963	enable_idle);
				5964
				5965	if (enable_idle)
				5966	bfq_mark_bfqq_idle_window(bfqq);
				5967	else
				5968	bfq_clear_bfqq_idle_window(bfqq);
				5969	}
				5970
				5971	/*
				5972	* Called when a new fs request (rq) is added to bfqq. Check if there's
				5973	* something we should do about it.
				5974	*/
				5975	static void bfq_rq_enqueued(struct bfq_data bfqd, struct bfq_queue bfqq,
				5976	struct request *rq)
				5977	{
				5978	struct bfq_io_cq *bic = RQ_BIC(rq);
				5979
				5980	if (rq->cmd_flags & REQ_META)
				5981	bfqq->meta_pending++;
				5982
				5983	bfq_update_io_thinktime(bfqd, bfqq);
				5984	bfq_update_io_seektime(bfqd, bfqq, rq);
				5985	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 \|\|
				5986	!BFQQ_SEEKY(bfqq))
				5987	bfq_update_idle_window(bfqd, bfqq, bic);
				5988
				5989	bfq_log_bfqq(bfqd, bfqq,
				5990	"rq_enqueued: idle_window=%d (seeky %d)",
				5991	bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
				5992
				5993	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
				5994
				5995	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
				5996	bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
				5997	blk_rq_sectors(rq) < 32;
				5998	bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
				5999
				6000	/*
				6001	* There is just this request queued: if the request
				6002	* is small and the queue is not to be expired, then
				6003	* just exit.
				6004	*
				6005	* In this way, if the device is being idled to wait
				6006	* for a new request from the in-service queue, we
				6007	* avoid unplugging the device and committing the
				6008	* device to serve just a small request. On the
				6009	* contrary, we wait for the block layer to decide
				6010	* when to unplug the device: hopefully, new requests
				6011	* will be merged to this one quickly, then the device
				6012	* will be unplugged and larger requests will be
				6013	* dispatched.
				6014	*/
				6015	if (small_req && !budget_timeout)
				6016	return;
				6017
				6018	/*
				6019	* A large enough request arrived, or the queue is to
				6020	* be expired: in both cases disk idling is to be
				6021	* stopped, so clear wait_request flag and reset
				6022	* timer.
				6023	*/
				6024	bfq_clear_bfqq_wait_request(bfqq);
				6025	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6026	bfqg_stats_update_idle_time(bfqq_group(bfqq));
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6027
				6028	/*
				6029	* The queue is not empty, because a new request just
				6030	* arrived. Hence we can safely expire the queue, in
				6031	* case of budget timeout, without risking that the
				6032	* timestamps of the queue are not updated correctly.
				6033	* See [1] for more details.
				6034	*/
				6035	if (budget_timeout)
				6036	bfq_bfqq_expire(bfqd, bfqq, false,
				6037	BFQQE_BUDGET_TIMEOUT);
				6038	}
				6039	}
				6040
				6041	static void __bfq_insert_request(struct bfq_data bfqd, struct request rq)
				6042	{
				6043	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				6044
				6045	bfq_add_request(rq);
				6046
				6047	rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
				6048	list_add_tail(&rq->queuelist, &bfqq->fifo);
				6049
				6050	bfq_rq_enqueued(bfqd, bfqq, rq);
				6051	}
				6052
				6053	static void bfq_insert_request(struct blk_mq_hw_ctx hctx, struct request rq,
				6054	bool at_head)
				6055	{
				6056	struct request_queue *q = hctx->queue;
				6057	struct bfq_data *bfqd = q->elevator->elevator_data;
				6058
				6059	spin_lock_irq(&bfqd->lock);
				6060	if (blk_mq_sched_try_insert_merge(q, rq)) {
				6061	spin_unlock_irq(&bfqd->lock);
				6062	return;
				6063	}
				6064
				6065	spin_unlock_irq(&bfqd->lock);
				6066
				6067	blk_mq_sched_request_inserted(rq);
				6068
				6069	spin_lock_irq(&bfqd->lock);
				6070	if (at_head \|\| blk_rq_is_passthrough(rq)) {
				6071	if (at_head)
				6072	list_add(&rq->queuelist, &bfqd->dispatch);
				6073	else
				6074	list_add_tail(&rq->queuelist, &bfqd->dispatch);
				6075	} else {
				6076	__bfq_insert_request(bfqd, rq);
				6077
				6078	if (rq_mergeable(rq)) {
				6079	elv_rqhash_add(q, rq);
				6080	if (!q->last_merge)
				6081	q->last_merge = rq;
				6082	}
				6083	}
				6084
				6085	spin_unlock_irq(&bfqd->lock);
				6086	}
				6087
				6088	static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
				6089	struct list_head *list, bool at_head)
				6090	{
				6091	while (!list_empty(list)) {
				6092	struct request *rq;
				6093
				6094	rq = list_first_entry(list, struct request, queuelist);
				6095	list_del_init(&rq->queuelist);
				6096	bfq_insert_request(hctx, rq, at_head);
				6097	}
				6098	}
				6099
				6100	static void bfq_update_hw_tag(struct bfq_data *bfqd)
				6101	{
				6102	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
				6103	bfqd->rq_in_driver);
				6104
				6105	if (bfqd->hw_tag == 1)
				6106	return;
				6107
				6108	/*
				6109	* This sample is valid if the number of outstanding requests
				6110	* is large enough to allow a queueing behavior. Note that the
				6111	* sum is not exact, as it's not taking into account deactivated
				6112	* requests.
				6113	*/
				6114	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
				6115	return;
				6116
				6117	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
				6118	return;
				6119
				6120	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
				6121	bfqd->max_rq_in_driver = 0;
				6122	bfqd->hw_tag_samples = 0;
				6123	}
				6124
				6125	static void bfq_completed_request(struct bfq_queue bfqq, struct bfq_data bfqd)
				6126	{
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	6127	u64 now_ns;
				6128	u32 delta_us;
				6129
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6130	bfq_update_hw_tag(bfqd);
				6131
				6132	bfqd->rq_in_driver--;
				6133	bfqq->dispatched--;
				6134
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	6135	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
				6136	/*
				6137	* Set budget_timeout (which we overload to store the
				6138	* time at which the queue remains with no backlog and
				6139	* no outstanding request; used by the weight-raising
				6140	* mechanism).
				6141	*/
				6142	bfqq->budget_timeout = jiffies;
				6143	}
				6144
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	6145	now_ns = ktime_get_ns();
				6146
				6147	bfqq->ttime.last_end_request = now_ns;
				6148
				6149	/*
				6150	* Using us instead of ns, to get a reasonable precision in
				6151	* computing rate in next check.
				6152	*/
				6153	delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
				6154
				6155	/*
				6156	* If the request took rather long to complete, and, according
				6157	* to the maximum request size recorded, this completion latency
				6158	* implies that the request was certainly served at a very low
				6159	* rate (less than 1M sectors/sec), then the whole observation
				6160	* interval that lasts up to this time instant cannot be a
				6161	* valid time interval for computing a new peak rate. Invoke
				6162	* bfq_update_rate_reset to have the following three steps
				6163	* taken:
				6164	* - close the observation interval at the last (previous)
				6165	* request dispatch or completion
				6166	* - compute rate, if possible, for that observation interval
				6167	* - reset to zero samples, which will trigger a proper
				6168	* re-initialization of the observation interval on next
				6169	* dispatch
				6170	*/
				6171	if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
				6172	(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
				6173	1UL<<(BFQ_RATE_SHIFT - 10))
				6174	bfq_update_rate_reset(bfqd, NULL);
				6175	bfqd->last_completion = now_ns;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6176
				6177	/*
				6178	* If this is the in-service queue, check if it needs to be expired,
				6179	* or if we want to idle in case it has no pending requests.
				6180	*/
				6181	if (bfqd->in_service_queue == bfqq) {
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	6182	if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6183	bfq_arm_slice_timer(bfqd);
				6184	return;
				6185	} else if (bfq_may_expire_for_budg_timeout(bfqq))
				6186	bfq_bfqq_expire(bfqd, bfqq, false,
				6187	BFQQE_BUDGET_TIMEOUT);
				6188	else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
				6189	(bfqq->dispatched == 0 \|\|
				6190	!bfq_bfqq_may_idle(bfqq)))
				6191	bfq_bfqq_expire(bfqd, bfqq, false,
				6192	BFQQE_NO_MORE_REQUESTS);
				6193	}
				6194	}
				6195
				6196	static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
				6197	{
				6198	bfqq->allocated--;
				6199
				6200	bfq_put_queue(bfqq);
				6201	}
				6202
				6203	static void bfq_put_rq_private(struct request_queue q, struct request rq)
				6204	{
				6205	struct bfq_queue *bfqq = RQ_BFQQ(rq);
				6206	struct bfq_data *bfqd = bfqq->bfqd;
				6207
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6208	if (rq->rq_flags & RQF_STARTED)
				6209	bfqg_stats_update_completion(bfqq_group(bfqq),
				6210	rq_start_time_ns(rq),
				6211	rq_io_start_time_ns(rq),
				6212	rq->cmd_flags);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6213
				6214	if (likely(rq->rq_flags & RQF_STARTED)) {
				6215	unsigned long flags;
				6216
				6217	spin_lock_irqsave(&bfqd->lock, flags);
				6218
				6219	bfq_completed_request(bfqq, bfqd);
				6220	bfq_put_rq_priv_body(bfqq);
				6221
				6222	spin_unlock_irqrestore(&bfqd->lock, flags);
				6223	} else {
				6224	/*
				6225	* Request rq may be still/already in the scheduler,
				6226	* in which case we need to remove it. And we cannot
				6227	* defer such a check and removal, to avoid
				6228	* inconsistencies in the time interval from the end
				6229	* of this function to the start of the deferred work.
				6230	* This situation seems to occur only in process
				6231	* context, as a consequence of a merge. In the
				6232	* current version of the code, this implies that the
				6233	* lock is held.
				6234	*/
				6235
				6236	if (!RB_EMPTY_NODE(&rq->rb_node))
				6237	bfq_remove_request(q, rq);
				6238	bfq_put_rq_priv_body(bfqq);
				6239	}
				6240
				6241	rq->elv.priv[0] = NULL;
				6242	rq->elv.priv[1] = NULL;
				6243	}
				6244
				6245	/*
				6246	* Allocate bfq data structures associated with this request.
				6247	*/
				6248	static int bfq_get_rq_private(struct request_queue q, struct request rq,
				6249	struct bio *bio)
				6250	{
				6251	struct bfq_data *bfqd = q->elevator->elevator_data;
				6252	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
				6253	const int is_sync = rq_is_sync(rq);
				6254	struct bfq_queue *bfqq;
				6255
				6256	spin_lock_irq(&bfqd->lock);
				6257
				6258	bfq_check_ioprio_change(bic, bio);
				6259
				6260	if (!bic)
				6261	goto queue_fail;
				6262
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6263	bfq_bic_update_cgroup(bic, bio);
				6264
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6265	bfqq = bic_to_bfqq(bic, is_sync);
				6266	if (!bfqq \|\| bfqq == &bfqd->oom_bfqq) {
				6267	if (bfqq)
				6268	bfq_put_queue(bfqq);
				6269	bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
				6270	bic_set_bfqq(bic, bfqq, is_sync);
				6271	}
				6272
				6273	bfqq->allocated++;
				6274	bfqq->ref++;
				6275	bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
				6276	rq, bfqq, bfqq->ref);
				6277
				6278	rq->elv.priv[0] = bic;
				6279	rq->elv.priv[1] = bfqq;
				6280
				6281	spin_unlock_irq(&bfqd->lock);
				6282
				6283	return 0;
				6284
				6285	queue_fail:
				6286	spin_unlock_irq(&bfqd->lock);
				6287
				6288	return 1;
				6289	}
				6290
				6291	static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
				6292	{
				6293	struct bfq_data *bfqd = bfqq->bfqd;
				6294	enum bfqq_expiration reason;
				6295	unsigned long flags;
				6296
				6297	spin_lock_irqsave(&bfqd->lock, flags);
				6298	bfq_clear_bfqq_wait_request(bfqq);
				6299
				6300	if (bfqq != bfqd->in_service_queue) {
				6301	spin_unlock_irqrestore(&bfqd->lock, flags);
				6302	return;
				6303	}
				6304
				6305	if (bfq_bfqq_budget_timeout(bfqq))
				6306	/*
				6307	* Also here the queue can be safely expired
				6308	* for budget timeout without wasting
				6309	* guarantees
				6310	*/
				6311	reason = BFQQE_BUDGET_TIMEOUT;
				6312	else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
				6313	/*
				6314	* The queue may not be empty upon timer expiration,
				6315	* because we may not disable the timer when the
				6316	* first request of the in-service queue arrives
				6317	* during disk idling.
				6318	*/
				6319	reason = BFQQE_TOO_IDLE;
				6320	else
				6321	goto schedule_dispatch;
				6322
				6323	bfq_bfqq_expire(bfqd, bfqq, true, reason);
				6324
				6325	schedule_dispatch:
				6326	spin_unlock_irqrestore(&bfqd->lock, flags);
				6327	bfq_schedule_dispatch(bfqd);
				6328	}
				6329
				6330	/*
				6331	* Handler of the expiration of the timer running if the in-service queue
				6332	* is idling inside its time slice.
				6333	*/
				6334	static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
				6335	{
				6336	struct bfq_data *bfqd = container_of(timer, struct bfq_data,
				6337	idle_slice_timer);
				6338	struct bfq_queue *bfqq = bfqd->in_service_queue;
				6339
				6340	/*
				6341	* Theoretical race here: the in-service queue can be NULL or
				6342	* different from the queue that was idling if a new request
				6343	* arrives for the current queue and there is a full dispatch
				6344	* cycle that changes the in-service queue. This can hardly
				6345	* happen, but in the worst case we just expire a queue too
				6346	* early.
				6347	*/
				6348	if (bfqq)
				6349	bfq_idle_slice_timer_body(bfqq);
				6350
				6351	return HRTIMER_NORESTART;
				6352	}
				6353
				6354	static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
				6355	struct bfq_queue **bfqq_ptr)
				6356	{
				6357	struct bfq_queue bfqq = bfqq_ptr;
				6358
				6359	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
				6360	if (bfqq) {
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6361	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
				6362
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6363	bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
				6364	bfqq, bfqq->ref);
				6365	bfq_put_queue(bfqq);
				6366	*bfqq_ptr = NULL;
				6367	}
				6368	}
				6369
				6370	/*
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6371	* Release all the bfqg references to its async queues. If we are
				6372	* deallocating the group these queues may still contain requests, so
				6373	* we reparent them to the root cgroup (i.e., the only one that will
				6374	* exist for sure until all the requests on a device are gone).
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6375	*/
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6376	static void bfq_put_async_queues(struct bfq_data bfqd, struct bfq_group bfqg)
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6377	{
				6378	int i, j;
				6379
				6380	for (i = 0; i < 2; i++)
				6381	for (j = 0; j < IOPRIO_BE_NR; j++)
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6382	__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6383
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6384	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6385	}
				6386
				6387	static void bfq_exit_queue(struct elevator_queue *e)
				6388	{
				6389	struct bfq_data *bfqd = e->elevator_data;
				6390	struct bfq_queue bfqq, n;
				6391
				6392	hrtimer_cancel(&bfqd->idle_slice_timer);
				6393
				6394	spin_lock_irq(&bfqd->lock);
				6395	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6396	bfq_deactivate_bfqq(bfqd, bfqq, false, false);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6397	spin_unlock_irq(&bfqd->lock);
				6398
				6399	hrtimer_cancel(&bfqd->idle_slice_timer);
				6400
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6401	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				6402	blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
				6403	#else
				6404	spin_lock_irq(&bfqd->lock);
				6405	bfq_put_async_queues(bfqd, bfqd->root_group);
				6406	kfree(bfqd->root_group);
				6407	spin_unlock_irq(&bfqd->lock);
				6408	#endif
				6409
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6410	kfree(bfqd);
				6411	}
				6412
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6413	static void bfq_init_root_group(struct bfq_group *root_group,
				6414	struct bfq_data *bfqd)
				6415	{
				6416	int i;
				6417
				6418	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				6419	root_group->entity.parent = NULL;
				6420	root_group->my_entity = NULL;
				6421	root_group->bfqd = bfqd;
				6422	#endif
				6423	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
				6424	root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
				6425	root_group->sched_data.bfq_class_idle_last_service = jiffies;
				6426	}
				6427
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6428	static int bfq_init_queue(struct request_queue q, struct elevator_type e)
				6429	{
				6430	struct bfq_data *bfqd;
				6431	struct elevator_queue *eq;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6432
				6433	eq = elevator_alloc(q, e);
				6434	if (!eq)
				6435	return -ENOMEM;
				6436
				6437	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
				6438	if (!bfqd) {
				6439	kobject_put(&eq->kobj);
				6440	return -ENOMEM;
				6441	}
				6442	eq->elevator_data = bfqd;
				6443
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6444	spin_lock_irq(q->queue_lock);
				6445	q->elevator = eq;
				6446	spin_unlock_irq(q->queue_lock);
				6447
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6448	/*
				6449	* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
				6450	* Grab a permanent reference to it, so that the normal code flow
				6451	* will not attempt to free it.
				6452	*/
				6453	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
				6454	bfqd->oom_bfqq.ref++;
				6455	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
				6456	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
				6457	bfqd->oom_bfqq.entity.new_weight =
				6458	bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
				6459	/*
				6460	* Trigger weight initialization, according to ioprio, at the
				6461	* oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
				6462	* class won't be changed any more.
				6463	*/
				6464	bfqd->oom_bfqq.entity.prio_changed = 1;
				6465
				6466	bfqd->queue = q;
				6467
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6468	INIT_LIST_HEAD(&bfqd->dispatch);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6469
				6470	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
				6471	HRTIMER_MODE_REL);
				6472	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
				6473
				6474	INIT_LIST_HEAD(&bfqd->active_list);
				6475	INIT_LIST_HEAD(&bfqd->idle_list);
				6476
				6477	bfqd->hw_tag = -1;
				6478
				6479	bfqd->bfq_max_budget = bfq_default_max_budget;
				6480
				6481	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
				6482	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
				6483	bfqd->bfq_back_max = bfq_back_max;
				6484	bfqd->bfq_back_penalty = bfq_back_penalty;
				6485	bfqd->bfq_slice_idle = bfq_slice_idle;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6486	bfqd->bfq_timeout = bfq_timeout;
				6487
				6488	bfqd->bfq_requests_within_timer = 120;
				6489
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	6490	bfqd->low_latency = true;
				6491
				6492	/*
				6493	* Trade-off between responsiveness and fairness.
				6494	*/
				6495	bfqd->bfq_wr_coeff = 30;
				6496	bfqd->bfq_wr_max_time = 0;
				6497	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
				6498	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
				6499
				6500	/*
				6501	* Begin by assuming, optimistically, that the device is a
				6502	* high-speed one, and that its peak rate is equal to 2/3 of
				6503	* the highest reference rate.
				6504	*/
				6505	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
				6506	T_fast[blk_queue_nonrot(bfqd->queue)];
				6507	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
				6508	bfqd->device_speed = BFQ_BFQD_FAST;
				6509
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6510	spin_lock_init(&bfqd->lock);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6511
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6512	/*
				6513	* The invocation of the next bfq_create_group_hierarchy
				6514	* function is the head of a chain of function calls
				6515	* (bfq_create_group_hierarchy->blkcg_activate_policy->
				6516	* blk_mq_freeze_queue) that may lead to the invocation of the
				6517	* has_work hook function. For this reason,
				6518	* bfq_create_group_hierarchy is invoked only after all
				6519	* scheduler data has been initialized, apart from the fields
				6520	* that can be initialized only after invoking
				6521	* bfq_create_group_hierarchy. This, in particular, enables
				6522	* has_work to correctly return false. Of course, to avoid
				6523	* other inconsistencies, the blk-mq stack must then refrain
				6524	* from invoking further scheduler hooks before this init
				6525	* function is finished.
				6526	*/
				6527	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
				6528	if (!bfqd->root_group)
				6529	goto out_free;
				6530	bfq_init_root_group(bfqd->root_group, bfqd);
				6531	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
				6532
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6533
				6534	return 0;
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6535
				6536	out_free:
				6537	kfree(bfqd);
				6538	kobject_put(&eq->kobj);
				6539	return -ENOMEM;
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6540	}
				6541
				6542	static void bfq_slab_kill(void)
				6543	{
				6544	kmem_cache_destroy(bfq_pool);
				6545	}
				6546
				6547	static int __init bfq_slab_setup(void)
				6548	{
				6549	bfq_pool = KMEM_CACHE(bfq_queue, 0);
				6550	if (!bfq_pool)
				6551	return -ENOMEM;
				6552	return 0;
				6553	}
				6554
				6555	static ssize_t bfq_var_show(unsigned int var, char *page)
				6556	{
				6557	return sprintf(page, "%u\n", var);
				6558	}
				6559
				6560	static ssize_t bfq_var_store(unsigned long var, const char page,
				6561	size_t count)
				6562	{
				6563	unsigned long new_val;
				6564	int ret = kstrtoul(page, 10, &new_val);
				6565
				6566	if (ret == 0)
				6567	*var = new_val;
				6568
				6569	return count;
				6570	}
				6571
				6572	#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
				6573	static ssize_t __FUNC(struct elevator_queue e, char page) \
				6574	{ \
				6575	struct bfq_data *bfqd = e->elevator_data; \
				6576	u64 __data = __VAR; \
				6577	if (__CONV == 1) \
				6578	__data = jiffies_to_msecs(__data); \
				6579	else if (__CONV == 2) \
				6580	__data = div_u64(__data, NSEC_PER_MSEC); \
				6581	return bfq_var_show(__data, (page)); \
				6582	}
				6583	SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
				6584	SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
				6585	SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
				6586	SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
				6587	SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
				6588	SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
				6589	SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
				6590	SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	6591	SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6592	#undef SHOW_FUNCTION
				6593
				6594	#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
				6595	static ssize_t __FUNC(struct elevator_queue e, char page) \
				6596	{ \
				6597	struct bfq_data *bfqd = e->elevator_data; \
				6598	u64 __data = __VAR; \
				6599	__data = div_u64(__data, NSEC_PER_USEC); \
				6600	return bfq_var_show(__data, (page)); \
				6601	}
				6602	USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
				6603	#undef USEC_SHOW_FUNCTION
				6604
				6605	#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
				6606	static ssize_t \
				6607	__FUNC(struct elevator_queue e, const char page, size_t count) \
				6608	{ \
				6609	struct bfq_data *bfqd = e->elevator_data; \
				6610	unsigned long uninitialized_var(__data); \
				6611	int ret = bfq_var_store(&__data, (page), count); \
				6612	if (__data < (MIN)) \
				6613	__data = (MIN); \
				6614	else if (__data > (MAX)) \
				6615	__data = (MAX); \
				6616	if (__CONV == 1) \
				6617	*(__PTR) = msecs_to_jiffies(__data); \
				6618	else if (__CONV == 2) \
				6619	(__PTR) = (u64)__data NSEC_PER_MSEC; \
				6620	else \
				6621	*(__PTR) = __data; \
				6622	return ret; \
				6623	}
				6624	STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
				6625	INT_MAX, 2);
				6626	STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
				6627	INT_MAX, 2);
				6628	STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
				6629	STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
				6630	INT_MAX, 0);
				6631	STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
				6632	#undef STORE_FUNCTION
				6633
				6634	#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
				6635	static ssize_t __FUNC(struct elevator_queue e, const char page, size_t count)\
				6636	{ \
				6637	struct bfq_data *bfqd = e->elevator_data; \
				6638	unsigned long uninitialized_var(__data); \
				6639	int ret = bfq_var_store(&__data, (page), count); \
				6640	if (__data < (MIN)) \
				6641	__data = (MIN); \
				6642	else if (__data > (MAX)) \
				6643	__data = (MAX); \
				6644	(__PTR) = (u64)__data NSEC_PER_USEC; \
				6645	return ret; \
				6646	}
				6647	USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
				6648	UINT_MAX);
				6649	#undef USEC_STORE_FUNCTION
				6650
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6651	static ssize_t bfq_max_budget_store(struct elevator_queue *e,
				6652	const char *page, size_t count)
				6653	{
				6654	struct bfq_data *bfqd = e->elevator_data;
				6655	unsigned long uninitialized_var(__data);
				6656	int ret = bfq_var_store(&__data, (page), count);
				6657
				6658	if (__data == 0)
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	6659	bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6660	else {
				6661	if (__data > INT_MAX)
				6662	__data = INT_MAX;
				6663	bfqd->bfq_max_budget = __data;
				6664	}
				6665
				6666	bfqd->bfq_user_max_budget = __data;
				6667
				6668	return ret;
				6669	}
				6670
				6671	/*
				6672	* Leaving this name to preserve name compatibility with cfq
				6673	* parameters, but this timeout is used for both sync and async.
				6674	*/
				6675	static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
				6676	const char *page, size_t count)
				6677	{
				6678	struct bfq_data *bfqd = e->elevator_data;
				6679	unsigned long uninitialized_var(__data);
				6680	int ret = bfq_var_store(&__data, (page), count);
				6681
				6682	if (__data < 1)
				6683	__data = 1;
				6684	else if (__data > INT_MAX)
				6685	__data = INT_MAX;
				6686
				6687	bfqd->bfq_timeout = msecs_to_jiffies(__data);
				6688	if (bfqd->bfq_user_max_budget == 0)
Paolo Valente	ab0e43e	2017-04-12 18:23:10 +0200	[diff] [blame]	6689	bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6690
				6691	return ret;
				6692	}
				6693
				6694	static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
				6695	const char *page, size_t count)
				6696	{
				6697	struct bfq_data *bfqd = e->elevator_data;
				6698	unsigned long uninitialized_var(__data);
				6699	int ret = bfq_var_store(&__data, (page), count);
				6700
				6701	if (__data > 1)
				6702	__data = 1;
				6703	if (!bfqd->strict_guarantees && __data == 1
				6704	&& bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
				6705	bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
				6706
				6707	bfqd->strict_guarantees = __data;
				6708
				6709	return ret;
				6710	}
				6711
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	6712	static ssize_t bfq_low_latency_store(struct elevator_queue *e,
				6713	const char *page, size_t count)
				6714	{
				6715	struct bfq_data *bfqd = e->elevator_data;
				6716	unsigned long uninitialized_var(__data);
				6717	int ret = bfq_var_store(&__data, (page), count);
				6718
				6719	if (__data > 1)
				6720	__data = 1;
				6721	if (__data == 0 && bfqd->low_latency != 0)
				6722	bfq_end_wr(bfqd);
				6723	bfqd->low_latency = __data;
				6724
				6725	return ret;
				6726	}
				6727
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6728	#define BFQ_ATTR(name) \
				6729	__ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
				6730
				6731	static struct elv_fs_entry bfq_attrs[] = {
				6732	BFQ_ATTR(fifo_expire_sync),
				6733	BFQ_ATTR(fifo_expire_async),
				6734	BFQ_ATTR(back_seek_max),
				6735	BFQ_ATTR(back_seek_penalty),
				6736	BFQ_ATTR(slice_idle),
				6737	BFQ_ATTR(slice_idle_us),
				6738	BFQ_ATTR(max_budget),
				6739	BFQ_ATTR(timeout_sync),
				6740	BFQ_ATTR(strict_guarantees),
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	6741	BFQ_ATTR(low_latency),
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6742	__ATTR_NULL
				6743	};
				6744
				6745	static struct elevator_type iosched_bfq_mq = {
				6746	.ops.mq = {
				6747	.get_rq_priv = bfq_get_rq_private,
				6748	.put_rq_priv = bfq_put_rq_private,
				6749	.exit_icq = bfq_exit_icq,
				6750	.insert_requests = bfq_insert_requests,
				6751	.dispatch_request = bfq_dispatch_request,
				6752	.next_request = elv_rb_latter_request,
				6753	.former_request = elv_rb_former_request,
				6754	.allow_merge = bfq_allow_bio_merge,
				6755	.bio_merge = bfq_bio_merge,
				6756	.request_merge = bfq_request_merge,
				6757	.requests_merged = bfq_requests_merged,
				6758	.request_merged = bfq_request_merged,
				6759	.has_work = bfq_has_work,
				6760	.init_sched = bfq_init_queue,
				6761	.exit_sched = bfq_exit_queue,
				6762	},
				6763
				6764	.uses_mq = true,
				6765	.icq_size = sizeof(struct bfq_io_cq),
				6766	.icq_align = __alignof__(struct bfq_io_cq),
				6767	.elevator_attrs = bfq_attrs,
				6768	.elevator_name = "bfq",
				6769	.elevator_owner = THIS_MODULE,
				6770	};
				6771
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6772	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				6773	static struct blkcg_policy blkcg_policy_bfq = {
				6774	.dfl_cftypes = bfq_blkg_files,
				6775	.legacy_cftypes = bfq_blkcg_legacy_files,
				6776
				6777	.cpd_alloc_fn = bfq_cpd_alloc,
				6778	.cpd_init_fn = bfq_cpd_init,
				6779	.cpd_bind_fn = bfq_cpd_init,
				6780	.cpd_free_fn = bfq_cpd_free,
				6781
				6782	.pd_alloc_fn = bfq_pd_alloc,
				6783	.pd_init_fn = bfq_pd_init,
				6784	.pd_offline_fn = bfq_pd_offline,
				6785	.pd_free_fn = bfq_pd_free,
				6786	.pd_reset_stats_fn = bfq_pd_reset_stats,
				6787	};
				6788	#endif
				6789
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6790	static int __init bfq_init(void)
				6791	{
				6792	int ret;
				6793
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6794	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				6795	ret = blkcg_policy_register(&blkcg_policy_bfq);
				6796	if (ret)
				6797	return ret;
				6798	#endif
				6799
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6800	ret = -ENOMEM;
				6801	if (bfq_slab_setup())
				6802	goto err_pol_unreg;
				6803
Paolo Valente	44e44a1	2017-04-12 18:23:12 +0200	[diff] [blame^]	6804	/*
				6805	* Times to load large popular applications for the typical
				6806	* systems installed on the reference devices (see the
				6807	* comments before the definitions of the next two
				6808	* arrays). Actually, we use slightly slower values, as the
				6809	* estimated peak rate tends to be smaller than the actual
				6810	* peak rate. The reason for this last fact is that estimates
				6811	* are computed over much shorter time intervals than the long
				6812	* intervals typically used for benchmarking. Why? First, to
				6813	* adapt more quickly to variations. Second, because an I/O
				6814	* scheduler cannot rely on a peak-rate-evaluation workload to
				6815	* be run for a long time.
				6816	*/
				6817	T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
				6818	T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
				6819	T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
				6820	T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
				6821
				6822	/*
				6823	* Thresholds that determine the switch between speed classes
				6824	* (see the comments before the definition of the array
				6825	* device_speed_thresh). These thresholds are biased towards
				6826	* transitions to the fast class. This is safer than the
				6827	* opposite bias. In fact, a wrong transition to the slow
				6828	* class results in short weight-raising periods, because the
				6829	* speed of the device then tends to be higher that the
				6830	* reference peak rate. On the opposite end, a wrong
				6831	* transition to the fast class tends to increase
				6832	* weight-raising periods, because of the opposite reason.
				6833	*/
				6834	device_speed_thresh[0] = (4 * R_slow[0]) / 3;
				6835	device_speed_thresh[1] = (4 * R_slow[1]) / 3;
				6836
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6837	ret = elv_register(&iosched_bfq_mq);
				6838	if (ret)
				6839	goto err_pol_unreg;
				6840
				6841	return 0;
				6842
				6843	err_pol_unreg:
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6844	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				6845	blkcg_policy_unregister(&blkcg_policy_bfq);
				6846	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6847	return ret;
				6848	}
				6849
				6850	static void __exit bfq_exit(void)
				6851	{
				6852	elv_unregister(&iosched_bfq_mq);
Arianna Avanzini	e21b7a0	2017-04-12 18:23:08 +0200	[diff] [blame]	6853	#ifdef CONFIG_BFQ_GROUP_IOSCHED
				6854	blkcg_policy_unregister(&blkcg_policy_bfq);
				6855	#endif
Paolo Valente	aee69d7	2017-04-19 08:29:02 -0600	[diff] [blame]	6856	bfq_slab_kill();
				6857	}
				6858
				6859	module_init(bfq_init);
				6860	module_exit(bfq_exit);
				6861
				6862	MODULE_AUTHOR("Paolo Valente");
				6863	MODULE_LICENSE("GPL");
				6864	MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");