Revert "block/mq-deadline: Prioritize high-priority requests" This reverts commit fb926032b3209300f9dc454a36b8299582ae545c. Zhen reports that this commit slows down mq-deadline on a 128 thread box, going from 258K IOPS to 170-180K. My testing shows that Optane gen2 IOPS goes from 2.3M IOPS to 1.2M IOPS on a 64 thread box. Looking in detail at the code, the main culprit here is needing to sum percpu counters in the dispatch hot path, leading to very high CPU utilization there. To make matters worse, the code currently needs to sum 2 percpu counters, and it does so in the most naive way of iterating possible CPUs _twice_. Since we're close to release, revert this commit and we can re-do it with regular per-priority counters instead for the 5.15 kernel. Link: https://lore.kernel.org/linux-block/20210826144039.2143-1-thunder.leizhen@huawei.com/ Reported-by: Zhen Lei <thunder.leizhen@huawei.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>

commit: 7b05bf771084ff788243b78f51bc2c820730951c [log] [tgz]
author: Jens Axboe <axboe@kernel.dk> Thu Aug 26 12:59:44 2021 -0600
committer: Jens Axboe <axboe@kernel.dk> Thu Aug 26 12:59:44 2021 -0600
tree: 7220ad3f581eaa5fbad9ad0d86cd23dc2c5878c4
parent: b6d2b054e8baaee53fd2d4854c63cbf0f2c6262a [diff] [blame]
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 18dc8ef..3692067 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c

@@ -31,11 +31,6 @@
  */
 static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
 static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
-/*
- * Time after which to dispatch lower priority requests even if higher
- * priority requests are pending.
- */
-static const int aging_expire = 10 * HZ;
 static const int writes_starved = 2;    /* max times reads can starve a write */
 static const int fifo_batch = 16;       /* # of sequential requests treated as one
 				     by the above parameters. For throughput. */
@@ -103,7 +98,6 @@ struct deadline_data {
 	int writes_starved;
 	int front_merges;
 	u32 async_depth;
-	int aging_expire;
 
 	spinlock_t lock;
 	spinlock_t zone_lock;
@@ -369,11 +363,10 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 
 /*
  * deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc and with a start time <= @latest.
+ * read/write expire, fifo_batch, etc
  */
 static struct request *__dd_dispatch_request(struct deadline_data *dd,
-					     struct dd_per_prio *per_prio,
-					     u64 latest_start_ns)
+					     struct dd_per_prio *per_prio)
 {
 	struct request *rq, *next_rq;
 	enum dd_data_dir data_dir;
@@ -385,8 +378,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	if (!list_empty(&per_prio->dispatch)) {
 		rq = list_first_entry(&per_prio->dispatch, struct request,
 				      queuelist);
-		if (rq->start_time_ns > latest_start_ns)
-			return NULL;
 		list_del_init(&rq->queuelist);
 		goto done;
 	}
@@ -464,8 +455,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	dd->batching = 0;
 
 dispatch_request:
-	if (rq->start_time_ns > latest_start_ns)
-		return NULL;
 	/*
 	 * rq is the selected appropriate request.
 	 */
@@ -494,32 +483,15 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-	const u64 now_ns = ktime_get_ns();
-	struct request *rq = NULL;
+	struct request *rq;
 	enum dd_prio prio;
 
 	spin_lock(&dd->lock);
-	/*
-	 * Start with dispatching requests whose deadline expired more than
-	 * aging_expire jiffies ago.
-	 */
-	for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
-		rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns -
-					   jiffies_to_nsecs(dd->aging_expire));
-		if (rq)
-			goto unlock;
-	}
-	/*
-	 * Next, dispatch requests in priority order. Ignore lower priority
-	 * requests if any higher priority requests are pending.
-	 */
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
-		rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns);
-		if (rq || dd_queued(dd, prio))
+		rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
+		if (rq)
 			break;
 	}
-
-unlock:
 	spin_unlock(&dd->lock);
 
 	return rq;
@@ -620,7 +592,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	dd->front_merges = 1;
 	dd->last_dir = DD_WRITE;
 	dd->fifo_batch = fifo_batch;
-	dd->aging_expire = aging_expire;
 	spin_lock_init(&dd->lock);
 	spin_lock_init(&dd->zone_lock);
 
@@ -842,7 +813,6 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
 SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
 SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
-SHOW_JIFFIES(deadline_aging_expire_show, dd->aging_expire);
 SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
 SHOW_INT(deadline_front_merges_show, dd->front_merges);
 SHOW_INT(deadline_async_depth_show, dd->front_merges);
@@ -872,7 +842,6 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
 	STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
 STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
 STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
-STORE_JIFFIES(deadline_aging_expire_store, &dd->aging_expire, 0, INT_MAX);
 STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
 STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
 STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
@@ -891,7 +860,6 @@ static struct elv_fs_entry deadline_attrs[] = {
 	DD_ATTR(front_merges),
 	DD_ATTR(async_depth),
 	DD_ATTR(fifo_batch),
-	DD_ATTR(aging_expire),
 	__ATTR_NULL
 };
commit	7b05bf771084ff788243b78f51bc2c820730951c	[log] [tgz]
author	Jens Axboe <axboe@kernel.dk>	Thu Aug 26 12:59:44 2021 -0600
committer	Jens Axboe <axboe@kernel.dk>	Thu Aug 26 12:59:44 2021 -0600
tree	7220ad3f581eaa5fbad9ad0d86cd23dc2c5878c4
parent	b6d2b054e8baaee53fd2d4854c63cbf0f2c6262a [diff] [blame]