sched/core: Fix task and run queue sched_info::run_delay inconsistencies Mike Meyer reported the following bug: > During evaluation of some performance data, it was discovered thread > and run queue run_delay accounting data was inconsistent with the other > accounting data that was collected. Further investigation found under > certain circumstances execution time was leaking into the task and > run queue accounting of run_delay. > > Consider the following sequence: > > a. thread is running. > b. thread moves beween cgroups, changes scheduling class or priority. > c. thread sleeps OR > d. thread involuntarily gives up cpu. > > a. implies: > > thread->sched_info.last_queued = 0 > > a. and b. results in the following: > > 1. dequeue_task(rq, thread) > > sched_info_dequeued(rq, thread) > delta = 0 > > sched_info_reset_dequeued(thread) > thread->sched_info.last_queued = 0 > > thread->sched_info.run_delay += delta > > 2. enqueue_task(rq, thread) > > sched_info_queued(rq, thread) > > /* thread is still on cpu at this point. */ > thread->sched_info.last_queued = task_rq(thread)->clock; > > c. results in: > > dequeue_task(rq, thread) > > sched_info_dequeued(rq, thread) > > /* delta is execution time not run_delay. */ > delta = task_rq(thread)->clock - thread->sched_info.last_queued > > sched_info_reset_dequeued(thread) > thread->sched_info.last_queued = 0 > > thread->sched_info.run_delay += delta > > Since thread was running between enqueue_task(rq, thread) and > dequeue_task(rq, thread), the delta above is really execution > time and not run_delay. > > d. results in: > > __sched_info_switch(thread, next_thread) > > sched_info_depart(rq, thread) > > sched_info_queued(rq, thread) > > /* last_queued not updated due to being non-zero */ > return > > Since thread was running between enqueue_task(rq, thread) and > __sched_info_switch(thread, next_thread), the execution time > between enqueue_task(rq, thread) and > __sched_info_switch(thread, next_thread) now will become > associated with run_delay due to when last_queued was last updated. > This alternative patch solves the problem by not calling sched_info_{de,}queued() in {de,en}queue_task(). Therefore the sched_info state is preserved and things work as expected. By inlining the {de,en}queue_task() functions the new condition becomes (mostly) a compile-time constant and we'll not emit any new branch instructions. It even shrinks the code (due to inlining {en,de}queue_task()): $ size defconfig-build/kernel/sched/core.o defconfig-build/kernel/sched/core.o.orig text data bss dec hex filename 64019 23378 2344 89741 15e8d defconfig-build/kernel/sched/core.o 64149 23378 2344 89871 15f0f defconfig-build/kernel/sched/core.o.orig Reported-by: Mike Meyer <Mike.Meyer@Teradata.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Mike Galbraith <efault@gmx.de> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20150930154413.GO3604@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar <mingo@kernel.org>

commit: 1de64443d755f83af8ba8b558fded0c61afaef47 [log] [tgz]
author: Peter Zijlstra <peterz@infradead.org> Wed Sep 30 17:44:13 2015 +0200
committer: Ingo Molnar <mingo@kernel.org> Tue Oct 06 17:08:22 2015 +0200
tree: 4a5c5f83384367e26ffbf73fd5023bf58e2dfe59
parent: b52da86e0ad58f096710977fcda856fd84da9233 [diff]
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4554cde..fb14a01 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c

@@ -827,17 +827,19 @@
 	load->inv_weight = prio_to_wmult[prio];
 }
 
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_queued(rq, p);
+	if (!(flags & ENQUEUE_RESTORE))
+		sched_info_queued(rq, p);
 	p->sched_class->enqueue_task(rq, p, flags);
 }
 
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_rq_clock(rq);
-	sched_info_dequeued(rq, p);
+	if (!(flags & DEQUEUE_SAVE))
+		sched_info_dequeued(rq, p);
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -1178,7 +1180,7 @@
 		 * holding rq->lock.
 		 */
 		lockdep_assert_held(&rq->lock);
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	}
 	if (running)
 		put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 }
 
 /*
@@ -1692,7 +1694,7 @@
 #endif /* CONFIG_SCHEDSTATS */
 }
 
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
 	activate_task(rq, p, en_flags);
 	p->on_rq = TASK_ON_RQ_QUEUED;
@@ -3325,7 +3327,7 @@
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	int oldprio, queued, running, enqueue_flag = 0;
+	int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
 	struct rq *rq;
 	const struct sched_class *prev_class;
 
@@ -3357,7 +3359,7 @@
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3375,7 +3377,7 @@
 		if (!dl_prio(p->normal_prio) ||
 		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
 			p->dl.dl_boosted = 1;
-			enqueue_flag = ENQUEUE_REPLENISH;
+			enqueue_flag |= ENQUEUE_REPLENISH;
 		} else
 			p->dl.dl_boosted = 0;
 		p->sched_class = &dl_sched_class;
@@ -3383,7 +3385,7 @@
 		if (dl_prio(oldprio))
 			p->dl.dl_boosted = 0;
 		if (oldprio < prio)
-			enqueue_flag = ENQUEUE_HEAD;
+			enqueue_flag |= ENQUEUE_HEAD;
 		p->sched_class = &rt_sched_class;
 	} else {
 		if (dl_prio(oldprio))
@@ -3435,7 +3437,7 @@
 	}
 	queued = task_on_rq_queued(p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -3444,7 +3446,7 @@
 	delta = p->prio - old_prio;
 
 	if (queued) {
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -3946,7 +3948,7 @@
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -3956,11 +3958,15 @@
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued) {
+		int enqueue_flags = ENQUEUE_RESTORE;
 		/*
 		 * We enqueue to tail when the priority of a task is
 		 * increased (user space view).
 		 */
-		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+		if (oldprio <= p->prio)
+			enqueue_flags |= ENQUEUE_HEAD;
+
+		enqueue_task(rq, p, enqueue_flags);
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
@@ -5109,7 +5115,7 @@
 	running = task_current(rq, p);
 
 	if (queued)
-		dequeue_task(rq, p, 0);
+		dequeue_task(rq, p, DEQUEUE_SAVE);
 	if (running)
 		put_prev_task(rq, p);
 
@@ -5118,7 +5124,7 @@
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, p, 0);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
 	task_rq_unlock(rq, p, &flags);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -7737,7 +7743,7 @@
 	queued = task_on_rq_queued(tsk);
 
 	if (queued)
-		dequeue_task(rq, tsk, 0);
+		dequeue_task(rq, tsk, DEQUEUE_SAVE);
 	if (unlikely(running))
 		put_prev_task(rq, tsk);
 
@@ -7761,7 +7767,7 @@
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (queued)
-		enqueue_task(rq, tsk, 0);
+		enqueue_task(rq, tsk, ENQUEUE_RESTORE);
 
 	task_rq_unlock(rq, tsk, &flags);
 }

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 046242f..e08cc4c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h

@@ -1151,16 +1151,18 @@
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-#define ENQUEUE_WAKEUP		1
-#define ENQUEUE_HEAD		2
+#define ENQUEUE_WAKEUP		0x01
+#define ENQUEUE_HEAD		0x02
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
+#define ENQUEUE_WAKING		0x04	/* sched_class::task_waking was called */
 #else
-#define ENQUEUE_WAKING		0
+#define ENQUEUE_WAKING		0x00
 #endif
-#define ENQUEUE_REPLENISH	8
+#define ENQUEUE_REPLENISH	0x08
+#define ENQUEUE_RESTORE	0x10
 
-#define DEQUEUE_SLEEP		1
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02
 
 #define RETRY_TASK		((void *)-1UL)
commit	1de64443d755f83af8ba8b558fded0c61afaef47	[log] [tgz]
author	Peter Zijlstra <peterz@infradead.org>	Wed Sep 30 17:44:13 2015 +0200
committer	Ingo Molnar <mingo@kernel.org>	Tue Oct 06 17:08:22 2015 +0200
tree	4a5c5f83384367e26ffbf73fd5023bf58e2dfe59
parent	b52da86e0ad58f096710977fcda856fd84da9233 [diff]