sched/core: Fix task and run queue sched_info::run_delay inconsistencies
Mike Meyer reported the following bug:
> During evaluation of some performance data, it was discovered thread
> and run queue run_delay accounting data was inconsistent with the other
> accounting data that was collected. Further investigation found under
> certain circumstances execution time was leaking into the task and
> run queue accounting of run_delay.
>
> Consider the following sequence:
>
> a. thread is running.
> b. thread moves beween cgroups, changes scheduling class or priority.
> c. thread sleeps OR
> d. thread involuntarily gives up cpu.
>
> a. implies:
>
> thread->sched_info.last_queued = 0
>
> a. and b. results in the following:
>
> 1. dequeue_task(rq, thread)
>
> sched_info_dequeued(rq, thread)
> delta = 0
>
> sched_info_reset_dequeued(thread)
> thread->sched_info.last_queued = 0
>
> thread->sched_info.run_delay += delta
>
> 2. enqueue_task(rq, thread)
>
> sched_info_queued(rq, thread)
>
> /* thread is still on cpu at this point. */
> thread->sched_info.last_queued = task_rq(thread)->clock;
>
> c. results in:
>
> dequeue_task(rq, thread)
>
> sched_info_dequeued(rq, thread)
>
> /* delta is execution time not run_delay. */
> delta = task_rq(thread)->clock - thread->sched_info.last_queued
>
> sched_info_reset_dequeued(thread)
> thread->sched_info.last_queued = 0
>
> thread->sched_info.run_delay += delta
>
> Since thread was running between enqueue_task(rq, thread) and
> dequeue_task(rq, thread), the delta above is really execution
> time and not run_delay.
>
> d. results in:
>
> __sched_info_switch(thread, next_thread)
>
> sched_info_depart(rq, thread)
>
> sched_info_queued(rq, thread)
>
> /* last_queued not updated due to being non-zero */
> return
>
> Since thread was running between enqueue_task(rq, thread) and
> __sched_info_switch(thread, next_thread), the execution time
> between enqueue_task(rq, thread) and
> __sched_info_switch(thread, next_thread) now will become
> associated with run_delay due to when last_queued was last updated.
>
This alternative patch solves the problem by not calling
sched_info_{de,}queued() in {de,en}queue_task(). Therefore the
sched_info state is preserved and things work as expected.
By inlining the {de,en}queue_task() functions the new condition
becomes (mostly) a compile-time constant and we'll not emit any new
branch instructions.
It even shrinks the code (due to inlining {en,de}queue_task()):
$ size defconfig-build/kernel/sched/core.o defconfig-build/kernel/sched/core.o.orig
text data bss dec hex filename
64019 23378 2344 89741 15e8d defconfig-build/kernel/sched/core.o
64149 23378 2344 89871 15f0f defconfig-build/kernel/sched/core.o.orig
Reported-by: Mike Meyer <Mike.Meyer@Teradata.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20150930154413.GO3604@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4554cde..fb14a01 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -827,17 +827,19 @@
load->inv_weight = prio_to_wmult[prio];
}
-static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_queued(rq, p);
+ if (!(flags & ENQUEUE_RESTORE))
+ sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
-static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
- sched_info_dequeued(rq, p);
+ if (!(flags & DEQUEUE_SAVE))
+ sched_info_dequeued(rq, p);
p->sched_class->dequeue_task(rq, p, flags);
}
@@ -1178,7 +1180,7 @@
* holding rq->lock.
*/
lockdep_assert_held(&rq->lock);
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
}
if (running)
put_prev_task(rq, p);
@@ -1188,7 +1190,7 @@
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
}
/*
@@ -1692,7 +1694,7 @@
#endif /* CONFIG_SCHEDSTATS */
}
-static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -3325,7 +3327,7 @@
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- int oldprio, queued, running, enqueue_flag = 0;
+ int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
struct rq *rq;
const struct sched_class *prev_class;
@@ -3357,7 +3359,7 @@
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
@@ -3375,7 +3377,7 @@
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- enqueue_flag = ENQUEUE_REPLENISH;
+ enqueue_flag |= ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
p->sched_class = &dl_sched_class;
@@ -3383,7 +3385,7 @@
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
if (oldprio < prio)
- enqueue_flag = ENQUEUE_HEAD;
+ enqueue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
@@ -3435,7 +3437,7 @@
}
queued = task_on_rq_queued(p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
@@ -3444,7 +3446,7 @@
delta = p->prio - old_prio;
if (queued) {
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
/*
* If the task increased its priority or is running and
* lowered its priority, then reschedule its CPU:
@@ -3946,7 +3948,7 @@
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
@@ -3956,11 +3958,15 @@
if (running)
p->sched_class->set_curr_task(rq);
if (queued) {
+ int enqueue_flags = ENQUEUE_RESTORE;
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
*/
- enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ if (oldprio <= p->prio)
+ enqueue_flags |= ENQUEUE_HEAD;
+
+ enqueue_task(rq, p, enqueue_flags);
}
check_class_changed(rq, p, prev_class, oldprio);
@@ -5109,7 +5115,7 @@
running = task_current(rq, p);
if (queued)
- dequeue_task(rq, p, 0);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
if (running)
put_prev_task(rq, p);
@@ -5118,7 +5124,7 @@
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, p, 0);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
task_rq_unlock(rq, p, &flags);
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -7737,7 +7743,7 @@
queued = task_on_rq_queued(tsk);
if (queued)
- dequeue_task(rq, tsk, 0);
+ dequeue_task(rq, tsk, DEQUEUE_SAVE);
if (unlikely(running))
put_prev_task(rq, tsk);
@@ -7761,7 +7767,7 @@
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (queued)
- enqueue_task(rq, tsk, 0);
+ enqueue_task(rq, tsk, ENQUEUE_RESTORE);
task_rq_unlock(rq, tsk, &flags);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 046242f..e08cc4c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1151,16 +1151,18 @@
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-#define ENQUEUE_WAKEUP 1
-#define ENQUEUE_HEAD 2
+#define ENQUEUE_WAKEUP 0x01
+#define ENQUEUE_HEAD 0x02
#ifdef CONFIG_SMP
-#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
#else
-#define ENQUEUE_WAKING 0
+#define ENQUEUE_WAKING 0x00
#endif
-#define ENQUEUE_REPLENISH 8
+#define ENQUEUE_REPLENISH 0x08
+#define ENQUEUE_RESTORE 0x10
-#define DEQUEUE_SLEEP 1
+#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SAVE 0x02
#define RETRY_TASK ((void *)-1UL)