perfcounters: generalize the counter scheduler
Impact: clean up and refactor code
refactor the counter scheduler: separate out in/out functions and
introduce a counter-rotation function as well.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 48e1dbc..d7a79f3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -111,11 +111,12 @@
spin_lock(&ctx->lock);
if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
- counter->hw_ops->disable(counter);
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->hw_ops->disable(counter);
ctx->nr_active--;
cpuctx->active_oncpu--;
counter->task = NULL;
+ counter->oncpu = -1;
}
ctx->nr_counters--;
@@ -192,8 +193,36 @@
spin_unlock_irq(&ctx->lock);
}
+static int
+counter_sched_in(struct perf_counter *counter,
+ struct perf_cpu_context *cpuctx,
+ struct perf_counter_context *ctx,
+ int cpu)
+{
+ if (counter->state == PERF_COUNTER_STATE_OFF)
+ return 0;
+
+ counter->state = PERF_COUNTER_STATE_ACTIVE;
+ counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
+ /*
+ * The new state must be visible before we turn it on in the hardware:
+ */
+ smp_wmb();
+
+ if (counter->hw_ops->enable(counter)) {
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->oncpu = -1;
+ return -EAGAIN;
+ }
+
+ cpuctx->active_oncpu++;
+ ctx->nr_active++;
+
+ return 0;
+}
+
/*
- * Cross CPU call to install and enable a preformance counter
+ * Cross CPU call to install and enable a performance counter
*/
static void __perf_install_in_context(void *info)
{
@@ -220,22 +249,17 @@
* counters on a global level. NOP for non NMI based counters.
*/
perf_flags = hw_perf_save_disable();
- list_add_counter(counter, ctx);
- hw_perf_restore(perf_flags);
+ list_add_counter(counter, ctx);
ctx->nr_counters++;
- if (cpuctx->active_oncpu < perf_max_counters) {
- counter->state = PERF_COUNTER_STATE_ACTIVE;
- counter->oncpu = cpu;
- ctx->nr_active++;
- cpuctx->active_oncpu++;
- counter->hw_ops->enable(counter);
- }
+ counter_sched_in(counter, cpuctx, ctx, cpu);
if (!ctx->task && cpuctx->max_pertask)
cpuctx->max_pertask--;
+ hw_perf_restore(perf_flags);
+
spin_unlock(&ctx->lock);
curr_rq_unlock_irq_restore(&flags);
}
@@ -302,8 +326,8 @@
if (counter->state != PERF_COUNTER_STATE_ACTIVE)
return;
- counter->hw_ops->disable(counter);
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->hw_ops->disable(counter);
counter->oncpu = -1;
cpuctx->active_oncpu--;
@@ -326,6 +350,22 @@
counter_sched_out(counter, cpuctx, ctx);
}
+void __perf_counter_sched_out(struct perf_counter_context *ctx,
+ struct perf_cpu_context *cpuctx)
+{
+ struct perf_counter *counter;
+
+ if (likely(!ctx->nr_counters))
+ return;
+
+ spin_lock(&ctx->lock);
+ if (ctx->nr_active) {
+ list_for_each_entry(counter, &ctx->counter_list, list_entry)
+ group_sched_out(counter, cpuctx, ctx);
+ }
+ spin_unlock(&ctx->lock);
+}
+
/*
* Called from scheduler to remove the counters of the current task,
* with interrupts disabled.
@@ -341,39 +381,18 @@
{
struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
struct perf_counter_context *ctx = &task->perf_counter_ctx;
- struct perf_counter *counter;
if (likely(!cpuctx->task_ctx))
return;
- spin_lock(&ctx->lock);
- if (ctx->nr_active) {
- list_for_each_entry(counter, &ctx->counter_list, list_entry)
- group_sched_out(counter, cpuctx, ctx);
- }
- spin_unlock(&ctx->lock);
+ __perf_counter_sched_out(ctx, cpuctx);
+
cpuctx->task_ctx = NULL;
}
-static int
-counter_sched_in(struct perf_counter *counter,
- struct perf_cpu_context *cpuctx,
- struct perf_counter_context *ctx,
- int cpu)
+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
{
- if (counter->state == PERF_COUNTER_STATE_OFF)
- return 0;
-
- if (counter->hw_ops->enable(counter))
- return -EAGAIN;
-
- counter->state = PERF_COUNTER_STATE_ACTIVE;
- counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
-
- cpuctx->active_oncpu++;
- ctx->nr_active++;
-
- return 0;
+ __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
}
static int
@@ -416,21 +435,10 @@
return -EAGAIN;
}
-/*
- * Called from scheduler to add the counters of the current task
- * with interrupts disabled.
- *
- * We restore the counter value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * keep the counter running.
- */
-void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+static void
+__perf_counter_sched_in(struct perf_counter_context *ctx,
+ struct perf_cpu_context *cpuctx, int cpu)
{
- struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
- struct perf_counter_context *ctx = &task->perf_counter_ctx;
struct perf_counter *counter;
if (likely(!ctx->nr_counters))
@@ -453,10 +461,35 @@
break;
}
spin_unlock(&ctx->lock);
+}
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = &task->perf_counter_ctx;
+
+ __perf_counter_sched_in(ctx, cpuctx, cpu);
cpuctx->task_ctx = ctx;
}
+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+ struct perf_counter_context *ctx = &cpuctx->ctx;
+
+ __perf_counter_sched_in(ctx, cpuctx, cpu);
+}
+
int perf_counter_task_disable(void)
{
struct task_struct *curr = current;
@@ -514,6 +547,8 @@
/* force the update of the task clock: */
__task_delta_exec(curr, 1);
+ perf_counter_task_sched_out(curr, cpu);
+
spin_lock(&ctx->lock);
/*
@@ -538,19 +573,18 @@
return 0;
}
-void perf_counter_task_tick(struct task_struct *curr, int cpu)
+/*
+ * Round-robin a context's counters:
+ */
+static void rotate_ctx(struct perf_counter_context *ctx)
{
- struct perf_counter_context *ctx = &curr->perf_counter_ctx;
struct perf_counter *counter;
u64 perf_flags;
- if (likely(!ctx->nr_counters))
+ if (!ctx->nr_counters)
return;
- perf_counter_task_sched_out(curr, cpu);
-
spin_lock(&ctx->lock);
-
/*
* Rotate the first entry last (works just fine for group counters too):
*/
@@ -563,7 +597,24 @@
hw_perf_restore(perf_flags);
spin_unlock(&ctx->lock);
+}
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+ struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+ struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+ const int rotate_percpu = 0;
+
+ if (rotate_percpu)
+ perf_counter_cpu_sched_out(cpuctx);
+ perf_counter_task_sched_out(curr, cpu);
+
+ if (rotate_percpu)
+ rotate_ctx(&cpuctx->ctx);
+ rotate_ctx(ctx);
+
+ if (rotate_percpu)
+ perf_counter_cpu_sched_in(cpuctx, cpu);
perf_counter_task_sched_in(curr, cpu);
}
@@ -905,8 +956,6 @@
struct task_struct *curr = counter->task;
u64 delta;
- WARN_ON_ONCE(counter->task != current);
-
delta = __task_delta_exec(curr, update);
return curr->se.sum_exec_runtime + delta;
@@ -1160,6 +1209,7 @@
counter->group_leader = group_leader;
counter->hw_ops = NULL;
+ counter->state = PERF_COUNTER_STATE_INACTIVE;
if (hw_event->disabled)
counter->state = PERF_COUNTER_STATE_OFF;
@@ -1331,36 +1381,50 @@
{
struct perf_counter *parent_counter;
u64 parent_val, child_val;
- unsigned long flags;
- u64 perf_flags;
/*
- * Disable and unlink this counter.
- *
- * Be careful about zapping the list - IRQ/NMI context
- * could still be processing it:
+ * If we do not self-reap then we have to wait for the
+ * child task to unschedule (it will happen for sure),
+ * so that its counter is at its final count. (This
+ * condition triggers rarely - child tasks usually get
+ * off their CPU before the parent has a chance to
+ * get this far into the reaping action)
*/
- curr_rq_lock_irq_save(&flags);
- perf_flags = hw_perf_save_disable();
-
- if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+ if (child != current) {
+ wait_task_inactive(child, 0);
+ list_del_init(&child_counter->list_entry);
+ } else {
struct perf_cpu_context *cpuctx;
+ unsigned long flags;
+ u64 perf_flags;
+
+ /*
+ * Disable and unlink this counter.
+ *
+ * Be careful about zapping the list - IRQ/NMI context
+ * could still be processing it:
+ */
+ curr_rq_lock_irq_save(&flags);
+ perf_flags = hw_perf_save_disable();
cpuctx = &__get_cpu_var(perf_cpu_context);
- child_counter->hw_ops->disable(child_counter);
- child_counter->state = PERF_COUNTER_STATE_INACTIVE;
- child_counter->oncpu = -1;
+ if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+ child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+ child_counter->hw_ops->disable(child_counter);
+ cpuctx->active_oncpu--;
+ child_ctx->nr_active--;
+ child_counter->oncpu = -1;
+ }
- cpuctx->active_oncpu--;
- child_ctx->nr_active--;
+ list_del_init(&child_counter->list_entry);
+
+ child_ctx->nr_counters--;
+
+ hw_perf_restore(perf_flags);
+ curr_rq_unlock_irq_restore(&flags);
}
- list_del_init(&child_counter->list_entry);
-
- hw_perf_restore(perf_flags);
- curr_rq_unlock_irq_restore(&flags);
-
parent_counter = child_counter->parent;
/*
* It can happen that parent exits first, and has counters