Merge branch 'perf/urgent' into perf/core, to fix up fixes before queueing up new changes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 3ea25c3..feb90f6 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -28,10 +28,46 @@
#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
+
+/*
+ * IBS states:
+ *
+ * ENABLED; tracks the pmu::add(), pmu::del() state, when set the counter is taken
+ * and any further add()s must fail.
+ *
+ * STARTED/STOPPING/STOPPED; deal with pmu::start(), pmu::stop() state but are
+ * complicated by the fact that the IBS hardware can send late NMIs (ie. after
+ * we've cleared the EN bit).
+ *
+ * In order to consume these late NMIs we have the STOPPED state, any NMI that
+ * happens after we've cleared the EN state will clear this bit and report the
+ * NMI handled (this is fundamentally racy in the face or multiple NMI sources,
+ * someone else can consume our BIT and our NMI will go unhandled).
+ *
+ * And since we cannot set/clear this separate bit together with the EN bit,
+ * there are races; if we cleared STARTED early, an NMI could land in
+ * between clearing STARTED and clearing the EN bit (in fact multiple NMIs
+ * could happen if the period is small enough), and consume our STOPPED bit
+ * and trigger streams of unhandled NMIs.
+ *
+ * If, however, we clear STARTED late, an NMI can hit between clearing the
+ * EN bit and clearing STARTED, still see STARTED set and process the event.
+ * If this event will have the VALID bit clear, we bail properly, but this
+ * is not a given. With VALID set we can end up calling pmu::stop() again
+ * (the throttle logic) and trigger the WARNs in there.
+ *
+ * So what we do is set STOPPING before clearing EN to avoid the pmu::stop()
+ * nesting, and clear STARTED late, so that we have a well defined state over
+ * the clearing of the EN bit.
+ *
+ * XXX: we could probably be using !atomic bitops for all this.
+ */
+
enum ibs_states {
IBS_ENABLED = 0,
IBS_STARTED = 1,
IBS_STOPPING = 2,
+ IBS_STOPPED = 3,
IBS_MAX_STATES,
};
@@ -377,11 +413,10 @@
perf_ibs_set_period(perf_ibs, hwc, &period);
/*
- * Set STARTED before enabling the hardware, such that
- * a subsequent NMI must observe it. Then clear STOPPING
- * such that we don't consume NMIs by accident.
+ * Set STARTED before enabling the hardware, such that a subsequent NMI
+ * must observe it.
*/
- set_bit(IBS_STARTED, pcpu->state);
+ set_bit(IBS_STARTED, pcpu->state);
clear_bit(IBS_STOPPING, pcpu->state);
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
@@ -396,6 +431,9 @@
u64 config;
int stopping;
+ if (test_and_set_bit(IBS_STOPPING, pcpu->state))
+ return;
+
stopping = test_bit(IBS_STARTED, pcpu->state);
if (!stopping && (hwc->state & PERF_HES_UPTODATE))
@@ -405,12 +443,12 @@
if (stopping) {
/*
- * Set STOPPING before disabling the hardware, such that it
+ * Set STOPPED before disabling the hardware, such that it
* must be visible to NMIs the moment we clear the EN bit,
* at which point we can generate an !VALID sample which
* we need to consume.
*/
- set_bit(IBS_STOPPING, pcpu->state);
+ set_bit(IBS_STOPPED, pcpu->state);
perf_ibs_disable_event(perf_ibs, hwc, config);
/*
* Clear STARTED after disabling the hardware; if it were
@@ -556,7 +594,7 @@
* with samples that even have the valid bit cleared.
* Mark all this NMIs as handled.
*/
- if (test_and_clear_bit(IBS_STOPPING, pcpu->state))
+ if (test_and_clear_bit(IBS_STOPPED, pcpu->state))
return 1;
return 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index de24fbc..52bedc5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2417,14 +2417,24 @@
cpuctx->task_ctx = NULL;
}
- is_active ^= ctx->is_active; /* changed bits */
-
+ /*
+ * Always update time if it was set; not only when it changes.
+ * Otherwise we can 'forget' to update time for any but the last
+ * context we sched out. For example:
+ *
+ * ctx_sched_out(.event_type = EVENT_FLEXIBLE)
+ * ctx_sched_out(.event_type = EVENT_PINNED)
+ *
+ * would only update time for the pinned events.
+ */
if (is_active & EVENT_TIME) {
/* update (and stop) ctx time */
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx);
}
+ is_active ^= ctx->is_active; /* changed bits */
+
if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
@@ -8532,6 +8542,7 @@
f_flags);
if (IS_ERR(event_file)) {
err = PTR_ERR(event_file);
+ event_file = NULL;
goto err_context;
}