timerfd: Manage cancelable timers in timerfd

Peter is concerned about the extra scan of CLOCK_REALTIME_COS in the
timer interrupt. Yes, I did not think about it, because the solution
was so elegant. I didn't like the extra list in timerfd when it was
proposed some time ago, but with a rcu based list the list walk it's
less horrible than the original global lock, which was held over the
list iteration.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index eabcbd7..26dd32f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -78,11 +78,6 @@
 			.get_time = &ktime_get_boottime,
 			.resolution = KTIME_LOW_RES,
 		},
-		{
-			.index = CLOCK_REALTIME_COS,
-			.get_time = &ktime_get_real,
-			.resolution = KTIME_LOW_RES,
-		},
 	}
 };
 
@@ -90,7 +85,6 @@
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
 	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
-	[CLOCK_REALTIME_COS]	= HRTIMER_BASE_REALTIME_COS,
 };
 
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -116,7 +110,6 @@
 	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
 	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
 	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
-	base->clock_base[HRTIMER_BASE_REALTIME_COS].softirq_time = xtim;
 }
 
 /*
@@ -486,8 +479,6 @@
 	trace_hrtimer_cancel(timer);
 }
 
-static void hrtimer_expire_cancelable(struct hrtimer_cpu_base *cpu_base);
-
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -663,7 +654,33 @@
 	return 0;
 }
 
-static void retrigger_next_event(void *arg);
+/*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+	struct timespec realtime_offset, xtim, wtm, sleep;
+
+	if (!hrtimer_hres_active())
+		return;
+
+	/* Optimized out for !HIGH_RES */
+	get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
+	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
+
+	/* Adjust CLOCK_REALTIME offset */
+	raw_spin_lock(&base->lock);
+	base->clock_base[HRTIMER_BASE_REALTIME].offset =
+		timespec_to_ktime(realtime_offset);
+	base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+		timespec_to_ktime(sleep);
+
+	hrtimer_force_reprogram(base, 0);
+	raw_spin_unlock(&base->lock);
+}
 
 /*
  * Switch to high resolution mode
@@ -711,46 +728,11 @@
 	return 0;
 }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
+static inline void retrigger_next_event(void *arg) { }
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
 /*
- * Retrigger next event is called after clock was set
- *
- * Called with interrupts disabled via on_each_cpu()
- */
-static void retrigger_next_event(void *arg)
-{
-	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
-	struct timespec realtime_offset, xtim, wtm, sleep;
-
-	if (!hrtimer_hres_active()) {
-		raw_spin_lock(&base->lock);
-		hrtimer_expire_cancelable(base);
-		raw_spin_unlock(&base->lock);
-		return;
-	}
-
-	/* Optimized out for !HIGH_RES */
-	get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
-	set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-
-	/* Adjust CLOCK_REALTIME offset */
-	raw_spin_lock(&base->lock);
-	base->clock_base[HRTIMER_BASE_REALTIME].offset =
-		timespec_to_ktime(realtime_offset);
-	base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
-		timespec_to_ktime(sleep);
-	base->clock_base[HRTIMER_BASE_REALTIME_COS].offset =
-		timespec_to_ktime(realtime_offset);
-
-	hrtimer_expire_cancelable(base);
-
-	hrtimer_force_reprogram(base, 0);
-	raw_spin_unlock(&base->lock);
-}
-
-/*
  * Clock realtime was set
  *
  * Change the offset of the realtime clock vs. the monotonic
@@ -763,8 +745,11 @@
  */
 void clock_was_set(void)
 {
+#ifdef CONFIG_HIGHRES_TIMERS
 	/* Retrigger the CPU local events everywhere */
 	on_each_cpu(retrigger_next_event, NULL, 1);
+#endif
+	timerfd_clock_was_set();
 }
 
 /*
@@ -777,6 +762,7 @@
 		  KERN_INFO "hrtimers_resume() called with IRQs enabled!");
 
 	retrigger_next_event(NULL);
+	timerfd_clock_was_set();
 }
 
 static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
@@ -1240,22 +1226,6 @@
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
-static void hrtimer_expire_cancelable(struct hrtimer_cpu_base *cpu_base)
-{
-	struct timerqueue_node *node;
-	struct hrtimer_clock_base *base;
-	ktime_t now = ktime_get_real();
-
-	base = &cpu_base->clock_base[HRTIMER_BASE_REALTIME_COS];
-
-	while ((node = timerqueue_getnext(&base->active))) {
-			struct hrtimer *timer;
-
-			timer = container_of(node, struct hrtimer, node);
-			__run_hrtimer(timer, &now);
-	}
-}
-
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 /*