Blame - kernel/watchdog_hld.c - SHIFTPHONES/mainline/linux

blob: 54a427d1f344543947867ea3a5a2b3c6e8274d47 [file] [log] [blame]

Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	1	/*
				2	* Detect hard lockups on a system
				3	*
				4	* started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
				5	*
				6	* Note: Most of this code is borrowed heavily from the original softlockup
				7	* detector, so thanks to Ingo for the initial implementation.
				8	* Some chunks also taken from the old x86-specific nmi watchdog code, thanks
				9	* to those contributors as well.
				10	*/
				11
				12	#define pr_fmt(fmt) "NMI watchdog: " fmt
				13
				14	#include <linux/nmi.h>
				15	#include <linux/module.h>
Ingo Molnar	b17b015	2017-02-08 18:51:35 +0100	[diff] [blame]	16	#include <linux/sched/debug.h>
				17
Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	18	#include <asm/irq_regs.h>
				19	#include <linux/perf_event.h>
				20
				21	static DEFINE_PER_CPU(bool, hard_watchdog_warn);
				22	static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
				23	static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
				24
				25	/* boot commands */
				26	/*
				27	* Should we panic when a soft-lockup or hard-lockup occurs:
				28	*/
				29	unsigned int __read_mostly hardlockup_panic =
				30	CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
				31	static unsigned long hardlockup_allcpu_dumped;
				32	/*
				33	* We may not want to enable hard lockup detection by default in all cases,
				34	* for example when running the kernel as a guest on a hypervisor. In these
				35	* cases this function can be called to disable hard lockup detection. This
				36	* function should only be executed once by the boot processor before the
				37	* kernel command line parameters are parsed, because otherwise it is not
				38	* possible to override this in hardlockup_panic_setup().
				39	*/
				40	void hardlockup_detector_disable(void)
				41	{
				42	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
				43	}
				44
				45	static int __init hardlockup_panic_setup(char *str)
				46	{
				47	if (!strncmp(str, "panic", 5))
				48	hardlockup_panic = 1;
				49	else if (!strncmp(str, "nopanic", 7))
				50	hardlockup_panic = 0;
				51	else if (!strncmp(str, "0", 1))
				52	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
				53	else if (!strncmp(str, "1", 1))
				54	watchdog_enabled \|= NMI_WATCHDOG_ENABLED;
				55	return 1;
				56	}
				57	__setup("nmi_watchdog=", hardlockup_panic_setup);
				58
				59	void touch_nmi_watchdog(void)
				60	{
				61	/*
				62	* Using __raw here because some code paths have
				63	* preemption enabled. If preemption is enabled
				64	* then interrupts should be enabled too, in which
				65	* case we shouldn't have to worry about the watchdog
				66	* going off.
				67	*/
				68	raw_cpu_write(watchdog_nmi_touch, true);
				69	touch_softlockup_watchdog();
				70	}
				71	EXPORT_SYMBOL(touch_nmi_watchdog);
				72
				73	static struct perf_event_attr wd_hw_attr = {
				74	.type = PERF_TYPE_HARDWARE,
				75	.config = PERF_COUNT_HW_CPU_CYCLES,
				76	.size = sizeof(struct perf_event_attr),
				77	.pinned = 1,
				78	.disabled = 1,
				79	};
				80
				81	/* Callback function for perf event subsystem */
				82	static void watchdog_overflow_callback(struct perf_event *event,
				83	struct perf_sample_data *data,
				84	struct pt_regs *regs)
				85	{
				86	/* Ensure the watchdog never gets throttled */
				87	event->hw.interrupts = 0;
				88
Don Zickus	b94f511	2017-01-24 15:17:53 -0800	[diff] [blame]	89	if (atomic_read(&watchdog_park_in_progress) != 0)
				90	return;
				91
Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	92	if (__this_cpu_read(watchdog_nmi_touch) == true) {
				93	__this_cpu_write(watchdog_nmi_touch, false);
				94	return;
				95	}
				96
				97	/* check for a hardlockup
				98	* This is done by making sure our timer interrupt
				99	* is incrementing. The timer interrupt should have
				100	* fired multiple times before we overflow'd. If it hasn't
				101	* then this is a good indication the cpu is stuck
				102	*/
				103	if (is_hardlockup()) {
				104	int this_cpu = smp_processor_id();
				105
				106	/* only print hardlockups once */
				107	if (__this_cpu_read(hard_watchdog_warn) == true)
				108	return;
				109
				110	pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
				111	print_modules();
				112	print_irqtrace_events(current);
				113	if (regs)
				114	show_regs(regs);
				115	else
				116	dump_stack();
				117
				118	/*
				119	* Perform all-CPU dump only once to avoid multiple hardlockups
				120	* generating interleaving traces
				121	*/
				122	if (sysctl_hardlockup_all_cpu_backtrace &&
				123	!test_and_set_bit(0, &hardlockup_allcpu_dumped))
				124	trigger_allbutself_cpu_backtrace();
				125
				126	if (hardlockup_panic)
				127	nmi_panic(regs, "Hard LOCKUP");
				128
				129	__this_cpu_write(hard_watchdog_warn, true);
				130	return;
				131	}
				132
				133	__this_cpu_write(hard_watchdog_warn, false);
				134	return;
				135	}
				136
				137	/*
				138	* People like the simple clean cpu node info on boot.
				139	* Reduce the watchdog noise by only printing messages
				140	* that are different from what cpu0 displayed.
				141	*/
Prarit Bhargava	8dcde9d	2017-02-22 15:40:56 -0800	[diff] [blame]	142	static unsigned long firstcpu_err;
				143	static atomic_t watchdog_cpus;
Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	144
				145	int watchdog_nmi_enable(unsigned int cpu)
				146	{
				147	struct perf_event_attr *wd_attr;
				148	struct perf_event *event = per_cpu(watchdog_ev, cpu);
Prarit Bhargava	8dcde9d	2017-02-22 15:40:56 -0800	[diff] [blame]	149	int firstcpu = 0;
Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	150
				151	/* nothing to do if the hard lockup detector is disabled */
				152	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
				153	goto out;
				154
				155	/* is it already setup and enabled? */
				156	if (event && event->state > PERF_EVENT_STATE_OFF)
				157	goto out;
				158
				159	/* it is setup but not enabled */
				160	if (event != NULL)
				161	goto out_enable;
				162
Prarit Bhargava	8dcde9d	2017-02-22 15:40:56 -0800	[diff] [blame]	163	if (atomic_inc_return(&watchdog_cpus) == 1)
				164	firstcpu = 1;
				165
Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	166	wd_attr = &wd_hw_attr;
				167	wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
				168
				169	/* Try to register using hardware perf events */
				170	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
				171
Prarit Bhargava	8dcde9d	2017-02-22 15:40:56 -0800	[diff] [blame]	172	/* save the first cpu's error for future comparision */
				173	if (firstcpu && IS_ERR(event))
				174	firstcpu_err = PTR_ERR(event);
Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	175
				176	if (!IS_ERR(event)) {
Prarit Bhargava	8dcde9d	2017-02-22 15:40:56 -0800	[diff] [blame]	177	/* only print for the first cpu initialized */
				178	if (firstcpu \|\| firstcpu_err)
Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	179	pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
				180	goto out_save;
				181	}
				182
				183	/*
				184	* Disable the hard lockup detector if _any_ CPU fails to set up
				185	* set up the hardware perf event. The watchdog() function checks
				186	* the NMI_WATCHDOG_ENABLED bit periodically.
				187	*
				188	* The barriers are for syncing up watchdog_enabled across all the
				189	* cpus, as clear_bit() does not use barriers.
				190	*/
				191	smp_mb__before_atomic();
				192	clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
				193	smp_mb__after_atomic();
				194
				195	/* skip displaying the same error again */
Prarit Bhargava	8dcde9d	2017-02-22 15:40:56 -0800	[diff] [blame]	196	if (!firstcpu && (PTR_ERR(event) == firstcpu_err))
Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	197	return PTR_ERR(event);
				198
				199	/* vary the KERN level based on the returned errno */
				200	if (PTR_ERR(event) == -EOPNOTSUPP)
				201	pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
				202	else if (PTR_ERR(event) == -ENOENT)
				203	pr_warn("disabled (cpu%i): hardware events not enabled\n",
				204	cpu);
				205	else
				206	pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
				207	cpu, PTR_ERR(event));
				208
				209	pr_info("Shutting down hard lockup detector on all cpus\n");
				210
				211	return PTR_ERR(event);
				212
				213	/* success path */
				214	out_save:
				215	per_cpu(watchdog_ev, cpu) = event;
				216	out_enable:
				217	perf_event_enable(per_cpu(watchdog_ev, cpu));
				218	out:
				219	return 0;
				220	}
				221
				222	void watchdog_nmi_disable(unsigned int cpu)
				223	{
				224	struct perf_event *event = per_cpu(watchdog_ev, cpu);
				225
				226	if (event) {
				227	perf_event_disable(event);
				228	per_cpu(watchdog_ev, cpu) = NULL;
				229
				230	/* should be in cleanup, but blocks oprofile */
				231	perf_event_release_kernel(event);
Prarit Bhargava	8dcde9d	2017-02-22 15:40:56 -0800	[diff] [blame]	232
Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	233	/* watchdog_nmi_enable() expects this to be zero initially. */
Prarit Bhargava	8dcde9d	2017-02-22 15:40:56 -0800	[diff] [blame]	234	if (atomic_dec_and_test(&watchdog_cpus))
				235	firstcpu_err = 0;
Babu Moger	73ce051	2016-12-14 15:06:24 -0800	[diff] [blame]	236	}
				237	}