blob: 968613572b9a037c907ab3256f0d233202a4ab38 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
Randy Dunlapa9415642006-01-11 12:17:48 -080018#include <linux/capability.h>
Andi Kleen91c6d402005-07-28 21:15:39 -070019#include <linux/cpu.h>
20#include <linux/percpu.h>
Tim Hockine02e68d2007-07-21 17:10:36 +020021#include <linux/poll.h>
22#include <linux/thread_info.h>
Andi Kleen8c566ef2005-09-12 18:49:24 +020023#include <linux/ctype.h>
Andi Kleena98f0dd2007-02-13 13:26:23 +010024#include <linux/kmod.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070025#include <linux/kdebug.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026#include <asm/processor.h>
27#include <asm/msr.h>
28#include <asm/mce.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070029#include <asm/uaccess.h>
Andi Kleen0a9c3ee2006-01-11 22:46:54 +010030#include <asm/smp.h>
Tim Hockine02e68d2007-07-21 17:10:36 +020031#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032
33#define MISC_MCELOG_MINOR 227
Shaohua Li73ca5352006-01-11 22:43:06 +010034#define NR_BANKS 6
Linus Torvalds1da177e2005-04-16 15:20:36 -070035
Andi Kleen553f2652006-04-07 19:49:57 +020036atomic_t mce_entry;
37
Linus Torvalds1da177e2005-04-16 15:20:36 -070038static int mce_dont_init;
39
40/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
41 3: never panic or exit (for testing only) */
42static int tolerant = 1;
43static int banks;
44static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
Tim Hockine02e68d2007-07-21 17:10:36 +020045static unsigned long notify_user;
Andi Kleen94ad8472005-04-16 15:25:09 -070046static int rip_msr;
Andi Kleene5835382005-11-05 17:25:54 +010047static int mce_bootlog = 1;
Andi Kleena98f0dd2007-02-13 13:26:23 +010048static atomic_t mce_events;
49
50static char trigger[128];
51static char *trigger_argv[2] = { trigger, NULL };
Linus Torvalds1da177e2005-04-16 15:20:36 -070052
Tim Hockine02e68d2007-07-21 17:10:36 +020053static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
54
Linus Torvalds1da177e2005-04-16 15:20:36 -070055/*
56 * Lockless MCE logging infrastructure.
57 * This avoids deadlocks on printk locks without having to break locks. Also
58 * separate MCEs from kernel messages to avoid bogus bug reports.
59 */
60
61struct mce_log mcelog = {
62 MCE_LOG_SIGNATURE,
63 MCE_LOG_LEN,
64};
65
66void mce_log(struct mce *mce)
67{
68 unsigned next, entry;
Andi Kleena98f0dd2007-02-13 13:26:23 +010069 atomic_inc(&mce_events);
Linus Torvalds1da177e2005-04-16 15:20:36 -070070 mce->finished = 0;
Mike Waychison76441432005-09-30 00:01:27 +020071 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070072 for (;;) {
73 entry = rcu_dereference(mcelog.next);
Mike Waychison76441432005-09-30 00:01:27 +020074 /* The rmb forces the compiler to reload next in each
75 iteration */
76 rmb();
Andi Kleen673242c2005-09-12 18:49:24 +020077 for (;;) {
78 /* When the buffer fills up discard new entries. Assume
79 that the earlier errors are the more interesting. */
80 if (entry >= MCE_LOG_LEN) {
81 set_bit(MCE_OVERFLOW, &mcelog.flags);
82 return;
83 }
84 /* Old left over entry. Skip. */
85 if (mcelog.entry[entry].finished) {
86 entry++;
87 continue;
88 }
Mike Waychison76441432005-09-30 00:01:27 +020089 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -070090 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070091 smp_rmb();
92 next = entry + 1;
93 if (cmpxchg(&mcelog.next, entry, next) == entry)
94 break;
95 }
96 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
Mike Waychison76441432005-09-30 00:01:27 +020097 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070098 mcelog.entry[entry].finished = 1;
Mike Waychison76441432005-09-30 00:01:27 +020099 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700100
Tim Hockine02e68d2007-07-21 17:10:36 +0200101 set_bit(0, &notify_user);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700102}
103
104static void print_mce(struct mce *m)
105{
106 printk(KERN_EMERG "\n"
Andi Kleen48551702006-01-11 22:44:48 +0100107 KERN_EMERG "HARDWARE ERROR\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108 KERN_EMERG
109 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
110 m->cpu, m->mcgstatus, m->bank, m->status);
111 if (m->rip) {
112 printk(KERN_EMERG
113 "RIP%s %02x:<%016Lx> ",
114 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
115 m->cs, m->rip);
116 if (m->cs == __KERNEL_CS)
117 print_symbol("{%s}", m->rip);
118 printk("\n");
119 }
120 printk(KERN_EMERG "TSC %Lx ", m->tsc);
121 if (m->addr)
122 printk("ADDR %Lx ", m->addr);
123 if (m->misc)
124 printk("MISC %Lx ", m->misc);
125 printk("\n");
Andi Kleen48551702006-01-11 22:44:48 +0100126 printk(KERN_EMERG "This is not a software problem!\n");
127 printk(KERN_EMERG
128 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700129}
130
131static void mce_panic(char *msg, struct mce *backup, unsigned long start)
132{
133 int i;
Tim Hockine02e68d2007-07-21 17:10:36 +0200134
135 if (tolerant >= 3)
136 return;
137
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138 oops_begin();
139 for (i = 0; i < MCE_LOG_LEN; i++) {
140 unsigned long tsc = mcelog.entry[i].tsc;
141 if (time_before(tsc, start))
142 continue;
143 print_mce(&mcelog.entry[i]);
144 if (backup && mcelog.entry[i].tsc == backup->tsc)
145 backup = NULL;
146 }
147 if (backup)
148 print_mce(backup);
Tim Hockine02e68d2007-07-21 17:10:36 +0200149 panic(msg);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700150}
151
152static int mce_available(struct cpuinfo_x86 *c)
153{
Akinobu Mita3d1712c2006-03-24 03:15:11 -0800154 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155}
156
Andi Kleen94ad8472005-04-16 15:25:09 -0700157static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
158{
159 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
160 m->rip = regs->rip;
161 m->cs = regs->cs;
162 } else {
163 m->rip = 0;
164 m->cs = 0;
165 }
166 if (rip_msr) {
167 /* Assume the RIP in the MSR is exact. Is this true? */
168 m->mcgstatus |= MCG_STATUS_EIPV;
169 rdmsrl(rip_msr, m->rip);
170 m->cs = 0;
171 }
172}
173
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174/*
175 * The actual machine check handler
176 */
177
178void do_machine_check(struct pt_regs * regs, long error_code)
179{
180 struct mce m, panicm;
181 int nowayout = (tolerant < 1);
182 int kill_it = 0;
183 u64 mcestart = 0;
184 int i;
185 int panicm_found = 0;
186
Andi Kleen553f2652006-04-07 19:49:57 +0200187 atomic_inc(&mce_entry);
188
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189 if (regs)
Jan Beulich6e3f3612006-01-11 22:42:14 +0100190 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191 if (!banks)
Andi Kleen553f2652006-04-07 19:49:57 +0200192 goto out2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700193
194 memset(&m, 0, sizeof(struct mce));
Andi Kleen151f8cc2006-09-26 10:52:37 +0200195 m.cpu = smp_processor_id();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
197 if (!(m.mcgstatus & MCG_STATUS_RIPV))
198 kill_it = 1;
199
200 rdtscll(mcestart);
201 barrier();
202
203 for (i = 0; i < banks; i++) {
204 if (!bank[i])
205 continue;
206
207 m.misc = 0;
208 m.addr = 0;
209 m.bank = i;
210 m.tsc = 0;
211
212 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
213 if ((m.status & MCI_STATUS_VAL) == 0)
214 continue;
215
216 if (m.status & MCI_STATUS_EN) {
217 /* In theory _OVER could be a nowayout too, but
218 assume any overflowed errors were no fatal. */
219 nowayout |= !!(m.status & MCI_STATUS_PCC);
220 kill_it |= !!(m.status & MCI_STATUS_UC);
221 }
222
223 if (m.status & MCI_STATUS_MISCV)
224 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
225 if (m.status & MCI_STATUS_ADDRV)
226 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
227
Andi Kleen94ad8472005-04-16 15:25:09 -0700228 mce_get_rip(&m, regs);
Andi Kleend5172f22005-08-07 09:42:07 -0700229 if (error_code >= 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230 rdtscll(m.tsc);
231 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
Andi Kleend5172f22005-08-07 09:42:07 -0700232 if (error_code != -2)
233 mce_log(&m);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700234
235 /* Did this bank cause the exception? */
236 /* Assume that the bank with uncorrectable errors did it,
237 and that there is only a single one. */
238 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
239 panicm = m;
240 panicm_found = 1;
241 }
242
Randy Dunlap9f158332005-09-13 01:25:16 -0700243 add_taint(TAINT_MACHINE_CHECK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244 }
245
246 /* Never do anything final in the polling timer */
Tim Hockine02e68d2007-07-21 17:10:36 +0200247 if (!regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248 goto out;
249
250 /* If we didn't find an uncorrectable error, pick
251 the last one (shouldn't happen, just being safe). */
252 if (!panicm_found)
253 panicm = m;
254 if (nowayout)
255 mce_panic("Machine check", &panicm, mcestart);
256 if (kill_it) {
257 int user_space = 0;
258
259 if (m.mcgstatus & MCG_STATUS_RIPV)
260 user_space = panicm.rip && (panicm.cs & 3);
261
262 /* When the machine was in user space and the CPU didn't get
263 confused it's normally not necessary to panic, unless you
264 are paranoid (tolerant == 0)
265
266 RED-PEN could be more tolerant for MCEs in idle,
267 but most likely they occur at boot anyways, where
268 it is best to just halt the machine. */
269 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
270 (unsigned)current->pid <= 1)
271 mce_panic("Uncorrected machine check", &panicm, mcestart);
272
273 /* do_exit takes an awful lot of locks and has as
274 slight risk of deadlocking. If you don't want that
275 don't set tolerant >= 2 */
276 if (tolerant < 3)
277 do_exit(SIGBUS);
278 }
279
Tim Hockine02e68d2007-07-21 17:10:36 +0200280 /* notify userspace ASAP */
281 set_thread_flag(TIF_MCE_NOTIFY);
282
Linus Torvalds1da177e2005-04-16 15:20:36 -0700283 out:
284 /* Last thing done in the machine check exception to clear state. */
285 wrmsrl(MSR_IA32_MCG_STATUS, 0);
Andi Kleen553f2652006-04-07 19:49:57 +0200286 out2:
287 atomic_dec(&mce_entry);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288}
289
Dmitriy Zavin15d5f832006-09-26 10:52:42 +0200290#ifdef CONFIG_X86_MCE_INTEL
291/***
292 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
293 * @cpu: The CPU on which the event occured.
294 * @status: Event status information
295 *
296 * This function should be called by the thermal interrupt after the
297 * event has been processed and the decision was made to log the event
298 * further.
299 *
300 * The status parameter will be saved to the 'status' field of 'struct mce'
301 * and historically has been the register value of the
302 * MSR_IA32_THERMAL_STATUS (Intel) msr.
303 */
304void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
305{
306 struct mce m;
307
308 memset(&m, 0, sizeof(m));
309 m.cpu = cpu;
310 m.bank = MCE_THERMAL_BANK;
311 m.status = status;
312 rdtscll(m.tsc);
313 mce_log(&m);
314}
315#endif /* CONFIG_X86_MCE_INTEL */
316
Linus Torvalds1da177e2005-04-16 15:20:36 -0700317/*
Tim Hockin8a336b02007-05-02 19:27:19 +0200318 * Periodic polling timer for "silent" machine check errors. If the
319 * poller finds an MCE, poll 2x faster. When the poller finds no more
320 * errors, poll 2x slower (up to check_interval seconds).
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321 */
322
323static int check_interval = 5 * 60; /* 5 minutes */
Tim Hockin8a336b02007-05-02 19:27:19 +0200324static int next_interval; /* in jiffies */
David Howells65f27f32006-11-22 14:55:48 +0000325static void mcheck_timer(struct work_struct *work);
326static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700327
328static void mcheck_check_cpu(void *info)
329{
330 if (mce_available(&current_cpu_data))
331 do_machine_check(NULL, 0);
332}
333
David Howells65f27f32006-11-22 14:55:48 +0000334static void mcheck_timer(struct work_struct *work)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335{
336 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337
338 /*
Tim Hockine02e68d2007-07-21 17:10:36 +0200339 * Alert userspace if needed. If we logged an MCE, reduce the
340 * polling interval, otherwise increase the polling interval.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341 */
Tim Hockine02e68d2007-07-21 17:10:36 +0200342 if (mce_notify_user()) {
Tim Hockin8a336b02007-05-02 19:27:19 +0200343 next_interval = max(next_interval/2, HZ/100);
Tim Hockin8a336b02007-05-02 19:27:19 +0200344 } else {
345 next_interval = min(next_interval*2, check_interval*HZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700346 }
Tim Hockin8a336b02007-05-02 19:27:19 +0200347
348 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700349}
350
Tim Hockine02e68d2007-07-21 17:10:36 +0200351/*
352 * This is only called from process context. This is where we do
353 * anything we need to alert userspace about new MCEs. This is called
354 * directly from the poller and also from entry.S and idle, thanks to
355 * TIF_MCE_NOTIFY.
356 */
357int mce_notify_user(void)
358{
359 clear_thread_flag(TIF_MCE_NOTIFY);
360 if (test_and_clear_bit(0, &notify_user)) {
361 static unsigned long last_print;
362 unsigned long now = jiffies;
363
364 wake_up_interruptible(&mce_wait);
365 if (trigger[0])
366 call_usermodehelper(trigger, trigger_argv, NULL,
367 UMH_NO_WAIT);
368
369 if (time_after_eq(now, last_print + (check_interval*HZ))) {
370 last_print = now;
371 printk(KERN_INFO "Machine check events logged\n");
372 }
373
374 return 1;
375 }
376 return 0;
377}
378
379/* see if the idle task needs to notify userspace */
380static int
381mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk)
382{
383 /* IDLE_END should be safe - interrupts are back on */
384 if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY))
385 mce_notify_user();
386
387 return NOTIFY_OK;
388}
389
390static struct notifier_block mce_idle_notifier = {
391 .notifier_call = mce_idle_callback,
392};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700393
394static __init int periodic_mcheck_init(void)
395{
Tim Hockin8a336b02007-05-02 19:27:19 +0200396 next_interval = check_interval * HZ;
397 if (next_interval)
398 schedule_delayed_work(&mcheck_work, next_interval);
Tim Hockine02e68d2007-07-21 17:10:36 +0200399 idle_notifier_register(&mce_idle_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400 return 0;
401}
402__initcall(periodic_mcheck_init);
403
404
405/*
406 * Initialize Machine Checks for a CPU.
407 */
408static void mce_init(void *dummy)
409{
410 u64 cap;
411 int i;
412
413 rdmsrl(MSR_IA32_MCG_CAP, cap);
414 banks = cap & 0xff;
415 if (banks > NR_BANKS) {
416 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
417 banks = NR_BANKS;
418 }
Andi Kleen94ad8472005-04-16 15:25:09 -0700419 /* Use accurate RIP reporting if available. */
420 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
421 rip_msr = MSR_IA32_MCG_EIP;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700422
423 /* Log the machine checks left over from the previous reset.
424 This also clears all registers */
Andi Kleend5172f22005-08-07 09:42:07 -0700425 do_machine_check(NULL, mce_bootlog ? -1 : -2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700426
427 set_in_cr4(X86_CR4_MCE);
428
429 if (cap & MCG_CTL_P)
430 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
431
432 for (i = 0; i < banks; i++) {
433 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
434 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
435 }
436}
437
438/* Add per CPU specific workarounds here */
Ashok Raje6982c62005-06-25 14:54:58 -0700439static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440{
441 /* This should be disabled by the BIOS, but isn't always */
442 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
443 /* disable GART TBL walk error reporting, which trips off
444 incorrectly with the IOMMU & 3ware & Cerberus. */
445 clear_bit(10, &bank[4]);
Andi Kleene5835382005-11-05 17:25:54 +0100446 /* Lots of broken BIOS around that don't clear them
447 by default and leave crap in there. Don't log. */
448 mce_bootlog = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700449 }
Andi Kleene5835382005-11-05 17:25:54 +0100450
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451}
452
Ashok Raje6982c62005-06-25 14:54:58 -0700453static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454{
455 switch (c->x86_vendor) {
456 case X86_VENDOR_INTEL:
457 mce_intel_feature_init(c);
458 break;
Jacob Shin89b831e2005-11-05 17:25:53 +0100459 case X86_VENDOR_AMD:
460 mce_amd_feature_init(c);
461 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462 default:
463 break;
464 }
465}
466
467/*
468 * Called for each booted CPU to set up machine checks.
469 * Must be called with preempt off.
470 */
Ashok Raje6982c62005-06-25 14:54:58 -0700471void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700472{
Ashok Raj7ded5682006-02-03 21:51:23 +0100473 static cpumask_t mce_cpus = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474
475 mce_cpu_quirks(c);
476
477 if (mce_dont_init ||
478 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
479 !mce_available(c))
480 return;
481
482 mce_init(NULL);
483 mce_cpu_features(c);
484}
485
486/*
487 * Character device to read and clear the MCE log.
488 */
489
Tim Hockinf528e7b2007-07-21 17:10:35 +0200490static DEFINE_SPINLOCK(mce_state_lock);
491static int open_count; /* #times opened */
492static int open_exclu; /* already open exclusive? */
493
494static int mce_open(struct inode *inode, struct file *file)
495{
496 spin_lock(&mce_state_lock);
497
498 if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
499 spin_unlock(&mce_state_lock);
500 return -EBUSY;
501 }
502
503 if (file->f_flags & O_EXCL)
504 open_exclu = 1;
505 open_count++;
506
507 spin_unlock(&mce_state_lock);
508
509 return 0;
510}
511
512static int mce_release(struct inode *inode, struct file *file)
513{
514 spin_lock(&mce_state_lock);
515
516 open_count--;
517 open_exclu = 0;
518
519 spin_unlock(&mce_state_lock);
520
521 return 0;
522}
523
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524static void collect_tscs(void *data)
525{
526 unsigned long *cpu_tsc = (unsigned long *)data;
527 rdtscll(cpu_tsc[smp_processor_id()]);
528}
529
530static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
531{
Andi Kleenf0de53b2005-04-16 15:25:10 -0700532 unsigned long *cpu_tsc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533 static DECLARE_MUTEX(mce_read_sem);
534 unsigned next;
535 char __user *buf = ubuf;
536 int i, err;
537
Andi Kleenf0de53b2005-04-16 15:25:10 -0700538 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
539 if (!cpu_tsc)
540 return -ENOMEM;
541
Linus Torvalds1da177e2005-04-16 15:20:36 -0700542 down(&mce_read_sem);
543 next = rcu_dereference(mcelog.next);
544
545 /* Only supports full reads right now */
546 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
547 up(&mce_read_sem);
Andi Kleenf0de53b2005-04-16 15:25:10 -0700548 kfree(cpu_tsc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 return -EINVAL;
550 }
551
552 err = 0;
Andi Kleen673242c2005-09-12 18:49:24 +0200553 for (i = 0; i < next; i++) {
554 unsigned long start = jiffies;
555 while (!mcelog.entry[i].finished) {
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700556 if (time_after_eq(jiffies, start + 2)) {
Andi Kleen673242c2005-09-12 18:49:24 +0200557 memset(mcelog.entry + i,0, sizeof(struct mce));
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700558 goto timeout;
Andi Kleen673242c2005-09-12 18:49:24 +0200559 }
560 cpu_relax();
561 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562 smp_rmb();
563 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
564 buf += sizeof(struct mce);
Joshua Wise4f84e4b2007-06-23 17:16:45 -0700565 timeout:
566 ;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700567 }
568
569 memset(mcelog.entry, 0, next * sizeof(struct mce));
570 mcelog.next = 0;
571
Paul E. McKenneyb2b18662005-06-25 14:55:38 -0700572 synchronize_sched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700573
574 /* Collect entries that were still getting written before the synchronize. */
575
576 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
577 for (i = next; i < MCE_LOG_LEN; i++) {
578 if (mcelog.entry[i].finished &&
579 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
580 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
581 smp_rmb();
582 buf += sizeof(struct mce);
583 memset(&mcelog.entry[i], 0, sizeof(struct mce));
584 }
585 }
586 up(&mce_read_sem);
Andi Kleenf0de53b2005-04-16 15:25:10 -0700587 kfree(cpu_tsc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588 return err ? -EFAULT : buf - ubuf;
589}
590
Tim Hockine02e68d2007-07-21 17:10:36 +0200591static unsigned int mce_poll(struct file *file, poll_table *wait)
592{
593 poll_wait(file, &mce_wait, wait);
594 if (rcu_dereference(mcelog.next))
595 return POLLIN | POLLRDNORM;
596 return 0;
597}
598
Linus Torvalds1da177e2005-04-16 15:20:36 -0700599static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
600{
601 int __user *p = (int __user *)arg;
602 if (!capable(CAP_SYS_ADMIN))
603 return -EPERM;
604 switch (cmd) {
605 case MCE_GET_RECORD_LEN:
606 return put_user(sizeof(struct mce), p);
607 case MCE_GET_LOG_LEN:
608 return put_user(MCE_LOG_LEN, p);
609 case MCE_GETCLEAR_FLAGS: {
610 unsigned flags;
611 do {
612 flags = mcelog.flags;
613 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
614 return put_user(flags, p);
615 }
616 default:
617 return -ENOTTY;
618 }
619}
620
Arjan van de Ven5dfe4c92007-02-12 00:55:31 -0800621static const struct file_operations mce_chrdev_ops = {
Tim Hockinf528e7b2007-07-21 17:10:35 +0200622 .open = mce_open,
623 .release = mce_release,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624 .read = mce_read,
Tim Hockine02e68d2007-07-21 17:10:36 +0200625 .poll = mce_poll,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626 .ioctl = mce_ioctl,
627};
628
629static struct miscdevice mce_log_device = {
630 MISC_MCELOG_MINOR,
631 "mcelog",
632 &mce_chrdev_ops,
633};
634
635/*
636 * Old style boot options parsing. Only for compatibility.
637 */
638
639static int __init mcheck_disable(char *str)
640{
641 mce_dont_init = 1;
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800642 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700643}
644
645/* mce=off disables machine check. Note you can reenable it later
Andi Kleend5172f22005-08-07 09:42:07 -0700646 using sysfs.
Andi Kleen8c566ef2005-09-12 18:49:24 +0200647 mce=TOLERANCELEVEL (number, see above)
Andi Kleene5835382005-11-05 17:25:54 +0100648 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
649 mce=nobootlog Don't log MCEs from before booting. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700650static int __init mcheck_enable(char *str)
651{
Andi Kleend5172f22005-08-07 09:42:07 -0700652 if (*str == '=')
653 str++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654 if (!strcmp(str, "off"))
655 mce_dont_init = 1;
Andi Kleene5835382005-11-05 17:25:54 +0100656 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
657 mce_bootlog = str[0] == 'b';
Andi Kleen8c566ef2005-09-12 18:49:24 +0200658 else if (isdigit(str[0]))
659 get_option(&str, &tolerant);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700660 else
661 printk("mce= argument %s ignored. Please use /sys", str);
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800662 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663}
664
665__setup("nomce", mcheck_disable);
666__setup("mce", mcheck_enable);
667
668/*
669 * Sysfs support
670 */
671
Andi Kleen413588c2005-09-12 18:49:24 +0200672/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
673 Only one CPU is active at this time, the others get readded later using
674 CPU hotplug. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675static int mce_resume(struct sys_device *dev)
676{
Andi Kleen413588c2005-09-12 18:49:24 +0200677 mce_init(NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678 return 0;
679}
680
681/* Reinit MCEs after user configuration changes */
682static void mce_restart(void)
683{
Tim Hockin8a336b02007-05-02 19:27:19 +0200684 if (next_interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700685 cancel_delayed_work(&mcheck_work);
686 /* Timer race is harmless here */
687 on_each_cpu(mce_init, NULL, 1, 1);
Tim Hockin8a336b02007-05-02 19:27:19 +0200688 next_interval = check_interval * HZ;
689 if (next_interval)
690 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691}
692
693static struct sysdev_class mce_sysclass = {
694 .resume = mce_resume,
695 set_kset_name("machinecheck"),
696};
697
Jacob Shinfff2e892006-06-26 13:58:50 +0200698DEFINE_PER_CPU(struct sys_device, device_mce);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699
700/* Why are there no generic functions for this? */
701#define ACCESSOR(name, var, start) \
702 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
703 return sprintf(buf, "%lx\n", (unsigned long)var); \
704 } \
705 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
706 char *end; \
707 unsigned long new = simple_strtoul(buf, &end, 0); \
708 if (end == buf) return -EINVAL; \
709 var = new; \
710 start; \
711 return end-buf; \
712 } \
713 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
714
Andi Kleena98f0dd2007-02-13 13:26:23 +0100715/* TBD should generate these dynamically based on number of available banks */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700716ACCESSOR(bank0ctl,bank[0],mce_restart())
717ACCESSOR(bank1ctl,bank[1],mce_restart())
718ACCESSOR(bank2ctl,bank[2],mce_restart())
719ACCESSOR(bank3ctl,bank[3],mce_restart())
720ACCESSOR(bank4ctl,bank[4],mce_restart())
Shaohua Li73ca5352006-01-11 22:43:06 +0100721ACCESSOR(bank5ctl,bank[5],mce_restart())
Andi Kleena98f0dd2007-02-13 13:26:23 +0100722
723static ssize_t show_trigger(struct sys_device *s, char *buf)
724{
725 strcpy(buf, trigger);
726 strcat(buf, "\n");
727 return strlen(trigger) + 1;
728}
729
730static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
731{
732 char *p;
733 int len;
734 strncpy(trigger, buf, sizeof(trigger));
735 trigger[sizeof(trigger)-1] = 0;
736 len = strlen(trigger);
737 p = strchr(trigger, '\n');
738 if (*p) *p = 0;
739 return len;
740}
741
742static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743ACCESSOR(tolerant,tolerant,)
744ACCESSOR(check_interval,check_interval,mce_restart())
Andi Kleena98f0dd2007-02-13 13:26:23 +0100745static struct sysdev_attribute *mce_attributes[] = {
746 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
747 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
748 &attr_tolerant, &attr_check_interval, &attr_trigger,
749 NULL
750};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700751
Andi Kleen91c6d402005-07-28 21:15:39 -0700752/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
753static __cpuinit int mce_create_device(unsigned int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754{
755 int err;
Shaohua Li73ca5352006-01-11 22:43:06 +0100756 int i;
Andi Kleen91c6d402005-07-28 21:15:39 -0700757 if (!mce_available(&cpu_data[cpu]))
758 return -EIO;
759
760 per_cpu(device_mce,cpu).id = cpu;
761 per_cpu(device_mce,cpu).cls = &mce_sysclass;
762
763 err = sysdev_register(&per_cpu(device_mce,cpu));
764
765 if (!err) {
Andi Kleena98f0dd2007-02-13 13:26:23 +0100766 for (i = 0; mce_attributes[i]; i++)
Shaohua Li73ca5352006-01-11 22:43:06 +0100767 sysdev_create_file(&per_cpu(device_mce,cpu),
Andi Kleena98f0dd2007-02-13 13:26:23 +0100768 mce_attributes[i]);
Andi Kleen91c6d402005-07-28 21:15:39 -0700769 }
770 return err;
771}
772
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700773static void mce_remove_device(unsigned int cpu)
Andi Kleen91c6d402005-07-28 21:15:39 -0700774{
Shaohua Li73ca5352006-01-11 22:43:06 +0100775 int i;
776
Andi Kleena98f0dd2007-02-13 13:26:23 +0100777 for (i = 0; mce_attributes[i]; i++)
Shaohua Li73ca5352006-01-11 22:43:06 +0100778 sysdev_remove_file(&per_cpu(device_mce,cpu),
Andi Kleena98f0dd2007-02-13 13:26:23 +0100779 mce_attributes[i]);
Andi Kleen91c6d402005-07-28 21:15:39 -0700780 sysdev_unregister(&per_cpu(device_mce,cpu));
Rafael J. Wysockid4c45712006-12-07 02:14:12 +0100781 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
Andi Kleen91c6d402005-07-28 21:15:39 -0700782}
Andi Kleen91c6d402005-07-28 21:15:39 -0700783
784/* Get notified when a cpu comes on/off. Be hotplug friendly. */
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700785static int
Andi Kleen91c6d402005-07-28 21:15:39 -0700786mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
787{
788 unsigned int cpu = (unsigned long)hcpu;
789
790 switch (action) {
791 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700792 case CPU_ONLINE_FROZEN:
Andi Kleen91c6d402005-07-28 21:15:39 -0700793 mce_create_device(cpu);
794 break;
Andi Kleen91c6d402005-07-28 21:15:39 -0700795 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700796 case CPU_DEAD_FROZEN:
Andi Kleen91c6d402005-07-28 21:15:39 -0700797 mce_remove_device(cpu);
798 break;
Andi Kleen91c6d402005-07-28 21:15:39 -0700799 }
800 return NOTIFY_OK;
801}
802
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700803static struct notifier_block mce_cpu_notifier = {
Andi Kleen91c6d402005-07-28 21:15:39 -0700804 .notifier_call = mce_cpu_callback,
805};
806
807static __init int mce_init_device(void)
808{
809 int err;
810 int i = 0;
811
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812 if (!mce_available(&boot_cpu_data))
813 return -EIO;
814 err = sysdev_class_register(&mce_sysclass);
Andi Kleen91c6d402005-07-28 21:15:39 -0700815
816 for_each_online_cpu(i) {
817 mce_create_device(i);
818 }
819
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700820 register_hotcpu_notifier(&mce_cpu_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821 misc_register(&mce_log_device);
822 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700823}
Andi Kleen91c6d402005-07-28 21:15:39 -0700824
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825device_initcall(mce_init_device);