blob: fa26726824775e774682476f072836e5a29cb691 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Machine check handler.
3 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 * Rest from unknown author(s).
5 * 2004 Andi Kleen. Rewrote most of it.
6 */
7
8#include <linux/init.h>
9#include <linux/types.h>
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/string.h>
13#include <linux/rcupdate.h>
14#include <linux/kallsyms.h>
15#include <linux/sysdev.h>
16#include <linux/miscdevice.h>
17#include <linux/fs.h>
Randy Dunlapa9415642006-01-11 12:17:48 -080018#include <linux/capability.h>
Andi Kleen91c6d402005-07-28 21:15:39 -070019#include <linux/cpu.h>
20#include <linux/percpu.h>
Andi Kleen8c566ef2005-09-12 18:49:24 +020021#include <linux/ctype.h>
Andi Kleena98f0dd2007-02-13 13:26:23 +010022#include <linux/kmod.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070023#include <asm/processor.h>
24#include <asm/msr.h>
25#include <asm/mce.h>
26#include <asm/kdebug.h>
27#include <asm/uaccess.h>
Andi Kleen0a9c3ee2006-01-11 22:46:54 +010028#include <asm/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070029
30#define MISC_MCELOG_MINOR 227
Shaohua Li73ca5352006-01-11 22:43:06 +010031#define NR_BANKS 6
Linus Torvalds1da177e2005-04-16 15:20:36 -070032
Andi Kleen553f2652006-04-07 19:49:57 +020033atomic_t mce_entry;
34
Linus Torvalds1da177e2005-04-16 15:20:36 -070035static int mce_dont_init;
36
37/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic,
38 3: never panic or exit (for testing only) */
39static int tolerant = 1;
40static int banks;
41static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
42static unsigned long console_logged;
43static int notify_user;
Andi Kleen94ad8472005-04-16 15:25:09 -070044static int rip_msr;
Andi Kleene5835382005-11-05 17:25:54 +010045static int mce_bootlog = 1;
Andi Kleena98f0dd2007-02-13 13:26:23 +010046static atomic_t mce_events;
47
48static char trigger[128];
49static char *trigger_argv[2] = { trigger, NULL };
Linus Torvalds1da177e2005-04-16 15:20:36 -070050
51/*
52 * Lockless MCE logging infrastructure.
53 * This avoids deadlocks on printk locks without having to break locks. Also
54 * separate MCEs from kernel messages to avoid bogus bug reports.
55 */
56
57struct mce_log mcelog = {
58 MCE_LOG_SIGNATURE,
59 MCE_LOG_LEN,
60};
61
62void mce_log(struct mce *mce)
63{
64 unsigned next, entry;
Andi Kleena98f0dd2007-02-13 13:26:23 +010065 atomic_inc(&mce_events);
Linus Torvalds1da177e2005-04-16 15:20:36 -070066 mce->finished = 0;
Mike Waychison76441432005-09-30 00:01:27 +020067 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070068 for (;;) {
69 entry = rcu_dereference(mcelog.next);
Mike Waychison76441432005-09-30 00:01:27 +020070 /* The rmb forces the compiler to reload next in each
71 iteration */
72 rmb();
Andi Kleen673242c2005-09-12 18:49:24 +020073 for (;;) {
74 /* When the buffer fills up discard new entries. Assume
75 that the earlier errors are the more interesting. */
76 if (entry >= MCE_LOG_LEN) {
77 set_bit(MCE_OVERFLOW, &mcelog.flags);
78 return;
79 }
80 /* Old left over entry. Skip. */
81 if (mcelog.entry[entry].finished) {
82 entry++;
83 continue;
84 }
Mike Waychison76441432005-09-30 00:01:27 +020085 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -070086 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070087 smp_rmb();
88 next = entry + 1;
89 if (cmpxchg(&mcelog.next, entry, next) == entry)
90 break;
91 }
92 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
Mike Waychison76441432005-09-30 00:01:27 +020093 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070094 mcelog.entry[entry].finished = 1;
Mike Waychison76441432005-09-30 00:01:27 +020095 wmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -070096
97 if (!test_and_set_bit(0, &console_logged))
98 notify_user = 1;
99}
100
101static void print_mce(struct mce *m)
102{
103 printk(KERN_EMERG "\n"
Andi Kleen48551702006-01-11 22:44:48 +0100104 KERN_EMERG "HARDWARE ERROR\n"
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105 KERN_EMERG
106 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
107 m->cpu, m->mcgstatus, m->bank, m->status);
108 if (m->rip) {
109 printk(KERN_EMERG
110 "RIP%s %02x:<%016Lx> ",
111 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
112 m->cs, m->rip);
113 if (m->cs == __KERNEL_CS)
114 print_symbol("{%s}", m->rip);
115 printk("\n");
116 }
117 printk(KERN_EMERG "TSC %Lx ", m->tsc);
118 if (m->addr)
119 printk("ADDR %Lx ", m->addr);
120 if (m->misc)
121 printk("MISC %Lx ", m->misc);
122 printk("\n");
Andi Kleen48551702006-01-11 22:44:48 +0100123 printk(KERN_EMERG "This is not a software problem!\n");
124 printk(KERN_EMERG
125 "Run through mcelog --ascii to decode and contact your hardware vendor\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700126}
127
128static void mce_panic(char *msg, struct mce *backup, unsigned long start)
129{
130 int i;
131 oops_begin();
132 for (i = 0; i < MCE_LOG_LEN; i++) {
133 unsigned long tsc = mcelog.entry[i].tsc;
134 if (time_before(tsc, start))
135 continue;
136 print_mce(&mcelog.entry[i]);
137 if (backup && mcelog.entry[i].tsc == backup->tsc)
138 backup = NULL;
139 }
140 if (backup)
141 print_mce(backup);
142 if (tolerant >= 3)
143 printk("Fake panic: %s\n", msg);
144 else
145 panic(msg);
146}
147
148static int mce_available(struct cpuinfo_x86 *c)
149{
Akinobu Mita3d1712c2006-03-24 03:15:11 -0800150 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151}
152
Andi Kleen94ad8472005-04-16 15:25:09 -0700153static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
154{
155 if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
156 m->rip = regs->rip;
157 m->cs = regs->cs;
158 } else {
159 m->rip = 0;
160 m->cs = 0;
161 }
162 if (rip_msr) {
163 /* Assume the RIP in the MSR is exact. Is this true? */
164 m->mcgstatus |= MCG_STATUS_EIPV;
165 rdmsrl(rip_msr, m->rip);
166 m->cs = 0;
167 }
168}
169
Andi Kleena98f0dd2007-02-13 13:26:23 +0100170static void do_mce_trigger(void)
171{
172 static atomic_t mce_logged;
173 int events = atomic_read(&mce_events);
174 if (events != atomic_read(&mce_logged) && trigger[0]) {
175 /* Small race window, but should be harmless. */
176 atomic_set(&mce_logged, events);
177 call_usermodehelper(trigger, trigger_argv, NULL, -1);
178 }
179}
180
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181/*
182 * The actual machine check handler
183 */
184
185void do_machine_check(struct pt_regs * regs, long error_code)
186{
187 struct mce m, panicm;
188 int nowayout = (tolerant < 1);
189 int kill_it = 0;
190 u64 mcestart = 0;
191 int i;
192 int panicm_found = 0;
193
Andi Kleen553f2652006-04-07 19:49:57 +0200194 atomic_inc(&mce_entry);
195
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196 if (regs)
Jan Beulich6e3f3612006-01-11 22:42:14 +0100197 notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 if (!banks)
Andi Kleen553f2652006-04-07 19:49:57 +0200199 goto out2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200
201 memset(&m, 0, sizeof(struct mce));
Andi Kleen151f8cc2006-09-26 10:52:37 +0200202 m.cpu = smp_processor_id();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700203 rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
204 if (!(m.mcgstatus & MCG_STATUS_RIPV))
205 kill_it = 1;
206
207 rdtscll(mcestart);
208 barrier();
209
210 for (i = 0; i < banks; i++) {
211 if (!bank[i])
212 continue;
213
214 m.misc = 0;
215 m.addr = 0;
216 m.bank = i;
217 m.tsc = 0;
218
219 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
220 if ((m.status & MCI_STATUS_VAL) == 0)
221 continue;
222
223 if (m.status & MCI_STATUS_EN) {
224 /* In theory _OVER could be a nowayout too, but
225 assume any overflowed errors were no fatal. */
226 nowayout |= !!(m.status & MCI_STATUS_PCC);
227 kill_it |= !!(m.status & MCI_STATUS_UC);
228 }
229
230 if (m.status & MCI_STATUS_MISCV)
231 rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
232 if (m.status & MCI_STATUS_ADDRV)
233 rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
234
Andi Kleen94ad8472005-04-16 15:25:09 -0700235 mce_get_rip(&m, regs);
Andi Kleend5172f22005-08-07 09:42:07 -0700236 if (error_code >= 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 rdtscll(m.tsc);
238 wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0);
Andi Kleend5172f22005-08-07 09:42:07 -0700239 if (error_code != -2)
240 mce_log(&m);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241
242 /* Did this bank cause the exception? */
243 /* Assume that the bank with uncorrectable errors did it,
244 and that there is only a single one. */
245 if ((m.status & MCI_STATUS_UC) && (m.status & MCI_STATUS_EN)) {
246 panicm = m;
247 panicm_found = 1;
248 }
249
Randy Dunlap9f158332005-09-13 01:25:16 -0700250 add_taint(TAINT_MACHINE_CHECK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251 }
252
253 /* Never do anything final in the polling timer */
Andi Kleena98f0dd2007-02-13 13:26:23 +0100254 if (!regs) {
255 /* Normal interrupt context here. Call trigger for any new
256 events. */
257 do_mce_trigger();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258 goto out;
Andi Kleena98f0dd2007-02-13 13:26:23 +0100259 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260
261 /* If we didn't find an uncorrectable error, pick
262 the last one (shouldn't happen, just being safe). */
263 if (!panicm_found)
264 panicm = m;
265 if (nowayout)
266 mce_panic("Machine check", &panicm, mcestart);
267 if (kill_it) {
268 int user_space = 0;
269
270 if (m.mcgstatus & MCG_STATUS_RIPV)
271 user_space = panicm.rip && (panicm.cs & 3);
272
273 /* When the machine was in user space and the CPU didn't get
274 confused it's normally not necessary to panic, unless you
275 are paranoid (tolerant == 0)
276
277 RED-PEN could be more tolerant for MCEs in idle,
278 but most likely they occur at boot anyways, where
279 it is best to just halt the machine. */
280 if ((!user_space && (panic_on_oops || tolerant < 2)) ||
281 (unsigned)current->pid <= 1)
282 mce_panic("Uncorrected machine check", &panicm, mcestart);
283
284 /* do_exit takes an awful lot of locks and has as
285 slight risk of deadlocking. If you don't want that
286 don't set tolerant >= 2 */
287 if (tolerant < 3)
288 do_exit(SIGBUS);
289 }
290
291 out:
292 /* Last thing done in the machine check exception to clear state. */
293 wrmsrl(MSR_IA32_MCG_STATUS, 0);
Andi Kleen553f2652006-04-07 19:49:57 +0200294 out2:
295 atomic_dec(&mce_entry);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296}
297
Dmitriy Zavin15d5f832006-09-26 10:52:42 +0200298#ifdef CONFIG_X86_MCE_INTEL
299/***
300 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
301 * @cpu: The CPU on which the event occured.
302 * @status: Event status information
303 *
304 * This function should be called by the thermal interrupt after the
305 * event has been processed and the decision was made to log the event
306 * further.
307 *
308 * The status parameter will be saved to the 'status' field of 'struct mce'
309 * and historically has been the register value of the
310 * MSR_IA32_THERMAL_STATUS (Intel) msr.
311 */
312void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
313{
314 struct mce m;
315
316 memset(&m, 0, sizeof(m));
317 m.cpu = cpu;
318 m.bank = MCE_THERMAL_BANK;
319 m.status = status;
320 rdtscll(m.tsc);
321 mce_log(&m);
322}
323#endif /* CONFIG_X86_MCE_INTEL */
324
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325/*
Tim Hockin8a336b02007-05-02 19:27:19 +0200326 * Periodic polling timer for "silent" machine check errors. If the
327 * poller finds an MCE, poll 2x faster. When the poller finds no more
328 * errors, poll 2x slower (up to check_interval seconds).
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329 */
330
331static int check_interval = 5 * 60; /* 5 minutes */
Tim Hockin8a336b02007-05-02 19:27:19 +0200332static int next_interval; /* in jiffies */
David Howells65f27f32006-11-22 14:55:48 +0000333static void mcheck_timer(struct work_struct *work);
334static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335
336static void mcheck_check_cpu(void *info)
337{
338 if (mce_available(&current_cpu_data))
339 do_machine_check(NULL, 0);
340}
341
David Howells65f27f32006-11-22 14:55:48 +0000342static void mcheck_timer(struct work_struct *work)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700343{
344 on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700345
346 /*
347 * It's ok to read stale data here for notify_user and
348 * console_logged as we'll simply get the updated versions
349 * on the next mcheck_timer execution and atomic operations
350 * on console_logged act as synchronization for notify_user
351 * writes.
352 */
353 if (notify_user && console_logged) {
Tim Hockin8a336b02007-05-02 19:27:19 +0200354 static unsigned long last_print;
355 unsigned long now = jiffies;
356
357 /* if we logged an MCE, reduce the polling interval */
358 next_interval = max(next_interval/2, HZ/100);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 notify_user = 0;
360 clear_bit(0, &console_logged);
Tim Hockin8a336b02007-05-02 19:27:19 +0200361 if (time_after_eq(now, last_print + (check_interval*HZ))) {
362 last_print = now;
363 printk(KERN_INFO "Machine check events logged\n");
364 }
365 } else {
366 next_interval = min(next_interval*2, check_interval*HZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367 }
Tim Hockin8a336b02007-05-02 19:27:19 +0200368
369 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700370}
371
372
373static __init int periodic_mcheck_init(void)
374{
Tim Hockin8a336b02007-05-02 19:27:19 +0200375 next_interval = check_interval * HZ;
376 if (next_interval)
377 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700378 return 0;
379}
380__initcall(periodic_mcheck_init);
381
382
383/*
384 * Initialize Machine Checks for a CPU.
385 */
386static void mce_init(void *dummy)
387{
388 u64 cap;
389 int i;
390
391 rdmsrl(MSR_IA32_MCG_CAP, cap);
392 banks = cap & 0xff;
393 if (banks > NR_BANKS) {
394 printk(KERN_INFO "MCE: warning: using only %d banks\n", banks);
395 banks = NR_BANKS;
396 }
Andi Kleen94ad8472005-04-16 15:25:09 -0700397 /* Use accurate RIP reporting if available. */
398 if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
399 rip_msr = MSR_IA32_MCG_EIP;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700400
401 /* Log the machine checks left over from the previous reset.
402 This also clears all registers */
Andi Kleend5172f22005-08-07 09:42:07 -0700403 do_machine_check(NULL, mce_bootlog ? -1 : -2);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404
405 set_in_cr4(X86_CR4_MCE);
406
407 if (cap & MCG_CTL_P)
408 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
409
410 for (i = 0; i < banks; i++) {
411 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
412 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
413 }
414}
415
416/* Add per CPU specific workarounds here */
Ashok Raje6982c62005-06-25 14:54:58 -0700417static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700418{
419 /* This should be disabled by the BIOS, but isn't always */
420 if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
421 /* disable GART TBL walk error reporting, which trips off
422 incorrectly with the IOMMU & 3ware & Cerberus. */
423 clear_bit(10, &bank[4]);
Andi Kleene5835382005-11-05 17:25:54 +0100424 /* Lots of broken BIOS around that don't clear them
425 by default and leave crap in there. Don't log. */
426 mce_bootlog = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427 }
Andi Kleene5835382005-11-05 17:25:54 +0100428
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429}
430
Ashok Raje6982c62005-06-25 14:54:58 -0700431static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700432{
433 switch (c->x86_vendor) {
434 case X86_VENDOR_INTEL:
435 mce_intel_feature_init(c);
436 break;
Jacob Shin89b831e2005-11-05 17:25:53 +0100437 case X86_VENDOR_AMD:
438 mce_amd_feature_init(c);
439 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700440 default:
441 break;
442 }
443}
444
445/*
446 * Called for each booted CPU to set up machine checks.
447 * Must be called with preempt off.
448 */
Ashok Raje6982c62005-06-25 14:54:58 -0700449void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700450{
Ashok Raj7ded5682006-02-03 21:51:23 +0100451 static cpumask_t mce_cpus = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700452
453 mce_cpu_quirks(c);
454
455 if (mce_dont_init ||
456 cpu_test_and_set(smp_processor_id(), mce_cpus) ||
457 !mce_available(c))
458 return;
459
460 mce_init(NULL);
461 mce_cpu_features(c);
462}
463
464/*
465 * Character device to read and clear the MCE log.
466 */
467
468static void collect_tscs(void *data)
469{
470 unsigned long *cpu_tsc = (unsigned long *)data;
471 rdtscll(cpu_tsc[smp_processor_id()]);
472}
473
474static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off)
475{
Andi Kleenf0de53b2005-04-16 15:25:10 -0700476 unsigned long *cpu_tsc;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477 static DECLARE_MUTEX(mce_read_sem);
478 unsigned next;
479 char __user *buf = ubuf;
480 int i, err;
481
Andi Kleenf0de53b2005-04-16 15:25:10 -0700482 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
483 if (!cpu_tsc)
484 return -ENOMEM;
485
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 down(&mce_read_sem);
487 next = rcu_dereference(mcelog.next);
488
489 /* Only supports full reads right now */
490 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
491 up(&mce_read_sem);
Andi Kleenf0de53b2005-04-16 15:25:10 -0700492 kfree(cpu_tsc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700493 return -EINVAL;
494 }
495
496 err = 0;
Andi Kleen673242c2005-09-12 18:49:24 +0200497 for (i = 0; i < next; i++) {
498 unsigned long start = jiffies;
499 while (!mcelog.entry[i].finished) {
500 if (!time_before(jiffies, start + 2)) {
501 memset(mcelog.entry + i,0, sizeof(struct mce));
502 continue;
503 }
504 cpu_relax();
505 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506 smp_rmb();
507 err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
508 buf += sizeof(struct mce);
509 }
510
511 memset(mcelog.entry, 0, next * sizeof(struct mce));
512 mcelog.next = 0;
513
Paul E. McKenneyb2b18662005-06-25 14:55:38 -0700514 synchronize_sched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515
516 /* Collect entries that were still getting written before the synchronize. */
517
518 on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
519 for (i = next; i < MCE_LOG_LEN; i++) {
520 if (mcelog.entry[i].finished &&
521 mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
522 err |= copy_to_user(buf, mcelog.entry+i, sizeof(struct mce));
523 smp_rmb();
524 buf += sizeof(struct mce);
525 memset(&mcelog.entry[i], 0, sizeof(struct mce));
526 }
527 }
528 up(&mce_read_sem);
Andi Kleenf0de53b2005-04-16 15:25:10 -0700529 kfree(cpu_tsc);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530 return err ? -EFAULT : buf - ubuf;
531}
532
533static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg)
534{
535 int __user *p = (int __user *)arg;
536 if (!capable(CAP_SYS_ADMIN))
537 return -EPERM;
538 switch (cmd) {
539 case MCE_GET_RECORD_LEN:
540 return put_user(sizeof(struct mce), p);
541 case MCE_GET_LOG_LEN:
542 return put_user(MCE_LOG_LEN, p);
543 case MCE_GETCLEAR_FLAGS: {
544 unsigned flags;
545 do {
546 flags = mcelog.flags;
547 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
548 return put_user(flags, p);
549 }
550 default:
551 return -ENOTTY;
552 }
553}
554
Arjan van de Ven5dfe4c92007-02-12 00:55:31 -0800555static const struct file_operations mce_chrdev_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700556 .read = mce_read,
557 .ioctl = mce_ioctl,
558};
559
560static struct miscdevice mce_log_device = {
561 MISC_MCELOG_MINOR,
562 "mcelog",
563 &mce_chrdev_ops,
564};
565
566/*
567 * Old style boot options parsing. Only for compatibility.
568 */
569
570static int __init mcheck_disable(char *str)
571{
572 mce_dont_init = 1;
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800573 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574}
575
576/* mce=off disables machine check. Note you can reenable it later
Andi Kleend5172f22005-08-07 09:42:07 -0700577 using sysfs.
Andi Kleen8c566ef2005-09-12 18:49:24 +0200578 mce=TOLERANCELEVEL (number, see above)
Andi Kleene5835382005-11-05 17:25:54 +0100579 mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
580 mce=nobootlog Don't log MCEs from before booting. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700581static int __init mcheck_enable(char *str)
582{
Andi Kleend5172f22005-08-07 09:42:07 -0700583 if (*str == '=')
584 str++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585 if (!strcmp(str, "off"))
586 mce_dont_init = 1;
Andi Kleene5835382005-11-05 17:25:54 +0100587 else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
588 mce_bootlog = str[0] == 'b';
Andi Kleen8c566ef2005-09-12 18:49:24 +0200589 else if (isdigit(str[0]))
590 get_option(&str, &tolerant);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700591 else
592 printk("mce= argument %s ignored. Please use /sys", str);
OGAWA Hirofumi9b410462006-03-31 02:30:33 -0800593 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594}
595
596__setup("nomce", mcheck_disable);
597__setup("mce", mcheck_enable);
598
599/*
600 * Sysfs support
601 */
602
Andi Kleen413588c2005-09-12 18:49:24 +0200603/* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
604 Only one CPU is active at this time, the others get readded later using
605 CPU hotplug. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606static int mce_resume(struct sys_device *dev)
607{
Andi Kleen413588c2005-09-12 18:49:24 +0200608 mce_init(NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 return 0;
610}
611
612/* Reinit MCEs after user configuration changes */
613static void mce_restart(void)
614{
Tim Hockin8a336b02007-05-02 19:27:19 +0200615 if (next_interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700616 cancel_delayed_work(&mcheck_work);
617 /* Timer race is harmless here */
618 on_each_cpu(mce_init, NULL, 1, 1);
Tim Hockin8a336b02007-05-02 19:27:19 +0200619 next_interval = check_interval * HZ;
620 if (next_interval)
621 schedule_delayed_work(&mcheck_work, next_interval);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622}
623
624static struct sysdev_class mce_sysclass = {
625 .resume = mce_resume,
626 set_kset_name("machinecheck"),
627};
628
Jacob Shinfff2e892006-06-26 13:58:50 +0200629DEFINE_PER_CPU(struct sys_device, device_mce);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700630
631/* Why are there no generic functions for this? */
632#define ACCESSOR(name, var, start) \
633 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \
634 return sprintf(buf, "%lx\n", (unsigned long)var); \
635 } \
636 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
637 char *end; \
638 unsigned long new = simple_strtoul(buf, &end, 0); \
639 if (end == buf) return -EINVAL; \
640 var = new; \
641 start; \
642 return end-buf; \
643 } \
644 static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
645
Andi Kleena98f0dd2007-02-13 13:26:23 +0100646/* TBD should generate these dynamically based on number of available banks */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700647ACCESSOR(bank0ctl,bank[0],mce_restart())
648ACCESSOR(bank1ctl,bank[1],mce_restart())
649ACCESSOR(bank2ctl,bank[2],mce_restart())
650ACCESSOR(bank3ctl,bank[3],mce_restart())
651ACCESSOR(bank4ctl,bank[4],mce_restart())
Shaohua Li73ca5352006-01-11 22:43:06 +0100652ACCESSOR(bank5ctl,bank[5],mce_restart())
Andi Kleena98f0dd2007-02-13 13:26:23 +0100653
654static ssize_t show_trigger(struct sys_device *s, char *buf)
655{
656 strcpy(buf, trigger);
657 strcat(buf, "\n");
658 return strlen(trigger) + 1;
659}
660
661static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
662{
663 char *p;
664 int len;
665 strncpy(trigger, buf, sizeof(trigger));
666 trigger[sizeof(trigger)-1] = 0;
667 len = strlen(trigger);
668 p = strchr(trigger, '\n');
669 if (*p) *p = 0;
670 return len;
671}
672
673static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700674ACCESSOR(tolerant,tolerant,)
675ACCESSOR(check_interval,check_interval,mce_restart())
Andi Kleena98f0dd2007-02-13 13:26:23 +0100676static struct sysdev_attribute *mce_attributes[] = {
677 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
678 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
679 &attr_tolerant, &attr_check_interval, &attr_trigger,
680 NULL
681};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682
Andi Kleen91c6d402005-07-28 21:15:39 -0700683/* Per cpu sysdev init. All of the cpus still share the same ctl bank */
684static __cpuinit int mce_create_device(unsigned int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700685{
686 int err;
Shaohua Li73ca5352006-01-11 22:43:06 +0100687 int i;
Andi Kleen91c6d402005-07-28 21:15:39 -0700688 if (!mce_available(&cpu_data[cpu]))
689 return -EIO;
690
691 per_cpu(device_mce,cpu).id = cpu;
692 per_cpu(device_mce,cpu).cls = &mce_sysclass;
693
694 err = sysdev_register(&per_cpu(device_mce,cpu));
695
696 if (!err) {
Andi Kleena98f0dd2007-02-13 13:26:23 +0100697 for (i = 0; mce_attributes[i]; i++)
Shaohua Li73ca5352006-01-11 22:43:06 +0100698 sysdev_create_file(&per_cpu(device_mce,cpu),
Andi Kleena98f0dd2007-02-13 13:26:23 +0100699 mce_attributes[i]);
Andi Kleen91c6d402005-07-28 21:15:39 -0700700 }
701 return err;
702}
703
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700704static void mce_remove_device(unsigned int cpu)
Andi Kleen91c6d402005-07-28 21:15:39 -0700705{
Shaohua Li73ca5352006-01-11 22:43:06 +0100706 int i;
707
Andi Kleena98f0dd2007-02-13 13:26:23 +0100708 for (i = 0; mce_attributes[i]; i++)
Shaohua Li73ca5352006-01-11 22:43:06 +0100709 sysdev_remove_file(&per_cpu(device_mce,cpu),
Andi Kleena98f0dd2007-02-13 13:26:23 +0100710 mce_attributes[i]);
Andi Kleen91c6d402005-07-28 21:15:39 -0700711 sysdev_unregister(&per_cpu(device_mce,cpu));
Rafael J. Wysockid4c45712006-12-07 02:14:12 +0100712 memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject));
Andi Kleen91c6d402005-07-28 21:15:39 -0700713}
Andi Kleen91c6d402005-07-28 21:15:39 -0700714
715/* Get notified when a cpu comes on/off. Be hotplug friendly. */
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700716static int
Andi Kleen91c6d402005-07-28 21:15:39 -0700717mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
718{
719 unsigned int cpu = (unsigned long)hcpu;
720
721 switch (action) {
722 case CPU_ONLINE:
723 mce_create_device(cpu);
724 break;
Andi Kleen91c6d402005-07-28 21:15:39 -0700725 case CPU_DEAD:
726 mce_remove_device(cpu);
727 break;
Andi Kleen91c6d402005-07-28 21:15:39 -0700728 }
729 return NOTIFY_OK;
730}
731
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700732static struct notifier_block mce_cpu_notifier = {
Andi Kleen91c6d402005-07-28 21:15:39 -0700733 .notifier_call = mce_cpu_callback,
734};
735
736static __init int mce_init_device(void)
737{
738 int err;
739 int i = 0;
740
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741 if (!mce_available(&boot_cpu_data))
742 return -EIO;
743 err = sysdev_class_register(&mce_sysclass);
Andi Kleen91c6d402005-07-28 21:15:39 -0700744
745 for_each_online_cpu(i) {
746 mce_create_device(i);
747 }
748
Chandra Seetharamanbe6b5a32006-07-30 03:03:37 -0700749 register_hotcpu_notifier(&mce_cpu_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750 misc_register(&mce_log_device);
751 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700752}
Andi Kleen91c6d402005-07-28 21:15:39 -0700753
Linus Torvalds1da177e2005-04-16 15:20:36 -0700754device_initcall(mce_init_device);