xref: /freebsd-13.1/sys/x86/x86/mca.c (revision 8596d2b3)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2009 Hudson River Trading LLC
5  * Written by: John H. Baldwin <[email protected]>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  * Support for x86 machine check architecture.
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #ifdef __amd64__
38 #define	DEV_APIC
39 #else
40 #include "opt_apic.h"
41 #endif
42 
43 #include <sys/param.h>
44 #include <sys/bus.h>
45 #include <sys/interrupt.h>
46 #include <sys/kernel.h>
47 #include <sys/lock.h>
48 #include <sys/malloc.h>
49 #include <sys/mutex.h>
50 #include <sys/proc.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/sysctl.h>
54 #include <sys/systm.h>
55 #include <sys/taskqueue.h>
56 #include <machine/intr_machdep.h>
57 #include <x86/apicvar.h>
58 #include <machine/cpu.h>
59 #include <machine/cputypes.h>
60 #include <x86/mca.h>
61 #include <machine/md_var.h>
62 #include <machine/specialreg.h>
63 
64 /* Modes for mca_scan() */
65 enum scan_mode {
66 	POLLED,
67 	MCE,
68 	CMCI,
69 };
70 
71 #ifdef DEV_APIC
72 /*
73  * State maintained for each monitored MCx bank to control the
74  * corrected machine check interrupt threshold.
75  */
76 struct cmc_state {
77 	int	max_threshold;
78 	time_t	last_intr;
79 };
80 
81 struct amd_et_state {
82 	int	cur_threshold;
83 	time_t	last_intr;
84 };
85 #endif
86 
87 struct mca_internal {
88 	struct mca_record rec;
89 	STAILQ_ENTRY(mca_internal) link;
90 };
91 
92 struct mca_enumerator_ops {
93         unsigned int (*ctl)(int);
94         unsigned int (*status)(int);
95         unsigned int (*addr)(int);
96         unsigned int (*misc)(int);
97 };
98 
99 static MALLOC_DEFINE(M_MCA, "MCA", "Machine Check Architecture");
100 
101 static volatile int mca_count;	/* Number of records stored. */
102 static int mca_banks;		/* Number of per-CPU register banks. */
103 static int mca_maxcount = -1;	/* Limit on records stored. (-1 = unlimited) */
104 
105 static SYSCTL_NODE(_hw, OID_AUTO, mca, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
106     "Machine Check Architecture");
107 
108 static int mca_enabled = 1;
109 SYSCTL_INT(_hw_mca, OID_AUTO, enabled, CTLFLAG_RDTUN, &mca_enabled, 0,
110     "Administrative toggle for machine check support");
111 
112 static int amd10h_L1TP = 1;
113 SYSCTL_INT(_hw_mca, OID_AUTO, amd10h_L1TP, CTLFLAG_RDTUN, &amd10h_L1TP, 0,
114     "Administrative toggle for logging of level one TLB parity (L1TP) errors");
115 
116 static int intel6h_HSD131;
117 SYSCTL_INT(_hw_mca, OID_AUTO, intel6h_HSD131, CTLFLAG_RDTUN, &intel6h_HSD131, 0,
118     "Administrative toggle for logging of spurious corrected errors");
119 
120 int workaround_erratum383;
121 SYSCTL_INT(_hw_mca, OID_AUTO, erratum383, CTLFLAG_RDTUN,
122     &workaround_erratum383, 0,
123     "Is the workaround for Erratum 383 on AMD Family 10h processors enabled?");
124 
125 static STAILQ_HEAD(, mca_internal) mca_freelist;
126 static int mca_freecount;
127 static STAILQ_HEAD(, mca_internal) mca_records;
128 static STAILQ_HEAD(, mca_internal) mca_pending;
129 static int mca_ticks = 300;
130 static struct taskqueue *mca_tq;
131 static struct task mca_resize_task;
132 static struct timeout_task mca_scan_task;
133 static struct mtx mca_lock;
134 
135 static unsigned int
mca_ia32_ctl_reg(int bank)136 mca_ia32_ctl_reg(int bank)
137 {
138 	return (MSR_MC_CTL(bank));
139 }
140 
141 static unsigned int
mca_ia32_status_reg(int bank)142 mca_ia32_status_reg(int bank)
143 {
144 	return (MSR_MC_STATUS(bank));
145 }
146 
147 static unsigned int
mca_ia32_addr_reg(int bank)148 mca_ia32_addr_reg(int bank)
149 {
150 	return (MSR_MC_ADDR(bank));
151 }
152 
153 static unsigned int
mca_ia32_misc_reg(int bank)154 mca_ia32_misc_reg(int bank)
155 {
156 	return (MSR_MC_MISC(bank));
157 }
158 
159 static unsigned int
mca_smca_ctl_reg(int bank)160 mca_smca_ctl_reg(int bank)
161 {
162         return (MSR_SMCA_MC_CTL(bank));
163 }
164 
165 static unsigned int
mca_smca_status_reg(int bank)166 mca_smca_status_reg(int bank)
167 {
168         return (MSR_SMCA_MC_STATUS(bank));
169 }
170 
171 static unsigned int
mca_smca_addr_reg(int bank)172 mca_smca_addr_reg(int bank)
173 {
174         return (MSR_SMCA_MC_ADDR(bank));
175 }
176 
177 static unsigned int
mca_smca_misc_reg(int bank)178 mca_smca_misc_reg(int bank)
179 {
180         return (MSR_SMCA_MC_MISC(bank));
181 }
182 
183 static struct mca_enumerator_ops mca_msr_ops = {
184         .ctl    = mca_ia32_ctl_reg,
185         .status = mca_ia32_status_reg,
186         .addr   = mca_ia32_addr_reg,
187         .misc   = mca_ia32_misc_reg
188 };
189 
190 #ifdef DEV_APIC
191 static struct cmc_state **cmc_state;		/* Indexed by cpuid, bank. */
192 static struct amd_et_state **amd_et_state;	/* Indexed by cpuid, bank. */
193 static int cmc_throttle = 60;	/* Time in seconds to throttle CMCI. */
194 
195 static int amd_elvt = -1;
196 
197 static inline bool
amd_thresholding_supported(void)198 amd_thresholding_supported(void)
199 {
200 	if (cpu_vendor_id != CPU_VENDOR_AMD &&
201 	    cpu_vendor_id != CPU_VENDOR_HYGON)
202 		return (false);
203 	/*
204 	 * The RASCap register is wholly reserved in families 0x10-0x15 (through model 1F).
205 	 *
206 	 * It begins to be documented in family 0x15 model 30 and family 0x16,
207 	 * but neither of these families documents the ScalableMca bit, which
208 	 * supposedly defines the presence of this feature on family 0x17.
209 	 */
210 	if (CPUID_TO_FAMILY(cpu_id) >= 0x10 && CPUID_TO_FAMILY(cpu_id) <= 0x16)
211 		return (true);
212 	if (CPUID_TO_FAMILY(cpu_id) >= 0x17)
213 		return ((amd_rascap & AMDRAS_SCALABLE_MCA) != 0);
214 	return (false);
215 }
216 #endif
217 
218 static inline bool
cmci_supported(uint64_t mcg_cap)219 cmci_supported(uint64_t mcg_cap)
220 {
221 	/*
222 	 * MCG_CAP_CMCI_P bit is reserved in AMD documentation.  Until
223 	 * it is defined, do not use it to check for CMCI support.
224 	 */
225 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
226 		return (false);
227 	return ((mcg_cap & MCG_CAP_CMCI_P) != 0);
228 }
229 
230 static inline bool
tes_supported(uint64_t mcg_cap)231 tes_supported(uint64_t mcg_cap)
232 {
233 
234 	/*
235 	 * MCG_CAP_TES_P bit is reserved in AMD documentation.  Until
236 	 * it is defined, do not use it to check for TES support.
237 	 */
238 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
239 		return (false);
240 	return ((mcg_cap & MCG_CAP_TES_P) != 0);
241 }
242 
243 static inline bool
ser_supported(uint64_t mcg_cap)244 ser_supported(uint64_t mcg_cap)
245 {
246 
247 	return (tes_supported(mcg_cap) && (mcg_cap & MCG_CAP_SER_P) != 0);
248 }
249 
250 static int
sysctl_positive_int(SYSCTL_HANDLER_ARGS)251 sysctl_positive_int(SYSCTL_HANDLER_ARGS)
252 {
253 	int error, value;
254 
255 	value = *(int *)arg1;
256 	error = sysctl_handle_int(oidp, &value, 0, req);
257 	if (error || req->newptr == NULL)
258 		return (error);
259 	if (value <= 0)
260 		return (EINVAL);
261 	*(int *)arg1 = value;
262 	return (0);
263 }
264 
265 static int
sysctl_mca_records(SYSCTL_HANDLER_ARGS)266 sysctl_mca_records(SYSCTL_HANDLER_ARGS)
267 {
268 	int *name = (int *)arg1;
269 	u_int namelen = arg2;
270 	struct mca_record record;
271 	struct mca_internal *rec;
272 	int i;
273 
274 	if (namelen != 1)
275 		return (EINVAL);
276 
277 	if (name[0] < 0 || name[0] >= mca_count)
278 		return (EINVAL);
279 
280 	mtx_lock_spin(&mca_lock);
281 	if (name[0] >= mca_count) {
282 		mtx_unlock_spin(&mca_lock);
283 		return (EINVAL);
284 	}
285 	i = 0;
286 	STAILQ_FOREACH(rec, &mca_records, link) {
287 		if (i == name[0]) {
288 			record = rec->rec;
289 			break;
290 		}
291 		i++;
292 	}
293 	mtx_unlock_spin(&mca_lock);
294 	return (SYSCTL_OUT(req, &record, sizeof(record)));
295 }
296 
297 static const char *
mca_error_ttype(uint16_t mca_error)298 mca_error_ttype(uint16_t mca_error)
299 {
300 
301 	switch ((mca_error & 0x000c) >> 2) {
302 	case 0:
303 		return ("I");
304 	case 1:
305 		return ("D");
306 	case 2:
307 		return ("G");
308 	}
309 	return ("?");
310 }
311 
312 static const char *
mca_error_level(uint16_t mca_error)313 mca_error_level(uint16_t mca_error)
314 {
315 
316 	switch (mca_error & 0x0003) {
317 	case 0:
318 		return ("L0");
319 	case 1:
320 		return ("L1");
321 	case 2:
322 		return ("L2");
323 	case 3:
324 		return ("LG");
325 	}
326 	return ("L?");
327 }
328 
329 static const char *
mca_error_request(uint16_t mca_error)330 mca_error_request(uint16_t mca_error)
331 {
332 
333 	switch ((mca_error & 0x00f0) >> 4) {
334 	case 0x0:
335 		return ("ERR");
336 	case 0x1:
337 		return ("RD");
338 	case 0x2:
339 		return ("WR");
340 	case 0x3:
341 		return ("DRD");
342 	case 0x4:
343 		return ("DWR");
344 	case 0x5:
345 		return ("IRD");
346 	case 0x6:
347 		return ("PREFETCH");
348 	case 0x7:
349 		return ("EVICT");
350 	case 0x8:
351 		return ("SNOOP");
352 	}
353 	return ("???");
354 }
355 
356 static const char *
mca_error_mmtype(uint16_t mca_error)357 mca_error_mmtype(uint16_t mca_error)
358 {
359 
360 	switch ((mca_error & 0x70) >> 4) {
361 	case 0x0:
362 		return ("GEN");
363 	case 0x1:
364 		return ("RD");
365 	case 0x2:
366 		return ("WR");
367 	case 0x3:
368 		return ("AC");
369 	case 0x4:
370 		return ("MS");
371 	}
372 	return ("???");
373 }
374 
375 static const char *
mca_addres_mode(uint64_t mca_misc)376 mca_addres_mode(uint64_t mca_misc)
377 {
378 
379 	switch ((mca_misc & MC_MISC_ADDRESS_MODE) >> 6) {
380 	case 0x0:
381 		return ("Segment Offset");
382 	case 0x1:
383 		return ("Linear Address");
384 	case 0x2:
385 		return ("Physical Address");
386 	case 0x3:
387 		return ("Memory Address");
388 	case 0x7:
389 		return ("Generic");
390 	}
391 	return ("???");
392 }
393 
394 static int
mca_mute(const struct mca_record * rec)395 mca_mute(const struct mca_record *rec)
396 {
397 
398 	/*
399 	 * Skip spurious corrected parity errors generated by Intel Haswell-
400 	 * and Broadwell-based CPUs (see HSD131, HSM142, HSW131 and BDM48
401 	 * erratum respectively), unless reporting is enabled.
402 	 * Note that these errors also have been observed with the D0-stepping
403 	 * of Haswell, while at least initially the CPU specification updates
404 	 * suggested only the C0-stepping to be affected.  Similarly, Celeron
405 	 * 2955U with a CPU ID of 0x45 apparently are also concerned with the
406 	 * same problem, with HSM142 only referring to 0x3c and 0x46.
407 	 */
408 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
409 	    CPUID_TO_FAMILY(cpu_id) == 0x6 &&
410 	    (CPUID_TO_MODEL(cpu_id) == 0x3c ||	/* HSD131, HSM142, HSW131 */
411 	    CPUID_TO_MODEL(cpu_id) == 0x3d ||	/* BDM48 */
412 	    CPUID_TO_MODEL(cpu_id) == 0x45 ||
413 	    CPUID_TO_MODEL(cpu_id) == 0x46) &&	/* HSM142 */
414 	    rec->mr_bank == 0 &&
415 	    (rec->mr_status & 0xa0000000ffffffff) == 0x80000000000f0005 &&
416 	    !intel6h_HSD131)
417 	    	return (1);
418 
419 	return (0);
420 }
421 
422 /* Dump details about a single machine check. */
423 static void
mca_log(const struct mca_record * rec)424 mca_log(const struct mca_record *rec)
425 {
426 	uint16_t mca_error;
427 
428 	if (mca_mute(rec))
429 	    	return;
430 
431 	printf("MCA: Bank %d, Status 0x%016llx\n", rec->mr_bank,
432 	    (long long)rec->mr_status);
433 	printf("MCA: Global Cap 0x%016llx, Status 0x%016llx\n",
434 	    (long long)rec->mr_mcg_cap, (long long)rec->mr_mcg_status);
435 	printf("MCA: Vendor \"%s\", ID 0x%x, APIC ID %d\n", cpu_vendor,
436 	    rec->mr_cpu_id, rec->mr_apic_id);
437 	printf("MCA: CPU %d ", rec->mr_cpu);
438 	if (rec->mr_status & MC_STATUS_UC)
439 		printf("UNCOR ");
440 	else {
441 		printf("COR ");
442 		if (cmci_supported(rec->mr_mcg_cap))
443 			printf("(%lld) ", ((long long)rec->mr_status &
444 			    MC_STATUS_COR_COUNT) >> 38);
445 		if (tes_supported(rec->mr_mcg_cap)) {
446 			switch ((rec->mr_status & MC_STATUS_TES_STATUS) >> 53) {
447 			case 0x1:
448 				printf("(Green) ");
449 			case 0x2:
450 				printf("(Yellow) ");
451 			}
452 		}
453 	}
454 	if (rec->mr_status & MC_STATUS_EN)
455 		printf("EN ");
456 	if (rec->mr_status & MC_STATUS_PCC)
457 		printf("PCC ");
458 	if (ser_supported(rec->mr_mcg_cap)) {
459 		if (rec->mr_status & MC_STATUS_S)
460 			printf("S ");
461 		if (rec->mr_status & MC_STATUS_AR)
462 			printf("AR ");
463 	}
464 	if (rec->mr_status & MC_STATUS_OVER)
465 		printf("OVER ");
466 	mca_error = rec->mr_status & MC_STATUS_MCA_ERROR;
467 	switch (mca_error) {
468 		/* Simple error codes. */
469 	case 0x0000:
470 		printf("no error");
471 		break;
472 	case 0x0001:
473 		printf("unclassified error");
474 		break;
475 	case 0x0002:
476 		printf("ucode ROM parity error");
477 		break;
478 	case 0x0003:
479 		printf("external error");
480 		break;
481 	case 0x0004:
482 		printf("FRC error");
483 		break;
484 	case 0x0005:
485 		printf("internal parity error");
486 		break;
487 	case 0x0006:
488 		printf("SMM handler code access violation");
489 		break;
490 	case 0x0400:
491 		printf("internal timer error");
492 		break;
493 	case 0x0e0b:
494 		printf("generic I/O error");
495 		if (rec->mr_cpu_vendor_id == CPU_VENDOR_INTEL &&
496 		    (rec->mr_status & MC_STATUS_MISCV)) {
497 			printf(" (pci%d:%d:%d:%d)",
498 			    (int)((rec->mr_misc & MC_MISC_PCIE_SEG) >> 32),
499 			    (int)((rec->mr_misc & MC_MISC_PCIE_BUS) >> 24),
500 			    (int)((rec->mr_misc & MC_MISC_PCIE_SLOT) >> 19),
501 			    (int)((rec->mr_misc & MC_MISC_PCIE_FUNC) >> 16));
502 		}
503 		break;
504 	default:
505 		if ((mca_error & 0xfc00) == 0x0400) {
506 			printf("internal error %x", mca_error & 0x03ff);
507 			break;
508 		}
509 
510 		/* Compound error codes. */
511 
512 		/* Memory hierarchy error. */
513 		if ((mca_error & 0xeffc) == 0x000c) {
514 			printf("%s memory error", mca_error_level(mca_error));
515 			break;
516 		}
517 
518 		/* TLB error. */
519 		if ((mca_error & 0xeff0) == 0x0010) {
520 			printf("%sTLB %s error", mca_error_ttype(mca_error),
521 			    mca_error_level(mca_error));
522 			break;
523 		}
524 
525 		/* Memory controller error. */
526 		if ((mca_error & 0xef80) == 0x0080) {
527 			printf("%s channel ", mca_error_mmtype(mca_error));
528 			if ((mca_error & 0x000f) != 0x000f)
529 				printf("%d", mca_error & 0x000f);
530 			else
531 				printf("??");
532 			printf(" memory error");
533 			break;
534 		}
535 
536 		/* Cache error. */
537 		if ((mca_error & 0xef00) == 0x0100) {
538 			printf("%sCACHE %s %s error",
539 			    mca_error_ttype(mca_error),
540 			    mca_error_level(mca_error),
541 			    mca_error_request(mca_error));
542 			break;
543 		}
544 
545 		/* Extended memory error. */
546 		if ((mca_error & 0xef80) == 0x0280) {
547 			printf("%s channel ", mca_error_mmtype(mca_error));
548 			if ((mca_error & 0x000f) != 0x000f)
549 				printf("%d", mca_error & 0x000f);
550 			else
551 				printf("??");
552 			printf(" extended memory error");
553 			break;
554 		}
555 
556 		/* Bus and/or Interconnect error. */
557 		if ((mca_error & 0xe800) == 0x0800) {
558 			printf("BUS%s ", mca_error_level(mca_error));
559 			switch ((mca_error & 0x0600) >> 9) {
560 			case 0:
561 				printf("Source");
562 				break;
563 			case 1:
564 				printf("Responder");
565 				break;
566 			case 2:
567 				printf("Observer");
568 				break;
569 			default:
570 				printf("???");
571 				break;
572 			}
573 			printf(" %s ", mca_error_request(mca_error));
574 			switch ((mca_error & 0x000c) >> 2) {
575 			case 0:
576 				printf("Memory");
577 				break;
578 			case 2:
579 				printf("I/O");
580 				break;
581 			case 3:
582 				printf("Other");
583 				break;
584 			default:
585 				printf("???");
586 				break;
587 			}
588 			if (mca_error & 0x0100)
589 				printf(" timed out");
590 			break;
591 		}
592 
593 		printf("unknown error %x", mca_error);
594 		break;
595 	}
596 	printf("\n");
597 	if (rec->mr_status & MC_STATUS_ADDRV) {
598 		printf("MCA: Address 0x%llx", (long long)rec->mr_addr);
599 		if (ser_supported(rec->mr_mcg_cap) &&
600 		    (rec->mr_status & MC_STATUS_MISCV)) {
601 			printf(" (Mode: %s, LSB: %d)",
602 			    mca_addres_mode(rec->mr_misc),
603 			    (int)(rec->mr_misc & MC_MISC_RA_LSB));
604 		}
605 		printf("\n");
606 	}
607 	if (rec->mr_status & MC_STATUS_MISCV)
608 		printf("MCA: Misc 0x%llx\n", (long long)rec->mr_misc);
609 }
610 
611 static bool
mca_is_mce(uint64_t mcg_cap,uint64_t status,bool * recoverablep)612 mca_is_mce(uint64_t mcg_cap, uint64_t status, bool *recoverablep)
613 {
614 
615 	/* Corrected error. */
616 	if ((status & MC_STATUS_UC) == 0)
617 		return (0);
618 
619 	/* Spurious MCA error. */
620 	if ((status & MC_STATUS_EN) == 0)
621 		return (0);
622 
623 	/* The processor does not support software error recovery. */
624 	if (!ser_supported(mcg_cap)) {
625 		*recoverablep = false;
626 		return (1);
627 	}
628 
629 	/* Context might have been corrupted. */
630 	if (status & MC_STATUS_PCC) {
631 		*recoverablep = false;
632 		return (1);
633 	}
634 
635 	/* Uncorrected software recoverable. */
636 	if (status & MC_STATUS_S) {
637 		/* Action required vs optional. */
638 		if (status & MC_STATUS_AR)
639 			*recoverablep = false;
640 		return (1);
641 	}
642 
643 	/* Uncorrected no action required. */
644 	return (0);
645 }
646 
647 static int
mca_check_status(enum scan_mode mode,uint64_t mcg_cap,int bank,struct mca_record * rec,bool * recoverablep)648 mca_check_status(enum scan_mode mode, uint64_t mcg_cap, int bank,
649     struct mca_record *rec, bool *recoverablep)
650 {
651 	uint64_t status;
652 	u_int p[4];
653 	bool mce, recover;
654 
655 	status = rdmsr(mca_msr_ops.status(bank));
656 	if (!(status & MC_STATUS_VAL))
657 		return (0);
658 
659 	recover = *recoverablep;
660 	mce = mca_is_mce(mcg_cap, status, &recover);
661 	if (mce != (mode == MCE))
662 		return (0);
663 	*recoverablep = recover;
664 
665 	/* Save exception information. */
666 	rec->mr_status = status;
667 	rec->mr_bank = bank;
668 	rec->mr_addr = 0;
669 	if (status & MC_STATUS_ADDRV)
670 		rec->mr_addr = rdmsr(mca_msr_ops.addr(bank));
671 	rec->mr_misc = 0;
672 	if (status & MC_STATUS_MISCV)
673 		rec->mr_misc = rdmsr(mca_msr_ops.misc(bank));
674 	rec->mr_tsc = rdtsc();
675 	rec->mr_apic_id = PCPU_GET(apic_id);
676 	rec->mr_mcg_cap = rdmsr(MSR_MCG_CAP);
677 	rec->mr_mcg_status = rdmsr(MSR_MCG_STATUS);
678 	rec->mr_cpu_id = cpu_id;
679 	rec->mr_cpu_vendor_id = cpu_vendor_id;
680 	rec->mr_cpu = PCPU_GET(cpuid);
681 
682 	/*
683 	 * Clear machine check.  Don't do this for uncorrectable
684 	 * errors so that the BIOS can see them.
685 	 */
686 	if (!mce || recover) {
687 		wrmsr(mca_msr_ops.status(bank), 0);
688 		do_cpuid(0, p);
689 	}
690 	return (1);
691 }
692 
693 static void
mca_resize_freelist(void)694 mca_resize_freelist(void)
695 {
696 	struct mca_internal *next, *rec;
697 	STAILQ_HEAD(, mca_internal) tmplist;
698 	int count, i, desired_max, desired_min;
699 
700 	/*
701 	 * Ensure we have at least one record for each bank and one
702 	 * record per CPU, but no more than twice that amount.
703 	 */
704 	desired_min = imax(mp_ncpus, mca_banks);
705 	desired_max = imax(mp_ncpus, mca_banks) * 2;
706 	STAILQ_INIT(&tmplist);
707 	mtx_lock_spin(&mca_lock);
708 	while (mca_freecount > desired_max) {
709 		rec = STAILQ_FIRST(&mca_freelist);
710 		KASSERT(rec != NULL, ("mca_freecount is %d, but list is empty",
711 		    mca_freecount));
712 		STAILQ_REMOVE_HEAD(&mca_freelist, link);
713 		mca_freecount--;
714 		STAILQ_INSERT_TAIL(&tmplist, rec, link);
715 	}
716 	while (mca_freecount < desired_min) {
717 		count = desired_min - mca_freecount;
718 		mtx_unlock_spin(&mca_lock);
719 		for (i = 0; i < count; i++) {
720 			rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
721 			STAILQ_INSERT_TAIL(&tmplist, rec, link);
722 		}
723 		mtx_lock_spin(&mca_lock);
724 		STAILQ_CONCAT(&mca_freelist, &tmplist);
725 		mca_freecount += count;
726 	}
727 	mtx_unlock_spin(&mca_lock);
728 	STAILQ_FOREACH_SAFE(rec, &tmplist, link, next)
729 		free(rec, M_MCA);
730 }
731 
732 static void
mca_resize(void * context,int pending)733 mca_resize(void *context, int pending)
734 {
735 
736 	mca_resize_freelist();
737 }
738 
739 static void
mca_record_entry(enum scan_mode mode,const struct mca_record * record)740 mca_record_entry(enum scan_mode mode, const struct mca_record *record)
741 {
742 	struct mca_internal *rec;
743 
744 	if (mode == POLLED) {
745 		rec = malloc(sizeof(*rec), M_MCA, M_WAITOK);
746 		mtx_lock_spin(&mca_lock);
747 	} else {
748 		mtx_lock_spin(&mca_lock);
749 		rec = STAILQ_FIRST(&mca_freelist);
750 		if (rec == NULL) {
751 			printf("MCA: Unable to allocate space for an event.\n");
752 			mca_log(record);
753 			mtx_unlock_spin(&mca_lock);
754 			return;
755 		}
756 		STAILQ_REMOVE_HEAD(&mca_freelist, link);
757 		mca_freecount--;
758 	}
759 
760 	rec->rec = *record;
761 	STAILQ_INSERT_TAIL(&mca_pending, rec, link);
762 	mtx_unlock_spin(&mca_lock);
763 }
764 
765 #ifdef DEV_APIC
766 /*
767  * Update the interrupt threshold for a CMCI.  The strategy is to use
768  * a low trigger that interrupts as soon as the first event occurs.
769  * However, if a steady stream of events arrive, the threshold is
770  * increased until the interrupts are throttled to once every
771  * cmc_throttle seconds or the periodic scan.  If a periodic scan
772  * finds that the threshold is too high, it is lowered.
773  */
774 static int
update_threshold(enum scan_mode mode,int valid,int last_intr,int count,int cur_threshold,int max_threshold)775 update_threshold(enum scan_mode mode, int valid, int last_intr, int count,
776     int cur_threshold, int max_threshold)
777 {
778 	u_int delta;
779 	int limit;
780 
781 	delta = (u_int)(time_uptime - last_intr);
782 	limit = cur_threshold;
783 
784 	/*
785 	 * If an interrupt was received less than cmc_throttle seconds
786 	 * since the previous interrupt and the count from the current
787 	 * event is greater than or equal to the current threshold,
788 	 * double the threshold up to the max.
789 	 */
790 	if (mode == CMCI && valid) {
791 		if (delta < cmc_throttle && count >= limit &&
792 		    limit < max_threshold) {
793 			limit = min(limit << 1, max_threshold);
794 		}
795 		return (limit);
796 	}
797 
798 	/*
799 	 * When the banks are polled, check to see if the threshold
800 	 * should be lowered.
801 	 */
802 	if (mode != POLLED)
803 		return (limit);
804 
805 	/* If a CMCI occured recently, do nothing for now. */
806 	if (delta < cmc_throttle)
807 		return (limit);
808 
809 	/*
810 	 * Compute a new limit based on the average rate of events per
811 	 * cmc_throttle seconds since the last interrupt.
812 	 */
813 	if (valid) {
814 		limit = count * cmc_throttle / delta;
815 		if (limit <= 0)
816 			limit = 1;
817 		else if (limit > max_threshold)
818 			limit = max_threshold;
819 	} else {
820 		limit = 1;
821 	}
822 	return (limit);
823 }
824 
825 static void
cmci_update(enum scan_mode mode,int bank,int valid,struct mca_record * rec)826 cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
827 {
828 	struct cmc_state *cc;
829 	uint64_t ctl;
830 	int cur_threshold, new_threshold;
831 	int count;
832 
833 	/* Fetch the current limit for this bank. */
834 	cc = &cmc_state[PCPU_GET(cpuid)][bank];
835 	ctl = rdmsr(MSR_MC_CTL2(bank));
836 	count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
837 	cur_threshold = ctl & MC_CTL2_THRESHOLD;
838 
839 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
840 	    cur_threshold, cc->max_threshold);
841 
842 	if (mode == CMCI && valid)
843 		cc->last_intr = time_uptime;
844 	if (new_threshold != cur_threshold) {
845 		ctl &= ~MC_CTL2_THRESHOLD;
846 		ctl |= new_threshold;
847 		wrmsr(MSR_MC_CTL2(bank), ctl);
848 	}
849 }
850 
851 static void
amd_thresholding_update(enum scan_mode mode,int bank,int valid)852 amd_thresholding_update(enum scan_mode mode, int bank, int valid)
853 {
854 	struct amd_et_state *cc;
855 	uint64_t misc;
856 	int new_threshold;
857 	int count;
858 
859 	cc = &amd_et_state[PCPU_GET(cpuid)][bank];
860 	misc = rdmsr(mca_msr_ops.misc(bank));
861 	count = (misc & MC_MISC_AMD_CNT_MASK) >> MC_MISC_AMD_CNT_SHIFT;
862 	count = count - (MC_MISC_AMD_CNT_MAX - cc->cur_threshold);
863 
864 	new_threshold = update_threshold(mode, valid, cc->last_intr, count,
865 	    cc->cur_threshold, MC_MISC_AMD_CNT_MAX);
866 
867 	cc->cur_threshold = new_threshold;
868 	misc &= ~MC_MISC_AMD_CNT_MASK;
869 	misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
870 	    << MC_MISC_AMD_CNT_SHIFT;
871 	misc &= ~MC_MISC_AMD_OVERFLOW;
872 	wrmsr(mca_msr_ops.misc(bank), misc);
873 	if (mode == CMCI && valid)
874 		cc->last_intr = time_uptime;
875 }
876 #endif
877 
878 /*
879  * This scans all the machine check banks of the current CPU to see if
880  * there are any machine checks.  Any non-recoverable errors are
881  * reported immediately via mca_log().  The current thread must be
882  * pinned when this is called.  The 'mode' parameter indicates if we
883  * are being called from the MC exception handler, the CMCI handler,
884  * or the periodic poller.
885  */
886 static int
mca_scan(enum scan_mode mode,bool * recoverablep)887 mca_scan(enum scan_mode mode, bool *recoverablep)
888 {
889 	struct mca_record rec;
890 	uint64_t mcg_cap;
891 	int count = 0, i, valid;
892 
893 	mcg_cap = rdmsr(MSR_MCG_CAP);
894 	for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
895 #ifdef DEV_APIC
896 		/*
897 		 * For a CMCI, only check banks this CPU is
898 		 * responsible for.
899 		 */
900 		if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
901 			continue;
902 #endif
903 
904 		valid = mca_check_status(mode, mcg_cap, i, &rec, recoverablep);
905 		if (valid) {
906 			count++;
907 			if (*recoverablep)
908 				mca_record_entry(mode, &rec);
909 			else
910 				mca_log(&rec);
911 		}
912 
913 #ifdef DEV_APIC
914 		/*
915 		 * If this is a bank this CPU monitors via CMCI,
916 		 * update the threshold.
917 		 */
918 		if (PCPU_GET(cmci_mask) & 1 << i) {
919 			if (cmc_state != NULL)
920 				cmci_update(mode, i, valid, &rec);
921 			else
922 				amd_thresholding_update(mode, i, valid);
923 		}
924 #endif
925 	}
926 	return (count);
927 }
928 
929 /*
930  * Store a new record on the mca_records list while enforcing
931  * mca_maxcount.
932  */
933 static void
mca_store_record(struct mca_internal * mca)934 mca_store_record(struct mca_internal *mca)
935 {
936 
937 	/*
938 	 * If we are storing no records (mca_maxcount == 0),
939 	 * we just free this record.
940 	 *
941 	 * If we are storing records (mca_maxcount != 0) and
942 	 * we have free space on the list, store the record
943 	 * and increment mca_count.
944 	 *
945 	 * If we are storing records and we do not have free
946 	 * space on the list, store the new record at the
947 	 * tail and free the oldest one from the head.
948 	 */
949 	if (mca_maxcount != 0)
950 		STAILQ_INSERT_TAIL(&mca_records, mca, link);
951 	if (mca_maxcount < 0 || mca_count < mca_maxcount)
952 		mca_count++;
953 	else {
954 		if (mca_maxcount != 0) {
955 			mca = STAILQ_FIRST(&mca_records);
956 			STAILQ_REMOVE_HEAD(&mca_records, link);
957 		}
958 		STAILQ_INSERT_TAIL(&mca_freelist, mca, link);
959 		mca_freecount++;
960 	}
961 }
962 
963 /*
964  * Do the work to process machine check records which have just been
965  * gathered. Print any pending logs to the console. Queue them for storage.
966  * Trigger a resizing of the free list.
967  */
968 static void
mca_process_records(enum scan_mode mode)969 mca_process_records(enum scan_mode mode)
970 {
971 	struct mca_internal *mca;
972 
973 	mtx_lock_spin(&mca_lock);
974 	while ((mca = STAILQ_FIRST(&mca_pending)) != NULL) {
975 		STAILQ_REMOVE_HEAD(&mca_pending, link);
976 		mca_log(&mca->rec);
977 		mca_store_record(mca);
978 	}
979 	mtx_unlock_spin(&mca_lock);
980 	if (mode == POLLED)
981 		mca_resize_freelist();
982 	else if (!cold)
983 		taskqueue_enqueue(mca_tq, &mca_resize_task);
984 }
985 
986 /*
987  * Scan the machine check banks on all CPUs by binding to each CPU in
988  * turn.  If any of the CPUs contained new machine check records, log
989  * them to the console.
990  */
991 static void
mca_scan_cpus(void * context,int pending)992 mca_scan_cpus(void *context, int pending)
993 {
994 	struct thread *td;
995 	int cpu;
996 	bool recoverable = true;
997 
998 	mca_resize_freelist();
999 	td = curthread;
1000 	thread_lock(td);
1001 	CPU_FOREACH(cpu) {
1002 		sched_bind(td, cpu);
1003 		thread_unlock(td);
1004 		mca_scan(POLLED, &recoverable);
1005 		thread_lock(td);
1006 		sched_unbind(td);
1007 	}
1008 	thread_unlock(td);
1009 	if (!STAILQ_EMPTY(&mca_pending))
1010 		mca_process_records(POLLED);
1011 	taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1012 	    mca_ticks * SBT_1S, 0, C_PREL(1));
1013 }
1014 
1015 static int
sysctl_mca_scan(SYSCTL_HANDLER_ARGS)1016 sysctl_mca_scan(SYSCTL_HANDLER_ARGS)
1017 {
1018 	int error, i;
1019 
1020 	i = 0;
1021 	error = sysctl_handle_int(oidp, &i, 0, req);
1022 	if (error)
1023 		return (error);
1024 	if (i)
1025 		taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1026 		    0, 0, 0);
1027 	return (0);
1028 }
1029 
1030 static int
sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS)1031 sysctl_mca_maxcount(SYSCTL_HANDLER_ARGS)
1032 {
1033 	struct mca_internal *mca;
1034 	int error, i;
1035 	bool doresize;
1036 
1037 	i = mca_maxcount;
1038 	error = sysctl_handle_int(oidp, &i, 0, req);
1039 	if (error || req->newptr == NULL)
1040 		return (error);
1041 	mtx_lock_spin(&mca_lock);
1042 	mca_maxcount = i;
1043 	doresize = false;
1044 	if (mca_maxcount >= 0)
1045 		while (mca_count > mca_maxcount) {
1046 			mca = STAILQ_FIRST(&mca_records);
1047 			STAILQ_REMOVE_HEAD(&mca_records, link);
1048 			mca_count--;
1049 			STAILQ_INSERT_TAIL(&mca_freelist, mca, link);
1050 			mca_freecount++;
1051 			doresize = true;
1052 		}
1053 	mtx_unlock_spin(&mca_lock);
1054 	if (doresize && !cold)
1055 		taskqueue_enqueue(mca_tq, &mca_resize_task);
1056 	return (error);
1057 }
1058 
1059 static void
mca_startup(void * dummy)1060 mca_startup(void *dummy)
1061 {
1062 
1063 	if (mca_banks <= 0)
1064 		return;
1065 
1066 	/* CMCIs during boot may have claimed items from the freelist. */
1067 	mca_resize_freelist();
1068 
1069 	taskqueue_start_threads(&mca_tq, 1, PI_SWI(SWI_TQ), "mca taskq");
1070 	taskqueue_enqueue_timeout_sbt(mca_tq, &mca_scan_task,
1071 	    mca_ticks * SBT_1S, 0, C_PREL(1));
1072 }
1073 #ifdef EARLY_AP_STARTUP
1074 SYSINIT(mca_startup, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, mca_startup, NULL);
1075 #else
1076 SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
1077 #endif
1078 
1079 #ifdef DEV_APIC
1080 static void
cmci_setup(void)1081 cmci_setup(void)
1082 {
1083 	int i;
1084 
1085 	cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state *), M_MCA,
1086 	    M_WAITOK);
1087 	for (i = 0; i <= mp_maxid; i++)
1088 		cmc_state[i] = malloc(sizeof(struct cmc_state) * mca_banks,
1089 		    M_MCA, M_WAITOK | M_ZERO);
1090 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1091 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1092 	    &cmc_throttle, 0, sysctl_positive_int, "I",
1093 	    "Interval in seconds to throttle corrected MC interrupts");
1094 }
1095 
1096 static void
amd_thresholding_setup(void)1097 amd_thresholding_setup(void)
1098 {
1099 	u_int i;
1100 
1101 	amd_et_state = malloc((mp_maxid + 1) * sizeof(struct amd_et_state *),
1102 	    M_MCA, M_WAITOK);
1103 	for (i = 0; i <= mp_maxid; i++)
1104 		amd_et_state[i] = malloc(sizeof(struct amd_et_state) *
1105 		    mca_banks, M_MCA, M_WAITOK | M_ZERO);
1106 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1107 	    "cmc_throttle", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1108 	    &cmc_throttle, 0, sysctl_positive_int, "I",
1109 	    "Interval in seconds to throttle corrected MC interrupts");
1110 }
1111 #endif
1112 
1113 static void
mca_setup(uint64_t mcg_cap)1114 mca_setup(uint64_t mcg_cap)
1115 {
1116 
1117 	/*
1118 	 * On AMD Family 10h processors, unless logging of level one TLB
1119 	 * parity (L1TP) errors is disabled, enable the recommended workaround
1120 	 * for Erratum 383.
1121 	 */
1122 	if (cpu_vendor_id == CPU_VENDOR_AMD &&
1123 	    CPUID_TO_FAMILY(cpu_id) == 0x10 && amd10h_L1TP)
1124 		workaround_erratum383 = 1;
1125 
1126 	mca_banks = mcg_cap & MCG_CAP_COUNT;
1127 	mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
1128 	STAILQ_INIT(&mca_records);
1129 	STAILQ_INIT(&mca_pending);
1130 	mca_tq = taskqueue_create_fast("mca", M_WAITOK,
1131 	    taskqueue_thread_enqueue, &mca_tq);
1132 	TIMEOUT_TASK_INIT(mca_tq, &mca_scan_task, 0, mca_scan_cpus, NULL);
1133 	STAILQ_INIT(&mca_freelist);
1134 	TASK_INIT(&mca_resize_task, 0, mca_resize, NULL);
1135 	mca_resize_freelist();
1136 	SYSCTL_ADD_INT(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1137 	    "count", CTLFLAG_RD, (int *)(uintptr_t)&mca_count, 0,
1138 	    "Record count");
1139 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1140 	    "maxcount", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1141 	    &mca_maxcount, 0, sysctl_mca_maxcount, "I",
1142 	    "Maximum record count (-1 is unlimited)");
1143 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1144 	    "interval", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
1145 	    &mca_ticks, 0, sysctl_positive_int, "I",
1146 	    "Periodic interval in seconds to scan for machine checks");
1147 	SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1148 	    "records", CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mca_records,
1149 	    "Machine check records");
1150 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
1151 	    "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1152 	    sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
1153 #ifdef DEV_APIC
1154 	if (cmci_supported(mcg_cap))
1155 		cmci_setup();
1156 	else if (amd_thresholding_supported())
1157 		amd_thresholding_setup();
1158 #endif
1159 }
1160 
1161 #ifdef DEV_APIC
1162 /*
1163  * See if we should monitor CMCI for this bank.  If CMCI_EN is already
1164  * set in MC_CTL2, then another CPU is responsible for this bank, so
1165  * ignore it.  If CMCI_EN returns zero after being set, then this bank
1166  * does not support CMCI_EN.  If this CPU sets CMCI_EN, then it should
1167  * now monitor this bank.
1168  */
1169 static void
cmci_monitor(int i)1170 cmci_monitor(int i)
1171 {
1172 	struct cmc_state *cc;
1173 	uint64_t ctl;
1174 
1175 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1176 
1177 	/*
1178 	 * It is possible for some APs to report CMCI support even if the BSP
1179 	 * does not, apparently due to a BIOS bug.
1180 	 */
1181 	if (cmc_state == NULL) {
1182 		if (bootverbose) {
1183 			printf(
1184 		    "AP %d (%d,%d) reports CMCI support but the BSP does not\n",
1185 			    PCPU_GET(cpuid), PCPU_GET(apic_id),
1186 			    PCPU_GET(acpi_id));
1187 		}
1188 		return;
1189 	}
1190 
1191 	ctl = rdmsr(MSR_MC_CTL2(i));
1192 	if (ctl & MC_CTL2_CMCI_EN)
1193 		/* Already monitored by another CPU. */
1194 		return;
1195 
1196 	/* Set the threshold to one event for now. */
1197 	ctl &= ~MC_CTL2_THRESHOLD;
1198 	ctl |= MC_CTL2_CMCI_EN | 1;
1199 	wrmsr(MSR_MC_CTL2(i), ctl);
1200 	ctl = rdmsr(MSR_MC_CTL2(i));
1201 	if (!(ctl & MC_CTL2_CMCI_EN))
1202 		/* This bank does not support CMCI. */
1203 		return;
1204 
1205 	cc = &cmc_state[PCPU_GET(cpuid)][i];
1206 
1207 	/* Determine maximum threshold. */
1208 	ctl &= ~MC_CTL2_THRESHOLD;
1209 	ctl |= 0x7fff;
1210 	wrmsr(MSR_MC_CTL2(i), ctl);
1211 	ctl = rdmsr(MSR_MC_CTL2(i));
1212 	cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
1213 
1214 	/* Start off with a threshold of 1. */
1215 	ctl &= ~MC_CTL2_THRESHOLD;
1216 	ctl |= 1;
1217 	wrmsr(MSR_MC_CTL2(i), ctl);
1218 
1219 	/* Mark this bank as monitored. */
1220 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
1221 }
1222 
1223 /*
1224  * For resume, reset the threshold for any banks we monitor back to
1225  * one and throw away the timestamp of the last interrupt.
1226  */
1227 static void
cmci_resume(int i)1228 cmci_resume(int i)
1229 {
1230 	struct cmc_state *cc;
1231 	uint64_t ctl;
1232 
1233 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1234 
1235 	/* See cmci_monitor(). */
1236 	if (cmc_state == NULL)
1237 		return;
1238 
1239 	/* Ignore banks not monitored by this CPU. */
1240 	if (!(PCPU_GET(cmci_mask) & 1 << i))
1241 		return;
1242 
1243 	cc = &cmc_state[PCPU_GET(cpuid)][i];
1244 	cc->last_intr = 0;
1245 	ctl = rdmsr(MSR_MC_CTL2(i));
1246 	ctl &= ~MC_CTL2_THRESHOLD;
1247 	ctl |= MC_CTL2_CMCI_EN | 1;
1248 	wrmsr(MSR_MC_CTL2(i), ctl);
1249 }
1250 
1251 /*
1252  * Apply an AMD ET configuration to the corresponding MSR.
1253  */
1254 static void
amd_thresholding_start(struct amd_et_state * cc,int bank)1255 amd_thresholding_start(struct amd_et_state *cc, int bank)
1256 {
1257 	uint64_t misc;
1258 
1259 	KASSERT(amd_elvt >= 0, ("ELVT offset is not set"));
1260 
1261 	misc = rdmsr(mca_msr_ops.misc(bank));
1262 
1263 	misc &= ~MC_MISC_AMD_INT_MASK;
1264 	misc |= MC_MISC_AMD_INT_LVT;
1265 
1266 	misc &= ~MC_MISC_AMD_LVT_MASK;
1267 	misc |= (uint64_t)amd_elvt << MC_MISC_AMD_LVT_SHIFT;
1268 
1269 	misc &= ~MC_MISC_AMD_CNT_MASK;
1270 	misc |= (uint64_t)(MC_MISC_AMD_CNT_MAX - cc->cur_threshold)
1271 	    << MC_MISC_AMD_CNT_SHIFT;
1272 
1273 	misc &= ~MC_MISC_AMD_OVERFLOW;
1274 	misc |= MC_MISC_AMD_CNTEN;
1275 
1276 	wrmsr(mca_msr_ops.misc(bank), misc);
1277 }
1278 
1279 static void
amd_thresholding_monitor(int i)1280 amd_thresholding_monitor(int i)
1281 {
1282 	struct amd_et_state *cc;
1283 	uint64_t misc;
1284 
1285 	/*
1286 	 * Kludge: On 10h, banks after 4 are not thresholding but also may have
1287 	 * bogus Valid bits.  Skip them.  This is definitely fixed in 15h, but
1288 	 * I have not investigated whether it is fixed in earlier models.
1289 	 */
1290 	if (CPUID_TO_FAMILY(cpu_id) < 0x15 && i >= 5)
1291 		return;
1292 
1293 	/* The counter must be valid and present. */
1294 	misc = rdmsr(mca_msr_ops.misc(i));
1295 	if ((misc & (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP)) !=
1296 	    (MC_MISC_AMD_VAL | MC_MISC_AMD_CNTP))
1297 		return;
1298 
1299 	/* The register should not be locked. */
1300 	if ((misc & MC_MISC_AMD_LOCK) != 0) {
1301 		if (bootverbose)
1302 			printf("%s: 0x%jx: Bank %d: locked\n", __func__,
1303 			    (uintmax_t)misc, i);
1304 		return;
1305 	}
1306 
1307 	/*
1308 	 * If counter is enabled then either the firmware or another CPU
1309 	 * has already claimed it.
1310 	 */
1311 	if ((misc & MC_MISC_AMD_CNTEN) != 0) {
1312 		if (bootverbose)
1313 			printf("%s: 0x%jx: Bank %d: already enabled\n",
1314 			    __func__, (uintmax_t)misc, i);
1315 		return;
1316 	}
1317 
1318 	/*
1319 	 * Configure an Extended Interrupt LVT register for reporting
1320 	 * counter overflows if that feature is supported and the first
1321 	 * extended register is available.
1322 	 */
1323 	amd_elvt = lapic_enable_mca_elvt();
1324 	if (amd_elvt < 0) {
1325 		printf("%s: Bank %d: lapic enable mca elvt failed: %d\n",
1326 		    __func__, i, amd_elvt);
1327 		return;
1328 	}
1329 
1330 	cc = &amd_et_state[PCPU_GET(cpuid)][i];
1331 	cc->cur_threshold = 1;
1332 	amd_thresholding_start(cc, i);
1333 
1334 	/* Mark this bank as monitored. */
1335 	PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
1336 }
1337 
1338 static void
amd_thresholding_resume(int i)1339 amd_thresholding_resume(int i)
1340 {
1341 	struct amd_et_state *cc;
1342 
1343 	KASSERT(i < mca_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
1344 
1345 	/* Ignore banks not monitored by this CPU. */
1346 	if (!(PCPU_GET(cmci_mask) & 1 << i))
1347 		return;
1348 
1349 	cc = &amd_et_state[PCPU_GET(cpuid)][i];
1350 	cc->last_intr = 0;
1351 	cc->cur_threshold = 1;
1352 	amd_thresholding_start(cc, i);
1353 }
1354 #endif
1355 
1356 /*
1357  * Initializes per-CPU machine check registers and enables corrected
1358  * machine check interrupts.
1359  */
1360 static void
_mca_init(int boot)1361 _mca_init(int boot)
1362 {
1363 	uint64_t mcg_cap;
1364 	uint64_t ctl, mask;
1365 	int i, skip, family;
1366 
1367 	family = CPUID_TO_FAMILY(cpu_id);
1368 
1369 	/* MCE is required. */
1370 	if (!mca_enabled || !(cpu_feature & CPUID_MCE))
1371 		return;
1372 
1373 	if (cpu_feature & CPUID_MCA) {
1374 		if (boot)
1375 			PCPU_SET(cmci_mask, 0);
1376 
1377 		mcg_cap = rdmsr(MSR_MCG_CAP);
1378 		if (mcg_cap & MCG_CAP_CTL_P)
1379 			/* Enable MCA features. */
1380 			wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
1381 		if (IS_BSP() && boot)
1382 			mca_setup(mcg_cap);
1383 
1384 		/*
1385 		 * Disable logging of level one TLB parity (L1TP) errors by
1386 		 * the data cache as an alternative workaround for AMD Family
1387 		 * 10h Erratum 383.  Unlike the recommended workaround, there
1388 		 * is no performance penalty to this workaround.  However,
1389 		 * L1TP errors will go unreported.
1390 		 */
1391 		if (cpu_vendor_id == CPU_VENDOR_AMD && family == 0x10 &&
1392 		    !amd10h_L1TP) {
1393 			mask = rdmsr(MSR_MC0_CTL_MASK);
1394 			if ((mask & (1UL << 5)) == 0)
1395 				wrmsr(MSR_MC0_CTL_MASK, mask | (1UL << 5));
1396 		}
1397 		if (amd_rascap & AMDRAS_SCALABLE_MCA) {
1398 			mca_msr_ops.ctl = mca_smca_ctl_reg;
1399 			mca_msr_ops.status = mca_smca_status_reg;
1400 			mca_msr_ops.addr = mca_smca_addr_reg;
1401 			mca_msr_ops.misc = mca_smca_misc_reg;
1402 		}
1403 
1404 		/* Enable local MCE if supported. */
1405 		if (cpu_vendor_id == CPU_VENDOR_INTEL &&
1406 		    (mcg_cap & MCG_CAP_LMCE_P) &&
1407 		    (rdmsr(MSR_IA32_FEATURE_CONTROL) &
1408 		     IA32_FEATURE_CONTROL_LMCE_EN))
1409 			wrmsr(MSR_MCG_EXT_CTL, rdmsr(MSR_MCG_EXT_CTL) | 1);
1410 
1411 		/*
1412 		 * The cmci_monitor() must not be executed
1413 		 * simultaneously by several CPUs.
1414 		 */
1415 		if (boot)
1416 			mtx_lock_spin(&mca_lock);
1417 
1418 		for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
1419 			/* By default enable logging of all errors. */
1420 			ctl = 0xffffffffffffffffUL;
1421 			skip = 0;
1422 
1423 			if (cpu_vendor_id == CPU_VENDOR_INTEL) {
1424 				/*
1425 				 * For P6 models before Nehalem MC0_CTL is
1426 				 * always enabled and reserved.
1427 				 */
1428 				if (i == 0 && family == 0x6
1429 				    && CPUID_TO_MODEL(cpu_id) < 0x1a)
1430 					skip = 1;
1431 			} else if (cpu_vendor_id == CPU_VENDOR_AMD) {
1432 				/* BKDG for Family 10h: unset GartTblWkEn. */
1433 				if (i == MC_AMDNB_BANK && family >= 0xf &&
1434 				    family < 0x17)
1435 					ctl &= ~(1UL << 10);
1436 			}
1437 
1438 			if (!skip)
1439 				wrmsr(mca_msr_ops.ctl(i), ctl);
1440 
1441 #ifdef DEV_APIC
1442 			if (cmci_supported(mcg_cap)) {
1443 				if (boot)
1444 					cmci_monitor(i);
1445 				else
1446 					cmci_resume(i);
1447 			} else if (amd_thresholding_supported()) {
1448 				if (boot)
1449 					amd_thresholding_monitor(i);
1450 				else
1451 					amd_thresholding_resume(i);
1452 			}
1453 #endif
1454 
1455 			/* Clear all errors. */
1456 			wrmsr(mca_msr_ops.status(i), 0);
1457 		}
1458 		if (boot)
1459 			mtx_unlock_spin(&mca_lock);
1460 
1461 #ifdef DEV_APIC
1462 		if (cmci_supported(mcg_cap) &&
1463 		    PCPU_GET(cmci_mask) != 0 && boot)
1464 			lapic_enable_cmc();
1465 #endif
1466 	}
1467 
1468 	load_cr4(rcr4() | CR4_MCE);
1469 }
1470 
1471 /* Must be executed on each CPU during boot. */
1472 void
mca_init(void)1473 mca_init(void)
1474 {
1475 
1476 	_mca_init(1);
1477 }
1478 
1479 /* Must be executed on each CPU during resume. */
1480 void
mca_resume(void)1481 mca_resume(void)
1482 {
1483 
1484 	_mca_init(0);
1485 }
1486 
1487 /*
1488  * The machine check registers for the BSP cannot be initialized until
1489  * the local APIC is initialized.  This happens at SI_SUB_CPU,
1490  * SI_ORDER_SECOND.
1491  */
1492 static void
mca_init_bsp(void * arg __unused)1493 mca_init_bsp(void *arg __unused)
1494 {
1495 
1496 	mca_init();
1497 }
1498 SYSINIT(mca_init_bsp, SI_SUB_CPU, SI_ORDER_ANY, mca_init_bsp, NULL);
1499 
1500 /* Called when a machine check exception fires. */
1501 void
mca_intr(void)1502 mca_intr(void)
1503 {
1504 	uint64_t mcg_status;
1505 	int count;
1506 	bool lmcs, recoverable;
1507 
1508 	if (!(cpu_feature & CPUID_MCA)) {
1509 		/*
1510 		 * Just print the values of the old Pentium registers
1511 		 * and panic.
1512 		 */
1513 		printf("MC Type: 0x%jx  Address: 0x%jx\n",
1514 		    (uintmax_t)rdmsr(MSR_P5_MC_TYPE),
1515 		    (uintmax_t)rdmsr(MSR_P5_MC_ADDR));
1516 		panic("Machine check exception");
1517 	}
1518 
1519 	/* Scan the banks and check for any non-recoverable errors. */
1520 	mcg_status = rdmsr(MSR_MCG_STATUS);
1521 	recoverable = (mcg_status & MCG_STATUS_RIPV) != 0;
1522 	lmcs = (cpu_vendor_id != CPU_VENDOR_INTEL ||
1523 	    (mcg_status & MCG_STATUS_LMCS));
1524 	count = mca_scan(MCE, &recoverable);
1525 
1526 	if (!recoverable) {
1527 		/*
1528 		 * Only panic if the error was detected local to this CPU.
1529 		 * Some errors will assert a machine check on all CPUs, but
1530 		 * only certain CPUs will find a valid bank to log.
1531 		 */
1532 		while (!lmcs && count == 0)
1533 			cpu_spinwait();
1534 
1535 		panic("Unrecoverable machine check exception");
1536 	}
1537 
1538 	/* Clear MCIP. */
1539 	wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
1540 }
1541 
1542 #ifdef DEV_APIC
1543 /* Called for a CMCI (correctable machine check interrupt). */
1544 void
cmc_intr(void)1545 cmc_intr(void)
1546 {
1547 	bool recoverable = true;
1548 
1549 	/*
1550 	 * Serialize MCA bank scanning to prevent collisions from
1551 	 * sibling threads.
1552 	 *
1553 	 * If we found anything, log them to the console.
1554 	 */
1555 	if (mca_scan(CMCI, &recoverable) != 0)
1556 		mca_process_records(CMCI);
1557 }
1558 #endif
1559