Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
ROG CROSSHAIR VIII HERO (WI-FI)
@cyring
Copy link
Author

cyring commented Mar 20, 2021

BIOS 3302

Hardware Error

mce: [Hardware Error]: Machine check events logged
[Hardware Error]: Corrected error, no action required.
[Hardware Error]: CPU:0 (17:71:0) MC25_STATUS[-|CE|MiscV|-|-|-|-|CECC|-|-|-]: 0>
[Hardware Error]: IPID: 0x000100ff03830400
[Hardware Error]: Platform Security Processor Ext. Error Code: 62
[Hardware Error]: cache level: RESV, tx: INSN
...
mce: [Hardware Error]: Machine check events logged
[Hardware Error]: Corrected error, no action required.
[Hardware Error]: CPU:0 (17:71:0) MC25_STATUS[-|CE|MiscV|-|-|-|-|CECC|-|-|-]: 0>
[Hardware Error]: IPID: 0x000100ff03830400
[Hardware Error]: Platform Security Processor Ext. Error Code: 62
[Hardware Error]: cache level: RESV, tx: INSN
...
[Hardware Error]: Corrected error, no action required.
[Hardware Error]: CPU:0 (17:71:0) MC25_STATUS[-|CE|MiscV|-|-|-|-|CECC|-|-|-]: 0>
[Hardware Error]: IPID: 0x000100ff03830400
[Hardware Error]: Platform Security Processor Ext. Error Code: 62
[Hardware Error]: cache level: RESV, tx: INSN

Kernel v5.11.7

Error breakdown

CPU# Family Model Stepping Bank #
0 0x17 0x71 0x0 25
OVER UC or DEFERRED* MISCV ADDRV PCC
FALSE FALSE TRUE FALSE FALSE
----- function           EAX          EBX          ECX          EDX ------
  80000007:00000000    00000000     0000001b     00000000     00006799
rdmsr -p 0 -x 0xc0002194
0x2300000079
MCAX and TCC[55] SYNDV[53]
FALSE FALSE
ECC DEFERRED* POISON SCRUB
C FALSE FALSE FALSE

IPID

XEC = (0x98004000003e0000 >> 16) & 0x3f
XEC = 0x3e (62)

Decode status error code of 0x98004000003e0000 & 0xffff = 0

INTERNAL Cache:RESV Cache:L1 Cache:L2 Cache:L3/GEN
FALSE TRUE FALSE FALSE FALSE
BUS INSN DATA RESV
FALSE TRUE FALSE FALSE
MEM GEN RD WR DRD DWR IRD PRF EV SNP
FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Source

drivers/edac/mce_amd.c

static int
amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
{
	struct mce *m = (struct mce *)data;
	unsigned int fam = x86_family(m->cpuid);
	int ecc;

	if (m->kflags & MCE_HANDLED_CEC)
		return NOTIFY_DONE;

	pr_emerg(HW_ERR "%s\n", decode_error_status(m));

	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
		m->extcpu,
		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
		m->bank,
		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
		((m->status & MCI_STATUS_UC)	? "UE"	  :
		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));

	if (boot_cpu_has(X86_FEATURE_SMCA)) {
		u32 low, high;
		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);

		if (!rdmsr_safe(addr, &low, &high) &&
		    (low & MCI_CONFIG_MCAX))
			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));

		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
	}

	/* do the two bits[14:13] together */
	ecc = (m->status >> 45) & 0x3;
	if (ecc)
		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));

	if (fam >= 0x15) {
		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));

		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
		if (fam != 0x15 || m->bank != 4)
			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
	}

	if (fam >= 0x17)
		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));

	pr_cont("]: 0x%016llx\n", m->status);

	if (m->status & MCI_STATUS_ADDRV)
		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);

	if (m->ppin)
		pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);

	if (boot_cpu_has(X86_FEATURE_SMCA)) {
		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);

		if (m->status & MCI_STATUS_SYNDV)
			pr_cont(", Syndrome: 0x%016llx", m->synd);

		pr_cont("\n");

		decode_smca_error(m);
		goto err_code;
	}

	if (m->tsc)
		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);

	/* Doesn't matter which member to test. */
	if (!fam_ops.mc0_mce)
		goto err_code;

	switch (m->bank) {
	case 0:
		decode_mc0_mce(m);
		break;

	case 1:
		decode_mc1_mce(m);
		break;

	case 2:
		decode_mc2_mce(m);
		break;

	case 3:
		decode_mc3_mce(m);
		break;

	case 4:
		decode_mc4_mce(m);
		break;

	case 5:
		decode_mc5_mce(m);
		break;

	case 6:
		decode_mc6_mce(m);
		break;

	default:
		break;
	}

 err_code:
	amd_decode_err_code(m->status & 0xffff);

	m->kflags |= MCE_HANDLED_EDAC;
	return NOTIFY_OK;
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment