Skip to content

Instantly share code, notes, and snippets.

@cyring
Created October 24, 2020 16:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cyring/fcb87d9280366e5eadddd0d34740bdc5 to your computer and use it in GitHub Desktop.
Save cyring/fcb87d9280366e5eadddd0d34740bdc5 to your computer and use it in GitHub Desktop.
ROG CROSSHAIR VIII HERO (WI-FI)
@cyring
Copy link
Author

cyring commented Mar 20, 2021

BIOS 3302

Hardware Error

mce: [Hardware Error]: Machine check events logged
[Hardware Error]: Corrected error, no action required.
[Hardware Error]: CPU:0 (17:71:0) MC25_STATUS[-|CE|MiscV|-|-|-|-|CECC|-|-|-]: 0>
[Hardware Error]: IPID: 0x000100ff03830400
[Hardware Error]: Platform Security Processor Ext. Error Code: 62
[Hardware Error]: cache level: RESV, tx: INSN
...
mce: [Hardware Error]: Machine check events logged
[Hardware Error]: Corrected error, no action required.
[Hardware Error]: CPU:0 (17:71:0) MC25_STATUS[-|CE|MiscV|-|-|-|-|CECC|-|-|-]: 0>
[Hardware Error]: IPID: 0x000100ff03830400
[Hardware Error]: Platform Security Processor Ext. Error Code: 62
[Hardware Error]: cache level: RESV, tx: INSN
...
[Hardware Error]: Corrected error, no action required.
[Hardware Error]: CPU:0 (17:71:0) MC25_STATUS[-|CE|MiscV|-|-|-|-|CECC|-|-|-]: 0>
[Hardware Error]: IPID: 0x000100ff03830400
[Hardware Error]: Platform Security Processor Ext. Error Code: 62
[Hardware Error]: cache level: RESV, tx: INSN

Kernel v5.11.7

Error breakdown

CPU# Family Model Stepping Bank #
0 0x17 0x71 0x0 25
OVER UC or DEFERRED* MISCV ADDRV PCC
FALSE FALSE TRUE FALSE FALSE
----- function           EAX          EBX          ECX          EDX ------
  80000007:00000000    00000000     0000001b     00000000     00006799
rdmsr -p 0 -x 0xc0002194
0x2300000079
MCAX and TCC[55] SYNDV[53]
FALSE FALSE
ECC DEFERRED* POISON SCRUB
C FALSE FALSE FALSE

IPID

XEC = (0x98004000003e0000 >> 16) & 0x3f
XEC = 0x3e (62)

Decode status error code of 0x98004000003e0000 & 0xffff = 0

INTERNAL Cache:RESV Cache:L1 Cache:L2 Cache:L3/GEN
FALSE TRUE FALSE FALSE FALSE
BUS INSN DATA RESV
FALSE TRUE FALSE FALSE
MEM GEN RD WR DRD DWR IRD PRF EV SNP
FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Source

drivers/edac/mce_amd.c

static int
amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
{
	struct mce *m = (struct mce *)data;
	unsigned int fam = x86_family(m->cpuid);
	int ecc;

	if (m->kflags & MCE_HANDLED_CEC)
		return NOTIFY_DONE;

	pr_emerg(HW_ERR "%s\n", decode_error_status(m));

	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
		m->extcpu,
		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
		m->bank,
		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
		((m->status & MCI_STATUS_UC)	? "UE"	  :
		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));

	if (boot_cpu_has(X86_FEATURE_SMCA)) {
		u32 low, high;
		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);

		if (!rdmsr_safe(addr, &low, &high) &&
		    (low & MCI_CONFIG_MCAX))
			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));

		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
	}

	/* do the two bits[14:13] together */
	ecc = (m->status >> 45) & 0x3;
	if (ecc)
		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));

	if (fam >= 0x15) {
		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));

		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
		if (fam != 0x15 || m->bank != 4)
			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
	}

	if (fam >= 0x17)
		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));

	pr_cont("]: 0x%016llx\n", m->status);

	if (m->status & MCI_STATUS_ADDRV)
		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);

	if (m->ppin)
		pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);

	if (boot_cpu_has(X86_FEATURE_SMCA)) {
		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);

		if (m->status & MCI_STATUS_SYNDV)
			pr_cont(", Syndrome: 0x%016llx", m->synd);

		pr_cont("\n");

		decode_smca_error(m);
		goto err_code;
	}

	if (m->tsc)
		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);

	/* Doesn't matter which member to test. */
	if (!fam_ops.mc0_mce)
		goto err_code;

	switch (m->bank) {
	case 0:
		decode_mc0_mce(m);
		break;

	case 1:
		decode_mc1_mce(m);
		break;

	case 2:
		decode_mc2_mce(m);
		break;

	case 3:
		decode_mc3_mce(m);
		break;

	case 4:
		decode_mc4_mce(m);
		break;

	case 5:
		decode_mc5_mce(m);
		break;

	case 6:
		decode_mc6_mce(m);
		break;

	default:
		break;
	}

 err_code:
	amd_decode_err_code(m->status & 0xffff);

	m->kflags |= MCE_HANDLED_EDAC;
	return NOTIFY_OK;
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment