Message ID | 20240404151359.47970-17-yazen.ghannam@amd.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | MCA Updates | expand |
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c > index aa27729f7899..a4d09dda5fae 100644 > --- a/arch/x86/kernel/cpu/mce/core.c > +++ b/arch/x86/kernel/cpu/mce/core.c > @@ -207,6 +207,8 @@ static void __print_mce(struct mce_hw_err *err) > pr_cont("SYND2 %llx ", err->vi.amd.synd2); > if (m->ipid) > pr_cont("IPID %llx ", m->ipid); > + if (err->vi.amd.config) This is in common code. If other vendors start adding their own stuff to the "vi" union you might incorrectly print this. Add a vendor check before looking at values inside "m->vi". > + pr_cont("CONFIG %llx ", err->vi.amd.config); > } > > pr_cont("\n");
On 4/5/24 12:06, Luck, Tony wrote: >> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c >> index aa27729f7899..a4d09dda5fae 100644 >> --- a/arch/x86/kernel/cpu/mce/core.c >> +++ b/arch/x86/kernel/cpu/mce/core.c >> @@ -207,6 +207,8 @@ static void __print_mce(struct mce_hw_err *err) >> pr_cont("SYND2 %llx ", err->vi.amd.synd2); >> if (m->ipid) >> pr_cont("IPID %llx ", m->ipid); >> + if (err->vi.amd.config) > > This is in common code. If other vendors start adding their own stuff to the > "vi" union you might incorrectly print this. Add a vendor check before looking > at values inside "m->vi". > Yes, agreed. Will do. Thanks, Yazen
On 4/5/2024 11:06, Luck, Tony wrote: >> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c >> index aa27729f7899..a4d09dda5fae 100644 >> --- a/arch/x86/kernel/cpu/mce/core.c >> +++ b/arch/x86/kernel/cpu/mce/core.c >> @@ -207,6 +207,8 @@ static void __print_mce(struct mce_hw_err *err) >> pr_cont("SYND2 %llx ", err->vi.amd.synd2); >> if (m->ipid) >> pr_cont("IPID %llx ", m->ipid); >> + if (err->vi.amd.config) > > This is in common code. If other vendors start adding their own stuff to the > "vi" union you might incorrectly print this. Add a vendor check before looking > at values inside "m->vi". > Do we really need an explicit vendor check in this particular instance? Below is a snippet from __print_mce() after applying this series: if (mce_flags.smca) { if (m->synd) pr_cont("SYND %llx ", m->synd); if (err->vi.amd.synd1) pr_cont("SYND1 %llx ", err->vi.amd.synd1); if (err->vi.amd.synd2) pr_cont("SYND2 %llx ", err->vi.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); if (err->vi.amd.config) pr_cont("CONFIG %llx ", err->vi.amd.config); } pr_cont("\n"); All of the above registers including the newly added config MSR will only be logged if the smca flag is set in mce_flags. Doesn't that already serve as a vendor check of sorts? Something that I am missing here?
> All of the above registers including the newly added config MSR will only > be logged if the smca flag is set in mce_flags. > Doesn't that already serve as a vendor check of sorts? > Something that I am missing here? Avadhut, Yes. That's a sufficient vendor check. I was looking at the bits in the patch, not at the broader context. Sorry for the noise. -Tony
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index a701290f80a1..2a8997d7ba4d 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -59,6 +59,7 @@ * - TCC bit is present in MCx_STATUS. */ #define MCI_CONFIG_MCAX 0x1 +#define MCI_CONFIG_FRUTEXT BIT_ULL(9) /* * Note that the full MCACOD field of IA32_MCi_STATUS MSR is @@ -195,6 +196,7 @@ struct mce_hw_err { struct { u64 synd1; u64 synd2; + u64 config; } amd; } vi; }; diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 43622241c379..a9c28614530b 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -154,6 +154,8 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id) fallthrough; /* MCA_CONFIG */ case 4: + err.vi.amd.config = *(i_mce + 3); + fallthrough; /* MCA_MISC0 */ case 3: m->misc = *(i_mce + 2); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index aa27729f7899..a4d09dda5fae 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -207,6 +207,8 @@ static void __print_mce(struct mce_hw_err *err) pr_cont("SYND2 %llx ", err->vi.amd.synd2); if (m->ipid) pr_cont("IPID %llx ", m->ipid); + if (err->vi.amd.config) + pr_cont("CONFIG %llx ", err->vi.amd.config); } pr_cont("\n"); @@ -679,6 +681,7 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) if (mce_flags.smca) { m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + err->vi.amd.config = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(i)); if (m->status & MCI_STATUS_SYNDV) { m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 32bf4cc564a3..f68b3d1b558e 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -795,6 +795,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) struct mce_hw_err *err = (struct mce_hw_err *)data; struct mce *m = &err->m; unsigned int fam = x86_family(m->cpuid); + u64 mca_config = err->vi.amd.config; int ecc; if (m->kflags & MCE_HANDLED_CEC) @@ -814,11 +815,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); if (boot_cpu_has(X86_FEATURE_SMCA)) { - u32 low, high; - u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); - - if (!rdmsr_safe(addr, &low, &high) && - (low & MCI_CONFIG_MCAX)) + if (mca_config & MCI_CONFIG_MCAX) pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); @@ -853,8 +850,18 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (m->status & MCI_STATUS_SYNDV) { pr_cont(", Syndrome: 0x%016llx\n", m->synd); - pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx", - err->vi.amd.synd1, err->vi.amd.synd2); + if (mca_config & MCI_CONFIG_FRUTEXT) { + char frutext[17]; + + memset(frutext, 0, sizeof(frutext)); + memcpy(&frutext[0], &err->vi.amd.synd1, 8); + memcpy(&frutext[8], &err->vi.amd.synd2, 8); + + pr_emerg(HW_ERR "FRU Text: %s", frutext); + } else { + pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx", + err->vi.amd.synd1, err->vi.amd.synd2); + } } pr_cont("\n");