diff mbox series

[v2,16/16] EDAC/mce_amd: Add support for FRU Text in MCA

Message ID 20240404151359.47970-17-yazen.ghannam@amd.com (mailing list archive)
State New
Headers show
Series MCA Updates | expand

Commit Message

Yazen Ghannam April 4, 2024, 3:13 p.m. UTC
A new "FRU Text in MCA" feature is defined where the Field Replaceable
Unit (FRU) Text for a device is represented by a string in the new
MCA_SYND1 and MCA_SYND2 registers. This feature is supported per MCA
bank, and it is advertised by the McaFruTextInMca bit (MCA_CONFIG[9]).

The FRU Text is populated dynamically for each individual error state
(MCA_STATUS, MCA_ADDR, et al.). This handles the case where an MCA bank
covers multiple devices, for example, a Unified Memory Controller (UMC)
bank that manages two DIMMs.

Print the FRU Text string, if available, when decoding an MCA error.

Also, add field for MCA_CONFIG MSR in struct mce_hw_err as vendor specific
error information and save the value of the MSR. The very value can then be
exported through tracepoint for userspace tools like rasdaemon to print FRU
Text, if available.

Co-developed-by: Avadhut Naik <Avadhut.Naik@amd.com>
Signed-off-by: Avadhut Naik <Avadhut.Naik@amd.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---

Notes:
    Link:
    https://lkml.kernel.org/r/20231118193248.1296798-21-yazen.ghannam@amd.com
    
    v1->v2:
    * No change.

 arch/x86/include/asm/mce.h     |  2 ++
 arch/x86/kernel/cpu/mce/apei.c |  2 ++
 arch/x86/kernel/cpu/mce/core.c |  3 +++
 drivers/edac/mce_amd.c         | 21 ++++++++++++++-------
 4 files changed, 21 insertions(+), 7 deletions(-)

Comments

Luck, Tony April 5, 2024, 4:06 p.m. UTC | #1
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index aa27729f7899..a4d09dda5fae 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -207,6 +207,8 @@ static void __print_mce(struct mce_hw_err *err)
>                       pr_cont("SYND2 %llx ", err->vi.amd.synd2);
>               if (m->ipid)
>                       pr_cont("IPID %llx ", m->ipid);
> +             if (err->vi.amd.config)

This is in common code. If other vendors start adding their own stuff to the
"vi" union you might incorrectly print this.  Add a vendor check before looking
at values inside "m->vi".

> +                     pr_cont("CONFIG %llx ", err->vi.amd.config);
>       }
>
>       pr_cont("\n");
Yazen Ghannam April 7, 2024, 1:19 p.m. UTC | #2
On 4/5/24 12:06, Luck, Tony wrote:
>> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
>> index aa27729f7899..a4d09dda5fae 100644
>> --- a/arch/x86/kernel/cpu/mce/core.c
>> +++ b/arch/x86/kernel/cpu/mce/core.c
>> @@ -207,6 +207,8 @@ static void __print_mce(struct mce_hw_err *err)
>>                        pr_cont("SYND2 %llx ", err->vi.amd.synd2);
>>                if (m->ipid)
>>                        pr_cont("IPID %llx ", m->ipid);
>> +             if (err->vi.amd.config)
> 
> This is in common code. If other vendors start adding their own stuff to the
> "vi" union you might incorrectly print this.  Add a vendor check before looking
> at values inside "m->vi".
>

Yes, agreed. Will do.

Thanks,
Yazen
Naik, Avadhut April 8, 2024, 7:47 p.m. UTC | #3
On 4/5/2024 11:06, Luck, Tony wrote:
>> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
>> index aa27729f7899..a4d09dda5fae 100644
>> --- a/arch/x86/kernel/cpu/mce/core.c
>> +++ b/arch/x86/kernel/cpu/mce/core.c
>> @@ -207,6 +207,8 @@ static void __print_mce(struct mce_hw_err *err)
>>                       pr_cont("SYND2 %llx ", err->vi.amd.synd2);
>>               if (m->ipid)
>>                       pr_cont("IPID %llx ", m->ipid);
>> +             if (err->vi.amd.config)
> 
> This is in common code. If other vendors start adding their own stuff to the
> "vi" union you might incorrectly print this.  Add a vendor check before looking
> at values inside "m->vi".
>

Do we really need an explicit vendor check in this particular instance?

Below is a snippet from __print_mce() after applying this series:

    if (mce_flags.smca) {
        if (m->synd)
            pr_cont("SYND %llx ", m->synd);
        if (err->vi.amd.synd1)
            pr_cont("SYND1 %llx ", err->vi.amd.synd1);
        if (err->vi.amd.synd2)
            pr_cont("SYND2 %llx ", err->vi.amd.synd2);
        if (m->ipid)
            pr_cont("IPID %llx ", m->ipid);
        if (err->vi.amd.config)
            pr_cont("CONFIG %llx ", err->vi.amd.config);
    }

    pr_cont("\n");

All of the above registers including the newly added config MSR will only
be logged if the smca flag is set in mce_flags.
Doesn't that already serve as a vendor check of sorts?
Something that I am missing here?
Luck, Tony April 8, 2024, 7:57 p.m. UTC | #4
> All of the above registers including the newly added config MSR will only
> be logged if the smca flag is set in mce_flags.
> Doesn't that already serve as a vendor check of sorts?
> Something that I am missing here?


Avadhut,

Yes. That's a sufficient vendor check. I was looking at the bits in the patch,
not at the broader context. Sorry for the noise.

-Tony
diff mbox series

Patch

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index a701290f80a1..2a8997d7ba4d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -59,6 +59,7 @@ 
  *  - TCC bit is present in MCx_STATUS.
  */
 #define MCI_CONFIG_MCAX		0x1
+#define MCI_CONFIG_FRUTEXT	BIT_ULL(9)
 
 /*
  * Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -195,6 +196,7 @@  struct mce_hw_err {
 		struct {
 			u64 synd1;
 			u64 synd2;
+			u64 config;
 		} amd;
 	} vi;
 };
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 43622241c379..a9c28614530b 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -154,6 +154,8 @@  int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
 		fallthrough;
 	/* MCA_CONFIG */
 	case 4:
+		err.vi.amd.config = *(i_mce + 3);
+		fallthrough;
 	/* MCA_MISC0 */
 	case 3:
 		m->misc = *(i_mce + 2);
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index aa27729f7899..a4d09dda5fae 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -207,6 +207,8 @@  static void __print_mce(struct mce_hw_err *err)
 			pr_cont("SYND2 %llx ", err->vi.amd.synd2);
 		if (m->ipid)
 			pr_cont("IPID %llx ", m->ipid);
+		if (err->vi.amd.config)
+			pr_cont("CONFIG %llx ", err->vi.amd.config);
 	}
 
 	pr_cont("\n");
@@ -679,6 +681,7 @@  static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
 
 	if (mce_flags.smca) {
 		m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i));
+		err->vi.amd.config = mce_rdmsrl(MSR_AMD64_SMCA_MCx_CONFIG(i));
 
 		if (m->status & MCI_STATUS_SYNDV) {
 			m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i));
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 32bf4cc564a3..f68b3d1b558e 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -795,6 +795,7 @@  amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 	struct mce_hw_err *err = (struct mce_hw_err *)data;
 	struct mce *m = &err->m;
 	unsigned int fam = x86_family(m->cpuid);
+	u64 mca_config = err->vi.amd.config;
 	int ecc;
 
 	if (m->kflags & MCE_HANDLED_CEC)
@@ -814,11 +815,7 @@  amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
 
 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
-		u32 low, high;
-		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
-
-		if (!rdmsr_safe(addr, &low, &high) &&
-		    (low & MCI_CONFIG_MCAX))
+		if (mca_config & MCI_CONFIG_MCAX)
 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
 
 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
@@ -853,8 +850,18 @@  amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
 
 		if (m->status & MCI_STATUS_SYNDV) {
 			pr_cont(", Syndrome: 0x%016llx\n", m->synd);
-			pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx",
-				 err->vi.amd.synd1, err->vi.amd.synd2);
+			if (mca_config & MCI_CONFIG_FRUTEXT) {
+				char frutext[17];
+
+				memset(frutext, 0, sizeof(frutext));
+				memcpy(&frutext[0], &err->vi.amd.synd1, 8);
+				memcpy(&frutext[8], &err->vi.amd.synd2, 8);
+
+				pr_emerg(HW_ERR "FRU Text: %s", frutext);
+			} else {
+				pr_emerg(HW_ERR "Syndrome1: 0x%016llx, Syndrome2: 0x%016llx",
+					 err->vi.amd.synd1, err->vi.amd.synd2);
+			}
 		}
 
 		pr_cont("\n");