diff mbox series

[1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types.

Message ID 20210511152538.148084-1-nchatrad@amd.com (mailing list archive)
State New, archived
Headers show
Series [1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types. | expand

Commit Message

Naveen Krishna Chatradhi May 11, 2021, 3:25 p.m. UTC
From: Muralidhara M K <muralimk@amd.com>

Add the (HWID, MCATYPE) tuples and names for new
SMCA bank types.

Also, add their respective error descriptions to the MCE
decoding module edac_mce_amd.

Signed-off-by: Muralidhara M K <muralimk@amd.com>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 arch/x86/include/asm/mce.h    |  5 +++
 arch/x86/kernel/cpu/mce/amd.c | 16 ++++++++
 drivers/edac/mce_amd.c        | 70 +++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+)

Comments

Borislav Petkov May 11, 2021, 5:27 p.m. UTC | #1
On Tue, May 11, 2021 at 08:55:36PM +0530, Naveen Krishna Chatradhi wrote:
> diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
> index e486f96b3cb3..055f3a0acf5e 100644
> --- a/arch/x86/kernel/cpu/mce/amd.c
> +++ b/arch/x86/kernel/cpu/mce/amd.c
> @@ -90,6 +90,7 @@ static struct smca_bank_name smca_names[] = {
>  	[SMCA_CS_V2]	= { "coherent_slave",	"Coherent Slave" },
>  	[SMCA_PIE]	= { "pie",		"Power, Interrupts, etc." },
>  	[SMCA_UMC]	= { "umc",		"Unified Memory Controller" },
> +	[SMCA_UMC_V2]	= { "umc_v2",		"Unified Memory Controller" },

So this is called "umc_v2" but the other V2 FUs's strings are the same.
Why?

Also, if you're going to repeat strings, you can just as well group all
those which are the same this way:

	[ SMCA_UMC ... SMCA_UMC_V2 ]    = { "umc",              "Unified Memory Controller" },

and do that for all which have V1 and V2.

I mean, gcc is smart enough to do that behind the scenes for identical
strings but you should do that in C too.

> diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
> index 5dd905a3f30c..5515fd9336b1 100644
> --- a/drivers/edac/mce_amd.c
> +++ b/drivers/edac/mce_amd.c
> @@ -323,6 +323,21 @@ static const char * const smca_umc_mce_desc[] = {
>  	"AES SRAM ECC error",
>  };
>  
> +static const char * const smca_umc2_mce_desc[] = {

Ok, gcc reuses the identical string pointers from smca_umc_mce_desc[] so
we should be ok wrt duplication.

> +	"DRAM ECC error",
> +	"Data poison error",
> +	"SDP parity error",
> +	"Reserved",
> +	"Address/Command parity error",
> +	"Write data parity error",
> +	"DCQ SRAM ECC error",
> +	"Reserved",
> +	"Read data parity error",
> +	"Rdb SRAM ECC error",
> +	"RdRsp SRAM ECC error",
> +	"LM32 MP errors",
> +};

...


> +static const char * const smca_xgmipcs_mce_desc[] = {
> +	"DataLossErr",
> +	"TrainingErr",
> +	"FlowCtrlAckErr",
> +	"RxFifoUnderflowErr",
> +	"RxFifoOverflowErr",
> +	"CRCErr",
> +	"BERExceededErr",
> +	"TxVcidDataErr",
> +	"ReplayBufParityErr",
> +	"DataParityErr",
> +	"ReplayFifoOverflowErr",
> +	"ReplayFIfoUnderflowErr",
> +	"ElasticFifoOverflowErr",
> +	"DeskewErr",
> +	"FlowCtrlCRCErr",
> +	"DataStartupLimitErr",
> +	"FCInitTimeoutErr",
> +	"RecoveryTimeoutErr",
> +	"ReadySerialTimeoutErr",
> +	"ReadySerialAttemptErr",
> +	"RecoveryAttemptErr",
> +	"RecoveryRelockAttemptErr",
> +	"ReplayAttemptErr",
> +	"SyncHdrErr",
> +	"TxReplayTimeoutErr",
> +	"RxReplayTimeoutErr",
> +	"LinkSubTxTimeoutErr",
> +	"LinkSubRxTimeoutErr",
> +	"RxCMDPktErr",

What happened to those and why aren't they proper words like the other
error descriptions?

Thx.
Chatradhi, Naveen Krishna May 24, 2021, 4:41 p.m. UTC | #2
[AMD Official Use Only]

Hi Boris

My apologies for delayed response. Thanks for your review comments, will submit a v2 shortly.

Regards,
Naveenk

-----Original Message-----
From: Borislav Petkov <bp@alien8.de> 
Sent: Tuesday, May 11, 2021 10:57 PM
To: Chatradhi, Naveen Krishna <NaveenKrishna.Chatradhi@amd.com>
Cc: linux-edac@vger.kernel.org; x86@kernel.org; linux-kernel@vger.kernel.org; mingo@redhat.com; mchehab@kernel.org; M K, Muralidhara <Muralidhara.MK@amd.com>
Subject: Re: [PATCH 1/3] x86/MCE/AMD, EDAC/mce_amd: Add new SMCA bank types.

[CAUTION: External Email]

On Tue, May 11, 2021 at 08:55:36PM +0530, Naveen Krishna Chatradhi wrote:
> diff --git a/arch/x86/kernel/cpu/mce/amd.c 
> b/arch/x86/kernel/cpu/mce/amd.c index e486f96b3cb3..055f3a0acf5e 
> 100644
> --- a/arch/x86/kernel/cpu/mce/amd.c
> +++ b/arch/x86/kernel/cpu/mce/amd.c
> @@ -90,6 +90,7 @@ static struct smca_bank_name smca_names[] = {
>       [SMCA_CS_V2]    = { "coherent_slave",   "Coherent Slave" },
>       [SMCA_PIE]      = { "pie",              "Power, Interrupts, etc." },
>       [SMCA_UMC]      = { "umc",              "Unified Memory Controller" },
> +     [SMCA_UMC_V2]   = { "umc_v2",           "Unified Memory Controller" },

So this is called "umc_v2" but the other V2 FUs's strings are the same.
Why?
[naveenk:] There is a possibility for a heterogenous system with both the SMCA_UMC and SMCA_UMC_V2 variant of controllers to exist.
I will update the long name to describe accordingly.

Also, if you're going to repeat strings, you can just as well group all those which are the same this way:

        [ SMCA_UMC ... SMCA_UMC_V2 ]    = { "umc",              "Unified Memory Controller" },

and do that for all which have V1 and V2.

I mean, gcc is smart enough to do that behind the scenes for identical strings but you should do that in C too.
[naveenk:] thanks for the suggestion, I can do this for the other units.

> diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index 
> 5dd905a3f30c..5515fd9336b1 100644
> --- a/drivers/edac/mce_amd.c
> +++ b/drivers/edac/mce_amd.c
> @@ -323,6 +323,21 @@ static const char * const smca_umc_mce_desc[] = {
>       "AES SRAM ECC error",
>  };
>
> +static const char * const smca_umc2_mce_desc[] = {

Ok, gcc reuses the identical string pointers from smca_umc_mce_desc[] so we should be ok wrt duplication.

> +     "DRAM ECC error",
> +     "Data poison error",
> +     "SDP parity error",
> +     "Reserved",
> +     "Address/Command parity error",
> +     "Write data parity error",
> +     "DCQ SRAM ECC error",
> +     "Reserved",
> +     "Read data parity error",
> +     "Rdb SRAM ECC error",
> +     "RdRsp SRAM ECC error",
> +     "LM32 MP errors",
> +};

...


> +static const char * const smca_xgmipcs_mce_desc[] = {
> +     "DataLossErr",
> +     "TrainingErr",
> +     "FlowCtrlAckErr",
> +     "RxFifoUnderflowErr",
> +     "RxFifoOverflowErr",
> +     "CRCErr",
> +     "BERExceededErr",
> +     "TxVcidDataErr",
> +     "ReplayBufParityErr",
> +     "DataParityErr",
> +     "ReplayFifoOverflowErr",
> +     "ReplayFIfoUnderflowErr",
> +     "ElasticFifoOverflowErr",
> +     "DeskewErr",
> +     "FlowCtrlCRCErr",
> +     "DataStartupLimitErr",
> +     "FCInitTimeoutErr",
> +     "RecoveryTimeoutErr",
> +     "ReadySerialTimeoutErr",
> +     "ReadySerialAttemptErr",
> +     "RecoveryAttemptErr",
> +     "RecoveryRelockAttemptErr",
> +     "ReplayAttemptErr",
> +     "SyncHdrErr",
> +     "TxReplayTimeoutErr",
> +     "RxReplayTimeoutErr",
> +     "LinkSubTxTimeoutErr",
> +     "LinkSubRxTimeoutErr",
> +     "RxCMDPktErr",

What happened to those and why aren't they proper words like the other error descriptions?
[naveenk:] Will change these into proper words.

Thx.

--
Regards/Gruss,
    Boris.

https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpeople.kernel.org%2Ftglx%2Fnotes-about-netiquette&amp;data=04%7C01%7CNaveenKrishna.Chatradhi%40amd.com%7C9159e5c1aebd47969c2508d914a20789%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637563508427766424%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=%2FiHUZDkg99NnGdDrOCK%2FQWsui2yA1dADCfG%2F4xFr%2B7I%3D&amp;reserved=0
Borislav Petkov May 25, 2021, 6:02 p.m. UTC | #3
On Mon, May 24, 2021 at 04:41:25PM +0000, Chatradhi, Naveen Krishna wrote:
> [naveenk:] There is a possibility for a heterogenous system with both
> the SMCA_UMC and SMCA_UMC_V2 variant of controllers to exist.

Wait, what? You can have systems with *both* UMCs in the same coherent
fabric and thus the OS can see UMCs of both types on the same system?

Thx.
Yazen Ghannam May 25, 2021, 8:03 p.m. UTC | #4
On Tue, May 25, 2021 at 08:02:44PM +0200, Borislav Petkov wrote:
> On Mon, May 24, 2021 at 04:41:25PM +0000, Chatradhi, Naveen Krishna wrote:
> > [naveenk:] There is a possibility for a heterogenous system with both
> > the SMCA_UMC and SMCA_UMC_V2 variant of controllers to exist.
> 
> Wait, what? You can have systems with *both* UMCs in the same coherent
> fabric and thus the OS can see UMCs of both types on the same system?
>

Yep, that's right. The UMCs are the only case of this so far and in the
foreseeable future. Though we may be moving towards more cases like
this.

Thanks,
Yazen
Borislav Petkov May 25, 2021, 8:12 p.m. UTC | #5
On Tue, May 25, 2021 at 04:03:27PM -0400, Yazen Ghannam wrote:
> Yep, that's right. The UMCs are the only case of this so far and in the
> foreseeable future. Though we may be moving towards more cases like
> this.

Ok, then you guys really need to make sure those strings are unique
because they're in sysfs:

/sys/devices/system/machinecheck/machinecheck...

AFAIR, so we can't have duplicates there.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index ddfb3cad8dff..cf7f35fdf2c8 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -317,6 +317,7 @@  enum smca_bank_types {
 	SMCA_CS_V2,	/* Coherent Slave */
 	SMCA_PIE,	/* Power, Interrupts, etc. */
 	SMCA_UMC,	/* Unified Memory Controller */
+	SMCA_UMC_V2,	/* Unified Memory Controller */
 	SMCA_PB,	/* Parameter Block */
 	SMCA_PSP,	/* Platform Security Processor */
 	SMCA_PSP_V2,	/* Platform Security Processor */
@@ -325,6 +326,10 @@  enum smca_bank_types {
 	SMCA_MP5,	/* Microprocessor 5 Unit */
 	SMCA_NBIO,	/* Northbridge IO Unit */
 	SMCA_PCIE,	/* PCI Express Unit */
+	SMCA_PCIE_V2,	/* PCI Express Unit */
+	SMCA_XGMI_PCS,	/* xGMI PCS Unit */
+	SMCA_XGMI_PHY,	/* xGMI PHY Unit */
+	SMCA_WAFL_PHY,	/* WAFL PHY Unit */
 	N_SMCA_BANK_TYPES
 };
 
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e486f96b3cb3..055f3a0acf5e 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -90,6 +90,7 @@  static struct smca_bank_name smca_names[] = {
 	[SMCA_CS_V2]	= { "coherent_slave",	"Coherent Slave" },
 	[SMCA_PIE]	= { "pie",		"Power, Interrupts, etc." },
 	[SMCA_UMC]	= { "umc",		"Unified Memory Controller" },
+	[SMCA_UMC_V2]	= { "umc_v2",		"Unified Memory Controller" },
 	[SMCA_PB]	= { "param_block",	"Parameter Block" },
 	[SMCA_PSP]	= { "psp",		"Platform Security Processor" },
 	[SMCA_PSP_V2]	= { "psp",		"Platform Security Processor" },
@@ -98,6 +99,10 @@  static struct smca_bank_name smca_names[] = {
 	[SMCA_MP5]	= { "mp5",		"Microprocessor 5 Unit" },
 	[SMCA_NBIO]	= { "nbio",		"Northbridge IO Unit" },
 	[SMCA_PCIE]	= { "pcie",		"PCI Express Unit" },
+	[SMCA_PCIE_V2]	= { "pcie",		"PCI Express Unit" },
+	[SMCA_XGMI_PCS]	= { "xgmi_pcs",		"Ext Global Memory Interconnect PCS Unit" },
+	[SMCA_XGMI_PHY]	= { "xgmi_phy",		"Ext Global Memory Interconnect PHY Unit" },
+	[SMCA_WAFL_PHY]	= { "wafl_phy",		"WAFL PHY Unit" },
 };
 
 static const char *smca_get_name(enum smca_bank_types t)
@@ -155,6 +160,7 @@  static struct smca_hwid smca_hwid_mcatypes[] = {
 
 	/* Unified Memory Controller MCA type */
 	{ SMCA_UMC,	 HWID_MCATYPE(0x96, 0x0)	},
+	{ SMCA_UMC_V2,	 HWID_MCATYPE(0x96, 0x1)	},
 
 	/* Parameter Block MCA type */
 	{ SMCA_PB,	 HWID_MCATYPE(0x05, 0x0)	},
@@ -175,6 +181,16 @@  static struct smca_hwid smca_hwid_mcatypes[] = {
 
 	/* PCI Express Unit MCA type */
 	{ SMCA_PCIE,	 HWID_MCATYPE(0x46, 0x0)	},
+	{ SMCA_PCIE_V2,	 HWID_MCATYPE(0x46, 0x1)	},
+
+	/* xGMI PCS MCA type */
+	{ SMCA_XGMI_PCS, HWID_MCATYPE(0x50, 0x0)	},
+
+	/* xGMI PHY MCA type */
+	{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0)	},
+
+	/* WAFL PHY MCA type */
+	{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0)	},
 };
 
 struct smca_bank smca_banks[MAX_NR_BANKS];
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 5dd905a3f30c..5515fd9336b1 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -323,6 +323,21 @@  static const char * const smca_umc_mce_desc[] = {
 	"AES SRAM ECC error",
 };
 
+static const char * const smca_umc2_mce_desc[] = {
+	"DRAM ECC error",
+	"Data poison error",
+	"SDP parity error",
+	"Reserved",
+	"Address/Command parity error",
+	"Write data parity error",
+	"DCQ SRAM ECC error",
+	"Reserved",
+	"Read data parity error",
+	"Rdb SRAM ECC error",
+	"RdRsp SRAM ECC error",
+	"LM32 MP errors",
+};
+
 static const char * const smca_pb_mce_desc[] = {
 	"An ECC error in the Parameter Block RAM array",
 };
@@ -400,6 +415,56 @@  static const char * const smca_pcie_mce_desc[] = {
 	"CCIX Non-okay write response with data error",
 };
 
+static const char * const smca_pcie2_mce_desc[] = {
+	"SDP Parity Error logging",
+};
+
+static const char * const smca_xgmipcs_mce_desc[] = {
+	"DataLossErr",
+	"TrainingErr",
+	"FlowCtrlAckErr",
+	"RxFifoUnderflowErr",
+	"RxFifoOverflowErr",
+	"CRCErr",
+	"BERExceededErr",
+	"TxVcidDataErr",
+	"ReplayBufParityErr",
+	"DataParityErr",
+	"ReplayFifoOverflowErr",
+	"ReplayFIfoUnderflowErr",
+	"ElasticFifoOverflowErr",
+	"DeskewErr",
+	"FlowCtrlCRCErr",
+	"DataStartupLimitErr",
+	"FCInitTimeoutErr",
+	"RecoveryTimeoutErr",
+	"ReadySerialTimeoutErr",
+	"ReadySerialAttemptErr",
+	"RecoveryAttemptErr",
+	"RecoveryRelockAttemptErr",
+	"ReplayAttemptErr",
+	"SyncHdrErr",
+	"TxReplayTimeoutErr",
+	"RxReplayTimeoutErr",
+	"LinkSubTxTimeoutErr",
+	"LinkSubRxTimeoutErr",
+	"RxCMDPktErr",
+};
+
+static const char * const smca_xgmiphy_mce_desc[] = {
+	"RAM ECC Error",
+	"ARC instruction buffer parity error",
+	"ARC data buffer parity error",
+	"PHY APB error",
+};
+
+static const char * const smca_waflphy_mce_desc[] = {
+	"RAM ECC Error",
+	"ARC instruction buffer parity error",
+	"ARC data buffer parity error",
+	"PHY APB error",
+};
+
 struct smca_mce_desc {
 	const char * const *descs;
 	unsigned int num_descs;
@@ -418,6 +483,7 @@  static struct smca_mce_desc smca_mce_descs[] = {
 	[SMCA_CS_V2]	= { smca_cs2_mce_desc,	ARRAY_SIZE(smca_cs2_mce_desc)	},
 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
+	[SMCA_UMC_V2]	= { smca_umc2_mce_desc,	ARRAY_SIZE(smca_umc2_mce_desc)	},
 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
 	[SMCA_PSP_V2]	= { smca_psp2_mce_desc,	ARRAY_SIZE(smca_psp2_mce_desc)	},
@@ -426,6 +492,10 @@  static struct smca_mce_desc smca_mce_descs[] = {
 	[SMCA_MP5]	= { smca_mp5_mce_desc,	ARRAY_SIZE(smca_mp5_mce_desc)	},
 	[SMCA_NBIO]	= { smca_nbio_mce_desc,	ARRAY_SIZE(smca_nbio_mce_desc)	},
 	[SMCA_PCIE]	= { smca_pcie_mce_desc,	ARRAY_SIZE(smca_pcie_mce_desc)	},
+	[SMCA_PCIE_V2]	= { smca_pcie2_mce_desc,   ARRAY_SIZE(smca_pcie2_mce_desc)	},
+	[SMCA_XGMI_PCS]	= { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc)	},
+	[SMCA_XGMI_PHY]	= { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc)	},
+	[SMCA_WAFL_PHY]	= { smca_waflphy_mce_desc, ARRAY_SIZE(smca_waflphy_mce_desc)	},
 };
 
 static bool f12h_mc0_mce(u16 ec, u8 xec)