diff mbox series

[3/4] rasdaemon: Enumerate memory on noncpu nodes

Message ID 20210810172214.134099-4-nchatrad@amd.com (mailing list archive)
State New, archived
Headers show
Series rasdaemon: Add support for AMD CPUs family 19h | expand

Commit Message

Naveen Krishna Chatradhi Aug. 10, 2021, 5:22 p.m. UTC
From: Muralidhara M K <muralimk@amd.com>

On newer heterogeneous systems from AMD with GPU nodes
(with HBM2 memory) connected via xGMI links to the CPUs.

The node id information is available in the
MCA_IPID[47:44](InstanceIdHI) register.

The UMC Phys on Aldeberan nodes are enumerated as csrow
The UMC channels connected to HBMs are enumerated as ranks.

Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
 mce-amd-smca.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)
diff mbox series

Patch

diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 3c346f4..9381aa1 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -78,6 +78,16 @@  enum smca_bank_types {
 /* Maximum number of MCA banks per CPU. */
 #define MAX_NR_BANKS	64
 
+/*
+ * On newer heterogeneous systems the data gabrics of the CPUs and GPUs
+ * are connected directly via a custom links, like is done with
+ * 2 socket CPU systems and also within a socket for Multi-chip Module
+ * (MCM) CPUs like Naples.
+ * The first GPU node(non cpu) is assumed to have an "AMD Node ID" value
+ * of 8 (the second GPU node has 9, etc.).
+ */
+#define NONCPU_NODE_INDEX	8
+
 /* SMCA Extended error strings */
 /* Load Store */
 static const char * const smca_ls_mce_desc[] = {
@@ -531,6 +541,26 @@  static int find_umc_channel(struct mce_event *e)
 {
 	return EXTRACT(e->ipid, 0, 31) >> 20;
 }
+
+/*
+ * The HBM memory managed by the UMCCH of the noncpu node
+ * can be calculated based on the [15:12]bits of IPID
+ */
+static int find_hbm_channel(struct mce_event *e)
+{
+	int umc, tmp;
+
+	umc = EXTRACT(e->ipid, 0, 31) >> 20;
+
+	/*
+	 * The HBM channel managed by the UMC of the noncpu node
+	 * can be calculated based on the [15:12]bits of IPID as follows
+	 */
+	tmp = ((e->ipid >> 12) & 0xf);
+
+	return (umc % 2) ? tmp + 4 : tmp;
+}
+
 /* Decode extended errors according to Scalable MCA specification */
 static void decode_smca_error(struct mce_event *e)
 {
@@ -539,6 +569,7 @@  static void decode_smca_error(struct mce_event *e)
 	unsigned short xec = (e->status >> 16) & 0x3f;
 	const struct smca_hwid *s_hwid;
 	uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
+	uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
 	unsigned int csrow = -1, channel = -1;
 	unsigned int i;
 
@@ -548,14 +579,16 @@  static void decode_smca_error(struct mce_event *e)
 			bank_type = s_hwid->bank_type;
 			break;
 		}
+		if (mcatype_instancehi >= NONCPU_NODE_INDEX)
+			bank_type = SMCA_UMC_V2;
 	}
 
-	if (i >= ARRAY_SIZE(smca_hwid_mcatypes)) {
+	if (i >= MAX_NR_BANKS) {
 		strcpy(e->mcastatus_msg, "Couldn't find bank type with IPID");
 		return;
 	}
 
-	if (bank_type >= N_SMCA_BANK_TYPES) {
+	if (bank_type >= MAX_NR_BANKS) {
 		strcpy(e->mcastatus_msg, "Don't know how to decode this bank");
 		return;
 	}
@@ -580,6 +613,16 @@  static void decode_smca_error(struct mce_event *e)
 		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
 			     channel, csrow);
 	}
+
+	if (bank_type == SMCA_UMC_V2 && xec == 0) {
+		/* The UMCPHY is reported as csrow in case of noncpu nodes */
+		csrow = find_umc_channel(e) / 2;
+		/* UMCCH is managing the HBM memory */
+		channel = find_hbm_channel(e);
+		mce_snprintf(e->mc_location, "memory_channel=%d,csrow=%d",
+			     channel, csrow);
+	}
+
 }
 
 int parse_amd_smca_event(struct ras_events *ras, struct mce_event *e)