@@ -3653,6 +3653,30 @@ static inline void decode_bus_error(int node_id, struct mce *m)
__log_ecc_error(mci, &err, ecc_type);
}
+/*
+ * On CPUs, The Data Fabric ID of an instance is equal to the UMC number.
+ * And since the UMC number and channel are equal in CPU nodes, the channel can be used
+ * as the Data Fabric ID of the instance.
+ */
+static int f17_df_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+ struct err_info *err)
+{
+ return err->channel;
+}
+
+/*
+ * A GPU node has 'X' number of PHYs and 'Y' number of channels.
+ * This results in 'X*Y' number of instances in the Data Fabric.
+ * Therefore the Data Fabric ID of an instance can be found with the following formula:
+ * df_inst_id = 'X' * number of channels per PHY + 'Y'
+ *
+ */
+static int gpu_df_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+ struct err_info *err)
+{
+ return (err->csrow * pvt->channel_count / mci->nr_csrows) + err->channel;
+}
+
/*
* To find the UMC channel represented by this bank we need to match on its
* instance_id. The instance_id of a bank is held in the lower 32 bits of its
@@ -3670,12 +3694,38 @@ static void update_umc_err_info(struct mce *m, struct err_info *err)
err->csrow = m->synd & 0x7;
}
+/*
+ * The CPUs have one channel per UMC, So UMC number is equivalent to a
+ * channel number. The GPUs have 8 channels per UMC, so the UMC number no
+ * longer works as a channel number.
+ * The channel number within a GPU UMC is given in MCA_IPID[15:12].
+ * However, the IDs are split such that two UMC values go to one UMC, and
+ * the channel numbers are split in two groups of four.
+ *
+ * Refer comment on get_umc_base_gpu() from amd64_edac.h
+ *
+ * For example,
+ * UMC0 CH[3:0] = 0x0005[3:0]000
+ * UMC0 CH[7:4] = 0x0015[3:0]000
+ * UMC1 CH[3:0] = 0x0025[3:0]000
+ * UMC1 CH[7:4] = 0x0035[3:0]000
+ */
+static void gpu_update_umc_err_info(struct mce *m, struct err_info *err)
+{
+ u8 ch = (m->ipid & GENMASK(31, 0)) >> 20;
+ u8 phy = ((m->ipid >> 12) & 0xf);
+
+ err->channel = ch % 2 ? phy + 4 : phy;
+ err->csrow = phy;
+}
+
static void decode_umc_error(int node_id, struct mce *m)
{
u8 ecc_type = (m->status >> 45) & 0x3;
struct mem_ctl_info *mci;
struct amd64_pvt *pvt;
struct err_info err;
+ u8 df_inst_id;
u64 sys_addr;
mci = edac_mc_find(node_id);
@@ -3705,7 +3755,9 @@ static void decode_umc_error(int node_id, struct mce *m)
pvt->ops->get_umc_err_info(m, &err);
- if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) {
+ df_inst_id = pvt->ops->find_umc_inst_id(mci, pvt, &err);
+
+ if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, df_inst_id, &sys_addr)) {
err.err_code = ERR_NORM_ADDR;
goto log_error;
}
@@ -4625,6 +4677,7 @@ static void per_family_init(struct amd64_pvt *pvt)
pvt->ops->get_mc_regs = __read_mc_regs_df;
pvt->ops->populate_csrows = init_csrows_df;
pvt->ops->get_umc_err_info = update_umc_err_info;
+ pvt->ops->find_umc_inst_id = f17_df_inst_id;
if (pvt->fam == 0x18) {
pvt->ctl_name = "F18h";
@@ -4678,6 +4731,8 @@ static void per_family_init(struct amd64_pvt *pvt)
pvt->ops->dump_misc_regs = gpu_dump_misc_regs;
pvt->ops->get_mc_regs = gpu_read_mc_regs;
pvt->ops->populate_csrows = gpu_init_csrows;
+ pvt->ops->get_umc_err_info = gpu_update_umc_err_info;
+ pvt->ops->find_umc_inst_id = gpu_df_inst_id;
goto end_fam;
} else {
pvt->ctl_name = "F19h_M30h";
@@ -4707,6 +4762,7 @@ static void per_family_init(struct amd64_pvt *pvt)
pvt->ops->get_mc_regs = __read_mc_regs_df;
pvt->ops->populate_csrows = init_csrows_df;
pvt->ops->get_umc_err_info = update_umc_err_info;
+ pvt->ops->find_umc_inst_id = f17_df_inst_id;
end_fam:
break;
@@ -4728,7 +4784,7 @@ static void per_family_init(struct amd64_pvt *pvt)
}
/* ops required for families 17h and later */
- if (pvt->fam >= 0x17 && !pvt->ops->get_umc_err_info) {
+ if (pvt->fam >= 0x17 && (!pvt->ops->get_umc_err_info || !pvt->ops->find_umc_inst_id)) {
edac_dbg(1, "Platform specific helper routines not defined.\n");
return;
}
@@ -496,6 +496,8 @@ struct low_ops {
void (*setup_mci_misc_attrs)(struct mem_ctl_info *mci);
int (*populate_csrows)(struct mem_ctl_info *mci);
void (*get_umc_err_info)(struct mce *m, struct err_info *err);
+ int (*find_umc_inst_id)(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+ struct err_info *err);
};
int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,