@@ -2831,6 +2831,9 @@ static void decode_umc_error(int node_id, struct mce *m)
error_address_to_page_and_offset(sys_addr, &err);
+ if (pvt->fam == 0x19 && (pvt->model >= 0x90 && pvt->model <= 0x9f))
+ amd_umc_retire_column_spa_from_row(m);
+
log_error:
__log_ecc_error(mci, &err, ecc_type);
}
@@ -255,3 +255,80 @@ int umc_mca_addr_to_sys_addr(struct mce *m, u64 *sys_addr)
return 0;
}
EXPORT_SYMBOL_GPL(umc_mca_addr_to_sys_addr);
+
+/*
+ * High Bandwidth Memory (HBM v3) has fixed number of columns in a row.
+ * In specific, HBMv3 has 8 columns in one row.
+ * Extract column bits in a row to find all the combination of masks and
+ * to retire all the system physical addresses in that particular row.
+ */
+#define MAX_COLUMNS_IN_HBM_ROW 8
+
+/* Column 2, 3 and 4th bits in Normalized Address */
+#define UMC_NA_C2_BIT BIT(8)
+#define UMC_NA_C3_BIT BIT(9)
+#define UMC_NA_C4_BIT BIT(14)
+
+/* Possible combinations of column address masks in a HBM v3 row */
+#define C_1_1_1_MASK (UMC_NA_C4_BIT | UMC_NA_C3_BIT | UMC_NA_C2_BIT)
+#define C_1_1_0_MASK (UMC_NA_C4_BIT | UMC_NA_C3_BIT)
+#define C_1_0_1_MASK (UMC_NA_C4_BIT | UMC_NA_C2_BIT)
+#define C_1_0_0_MASK (UMC_NA_C4_BIT)
+#define C_0_1_1_MASK (UMC_NA_C3_BIT | UMC_NA_C2_BIT)
+#define C_0_1_0_MASK (UMC_NA_C3_BIT)
+#define C_0_0_1_MASK (UMC_NA_C2_BIT)
+#define C_0_0_0_MASK ~C_1_1_1_MASK
+
+/* Identify system address physical addresses of all columns in a HBM v3 row */
+static void identify_column_spa_from_row(struct mce *m, u64 *col)
+{
+ u8 cs_inst_id = get_cs_inst_id(m);
+ u8 socket_id = get_socket_id(m);
+ u64 norm_addr = get_norm_addr(m);
+ u8 die_id = get_die_id(m);
+ u16 df_acc_id = get_df_acc_id(m);
+
+ u64 retire_addr, column;
+ u64 column_masks[] = { 0, C_0_0_1_MASK, C_0_1_0_MASK, C_0_1_1_MASK,
+ C_1_0_0_MASK, C_1_0_1_MASK, C_1_1_0_MASK, C_1_1_1_MASK };
+
+ /* clear and loop for all possibilities of [c4 c3 c2] */
+ norm_addr &= C_0_0_0_MASK;
+
+ for (column = 0; column < ARRAY_SIZE(column_masks); column++) {
+ retire_addr = norm_addr | column_masks[column];
+
+ if (norm_to_sys_addr(df_acc_id, socket_id, die_id, cs_inst_id, &retire_addr))
+ pr_warn("Failed norm_to_sys_addr for column[%lld]\n", column);
+ else
+ col[column] = retire_addr;
+ }
+}
+
+void amd_umc_retire_column_spa_from_row(struct mce *m)
+{
+ u64 col[MAX_COLUMNS_IN_HBM_ROW];
+ u64 tmp[MAX_COLUMNS_IN_HBM_ROW];
+ int i, j, count = 0;
+ unsigned long pfn;
+
+ pr_info("Identify SPA of all columns from row for MCE Addr:0x%llx\n", m->addr);
+ identify_column_spa_from_row(m, col);
+
+ /* Find duplicate column SPA in a row */
+ for (i = 0; i < MAX_COLUMNS_IN_HBM_ROW; i++) {
+ for (j = 0; j < count; j++) {
+ if (col[i] == tmp[j])
+ break;
+ }
+ if (j == count) {
+ tmp[count] = col[i];
+ /* do page retirement, except for duplicate addresses */
+ pr_debug("Retire column spa:0x%llx ", tmp[count]);
+ pfn = PHYS_PFN(tmp[count]);
+ memory_failure(pfn, 0);
+ count++;
+ }
+ }
+}
+EXPORT_SYMBOL(amd_umc_retire_column_spa_from_row);
@@ -25,4 +25,6 @@ static inline int amd_umc_mca_addr_to_sys_addr(struct mce *m, u64 *sys_addr)
return umc_mca_addr_to_sys_addr(m, sys_addr);
}
+void amd_umc_retire_column_spa_from_row(struct mce *m);
+
#endif /* _AMD_ATL_H */