@@ -144,6 +144,8 @@
#define XILINX_DRAM_SIZE_16G 4
#define XILINX_DRAM_SIZE_32G 5
#define NUM_UE_BITPOS 2
+#define NUM_ECC_BITS 72
+#define DATA_BITS 64
/**
* struct ecc_error_info - ECC error log information.
@@ -245,6 +247,79 @@ struct edac_priv {
#endif
};
+static int rowa[] = {
+ 0x7, 0xB, 0x13, 0x23, 0x43, 0x83, 0xD, 0x15,
+ 0x25, 0x45, 0x85, 0x19, 0x29, 0x49, 0x89, 0x31,
+ 0x51, 0x91, 0x61, 0xa1, 0xc1, 0x0e, 0x16, 0x26,
+ 0x46, 0x86, 0x1a, 0x2a, 0x4a, 0x8a, 0x32, 0x52,
+ 0x92, 0x62, 0xa2, 0xc2, 0x1c, 0x2c, 0x4c, 0x8c,
+ 0x34, 0x54, 0x94, 0x64, 0xa4, 0xc4, 0x38, 0x58,
+ 0x98, 0x68, 0xa8, 0xc8, 0x70, 0xB0, 0xD0, 0xE0,
+ 0x1f, 0x2f, 0x4f, 0x8f, 0x37, 0x57, 0x97, 0x67,
+ 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+
+/**
+ * calculate_ce_bit_value - Report the bit that was corrupted in the CE errors
+ *
+ * @eccr0_corr_err_data_hi: ECC error data hi register
+ * @regval: ECC error data lo register.
+ * @parity: ECC error parity
+ *
+ * Return: none.
+ * Report the bit that was corrupted in the CE errors
+ *
+ * The function takes the data and the parity as inputs.
+ * It derive the corrupted bit from a fixed H Matrix and the data
+ * and parity.
+ */
+static void calculate_ce_bit_value(u32 eccr0_corr_err_data_hi, u32 regval, u32 parity)
+{
+ u8 bit, xor[BITS_PER_BYTE], match = 0;
+ u8 rows[NUM_ECC_BITS] = {0};
+ bool field, prev;
+ u64 data;
+ int i, j;
+
+ data = (u64)eccr0_corr_err_data_hi << 32;
+ data |= regval;
+
+ for (i = 0 ; i < DATA_BITS; i++) {
+ if (data & BIT(i))
+ rows[i] = rowa[i];
+ }
+
+ for (i = DATA_BITS ; i < NUM_ECC_BITS; i++) {
+ j = i - DATA_BITS;
+ if (parity & BIT(j))
+ rows[i] = rowa[i];
+ }
+
+ for (j = 0 ; j < BITS_PER_BYTE; j++) {
+ bit = rows[0] & BIT(j);
+ prev = !!bit;
+
+ for (i = 1 ; i < NUM_ECC_BITS; i++) {
+ bit = rows[i] & BIT(j);
+ field = !!bit;
+ if (field != prev)
+ prev = true;
+ else
+ prev = false;
+ }
+ xor[j] = prev;
+ }
+
+ for (j = 0 ; j < BITS_PER_BYTE; j++)
+ match |= xor[j] << j;
+
+ for (i = 0 ; i < NUM_ECC_BITS; i++) {
+ if (match == rowa[i]) {
+ edac_dbg(2, "bit found is 0x%08X\n", i);
+ return;
+ }
+ }
+}
+
static void get_ce_error_info(struct edac_priv *priv)
{
void __iomem *ddrmc_base;
@@ -271,6 +346,15 @@ static void get_ce_error_info(struct edac_priv *priv)
p->ceinfo[1].i = regval | reghi << 32;
regval = readl(ddrmc_base + ECCR1_CE_ADDR_HI_OFFSET);
+ if (IS_ENABLED(CONFIG_EDAC_DEBUG)) {
+ u32 eccr0_corr_err_data_hi, par;
+
+ par = readl(ddrmc_base + ECCR0_CE_DATA_PAR_OFFSET);
+ regval = readl(ddrmc_base + ECCR0_CE_DATA_LO_OFFSET);
+ eccr0_corr_err_data_hi = readl(ddrmc_base + ECCR0_CE_DATA_HI_OFFSET);
+ calculate_ce_bit_value(eccr0_corr_err_data_hi, regval, par);
+ }
+
edac_dbg(2, "ERR DATA: 0x%08X%08X ERR DATA PARITY: 0x%08X\n",
readl(ddrmc_base + ECCR1_CE_DATA_LO_OFFSET),
readl(ddrmc_base + ECCR1_CE_DATA_HI_OFFSET),
In the case of CE, the bit position can be derived from the data and the parity registers. The values are read and then from the fixed H Matrix the corrected bit position is derived. The hardware already corrects the CE errors however it is derived from the registers and reported for debug purposes. When a correctable error is detected, the driver reads from the registers: eccr0/eccr1_corr_err_data_lo eccr0/eccr1_corr_err_data_hi eccr0/eccr1_corr_err_data_par and along with shifting/XORing fixed H_Matrix ECC values, calculates which bit location is found, mimicking the calculation done in the DDRMC ECC hw. The H_Matrix was generated from an old 1950's IBM paper. The maximum bits in the data read is 64 so the data match is run for 64 bits. Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@amd.com> --- drivers/edac/versal_edac.c | 84 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+)