diff mbox series

EDAC/amd64: Include MCA error codes in EDAC message

Message ID 20220622160800.1293328-1-yazen.ghannam@amd.com (mailing list archive)
State New, archived
Headers show
Series EDAC/amd64: Include MCA error codes in EDAC message | expand

Commit Message

Yazen Ghannam June 22, 2022, 4:08 p.m. UTC
The AMD64 EDAC module does not include MCA information in its output.
Users and tooling that gather memory error information only from EDAC
will lose the MCA information.

Print the ErrorCode and ErrorCodeExt fields from MCA_STATUS as part of
the EDAC message, so that relevant memory error information is available
from a single source.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 drivers/edac/amd64_edac.c | 11 ++++++++++-
 drivers/edac/amd64_edac.h |  2 ++
 2 files changed, 12 insertions(+), 1 deletion(-)

Comments

kernel test robot June 22, 2022, 10:42 p.m. UTC | #1
Hi Yazen,

I love your patch! Perhaps something to improve:

[auto build test WARNING on ras/edac-for-next]
[also build test WARNING on linus/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/Yazen-Ghannam/EDAC-amd64-Include-MCA-error-codes-in-EDAC-message/20220623-001158
base:   https://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20220623/202206230649.5mB3KFtF-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-3) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/f791cdde2f3ca52076ed5d1185138b80d4d783bf
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Yazen-Ghannam/EDAC-amd64-Include-MCA-error-codes-in-EDAC-message/20220623-001158
        git checkout f791cdde2f3ca52076ed5d1185138b80d4d783bf
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/edac/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   drivers/edac/amd64_edac.c: In function '__log_ecc_error':
>> drivers/edac/amd64_edac.c:3179:13: warning: variable 'len' set but not used [-Wunused-but-set-variable]
    3179 |         int len;
         |             ^~~


vim +/len +3179 drivers/edac/amd64_edac.c

  3173	
  3174	static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err,
  3175				    u8 ecc_type)
  3176	{
  3177		enum hw_event_mc_err_type err_type;
  3178		const char *string;
> 3179		int len;
  3180	
  3181		if (ecc_type == 2)
  3182			err_type = HW_EVENT_ERR_CORRECTED;
  3183		else if (ecc_type == 1)
  3184			err_type = HW_EVENT_ERR_UNCORRECTED;
  3185		else if (ecc_type == 3)
  3186			err_type = HW_EVENT_ERR_DEFERRED;
  3187		else {
  3188			WARN(1, "Something is rotten in the state of Denmark.\n");
  3189			return;
  3190		}
  3191	
  3192		switch (err->err_code) {
  3193		case DECODE_OK:
  3194			string = "";
  3195			break;
  3196		case ERR_NODE:
  3197			string = "Failed to map error addr to a node";
  3198			break;
  3199		case ERR_CSROW:
  3200			string = "Failed to map error addr to a csrow";
  3201			break;
  3202		case ERR_CHANNEL:
  3203			string = "Unknown syndrome - possible error reporting race";
  3204			break;
  3205		case ERR_SYND:
  3206			string = "MCA_SYND not valid - unknown syndrome and csrow";
  3207			break;
  3208		case ERR_NORM_ADDR:
  3209			string = "Cannot decode normalized address";
  3210			break;
  3211		default:
  3212			string = "WTF error";
  3213			break;
  3214		}
  3215	
  3216		len = snprintf(msg, MSG_SIZE, "err_code:0x%04x:0x%04x", err->xec, err->ec);
  3217	
  3218		edac_mc_handle_error(err_type, mci, 1,
  3219				     err->page, err->offset, err->syndrome,
  3220				     err->csrow, err->channel, -1,
  3221				     string, msg);
  3222	}
  3223
diff mbox series

Patch

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 2f854feeeb23..7905cfd34cd0 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -3168,11 +3168,15 @@  static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
 	return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz);
 }
 
+#define MSG_SIZE		1024
+static char msg[MSG_SIZE];
+
 static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err,
 			    u8 ecc_type)
 {
 	enum hw_event_mc_err_type err_type;
 	const char *string;
+	int len;
 
 	if (ecc_type == 2)
 		err_type = HW_EVENT_ERR_CORRECTED;
@@ -3209,10 +3213,12 @@  static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err,
 		break;
 	}
 
+	len = snprintf(msg, MSG_SIZE, "err_code:0x%04x:0x%04x", err->xec, err->ec);
+
 	edac_mc_handle_error(err_type, mci, 1,
 			     err->page, err->offset, err->syndrome,
 			     err->csrow, err->channel, -1,
-			     string, "");
+			     string, msg);
 }
 
 static inline void decode_bus_error(int node_id, struct mce *m)
@@ -3281,6 +3287,9 @@  static void decode_umc_error(int node_id, struct mce *m)
 
 	memset(&err, 0, sizeof(err));
 
+	err.ec  = EC(m->status);
+	err.xec = XEC(m->status, 0x3f);
+
 	if (m->status & MCI_STATUS_DEFERRED)
 		ecc_type = 3;
 
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 38e5ad95d010..a49d797b7322 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -422,6 +422,8 @@  struct err_info {
 	struct mem_ctl_info *src_mci;
 	int csrow;
 	int channel;
+	u16 ec;
+	u16 xec;
 	u16 syndrome;
 	u32 page;
 	u32 offset;