diff mbox series

[11/11] EDAC/ie31200: Switch Raptor Lake-S to interrupt mode

Message ID 20250226015202.36576-12-qiuxu.zhuo@intel.com (mailing list archive)
State New
Headers show
Series EDAC/ie31200: Add EDAC support for Intel Raptor Lake-S SoCs | expand

Commit Message

Zhuo, Qiuxu Feb. 26, 2025, 1:52 a.m. UTC
Raptor Lake-S SoCs notify memory errors via Machine Check. Add interrupt
mode support and switch Raptor Lake-S EDAC from polling to interrupt mode.

Tested-by: Gary Wang <gary.c.wang@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
---
 drivers/edac/Kconfig        |  2 +-
 drivers/edac/ie31200_edac.c | 83 ++++++++++++++++++++++++++++++++++---
 2 files changed, 78 insertions(+), 7 deletions(-)

Comments

Tony Luck Feb. 27, 2025, 6:01 p.m. UTC | #1
> Raptor Lake-S SoCs notify memory errors via Machine Check. Add interrupt
> mode support and switch Raptor Lake-S EDAC from polling to interrupt mode.

Is this notification #MC (a.k.a. INT#18)? Or CMCI? Or #MC for uncorrected errors
and CMCI for corrected errors?

Corrected errors -> CMCI I can understand. This code should work well for that.
Same for uncorrected patrol scrub errors -> CMCI.

Other uncorrected errors -> #MC is trickier. Does Raptor Lake support recovery
from other uncorrected errors? If it doesn't, then this driver handler will not be
called (Linux panicked and never called the functions registered on the mce
decode chain).


Which is perhaps a long way of asking whether you really mean:

   Raptor Lake-S SoCs notify memory errors via CMCI. Add interrupt
   mode support and switch Raptor Lake-S EDAC from polling to interrupt mode.

-Tony
Zhuo, Qiuxu March 3, 2025, 2:53 a.m. UTC | #2
Hi Tony,

> From: Luck, Tony <tony.luck@intel.com>
> [...]
> Subject: RE: [PATCH 11/11] EDAC/ie31200: Switch Raptor Lake-S to interrupt
> mode
> 
> > Raptor Lake-S SoCs notify memory errors via Machine Check. Add
> > interrupt mode support and switch Raptor Lake-S EDAC from polling to
> interrupt mode.
> 
> Is this notification #MC (a.k.a. INT#18)? Or CMCI? Or #MC for uncorrected
> errors and CMCI for corrected errors?

We performed correctable error injection, and the correctable error events were
notified by CMCI. 

> Corrected errors -> CMCI I can understand. This code should work well for
> that.
> Same for uncorrected patrol scrub errors -> CMCI.
> 
> Other uncorrected errors -> #MC is trickier. Does Raptor Lake support
> recovery from other uncorrected errors? If it doesn't, then this driver handler
> will not be called (Linux panicked and never called the functions registered on
> the mce decode chain).

It seems like Raptor Lake doesn't support recovery from uncorrected errors. 
When performing the uncorrectable error injection testing, the system hung 
(The validation team mentioned this was expected behavior). 
 
> 
> Which is perhaps a long way of asking whether you really mean:
> 
>    Raptor Lake-S SoCs notify memory errors via CMCI. Add interrupt

Yes, the correctable errors are via CMCI, not machine check. For uncorrectable errors,
the system is hung, and no callback to EDAC. I'll update this commit description in 
the next version. 

>    mode support and switch Raptor Lake-S EDAC from polling to interrupt
> mode.
> [...]
diff mbox series

Patch

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 2051a7c944a5..b908a118fb32 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -168,7 +168,7 @@  config EDAC_I3200
 
 config EDAC_IE31200
 	tristate "Intel e312xx"
-	depends on PCI && X86
+	depends on PCI && X86 && X86_MCE_INTEL
 	help
 	  Support for error detection and correction on the Intel
 	  E3-1200 based DRAM controllers.
diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c
index 8c0a2beec537..4abcf7b322c2 100644
--- a/drivers/edac/ie31200_edac.c
+++ b/drivers/edac/ie31200_edac.c
@@ -51,6 +51,7 @@ 
 #include <linux/edac.h>
 
 #include <linux/io-64-nonatomic-lo-hi.h>
+#include <asm/mce.h>
 #include "edac_module.h"
 
 #define EDAC_MOD_STR "ie31200_edac"
@@ -123,6 +124,7 @@  static int ie31200_registered = 1;
 
 struct res_config {
 	enum mem_type mtype;
+	bool machine_check;
 	int imc_num;
 	/* Host MMIO configuration register */
 	u64 reg_mchbar_mask;
@@ -172,6 +174,7 @@  struct ie31200_error_info {
 	u16 errsts;
 	u16 errsts2;
 	u64 eccerrlog[IE31200_CHANNELS];
+	u64 erraddr;
 };
 
 static const struct ie31200_dev_info ie31200_devs[] = {
@@ -327,13 +330,13 @@  static void ie31200_process_error_info(struct mem_ctl_info *mci,
 		log = info->eccerrlog[channel];
 		if (log & cfg->reg_eccerrlog_ue_mask) {
 			edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, 1,
-					     0, 0, 0,
+					     info->erraddr >> PAGE_SHIFT, 0, 0,
 					     field_get(cfg->reg_eccerrlog_rank_mask, log),
 					     channel, -1,
 					     "ie31200 UE", "");
 		} else if (log & cfg->reg_eccerrlog_ce_mask) {
 			edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, 1,
-					     0, 0,
+					     info->erraddr >> PAGE_SHIFT, 0,
 					     field_get(cfg->reg_eccerrlog_syndrome_mask, log),
 					     field_get(cfg->reg_eccerrlog_rank_mask, log),
 					     channel, -1,
@@ -342,14 +345,20 @@  static void ie31200_process_error_info(struct mem_ctl_info *mci,
 	}
 }
 
-static void ie31200_check(struct mem_ctl_info *mci)
+static void __ie31200_check(struct mem_ctl_info *mci, struct mce *mce)
 {
 	struct ie31200_error_info info;
 
+	info.erraddr = mce ? mce->addr : 0;
 	ie31200_get_and_clear_error_info(mci, &info);
 	ie31200_process_error_info(mci, &info);
 }
 
+static void ie31200_check(struct mem_ctl_info *mci)
+{
+	__ie31200_check(mci, NULL);
+}
+
 static void __iomem *ie31200_map_mchbar(struct pci_dev *pdev, struct res_config *cfg, int mc)
 {
 	union {
@@ -459,7 +468,7 @@  static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, in
 	mci->mod_name = EDAC_MOD_STR;
 	mci->ctl_name = ie31200_devs[mc].ctl_name;
 	mci->dev_name = pci_name(pdev);
-	mci->edac_check = ie31200_check;
+	mci->edac_check = cfg->machine_check ? NULL : ie31200_check;
 	mci->ctl_page_to_phys = NULL;
 	priv = mci->pvt_info;
 	priv->window = window;
@@ -499,6 +508,58 @@  static int ie31200_register_mci(struct pci_dev *pdev, struct res_config *cfg, in
 	return ret;
 }
 
+static void mce_check(struct mce *mce)
+{
+	struct ie31200_priv *priv;
+	int i;
+
+	for (i = 0; i < IE31200_IMC_NUM; i++) {
+		priv = ie31200_pvt.priv[i];
+		if (!priv)
+			continue;
+
+		__ie31200_check(priv->mci, mce);
+	}
+}
+
+static int mce_handler(struct notifier_block *nb, unsigned long val, void *data)
+{
+	struct mce *mce = (struct mce *)data;
+	char *type;
+
+	if (mce->kflags & MCE_HANDLED_CEC)
+		return NOTIFY_DONE;
+
+	/*
+	 * Ignore unless this is a memory related error.
+	 * Don't check MCI_STATUS_ADDRV since it's not set on some CPUs.
+	 */
+	if ((mce->status & 0xefff) >> 7 != 1)
+		return NOTIFY_DONE;
+
+	type = mce->mcgstatus & MCG_STATUS_MCIP ?  "Exception" : "Event";
+
+	edac_dbg(0, "CPU %d: Machine Check %s: 0x%llx Bank %d: 0x%llx\n",
+		 mce->extcpu, type, mce->mcgstatus,
+		 mce->bank, mce->status);
+	edac_dbg(0, "TSC 0x%llx\n", mce->tsc);
+	edac_dbg(0, "ADDR 0x%llx\n", mce->addr);
+	edac_dbg(0, "MISC 0x%llx\n", mce->misc);
+	edac_dbg(0, "PROCESSOR %u:0x%x TIME %llu SOCKET %u APIC 0x%x\n",
+		 mce->cpuvendor, mce->cpuid, mce->time,
+		 mce->socketid, mce->apicid);
+
+	mce_check(mce);
+	mce->kflags |= MCE_HANDLED_EDAC;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ie31200_mce_dec = {
+	.notifier_call	= mce_handler,
+	.priority	= MCE_PRIO_EDAC,
+};
+
 static void ie31200_unregister_mcis(void)
 {
 	struct ie31200_priv *priv;
@@ -534,6 +595,13 @@  static int ie31200_probe1(struct pci_dev *pdev, struct res_config *cfg)
 			goto fail_register;
 	}
 
+	if (cfg->machine_check) {
+		mce_register_decode_chain(&ie31200_mce_dec);
+		edac_op_state = EDAC_OPSTATE_INT;
+	} else {
+		edac_op_state = EDAC_OPSTATE_POLL;
+	}
+
 	/* get this far and it's successful. */
 	edac_dbg(3, "MC: success\n");
 	return 0;
@@ -560,9 +628,13 @@  static int ie31200_init_one(struct pci_dev *pdev,
 
 static void ie31200_remove_one(struct pci_dev *pdev)
 {
+	struct ie31200_priv *priv = ie31200_pvt.priv[0];
+
 	edac_dbg(0, "\n");
 	pci_dev_put(mci_pdev);
 	mci_pdev = NULL;
+	if (priv->cfg->machine_check)
+		mce_unregister_decode_chain(&ie31200_mce_dec);
 	ie31200_unregister_mcis();
 }
 
@@ -612,6 +684,7 @@  static struct res_config skl_cfg = {
 
 struct res_config rpl_s_cfg = {
 	.mtype				= MEM_DDR5,
+	.machine_check			= true,
 	.imc_num			= 2,
 	.reg_mchbar_mask		= GENMASK_ULL(41, 17),
 	.reg_mchbar_window_size		= BIT_ULL(16),
@@ -677,8 +750,6 @@  static int __init ie31200_init(void)
 	int pci_rc, i;
 
 	edac_dbg(3, "MC:\n");
-	/* Ensure that the OPSTATE is set correctly for POLL or NMI */
-	opstate_init();
 
 	pci_rc = pci_register_driver(&ie31200_driver);
 	if (pci_rc < 0)