diff mbox series

[v8,13/16] cxl/pci: Assign CXL Endpoint protocol error handlers

Message ID 20250327014717.2988633-14-terry.bowman@amd.com
State New
Headers show
Series Enable CXL PCIe port protocol error handling and logging | expand

Commit Message

Bowman, Terry March 27, 2025, 1:47 a.m. UTC
CXL Endpoint protocol errors are currently handled using PCI error
handlers. The CXL Endpoint requires CXL specific handling in the case of
uncorrectable error handling not provided by the PCI handlers.

Add CXL specific handlers for CXL Endpoints. Assign the CXL handlers
during Endpoint Port initialization.

Keep the PCI Endpoint handlers. PCI handlers can be called if the CXL
device is not trained for alternate protocol (CXL). Update the CXL
Endpoint PCI handlers to call the CXL handler. If the CXL
uncorrectable handler returns PCI_ERS_RESULT_PANIC then the PCI
handler invokes panic().

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/cxl/core/pci.c | 65 ++++++++++++++++++++++++------------------
 drivers/cxl/cxl.h      |  5 ++++
 drivers/cxl/cxlpci.h   |  4 +--
 drivers/cxl/pci.c      |  8 +++---
 drivers/cxl/port.c     |  7 +++++
 5 files changed, 56 insertions(+), 33 deletions(-)

Comments

kernel test robot March 27, 2025, 7:46 p.m. UTC | #1
Hi Terry,

kernel test robot noticed the following build warnings:

[auto build test WARNING on aae0594a7053c60b82621136257c8b648c67b512]

url:    https://github.com/intel-lab-lkp/linux/commits/Terry-Bowman/PCI-CXL-Introduce-PCIe-helper-function-pcie_is_cxl/20250327-095738
base:   aae0594a7053c60b82621136257c8b648c67b512
patch link:    https://lore.kernel.org/r/20250327014717.2988633-14-terry.bowman%40amd.com
patch subject: [PATCH v8 13/16] cxl/pci: Assign CXL Endpoint protocol error handlers
config: csky-randconfig-r122-20250327 (https://download.01.org/0day-ci/archive/20250328/202503280346.euKvcovE-lkp@intel.com/config)
compiler: csky-linux-gcc (GCC) 12.4.0
reproduce: (https://download.01.org/0day-ci/archive/20250328/202503280346.euKvcovE-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503280346.euKvcovE-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> drivers/cxl/port.c:68:33: sparse: sparse: symbol 'cxl_ep_error_handlers' was not declared. Should it be static?

vim +/cxl_ep_error_handlers +68 drivers/cxl/port.c

    67	
  > 68	const struct cxl_error_handlers cxl_ep_error_handlers = {
    69		.error_detected = cxl_error_detected,
    70		.cor_error_detected = cxl_cor_error_detected,
    71	};
    72
diff mbox series

Patch

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 9ed6f700e132..f2139b382839 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -852,10 +852,10 @@  static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds)
 static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { }
 #endif
 
-void cxl_cor_error_detected(struct pci_dev *pdev)
+void cxl_cor_error_detected(struct device *dev, struct cxl_prot_error_info *err_info)
 {
+	struct pci_dev *pdev = err_info->pdev;
 	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct device *dev = &cxlds->cxlmd->dev;
 
 	scoped_guard(device, dev) {
 		if (!dev->driver) {
@@ -873,20 +873,30 @@  void cxl_cor_error_detected(struct pci_dev *pdev)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, "CXL");
 
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
-				    pci_channel_state_t state)
+void pci_cor_error_detected(struct pci_dev *pdev)
+{
+	struct cxl_prot_error_info err_info;
+
+	if (cxl_create_prot_err_info(pdev, AER_CORRECTABLE, &err_info))
+		return;
+
+	cxl_cor_error_detected(err_info.dev, &err_info);
+}
+EXPORT_SYMBOL_NS_GPL(pci_cor_error_detected, "CXL");
+
+pci_ers_result_t cxl_error_detected(struct device *dev,
+				    struct cxl_prot_error_info *err_info)
 {
-	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
-	struct cxl_memdev *cxlmd = cxlds->cxlmd;
-	struct device *dev = &cxlmd->dev;
 	bool ue;
+	struct pci_dev *pdev = err_info->pdev;
+	struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
 
 	scoped_guard(device, dev) {
 		if (!dev->driver) {
 			dev_warn(&pdev->dev,
 				 "%s: memdev disabled, abort error handling\n",
 				 dev_name(dev));
-			return PCI_ERS_RESULT_DISCONNECT;
+			return PCI_ERS_RESULT_PANIC;
 		}
 
 		if (cxlds->rcd)
@@ -900,29 +910,30 @@  pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
 		ue = cxl_handle_endpoint_ras(cxlds);
 	}
 
+	if (ue)
+		return PCI_ERS_RESULT_PANIC;
 
-	switch (state) {
-	case pci_channel_io_normal:
-		if (ue) {
-			device_release_driver(dev);
-			return PCI_ERS_RESULT_NEED_RESET;
-		}
-		return PCI_ERS_RESULT_CAN_RECOVER;
-	case pci_channel_io_frozen:
-		dev_warn(&pdev->dev,
-			 "%s: frozen state error detected, disable CXL.mem\n",
-			 dev_name(dev));
-		device_release_driver(dev);
-		return PCI_ERS_RESULT_NEED_RESET;
-	case pci_channel_io_perm_failure:
-		dev_warn(&pdev->dev,
-			 "failure state error detected, request disconnect\n");
-		return PCI_ERS_RESULT_DISCONNECT;
-	}
-	return PCI_ERS_RESULT_NEED_RESET;
+	return PCI_ERS_RESULT_CAN_RECOVER;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_error_detected, "CXL");
 
+pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
+				    pci_channel_state_t error)
+{
+	struct cxl_prot_error_info err_info;
+	pci_ers_result_t rc;
+
+	if (cxl_create_prot_err_info(pdev, AER_FATAL, &err_info))
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	rc = cxl_error_detected(err_info.dev, &err_info);
+	if (rc == PCI_ERS_RESULT_PANIC)
+		panic("CXL cachemem error.");
+
+	return rc;
+}
+EXPORT_SYMBOL_NS_GPL(pci_error_detected, "CXL");
+
 static int cxl_flit_size(struct pci_dev *pdev)
 {
 	if (cxl_pci_flit_256(pdev))
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 512cc38892ed..c1adf8a3cb9e 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -815,6 +815,11 @@  void cxl_port_cor_error_detected(struct device *dev,
 pci_ers_result_t cxl_port_error_detected(struct device *dev,
 					 struct cxl_prot_error_info *err_info);
 
+void cxl_cor_error_detected(struct device *dev,
+			    struct cxl_prot_error_info *err_info);
+pci_ers_result_t cxl_error_detected(struct device *dev,
+				    struct cxl_prot_error_info *err_info);
+
 /**
  * struct cxl_endpoint_dvsec_info - Cached DVSEC info
  * @mem_enabled: cached value of mem_enabled in the DVSEC at init time
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 92d72c0423ab..d277cf048eba 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -133,8 +133,8 @@  struct cxl_dev_state;
 int cxl_hdm_decode_init(struct cxl_dev_state *cxlds, struct cxl_hdm *cxlhdm,
 			struct cxl_endpoint_dvsec_info *info);
 void read_cdat_data(struct cxl_port *port);
-void cxl_cor_error_detected(struct pci_dev *pdev);
-pci_ers_result_t cxl_error_detected(struct pci_dev *pdev,
+void pci_cor_error_detected(struct pci_dev *pdev);
+pci_ers_result_t pci_error_detected(struct pci_dev *pdev,
 				    pci_channel_state_t state);
 int cxl_create_prot_err_info(struct pci_dev *_pdev, int severity,
 			     struct cxl_prot_error_info *err_info);
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 4288f4814cc5..c5be4422748e 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -1108,11 +1108,11 @@  static void cxl_reset_done(struct pci_dev *pdev)
 	}
 }
 
-static const struct pci_error_handlers cxl_error_handlers = {
-	.error_detected	= cxl_error_detected,
+static const struct pci_error_handlers pci_error_handlers = {
+	.error_detected = pci_error_detected,
 	.slot_reset	= cxl_slot_reset,
 	.resume		= cxl_error_resume,
-	.cor_error_detected	= cxl_cor_error_detected,
+	.cor_error_detected	= pci_cor_error_detected,
 	.reset_done	= cxl_reset_done,
 };
 
@@ -1120,7 +1120,7 @@  static struct pci_driver cxl_pci_driver = {
 	.name			= KBUILD_MODNAME,
 	.id_table		= cxl_mem_pci_tbl,
 	.probe			= cxl_pci_probe,
-	.err_handler		= &cxl_error_handlers,
+	.err_handler		= &pci_error_handlers,
 	.dev_groups		= cxl_rcd_groups,
 	.driver	= {
 		.probe_type	= PROBE_PREFER_ASYNCHRONOUS,
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index 30a4bdb88c31..8e2b70e73582 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -65,6 +65,11 @@  static const struct cxl_error_handlers cxl_port_error_handlers = {
 	.cor_error_detected = cxl_port_cor_error_detected,
 };
 
+const struct cxl_error_handlers cxl_ep_error_handlers = {
+	.error_detected = cxl_error_detected,
+	.cor_error_detected = cxl_cor_error_detected,
+};
+
 static void cxl_assign_error_handlers(struct device *_dev,
 				      const struct cxl_error_handlers *handlers)
 {
@@ -203,6 +208,8 @@  static void cxl_endpoint_port_init_ras(struct cxl_port *port)
 	}
 
 	cxl_dport_init_ras_reporting(dport, cxlmd_dev);
+
+	cxl_assign_error_handlers(cxlmd_dev, &cxl_ep_error_handlers);
 }
 
 #else