diff mbox series

[v7,15/17] cxl/pci: Add support to assign and clear pci_driver::cxl_err_handlers

Message ID 20250211192444.2292833-16-terry.bowman@amd.com (mailing list archive)
State Handled Elsewhere
Headers show
Series Enable CXL PCIe port protocol error handling and logging | expand

Commit Message

Bowman, Terry Feb. 11, 2025, 7:24 p.m. UTC
pci_driver::cxl_err_handlers are not currently assigned handler callbacks.
The handlers can't be set in the pci_driver static definition because the
CXL PCIe Port devices are bound to the portdrv driver which is not CXL
driver aware.

Add cxl_assign_port_error_handlers() in the cxl_core module. This
function will assign the default handlers for a CXL PCIe Port device.

When the CXL Port (cxl_port or cxl_dport) is destroyed the device's
pci_driver::cxl_err_handlers must be set to NULL indicating they should no
longer be used.

Create cxl_clear_port_error_handlers() and register it to be called
when the CXL Port device (cxl_port or cxl_dport) is destroyed.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Gregory Price <gourry@gourry.net>
---
 drivers/cxl/core/pci.c | 59 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

Comments

Dave Jiang Feb. 12, 2025, 12:38 a.m. UTC | #1
On 2/11/25 12:24 PM, Terry Bowman wrote:
> pci_driver::cxl_err_handlers are not currently assigned handler callbacks.
> The handlers can't be set in the pci_driver static definition because the
> CXL PCIe Port devices are bound to the portdrv driver which is not CXL
> driver aware.
> 
> Add cxl_assign_port_error_handlers() in the cxl_core module. This
> function will assign the default handlers for a CXL PCIe Port device.
> 
> When the CXL Port (cxl_port or cxl_dport) is destroyed the device's
> pci_driver::cxl_err_handlers must be set to NULL indicating they should no
> longer be used.
> 
> Create cxl_clear_port_error_handlers() and register it to be called
> when the CXL Port device (cxl_port or cxl_dport) is destroyed.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> Reviewed-by: Gregory Price <gourry@gourry.net>

Reviewed-by: Dave Jiang <dave.jiang@intel.com>
> ---
>  drivers/cxl/core/pci.c | 59 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 57 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index f154dcf6dfda..03ae21a944e0 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -860,8 +860,39 @@ static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev)
>  	return __cxl_handle_ras(dev, &pdev->dev, ras_base);
>  }
>  
> +static const struct cxl_error_handlers cxl_port_error_handlers = {
> +	.error_detected	= cxl_port_error_detected,
> +	.cor_error_detected = cxl_port_cor_error_detected,
> +};
> +
> +static void cxl_assign_port_error_handlers(struct pci_dev *pdev)
> +{
> +	struct pci_driver *pdrv;
> +
> +	if (!pdev || !pdev->driver || !get_device(&pdev->dev))
> +		return;
> +
> +	pdrv = pdev->driver;
> +	pdrv->cxl_err_handler = &cxl_port_error_handlers;
> +	put_device(&pdev->dev);
> +}
> +
> +static void cxl_clear_port_error_handlers(void *data)
> +{
> +	struct pci_dev *pdev = data;
> +	struct pci_driver *pdrv;
> +
> +	if (!pdev || !pdev->driver || !get_device(&pdev->dev))
> +		return;
> +
> +	pdrv = pdev->driver;
> +	pdrv->cxl_err_handler = NULL;
> +	put_device(&pdev->dev);
> +}
> +
>  void cxl_uport_init_ras_reporting(struct cxl_port *port)
>  {
> +	struct pci_dev *pdev = to_pci_dev(port->uport_dev);
>  
>  	/* uport may have more than 1 downstream EP. Check if already mapped. */
>  	mutex_lock(&ras_init_mutex);
> @@ -872,9 +903,15 @@ void cxl_uport_init_ras_reporting(struct cxl_port *port)
>  
>  	port->reg_map.host = &port->dev;
>  	if (cxl_map_component_regs(&port->reg_map, &port->uport_regs,
> -				   BIT(CXL_CM_CAP_CAP_ID_RAS)))
> +				   BIT(CXL_CM_CAP_CAP_ID_RAS))) {
>  		dev_err(&port->dev, "Failed to map RAS capability\n");
> +		mutex_unlock(&ras_init_mutex);
> +		return;
> +	}
>  	mutex_unlock(&ras_init_mutex);
> +
> +	cxl_assign_port_error_handlers(pdev);
> +	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL");
>  
> @@ -886,6 +923,8 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>  {
>  	struct device *dport_dev = dport->dport_dev;
>  	struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev);
> +	struct pci_dev *pdev = to_pci_dev(dport_dev);
> +	struct cxl_port *port;
>  
>  	dport->reg_map.host = dport_dev;
>  	if (dport->rch && host_bridge->native_aer) {
> @@ -901,9 +940,25 @@ void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
>  	}
>  
>  	if (cxl_map_component_regs(&dport->reg_map, &dport->regs.component,
> -				   BIT(CXL_CM_CAP_CAP_ID_RAS)))
> +				   BIT(CXL_CM_CAP_CAP_ID_RAS))) {
>  		dev_err(dport_dev, "Failed to map RAS capability\n");
> +		mutex_unlock(&ras_init_mutex);
> +		return;
> +	}
>  	mutex_unlock(&ras_init_mutex);
> +
> +	if (dport->rch)
> +		return;
> +
> +	port = find_cxl_port(dport_dev, NULL);
> +	if (!port) {
> +		dev_err(dport_dev, "Failed to find upstream port\n");
> +		return;
> +	}
> +
> +	cxl_assign_port_error_handlers(pdev);
> +	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
> +	put_device(&port->dev);
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");
>
Dan Williams Feb. 14, 2025, 2:29 a.m. UTC | #2
Terry Bowman wrote:
> pci_driver::cxl_err_handlers are not currently assigned handler callbacks.
> The handlers can't be set in the pci_driver static definition because the
> CXL PCIe Port devices are bound to the portdrv driver which is not CXL
> driver aware.
> 
> Add cxl_assign_port_error_handlers() in the cxl_core module. This
> function will assign the default handlers for a CXL PCIe Port device.
> 
> When the CXL Port (cxl_port or cxl_dport) is destroyed the device's
> pci_driver::cxl_err_handlers must be set to NULL indicating they should no
> longer be used.
> 
> Create cxl_clear_port_error_handlers() and register it to be called
> when the CXL Port device (cxl_port or cxl_dport) is destroyed.

This is another complication that naturally goes away with
cxl_error_handlers are instances that get attached to 'struct
cxl_driver' instances rather tha 'struct pci_driver' instances.

> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Reviewed-by: Ira Weiny <ira.weiny@intel.com>
> Reviewed-by: Gregory Price <gourry@gourry.net>
> ---
>  drivers/cxl/core/pci.c | 59 ++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 57 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index f154dcf6dfda..03ae21a944e0 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -860,8 +860,39 @@ static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev)
>  	return __cxl_handle_ras(dev, &pdev->dev, ras_base);
>  }
>  
> +static const struct cxl_error_handlers cxl_port_error_handlers = {
> +	.error_detected	= cxl_port_error_detected,
> +	.cor_error_detected = cxl_port_cor_error_detected,
> +};
> +
> +static void cxl_assign_port_error_handlers(struct pci_dev *pdev)
> +{
> +	struct pci_driver *pdrv;
> +
> +	if (!pdev || !pdev->driver || !get_device(&pdev->dev))
> +		return;
> +
> +	pdrv = pdev->driver;
> +	pdrv->cxl_err_handler = &cxl_port_error_handlers;

Nothing is holding the @pdev device_lock(), so @pdev->driver may go NULL
immediately after reading it.

Also, it is possible for a 'struct cxl_port' to exist even though its
uport_dev (pci_dev) is not attached to a driver. This would seem to
result in unpredictable behavior from one kernel to the next as the PCIe
portdrv situation evolves.

Lastly, I do not like the precedent of not being able to read a 'struct
pci_driver' template and be assured that it captures all possible error
handlers, or even worse, this unceremoniously overrides a PCI driver
that thinks it knows what the CXL error handlers should be.
diff mbox series

Patch

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index f154dcf6dfda..03ae21a944e0 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -860,8 +860,39 @@  static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev)
 	return __cxl_handle_ras(dev, &pdev->dev, ras_base);
 }
 
+static const struct cxl_error_handlers cxl_port_error_handlers = {
+	.error_detected	= cxl_port_error_detected,
+	.cor_error_detected = cxl_port_cor_error_detected,
+};
+
+static void cxl_assign_port_error_handlers(struct pci_dev *pdev)
+{
+	struct pci_driver *pdrv;
+
+	if (!pdev || !pdev->driver || !get_device(&pdev->dev))
+		return;
+
+	pdrv = pdev->driver;
+	pdrv->cxl_err_handler = &cxl_port_error_handlers;
+	put_device(&pdev->dev);
+}
+
+static void cxl_clear_port_error_handlers(void *data)
+{
+	struct pci_dev *pdev = data;
+	struct pci_driver *pdrv;
+
+	if (!pdev || !pdev->driver || !get_device(&pdev->dev))
+		return;
+
+	pdrv = pdev->driver;
+	pdrv->cxl_err_handler = NULL;
+	put_device(&pdev->dev);
+}
+
 void cxl_uport_init_ras_reporting(struct cxl_port *port)
 {
+	struct pci_dev *pdev = to_pci_dev(port->uport_dev);
 
 	/* uport may have more than 1 downstream EP. Check if already mapped. */
 	mutex_lock(&ras_init_mutex);
@@ -872,9 +903,15 @@  void cxl_uport_init_ras_reporting(struct cxl_port *port)
 
 	port->reg_map.host = &port->dev;
 	if (cxl_map_component_regs(&port->reg_map, &port->uport_regs,
-				   BIT(CXL_CM_CAP_CAP_ID_RAS)))
+				   BIT(CXL_CM_CAP_CAP_ID_RAS))) {
 		dev_err(&port->dev, "Failed to map RAS capability\n");
+		mutex_unlock(&ras_init_mutex);
+		return;
+	}
 	mutex_unlock(&ras_init_mutex);
+
+	cxl_assign_port_error_handlers(pdev);
+	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
 }
 EXPORT_SYMBOL_NS_GPL(cxl_uport_init_ras_reporting, "CXL");
 
@@ -886,6 +923,8 @@  void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
 {
 	struct device *dport_dev = dport->dport_dev;
 	struct pci_host_bridge *host_bridge = to_pci_host_bridge(dport_dev);
+	struct pci_dev *pdev = to_pci_dev(dport_dev);
+	struct cxl_port *port;
 
 	dport->reg_map.host = dport_dev;
 	if (dport->rch && host_bridge->native_aer) {
@@ -901,9 +940,25 @@  void cxl_dport_init_ras_reporting(struct cxl_dport *dport)
 	}
 
 	if (cxl_map_component_regs(&dport->reg_map, &dport->regs.component,
-				   BIT(CXL_CM_CAP_CAP_ID_RAS)))
+				   BIT(CXL_CM_CAP_CAP_ID_RAS))) {
 		dev_err(dport_dev, "Failed to map RAS capability\n");
+		mutex_unlock(&ras_init_mutex);
+		return;
+	}
 	mutex_unlock(&ras_init_mutex);
+
+	if (dport->rch)
+		return;
+
+	port = find_cxl_port(dport_dev, NULL);
+	if (!port) {
+		dev_err(dport_dev, "Failed to find upstream port\n");
+		return;
+	}
+
+	cxl_assign_port_error_handlers(pdev);
+	devm_add_action_or_reset(&port->dev, cxl_clear_port_error_handlers, pdev);
+	put_device(&port->dev);
 }
 EXPORT_SYMBOL_NS_GPL(cxl_dport_init_ras_reporting, "CXL");