diff mbox series

[07/15] cxl/aer/pci: Add CXL PCIe port uncorrectable error recovery in AER service driver

Message ID 20241008221657.1130181-8-terry.bowman@amd.com
State Superseded
Headers show
Series Enable CXL PCIe port protocol error handling and logging | expand

Commit Message

Bowman, Terry Oct. 8, 2024, 10:16 p.m. UTC
The current pcie_do_recovery() handles device recovery as result of
uncorrectable errors (UCE). But, CXL port devices require unique
recovery handling.

Create a cxl_do_recovery() function parallel to pcie_do_recovery(). Add CXL
specific handling to the new recovery function.

The CXL port UCE recovery must invoke the AER service driver's CXL port
UCE callback. This is different than the standard pcie_do_recovery()
recovery that calls the pci_driver::err_handler UCE handler instead.

Treat all CXL PCIe port UCE errors as fatal and call kernel panic to
"recover" the error. A panic is called instead of attempting recovery
to avoid potential system corruption.

The uncorrectable support added here will be used to complete CXL PCIe
port error handling in the future.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/pci/pci.h      |   5 ++
 drivers/pci/pcie/aer.c |   5 +-
 drivers/pci/pcie/err.c | 150 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 159 insertions(+), 1 deletion(-)

Comments

Jonathan Cameron Oct. 16, 2024, 4:54 p.m. UTC | #1
On Tue, 8 Oct 2024 17:16:49 -0500
Terry Bowman <terry.bowman@amd.com> wrote:

> The current pcie_do_recovery() handles device recovery as result of
> uncorrectable errors (UCE). But, CXL port devices require unique
> recovery handling.
> 
> Create a cxl_do_recovery() function parallel to pcie_do_recovery(). Add CXL
> specific handling to the new recovery function.
> 
> The CXL port UCE recovery must invoke the AER service driver's CXL port
> UCE callback. This is different than the standard pcie_do_recovery()
> recovery that calls the pci_driver::err_handler UCE handler instead.
> 
> Treat all CXL PCIe port UCE errors as fatal and call kernel panic to
> "recover" the error. A panic is called instead of attempting recovery
> to avoid potential system corruption.
> 
> The uncorrectable support added here will be used to complete CXL PCIe
> port error handling in the future.
> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>

Hi Terry,

I'm a little bothered by the subtle difference in the bus walks
in here vs the existing cases. If we need them, comments needed
to explain why.

If we are going to have separate handling, see if you can share
a lot more of the code by factoring out common functions for
the pci and cxl handling with callbacks to handle the differences.

I've managed to get my head around this code a few times in the past
(I think!) and really don't fancy having two subtle variants to
consider next time we get a bug :( The RC_EC additions hurt my head.

Jonathan

>  static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
> index 31090770fffc..de12f2eb19ef 100644
> --- a/drivers/pci/pcie/err.c
> +++ b/drivers/pci/pcie/err.c
> @@ -86,6 +86,63 @@ static int report_error_detected(struct pci_dev *dev,
>  	return 0;
>  }
>  
> +static int cxl_report_error_detected(struct pci_dev *dev,
> +				     pci_channel_state_t state,
> +				     enum pci_ers_result *result)
> +{
> +	struct cxl_port_err_hndlrs *cxl_port_hndlrs;
> +	struct pci_driver *pdrv;
> +	pci_ers_result_t vote;
> +
> +	device_lock(&dev->dev);
> +	cxl_port_hndlrs = find_cxl_port_hndlrs();

Can we refactor to have a common function under this and report_error_detected()?

> +	pdrv = dev->driver;
> +	if (pci_dev_is_disconnected(dev)) {
> +		vote = PCI_ERS_RESULT_DISCONNECT;
> +	} else if (!pci_dev_set_io_state(dev, state)) {
> +		pci_info(dev, "can't recover (state transition %u -> %u invalid)\n",
> +			dev->error_state, state);
> +		vote = PCI_ERS_RESULT_NONE;
> +	} else if (!cxl_port_hndlrs || !cxl_port_hndlrs->error_detected) {
> +		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
> +			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
> +			pci_info(dev, "can't recover (no error_detected callback)\n");
> +		} else {
> +			vote = PCI_ERS_RESULT_NONE;
> +		}
> +	} else {
> +		vote = cxl_port_hndlrs->error_detected(dev, state);
> +	}
> +	pci_uevent_ers(dev, vote);
> +	*result = merge_result(*result, vote);
> +	device_unlock(&dev->dev);
> +	return 0;
> +}

>  static int pci_pm_runtime_get_sync(struct pci_dev *pdev, void *data)
>  {
>  	pm_runtime_get_sync(&pdev->dev);
> @@ -188,6 +245,28 @@ static void pci_walk_bridge(struct pci_dev *bridge,
>  		cb(bridge, userdata);
>  }
>  
> +/**
> + * cxl_walk_bridge - walk bridges potentially AER affected
> + * @bridge:	bridge which may be a Port, an RCEC, or an RCiEP
> + * @cb:		callback to be called for each device found
> + * @userdata:	arbitrary pointer to be passed to callback
> + *
> + * If the device provided is a bridge, walk the subordinate bus, including
> + * the device itself and any bridged devices on buses under this bus.  Call
> + * the provided callback on each device found.
> + *
> + * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP,
> + * call the callback on the device itself.
only call the callback on the device itself.

(as you call it as stated above either way).

> + */
> +static void cxl_walk_bridge(struct pci_dev *bridge,
> +			    int (*cb)(struct pci_dev *, void *),
> +			    void *userdata)
> +{
> +	cb(bridge, userdata);
> +	if (bridge->subordinate)
> +		pci_walk_bus(bridge->subordinate, cb, userdata);
The difference between this and pci_walk_bridge() is subtle and
I'd like to avoid having both if we can.

> +}
> +
>  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>  		pci_channel_state_t state,
>  		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
> @@ -276,3 +355,74 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>  
>  	return status;
>  }
> +
> +pci_ers_result_t cxl_do_recovery(struct pci_dev *bridge,
> +				 pci_channel_state_t state,
> +				 pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
> +{
> +	struct pci_host_bridge *host = pci_find_host_bridge(bridge->bus);
> +	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
> +	int type = pci_pcie_type(bridge);
> +
> +	if ((type != PCI_EXP_TYPE_ROOT_PORT) &&
> +	    (type != PCI_EXP_TYPE_RC_EC) &&
> +	    (type != PCI_EXP_TYPE_DOWNSTREAM) &&
> +	    (type != PCI_EXP_TYPE_UPSTREAM)) {
> +		pci_dbg(bridge, "Unsupported device type (%x)\n", type);
> +		return status;
> +	}
> +

Would similar trick to in pcie_do_recovery work here for the upstream
and downstream ports use pci_upstream_bridge() and for the others pass the dev into
pci_walk_bridge()?

> +	cxl_walk_bridge(bridge, pci_pm_runtime_get_sync, NULL);
> +
> +	pci_dbg(bridge, "broadcast error_detected message\n");
> +	if (state == pci_channel_io_frozen) {
> +		cxl_walk_bridge(bridge, cxl_report_frozen_detected, &status);
> +		if (reset_subordinates(bridge) != PCI_ERS_RESULT_RECOVERED) {
> +			pci_warn(bridge, "subordinate device reset failed\n");
> +			goto failed;
> +		}
> +	} else {
> +		cxl_walk_bridge(bridge, cxl_report_normal_detected, &status);
> +	}
> +
> +	if (status == PCI_ERS_RESULT_PANIC)
> +		panic("CXL cachemem error. Invoking panic");
> +
> +	if (status == PCI_ERS_RESULT_CAN_RECOVER) {
> +		status = PCI_ERS_RESULT_RECOVERED;
> +		pci_dbg(bridge, "broadcast mmio_enabled message\n");
> +		cxl_walk_bridge(bridge, report_mmio_enabled, &status);
> +	}
> +
> +	if (status == PCI_ERS_RESULT_NEED_RESET) {
> +		status = PCI_ERS_RESULT_RECOVERED;
> +		pci_dbg(bridge, "broadcast slot_reset message\n");
> +		report_slot_reset(bridge, &status);
> +		pci_walk_bridge(bridge, report_slot_reset, &status);
> +	}
> +
> +	if (status != PCI_ERS_RESULT_RECOVERED)
> +		goto failed;
> +
> +	pci_dbg(bridge, "broadcast resume message\n");
> +	cxl_walk_bridge(bridge, report_resume, &status);
> +
> +	if (host->native_aer || pcie_ports_native) {
> +		pcie_clear_device_status(bridge);
> +		pci_aer_clear_nonfatal_status(bridge);
> +	}
> +
> +	cxl_walk_bridge(bridge, pci_pm_runtime_put, NULL);
> +
> +	pci_info(bridge, "device recovery successful\n");
> +	return status;
> +
> +failed:
> +	cxl_walk_bridge(bridge, pci_pm_runtime_put, NULL);
> +
> +	pci_uevent_ers(bridge, PCI_ERS_RESULT_DISCONNECT);
> +
> +	pci_info(bridge, "device recovery failed\n");
> +
> +	return status;
> +}
Bowman, Terry Oct. 16, 2024, 6:07 p.m. UTC | #2
Hi Jonathan,

On 10/16/24 11:54, Jonathan Cameron wrote:
> On Tue, 8 Oct 2024 17:16:49 -0500
> Terry Bowman <terry.bowman@amd.com> wrote:
> 
>> The current pcie_do_recovery() handles device recovery as result of
>> uncorrectable errors (UCE). But, CXL port devices require unique
>> recovery handling.
>>
>> Create a cxl_do_recovery() function parallel to pcie_do_recovery(). Add CXL
>> specific handling to the new recovery function.
>>
>> The CXL port UCE recovery must invoke the AER service driver's CXL port
>> UCE callback. This is different than the standard pcie_do_recovery()
>> recovery that calls the pci_driver::err_handler UCE handler instead.
>>
>> Treat all CXL PCIe port UCE errors as fatal and call kernel panic to
>> "recover" the error. A panic is called instead of attempting recovery
>> to avoid potential system corruption.
>>
>> The uncorrectable support added here will be used to complete CXL PCIe
>> port error handling in the future.
>>
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> 
> Hi Terry,
> 
> I'm a little bothered by the subtle difference in the bus walks
> in here vs the existing cases. If we need them, comments needed
> to explain why.
> 

Yes, I will add more details in the commit message about "why".
I added explanation following your below comment.

> If we are going to have separate handling, see if you can share
> a lot more of the code by factoring out common functions for
> the pci and cxl handling with callbacks to handle the differences.
> 

Dan requested separate paths for the PCIe and CXL recovery. The intent,
as I understand, is to isolate the handling of PCIe and CXL protocol 
errors. This is to create 2 different classes of protocol errors.

> I've managed to get my head around this code a few times in the past
> (I think!) and really don't fancy having two subtle variants to
> consider next time we get a bug :( The RC_EC additions hurt my head.
> 
> Jonathan

Right, the UCE recovery logic is not straightforward. The code can  be 
refactored to take advantage of reuse. I'm interested in your thoughts 
after I have provided some responses here.

> 
>>  static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
>> index 31090770fffc..de12f2eb19ef 100644
>> --- a/drivers/pci/pcie/err.c
>> +++ b/drivers/pci/pcie/err.c
>> @@ -86,6 +86,63 @@ static int report_error_detected(struct pci_dev *dev,
>>  	return 0;
>>  }
>>  
>> +static int cxl_report_error_detected(struct pci_dev *dev,
>> +				     pci_channel_state_t state,
>> +				     enum pci_ers_result *result)
>> +{
>> +	struct cxl_port_err_hndlrs *cxl_port_hndlrs;
>> +	struct pci_driver *pdrv;
>> +	pci_ers_result_t vote;
>> +
>> +	device_lock(&dev->dev);
>> +	cxl_port_hndlrs = find_cxl_port_hndlrs();
> 
> Can we refactor to have a common function under this and report_error_detected()?
> 

Sure, this can be refactored. 

The difference between cxl_report_error_detected() and report_error_detected() is the 
handlers that are called.

cxl_report_error_detected() calls the CXL driver's registered port error handler. 

report_error_recovery() calls the pcie_dev::err_handlers.

Let me know if I should refactor for common code here?


>> +	pdrv = dev->driver;
>> +	if (pci_dev_is_disconnected(dev)) {
>> +		vote = PCI_ERS_RESULT_DISCONNECT;
>> +	} else if (!pci_dev_set_io_state(dev, state)) {
>> +		pci_info(dev, "can't recover (state transition %u -> %u invalid)\n",
>> +			dev->error_state, state);
>> +		vote = PCI_ERS_RESULT_NONE;
>> +	} else if (!cxl_port_hndlrs || !cxl_port_hndlrs->error_detected) {
>> +		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
>> +			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
>> +			pci_info(dev, "can't recover (no error_detected callback)\n");
>> +		} else {
>> +			vote = PCI_ERS_RESULT_NONE;
>> +		}
>> +	} else {
>> +		vote = cxl_port_hndlrs->error_detected(dev, state);
>> +	}
>> +	pci_uevent_ers(dev, vote);
>> +	*result = merge_result(*result, vote);
>> +	device_unlock(&dev->dev);
>> +	return 0;
>> +}
> 
>>  static int pci_pm_runtime_get_sync(struct pci_dev *pdev, void *data)
>>  {
>>  	pm_runtime_get_sync(&pdev->dev);
>> @@ -188,6 +245,28 @@ static void pci_walk_bridge(struct pci_dev *bridge,
>>  		cb(bridge, userdata);
>>  }
>>  
>> +/**
>> + * cxl_walk_bridge - walk bridges potentially AER affected
>> + * @bridge:	bridge which may be a Port, an RCEC, or an RCiEP
>> + * @cb:		callback to be called for each device found
>> + * @userdata:	arbitrary pointer to be passed to callback
>> + *
>> + * If the device provided is a bridge, walk the subordinate bus, including
>> + * the device itself and any bridged devices on buses under this bus.  Call
>> + * the provided callback on each device found.
>> + *
>> + * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP,
>> + * call the callback on the device itself.
> only call the callback on the device itself.
> 
> (as you call it as stated above either way).
> 

Thanks. I will update the function header to include "only".

>> + */
>> +static void cxl_walk_bridge(struct pci_dev *bridge,
>> +			    int (*cb)(struct pci_dev *, void *),
>> +			    void *userdata)
>> +{
>> +	cb(bridge, userdata);
>> +	if (bridge->subordinate)
>> +		pci_walk_bus(bridge->subordinate, cb, userdata);
> The difference between this and pci_walk_bridge() is subtle and
> I'd like to avoid having both if we can.
> 

The cxl_walk_bridge() was added because pci_walk_bridge() does not report
CXL errors as needed. If the erroring device is a bridge then pci_walk_bridge() 
does not call report_error_detected() for the root port itself. If the bridge 
is a CXL root port then the CXL port error handler is not called. This has 2 
problems: 1. Error logging is not provided, 2. A result vote is not provided 
by the root port's CXL port handler.

>> +}
>> +
>>  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>>  		pci_channel_state_t state,
>>  		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
>> @@ -276,3 +355,74 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>>  
>>  	return status;
>>  }
>> +
>> +pci_ers_result_t cxl_do_recovery(struct pci_dev *bridge,
>> +				 pci_channel_state_t state,
>> +				 pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
>> +{
>> +	struct pci_host_bridge *host = pci_find_host_bridge(bridge->bus);
>> +	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
>> +	int type = pci_pcie_type(bridge);
>> +
>> +	if ((type != PCI_EXP_TYPE_ROOT_PORT) &&
>> +	    (type != PCI_EXP_TYPE_RC_EC) &&
>> +	    (type != PCI_EXP_TYPE_DOWNSTREAM) &&
>> +	    (type != PCI_EXP_TYPE_UPSTREAM)) {
>> +		pci_dbg(bridge, "Unsupported device type (%x)\n", type);
>> +		return status;
>> +	}
>> +
> 
> Would similar trick to in pcie_do_recovery work here for the upstream
> and downstream ports use pci_upstream_bridge() and for the others pass the dev into
> pci_walk_bridge()?
> 

Yes, that would be a good starting point to begin reuse refactoring.
I'm interested in getting yours and others feedback on the separation of the 
PCI and CXL protocol errors and how much separation is or not needed.


Regards,
Terry
Jonathan Cameron Oct. 17, 2024, 1:43 p.m. UTC | #3
On Wed, 16 Oct 2024 13:07:37 -0500
Terry Bowman <Terry.Bowman@amd.com> wrote:

> Hi Jonathan,
> 
> On 10/16/24 11:54, Jonathan Cameron wrote:
> > On Tue, 8 Oct 2024 17:16:49 -0500
> > Terry Bowman <terry.bowman@amd.com> wrote:
> >   
> >> The current pcie_do_recovery() handles device recovery as result of
> >> uncorrectable errors (UCE). But, CXL port devices require unique
> >> recovery handling.
> >>
> >> Create a cxl_do_recovery() function parallel to pcie_do_recovery(). Add CXL
> >> specific handling to the new recovery function.
> >>
> >> The CXL port UCE recovery must invoke the AER service driver's CXL port
> >> UCE callback. This is different than the standard pcie_do_recovery()
> >> recovery that calls the pci_driver::err_handler UCE handler instead.
> >>
> >> Treat all CXL PCIe port UCE errors as fatal and call kernel panic to
> >> "recover" the error. A panic is called instead of attempting recovery
> >> to avoid potential system corruption.
> >>
> >> The uncorrectable support added here will be used to complete CXL PCIe
> >> port error handling in the future.
> >>
> >> Signed-off-by: Terry Bowman <terry.bowman@amd.com>  
> > 
> > Hi Terry,
> > 
> > I'm a little bothered by the subtle difference in the bus walks
> > in here vs the existing cases. If we need them, comments needed
> > to explain why.
> >   
> 
> Yes, I will add more details in the commit message about "why".
> I added explanation following your below comment.
> 
> > If we are going to have separate handling, see if you can share
> > a lot more of the code by factoring out common functions for
> > the pci and cxl handling with callbacks to handle the differences.
> >   
> 
> Dan requested separate paths for the PCIe and CXL recovery. The intent,
> as I understand, is to isolate the handling of PCIe and CXL protocol 
> errors. This is to create 2 different classes of protocol errors.
Function call chain wise I'm reasonably convinced that might be a good
idea.  But not code wise if it means we end up with more hard to review
code.

> 
> > I've managed to get my head around this code a few times in the past
> > (I think!) and really don't fancy having two subtle variants to
> > consider next time we get a bug :( The RC_EC additions hurt my head.
> > 
> > Jonathan  
> 
> Right, the UCE recovery logic is not straightforward. The code can  be 
> refactored to take advantage of reuse. I'm interested in your thoughts 
> after I have provided some responses here.
> 
> >   
> >>  static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
> >> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
> >> index 31090770fffc..de12f2eb19ef 100644
> >> --- a/drivers/pci/pcie/err.c
> >> +++ b/drivers/pci/pcie/err.c
> >> @@ -86,6 +86,63 @@ static int report_error_detected(struct pci_dev *dev,
> >>  	return 0;
> >>  }
> >>  
> >> +static int cxl_report_error_detected(struct pci_dev *dev,
> >> +				     pci_channel_state_t state,
> >> +				     enum pci_ers_result *result)
> >> +{
> >> +	struct cxl_port_err_hndlrs *cxl_port_hndlrs;
> >> +	struct pci_driver *pdrv;
> >> +	pci_ers_result_t vote;
> >> +
> >> +	device_lock(&dev->dev);
> >> +	cxl_port_hndlrs = find_cxl_port_hndlrs();  
> > 
> > Can we refactor to have a common function under this and report_error_detected()?
> >   
> 
> Sure, this can be refactored. 
> 
> The difference between cxl_report_error_detected() and report_error_detected() is the 
> handlers that are called.
> 
> cxl_report_error_detected() calls the CXL driver's registered port error handler. 
> 
> report_error_recovery() calls the pcie_dev::err_handlers.
> 
> Let me know if I should refactor for common code here?

It certainly makes sense to do that somewhere in here.  Just have light
wrappers that provide callbacks so the bulk of the code is shared.

> 
> 
> >> +	pdrv = dev->driver;
> >> +	if (pci_dev_is_disconnected(dev)) {
> >> +		vote = PCI_ERS_RESULT_DISCONNECT;
> >> +	} else if (!pci_dev_set_io_state(dev, state)) {
> >> +		pci_info(dev, "can't recover (state transition %u -> %u invalid)\n",
> >> +			dev->error_state, state);
> >> +		vote = PCI_ERS_RESULT_NONE;
> >> +	} else if (!cxl_port_hndlrs || !cxl_port_hndlrs->error_detected) {
> >> +		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
> >> +			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
> >> +			pci_info(dev, "can't recover (no error_detected callback)\n");
> >> +		} else {
> >> +			vote = PCI_ERS_RESULT_NONE;
> >> +		}
> >> +	} else {
> >> +		vote = cxl_port_hndlrs->error_detected(dev, state);
> >> +	}
> >> +	pci_uevent_ers(dev, vote);
> >> +	*result = merge_result(*result, vote);
> >> +	device_unlock(&dev->dev);
> >> +	return 0;
> >> +}  
> >   
> >>  static int pci_pm_runtime_get_sync(struct pci_dev *pdev, void *data)
> >>  {
> >>  	pm_runtime_get_sync(&pdev->dev);
> >> @@ -188,6 +245,28 @@ static void pci_walk_bridge(struct pci_dev *bridge,
> >>  		cb(bridge, userdata);
> >>  }
> >>  
> >> +/**
> >> + * cxl_walk_bridge - walk bridges potentially AER affected
> >> + * @bridge:	bridge which may be a Port, an RCEC, or an RCiEP
> >> + * @cb:		callback to be called for each device found
> >> + * @userdata:	arbitrary pointer to be passed to callback
> >> + *
> >> + * If the device provided is a bridge, walk the subordinate bus, including
> >> + * the device itself and any bridged devices on buses under this bus.  Call
> >> + * the provided callback on each device found.
> >> + *
> >> + * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP,
> >> + * call the callback on the device itself.  
> > only call the callback on the device itself.
> > 
> > (as you call it as stated above either way).
> >   
> 
> Thanks. I will update the function header to include "only".
> 
> >> + */
> >> +static void cxl_walk_bridge(struct pci_dev *bridge,
> >> +			    int (*cb)(struct pci_dev *, void *),
> >> +			    void *userdata)
> >> +{
> >> +	cb(bridge, userdata);
> >> +	if (bridge->subordinate)
> >> +		pci_walk_bus(bridge->subordinate, cb, userdata);  
> > The difference between this and pci_walk_bridge() is subtle and
> > I'd like to avoid having both if we can.
> >   
> 
> The cxl_walk_bridge() was added because pci_walk_bridge() does not report
> CXL errors as needed. If the erroring device is a bridge then pci_walk_bridge() 
> does not call report_error_detected() for the root port itself. If the bridge 
> is a CXL root port then the CXL port error handler is not called. This has 2 
> problems: 1. Error logging is not provided, 2. A result vote is not provided 
> by the root port's CXL port handler.

So what happens for PCIe errors on the root port?  How are they reported?
What I'm failing to understand is why these should be different.
Maybe there is something missing on the PCIe side though!
That code plays a game with what bridge and I thought that was there to handle
this case.

> 
> >> +}
> >> +
> >>  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
> >>  		pci_channel_state_t state,
> >>  		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
> >> @@ -276,3 +355,74 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
> >>  
> >>  	return status;
> >>  }
> >> +
> >> +pci_ers_result_t cxl_do_recovery(struct pci_dev *bridge,
> >> +				 pci_channel_state_t state,
> >> +				 pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
> >> +{
> >> +	struct pci_host_bridge *host = pci_find_host_bridge(bridge->bus);
> >> +	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
> >> +	int type = pci_pcie_type(bridge);
> >> +
> >> +	if ((type != PCI_EXP_TYPE_ROOT_PORT) &&
> >> +	    (type != PCI_EXP_TYPE_RC_EC) &&
> >> +	    (type != PCI_EXP_TYPE_DOWNSTREAM) &&
> >> +	    (type != PCI_EXP_TYPE_UPSTREAM)) {
> >> +		pci_dbg(bridge, "Unsupported device type (%x)\n", type);
> >> +		return status;
> >> +	}
> >> +  
> > 
> > Would similar trick to in pcie_do_recovery work here for the upstream
> > and downstream ports use pci_upstream_bridge() and for the others pass the dev into
> > pci_walk_bridge()?
> >   
> 
> Yes, that would be a good starting point to begin reuse refactoring.
> I'm interested in getting yours and others feedback on the separation of the 
> PCI and CXL protocol errors and how much separation is or not needed.

Separation may make sense (I'm still thinking about it) for separate passes
through the topology and separate callbacks / handling when an error is seen.
What I don't want to see is two horribly complex separate walking codes if
we can possibly avoid it.  Long term to me that just means two sets of bugs
and problem corners instead of one.

Jonathan

> 
> 
> Regards,
> Terry
>
Bowman, Terry Oct. 17, 2024, 4:21 p.m. UTC | #4
Hi Jonathan,

On 10/17/2024 8:43 AM, Jonathan Cameron wrote:
> On Wed, 16 Oct 2024 13:07:37 -0500
> Terry Bowman <Terry.Bowman@amd.com> wrote:
> 
>> Hi Jonathan,
>>
>> On 10/16/24 11:54, Jonathan Cameron wrote:
>>> On Tue, 8 Oct 2024 17:16:49 -0500
>>> Terry Bowman <terry.bowman@amd.com> wrote:
>>>    
>>>> The current pcie_do_recovery() handles device recovery as result of
>>>> uncorrectable errors (UCE). But, CXL port devices require unique
>>>> recovery handling.
>>>>
>>>> Create a cxl_do_recovery() function parallel to pcie_do_recovery(). Add CXL
>>>> specific handling to the new recovery function.
>>>>
>>>> The CXL port UCE recovery must invoke the AER service driver's CXL port
>>>> UCE callback. This is different than the standard pcie_do_recovery()
>>>> recovery that calls the pci_driver::err_handler UCE handler instead.
>>>>
>>>> Treat all CXL PCIe port UCE errors as fatal and call kernel panic to
>>>> "recover" the error. A panic is called instead of attempting recovery
>>>> to avoid potential system corruption.
>>>>
>>>> The uncorrectable support added here will be used to complete CXL PCIe
>>>> port error handling in the future.
>>>>
>>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>>>
>>> Hi Terry,
>>>
>>> I'm a little bothered by the subtle difference in the bus walks
>>> in here vs the existing cases. If we need them, comments needed
>>> to explain why.
>>>    
>>
>> Yes, I will add more details in the commit message about "why".
>> I added explanation following your below comment.
>>
>>> If we are going to have separate handling, see if you can share
>>> a lot more of the code by factoring out common functions for
>>> the pci and cxl handling with callbacks to handle the differences.
>>>    
>>
>> Dan requested separate paths for the PCIe and CXL recovery. The intent,
>> as I understand, is to isolate the handling of PCIe and CXL protocol
>> errors. This is to create 2 different classes of protocol errors.
> Function call chain wise I'm reasonably convinced that might be a good
> idea.  But not code wise if it means we end up with more hard to review
> code.
> 
>>
>>> I've managed to get my head around this code a few times in the past
>>> (I think!) and really don't fancy having two subtle variants to
>>> consider next time we get a bug :( The RC_EC additions hurt my head.
>>>
>>> Jonathan
>>
>> Right, the UCE recovery logic is not straightforward. The code can  be
>> refactored to take advantage of reuse. I'm interested in your thoughts
>> after I have provided some responses here.
>>
>>>    
>>>>   static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
>>>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
>>>> index 31090770fffc..de12f2eb19ef 100644
>>>> --- a/drivers/pci/pcie/err.c
>>>> +++ b/drivers/pci/pcie/err.c
>>>> @@ -86,6 +86,63 @@ static int report_error_detected(struct pci_dev *dev,
>>>>   	return 0;
>>>>   }
>>>>   
>>>> +static int cxl_report_error_detected(struct pci_dev *dev,
>>>> +				     pci_channel_state_t state,
>>>> +				     enum pci_ers_result *result)
>>>> +{
>>>> +	struct cxl_port_err_hndlrs *cxl_port_hndlrs;
>>>> +	struct pci_driver *pdrv;
>>>> +	pci_ers_result_t vote;
>>>> +
>>>> +	device_lock(&dev->dev);
>>>> +	cxl_port_hndlrs = find_cxl_port_hndlrs();
>>>
>>> Can we refactor to have a common function under this and report_error_detected()?
>>>    
>>
>> Sure, this can be refactored.
>>
>> The difference between cxl_report_error_detected() and report_error_detected() is the
>> handlers that are called.
>>
>> cxl_report_error_detected() calls the CXL driver's registered port error handler.
>>
>> report_error_recovery() calls the pcie_dev::err_handlers.
>>
>> Let me know if I should refactor for common code here?
> 
> It certainly makes sense to do that somewhere in here.  Just have light
> wrappers that provide callbacks so the bulk of the code is shared.
> 

Ok, Ill start on that. I have a v2 ready to-go without the reuse changes.
You want me to wait on sending v2 till it has reuse refactoring?

>>
>>
>>>> +	pdrv = dev->driver;
>>>> +	if (pci_dev_is_disconnected(dev)) {
>>>> +		vote = PCI_ERS_RESULT_DISCONNECT;
>>>> +	} else if (!pci_dev_set_io_state(dev, state)) {
>>>> +		pci_info(dev, "can't recover (state transition %u -> %u invalid)\n",
>>>> +			dev->error_state, state);
>>>> +		vote = PCI_ERS_RESULT_NONE;
>>>> +	} else if (!cxl_port_hndlrs || !cxl_port_hndlrs->error_detected) {
>>>> +		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
>>>> +			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
>>>> +			pci_info(dev, "can't recover (no error_detected callback)\n");
>>>> +		} else {
>>>> +			vote = PCI_ERS_RESULT_NONE;
>>>> +		}
>>>> +	} else {
>>>> +		vote = cxl_port_hndlrs->error_detected(dev, state);
>>>> +	}
>>>> +	pci_uevent_ers(dev, vote);
>>>> +	*result = merge_result(*result, vote);
>>>> +	device_unlock(&dev->dev);
>>>> +	return 0;
>>>> +}
>>>    
>>>>   static int pci_pm_runtime_get_sync(struct pci_dev *pdev, void *data)
>>>>   {
>>>>   	pm_runtime_get_sync(&pdev->dev);
>>>> @@ -188,6 +245,28 @@ static void pci_walk_bridge(struct pci_dev *bridge,
>>>>   		cb(bridge, userdata);
>>>>   }
>>>>   
>>>> +/**
>>>> + * cxl_walk_bridge - walk bridges potentially AER affected
>>>> + * @bridge:	bridge which may be a Port, an RCEC, or an RCiEP
>>>> + * @cb:		callback to be called for each device found
>>>> + * @userdata:	arbitrary pointer to be passed to callback
>>>> + *
>>>> + * If the device provided is a bridge, walk the subordinate bus, including
>>>> + * the device itself and any bridged devices on buses under this bus.  Call
>>>> + * the provided callback on each device found.
>>>> + *
>>>> + * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP,
>>>> + * call the callback on the device itself.
>>> only call the callback on the device itself.
>>>
>>> (as you call it as stated above either way).
>>>    
>>
>> Thanks. I will update the function header to include "only".
>>
>>>> + */
>>>> +static void cxl_walk_bridge(struct pci_dev *bridge,
>>>> +			    int (*cb)(struct pci_dev *, void *),
>>>> +			    void *userdata)
>>>> +{
>>>> +	cb(bridge, userdata);
>>>> +	if (bridge->subordinate)
>>>> +		pci_walk_bus(bridge->subordinate, cb, userdata);
>>> The difference between this and pci_walk_bridge() is subtle and
>>> I'd like to avoid having both if we can.
>>>    
>>
>> The cxl_walk_bridge() was added because pci_walk_bridge() does not report
>> CXL errors as needed. If the erroring device is a bridge then pci_walk_bridge()
>> does not call report_error_detected() for the root port itself. If the bridge
>> is a CXL root port then the CXL port error handler is not called. This has 2
>> problems: 1. Error logging is not provided, 2. A result vote is not provided
>> by the root port's CXL port handler.
> 
> So what happens for PCIe errors on the root port?  How are they reported?
> What I'm failing to understand is why these should be different.
> Maybe there is something missing on the PCIe side though!
> That code plays a game with what bridge and I thought that was there to handle
> this case.
> 

PCIe errors (not CXL errors) on a root port will be processed as they are today.

An AER error is treated as a CXL error if *all* of the following are met:
- The AER error is not an internal error
    - Check is in AER's is_internal_error(info) function.
- The device is not a CXL device
    - Check is in AER's handles_cxl_errors() function.

Root port device PCIe error processing will not call the the pci_dev::err_handlers::error_detected().
because of the walk_bridge() implementation. The result vote to direct handling
is determined by downstream devices. This has probably been Ok until now because ports have been
fairly vanilla and standard until CXL.


>>
>>>> +}
>>>> +
>>>>   pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>>>>   		pci_channel_state_t state,
>>>>   		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
>>>> @@ -276,3 +355,74 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
>>>>   
>>>>   	return status;
>>>>   }
>>>> +
>>>> +pci_ers_result_t cxl_do_recovery(struct pci_dev *bridge,
>>>> +				 pci_channel_state_t state,
>>>> +				 pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
>>>> +{
>>>> +	struct pci_host_bridge *host = pci_find_host_bridge(bridge->bus);
>>>> +	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
>>>> +	int type = pci_pcie_type(bridge);
>>>> +
>>>> +	if ((type != PCI_EXP_TYPE_ROOT_PORT) &&
>>>> +	    (type != PCI_EXP_TYPE_RC_EC) &&
>>>> +	    (type != PCI_EXP_TYPE_DOWNSTREAM) &&
>>>> +	    (type != PCI_EXP_TYPE_UPSTREAM)) {
>>>> +		pci_dbg(bridge, "Unsupported device type (%x)\n", type);
>>>> +		return status;
>>>> +	}
>>>> +
>>>
>>> Would similar trick to in pcie_do_recovery work here for the upstream
>>> and downstream ports use pci_upstream_bridge() and for the others pass the dev into
>>> pci_walk_bridge()?
>>>    
>>
>> Yes, that would be a good starting point to begin reuse refactoring.
>> I'm interested in getting yours and others feedback on the separation of the
>> PCI and CXL protocol errors and how much separation is or not needed.
> 
> Separation may make sense (I'm still thinking about it) for separate passes
> through the topology and separate callbacks / handling when an error is seen.
> What I don't want to see is two horribly complex separate walking codes if
> we can possibly avoid it.  Long term to me that just means two sets of bugs
> and problem corners instead of one.
> 
> Jonathan
> 

I understand. I will look to make changes here for reuse.

Regards,
Terry
Jonathan Cameron Oct. 17, 2024, 5:08 p.m. UTC | #5
On Thu, 17 Oct 2024 11:21:36 -0500
"Bowman, Terry" <kibowman@amd.com> wrote:

> Hi Jonathan,
> 
> On 10/17/2024 8:43 AM, Jonathan Cameron wrote:
> > On Wed, 16 Oct 2024 13:07:37 -0500
> > Terry Bowman <Terry.Bowman@amd.com> wrote:
> >   
> >> Hi Jonathan,
> >>
> >> On 10/16/24 11:54, Jonathan Cameron wrote:  
> >>> On Tue, 8 Oct 2024 17:16:49 -0500
> >>> Terry Bowman <terry.bowman@amd.com> wrote:
> >>>      
> >>>> The current pcie_do_recovery() handles device recovery as result of
> >>>> uncorrectable errors (UCE). But, CXL port devices require unique
> >>>> recovery handling.
> >>>>
> >>>> Create a cxl_do_recovery() function parallel to pcie_do_recovery(). Add CXL
> >>>> specific handling to the new recovery function.
> >>>>
> >>>> The CXL port UCE recovery must invoke the AER service driver's CXL port
> >>>> UCE callback. This is different than the standard pcie_do_recovery()
> >>>> recovery that calls the pci_driver::err_handler UCE handler instead.
> >>>>
> >>>> Treat all CXL PCIe port UCE errors as fatal and call kernel panic to
> >>>> "recover" the error. A panic is called instead of attempting recovery
> >>>> to avoid potential system corruption.
> >>>>
> >>>> The uncorrectable support added here will be used to complete CXL PCIe
> >>>> port error handling in the future.
> >>>>
> >>>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>  
> >>>
> >>> Hi Terry,
> >>>
> >>> I'm a little bothered by the subtle difference in the bus walks
> >>> in here vs the existing cases. If we need them, comments needed
> >>> to explain why.
> >>>      
> >>
> >> Yes, I will add more details in the commit message about "why".
> >> I added explanation following your below comment.
> >>  
> >>> If we are going to have separate handling, see if you can share
> >>> a lot more of the code by factoring out common functions for
> >>> the pci and cxl handling with callbacks to handle the differences.
> >>>      
> >>
> >> Dan requested separate paths for the PCIe and CXL recovery. The intent,
> >> as I understand, is to isolate the handling of PCIe and CXL protocol
> >> errors. This is to create 2 different classes of protocol errors.  
> > Function call chain wise I'm reasonably convinced that might be a good
> > idea.  But not code wise if it means we end up with more hard to review
> > code.
> >   
> >>  
> >>> I've managed to get my head around this code a few times in the past
> >>> (I think!) and really don't fancy having two subtle variants to
> >>> consider next time we get a bug :( The RC_EC additions hurt my head.
> >>>
> >>> Jonathan  
> >>
> >> Right, the UCE recovery logic is not straightforward. The code can  be
> >> refactored to take advantage of reuse. I'm interested in your thoughts
> >> after I have provided some responses here.
> >>  
> >>>      
> >>>>   static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
> >>>> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
> >>>> index 31090770fffc..de12f2eb19ef 100644
> >>>> --- a/drivers/pci/pcie/err.c
> >>>> +++ b/drivers/pci/pcie/err.c
> >>>> @@ -86,6 +86,63 @@ static int report_error_detected(struct pci_dev *dev,
> >>>>   	return 0;
> >>>>   }
> >>>>   
> >>>> +static int cxl_report_error_detected(struct pci_dev *dev,
> >>>> +				     pci_channel_state_t state,
> >>>> +				     enum pci_ers_result *result)
> >>>> +{
> >>>> +	struct cxl_port_err_hndlrs *cxl_port_hndlrs;
> >>>> +	struct pci_driver *pdrv;
> >>>> +	pci_ers_result_t vote;
> >>>> +
> >>>> +	device_lock(&dev->dev);
> >>>> +	cxl_port_hndlrs = find_cxl_port_hndlrs();  
> >>>
> >>> Can we refactor to have a common function under this and report_error_detected()?
> >>>      
> >>
> >> Sure, this can be refactored.
> >>
> >> The difference between cxl_report_error_detected() and report_error_detected() is the
> >> handlers that are called.
> >>
> >> cxl_report_error_detected() calls the CXL driver's registered port error handler.
> >>
> >> report_error_recovery() calls the pcie_dev::err_handlers.
> >>
> >> Let me know if I should refactor for common code here?  
> > 
> > It certainly makes sense to do that somewhere in here.  Just have light
> > wrappers that provide callbacks so the bulk of the code is shared.
> >   
> 
> Ok, Ill start on that. I have a v2 ready to-go without the reuse changes.
> You want me to wait on sending v2 till it has reuse refactoring?

I'd imagine we might have some time after v2, so go ahead - experiments
with refactoring can come later.


> >>>> + */
> >>>> +static void cxl_walk_bridge(struct pci_dev *bridge,
> >>>> +			    int (*cb)(struct pci_dev *, void *),
> >>>> +			    void *userdata)
> >>>> +{
> >>>> +	cb(bridge, userdata);
> >>>> +	if (bridge->subordinate)
> >>>> +		pci_walk_bus(bridge->subordinate, cb, userdata);  
> >>> The difference between this and pci_walk_bridge() is subtle and
> >>> I'd like to avoid having both if we can.
> >>>      
> >>
> >> The cxl_walk_bridge() was added because pci_walk_bridge() does not report
> >> CXL errors as needed. If the erroring device is a bridge then pci_walk_bridge()
> >> does not call report_error_detected() for the root port itself. If the bridge
> >> is a CXL root port then the CXL port error handler is not called. This has 2
> >> problems: 1. Error logging is not provided, 2. A result vote is not provided
> >> by the root port's CXL port handler.  
> > 
> > So what happens for PCIe errors on the root port?  How are they reported?
> > What I'm failing to understand is why these should be different.
> > Maybe there is something missing on the PCIe side though!
> > That code plays a game with what bridge and I thought that was there to handle
> > this case.
> >   
> 
> PCIe errors (not CXL errors) on a root port will be processed as they are today.
Sure, I was just failing to understand why the code didn't need to check
for error_detected on the root port, but the CXL code does.

> 
> An AER error is treated as a CXL error if *all* of the following are met:
> - The AER error is not an internal error
>     - Check is in AER's is_internal_error(info) function.
> - The device is not a CXL device
>     - Check is in AER's handles_cxl_errors() function.
> 
> Root port device PCIe error processing will not call the the pci_dev::err_handlers::error_detected().
> because of the walk_bridge() implementation. The result vote to direct handling
> is determined by downstream devices. This has probably been Ok until now because ports have been
> fairly vanilla and standard until CXL.

Ah. Got it - Root ports didn't have the handler.
So is there any harm in making them run it? (well not as they don't have it -
actually they do.  There is one in portdrv)
That way the two codes look more similar.  Also, does this mean there were
runtime pm and other calls that didn't hit the root port for PCIe that should have
done?

Comes back to I don't want too complex bits of code.  I'm fine with changing
the PCIe one to add new handling needed for CXL.

Just to be clear I'm fine with totally separate call paths just with lots
of code reuse.   That may mean a few precursor patches touching only the
PCIe code to make it fit for reuse.

Jonathan



> 
> 
> >>  
> >>>> +}
> >>>> +
> >>>>   pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
> >>>>   		pci_channel_state_t state,
> >>>>   		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
> >>>> @@ -276,3 +355,74 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
> >>>>   
> >>>>   	return status;
> >>>>   }
> >>>> +
> >>>> +pci_ers_result_t cxl_do_recovery(struct pci_dev *bridge,
> >>>> +				 pci_channel_state_t state,
> >>>> +				 pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
> >>>> +{
> >>>> +	struct pci_host_bridge *host = pci_find_host_bridge(bridge->bus);
> >>>> +	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
> >>>> +	int type = pci_pcie_type(bridge);
> >>>> +
> >>>> +	if ((type != PCI_EXP_TYPE_ROOT_PORT) &&
> >>>> +	    (type != PCI_EXP_TYPE_RC_EC) &&
> >>>> +	    (type != PCI_EXP_TYPE_DOWNSTREAM) &&
> >>>> +	    (type != PCI_EXP_TYPE_UPSTREAM)) {
> >>>> +		pci_dbg(bridge, "Unsupported device type (%x)\n", type);
> >>>> +		return status;
> >>>> +	}
> >>>> +  
> >>>
> >>> Would similar trick to in pcie_do_recovery work here for the upstream
> >>> and downstream ports use pci_upstream_bridge() and for the others pass the dev into
> >>> pci_walk_bridge()?
> >>>      
> >>
> >> Yes, that would be a good starting point to begin reuse refactoring.
> >> I'm interested in getting yours and others feedback on the separation of the
> >> PCI and CXL protocol errors and how much separation is or not needed.  
> > 
> > Separation may make sense (I'm still thinking about it) for separate passes
> > through the topology and separate callbacks / handling when an error is seen.
> > What I don't want to see is two horribly complex separate walking codes if
> > we can possibly avoid it.  Long term to me that just means two sets of bugs
> > and problem corners instead of one.
> > 
> > Jonathan
> >   
> 
> I understand. I will look to make changes here for reuse.
> 
> Regards,
> Terry
>
diff mbox series

Patch

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 79c8398f3938..d1f5b42fa48d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -632,6 +632,11 @@  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 		pci_channel_state_t state,
 		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev));
 
+/* CXL error reporting and recovery */
+pci_ers_result_t cxl_do_recovery(struct pci_dev *dev,
+		pci_channel_state_t state,
+		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev));
+
 bool pcie_wait_for_link(struct pci_dev *pdev, bool active);
 int pcie_retrain_link(struct pci_dev *pdev, bool use_lt);
 
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9b2872c8e20d..81a19028c4e7 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1060,7 +1060,10 @@  static void cxl_handle_error(struct pci_dev *dev, struct aer_err_info *info)
 		if (cxl_port_hndlrs && cxl_port_hndlrs->cor_error_detected)
 			cxl_port_hndlrs->cor_error_detected(dev);
 		pcie_clear_device_status(dev);
-	}
+	} else if (info->severity == AER_NONFATAL)
+		cxl_do_recovery(dev, pci_channel_io_normal, aer_root_reset);
+	else if (info->severity == AER_FATAL)
+		cxl_do_recovery(dev, pci_channel_io_frozen, aer_root_reset);
 }
 
 static int handles_cxl_error_iter(struct pci_dev *dev, void *data)
diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 31090770fffc..de12f2eb19ef 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -86,6 +86,63 @@  static int report_error_detected(struct pci_dev *dev,
 	return 0;
 }
 
+static int cxl_report_error_detected(struct pci_dev *dev,
+				     pci_channel_state_t state,
+				     enum pci_ers_result *result)
+{
+	struct cxl_port_err_hndlrs *cxl_port_hndlrs;
+	struct pci_driver *pdrv;
+	pci_ers_result_t vote;
+
+	device_lock(&dev->dev);
+	cxl_port_hndlrs = find_cxl_port_hndlrs();
+	pdrv = dev->driver;
+	if (pci_dev_is_disconnected(dev)) {
+		vote = PCI_ERS_RESULT_DISCONNECT;
+	} else if (!pci_dev_set_io_state(dev, state)) {
+		pci_info(dev, "can't recover (state transition %u -> %u invalid)\n",
+			dev->error_state, state);
+		vote = PCI_ERS_RESULT_NONE;
+	} else if (!cxl_port_hndlrs || !cxl_port_hndlrs->error_detected) {
+		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
+			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
+			pci_info(dev, "can't recover (no error_detected callback)\n");
+		} else {
+			vote = PCI_ERS_RESULT_NONE;
+		}
+	} else {
+		vote = cxl_port_hndlrs->error_detected(dev, state);
+	}
+	pci_uevent_ers(dev, vote);
+	*result = merge_result(*result, vote);
+	device_unlock(&dev->dev);
+	return 0;
+}
+
+static int cxl_report_frozen_detected(struct pci_dev *dev, void *data)
+{
+	/*
+	 * CXL endpoints report using pci_dev::err_handlers.
+	 * CXL PCIe ports report using aer_rpc::cxl_port_err_handlers.
+	 */
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_ENDPOINT)
+		return report_error_detected(dev, pci_channel_io_frozen, data);
+	else
+		return cxl_report_error_detected(dev, pci_channel_io_frozen, data);
+}
+
+static int cxl_report_normal_detected(struct pci_dev *dev, void *data)
+{
+	/*
+	 * CXL endpoints report using pci_dev::err_handlers.
+	 * CXL PCIe ports report using aer_rpc::cxl_port_err_handlers.
+	 */
+	if (pci_pcie_type(dev) == PCI_EXP_TYPE_ENDPOINT)
+		return report_error_detected(dev, pci_channel_io_normal, data);
+	else
+		return cxl_report_error_detected(dev, pci_channel_io_normal, data);
+}
+
 static int pci_pm_runtime_get_sync(struct pci_dev *pdev, void *data)
 {
 	pm_runtime_get_sync(&pdev->dev);
@@ -188,6 +245,28 @@  static void pci_walk_bridge(struct pci_dev *bridge,
 		cb(bridge, userdata);
 }
 
+/**
+ * cxl_walk_bridge - walk bridges potentially AER affected
+ * @bridge:	bridge which may be a Port, an RCEC, or an RCiEP
+ * @cb:		callback to be called for each device found
+ * @userdata:	arbitrary pointer to be passed to callback
+ *
+ * If the device provided is a bridge, walk the subordinate bus, including
+ * the device itself and any bridged devices on buses under this bus.  Call
+ * the provided callback on each device found.
+ *
+ * If the device provided has no subordinate bus, e.g., an RCEC or RCiEP,
+ * call the callback on the device itself.
+ */
+static void cxl_walk_bridge(struct pci_dev *bridge,
+			    int (*cb)(struct pci_dev *, void *),
+			    void *userdata)
+{
+	cb(bridge, userdata);
+	if (bridge->subordinate)
+		pci_walk_bus(bridge->subordinate, cb, userdata);
+}
+
 pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 		pci_channel_state_t state,
 		pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
@@ -276,3 +355,74 @@  pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 
 	return status;
 }
+
+pci_ers_result_t cxl_do_recovery(struct pci_dev *bridge,
+				 pci_channel_state_t state,
+				 pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev))
+{
+	struct pci_host_bridge *host = pci_find_host_bridge(bridge->bus);
+	pci_ers_result_t status = PCI_ERS_RESULT_CAN_RECOVER;
+	int type = pci_pcie_type(bridge);
+
+	if ((type != PCI_EXP_TYPE_ROOT_PORT) &&
+	    (type != PCI_EXP_TYPE_RC_EC) &&
+	    (type != PCI_EXP_TYPE_DOWNSTREAM) &&
+	    (type != PCI_EXP_TYPE_UPSTREAM)) {
+		pci_dbg(bridge, "Unsupported device type (%x)\n", type);
+		return status;
+	}
+
+	cxl_walk_bridge(bridge, pci_pm_runtime_get_sync, NULL);
+
+	pci_dbg(bridge, "broadcast error_detected message\n");
+	if (state == pci_channel_io_frozen) {
+		cxl_walk_bridge(bridge, cxl_report_frozen_detected, &status);
+		if (reset_subordinates(bridge) != PCI_ERS_RESULT_RECOVERED) {
+			pci_warn(bridge, "subordinate device reset failed\n");
+			goto failed;
+		}
+	} else {
+		cxl_walk_bridge(bridge, cxl_report_normal_detected, &status);
+	}
+
+	if (status == PCI_ERS_RESULT_PANIC)
+		panic("CXL cachemem error. Invoking panic");
+
+	if (status == PCI_ERS_RESULT_CAN_RECOVER) {
+		status = PCI_ERS_RESULT_RECOVERED;
+		pci_dbg(bridge, "broadcast mmio_enabled message\n");
+		cxl_walk_bridge(bridge, report_mmio_enabled, &status);
+	}
+
+	if (status == PCI_ERS_RESULT_NEED_RESET) {
+		status = PCI_ERS_RESULT_RECOVERED;
+		pci_dbg(bridge, "broadcast slot_reset message\n");
+		report_slot_reset(bridge, &status);
+		pci_walk_bridge(bridge, report_slot_reset, &status);
+	}
+
+	if (status != PCI_ERS_RESULT_RECOVERED)
+		goto failed;
+
+	pci_dbg(bridge, "broadcast resume message\n");
+	cxl_walk_bridge(bridge, report_resume, &status);
+
+	if (host->native_aer || pcie_ports_native) {
+		pcie_clear_device_status(bridge);
+		pci_aer_clear_nonfatal_status(bridge);
+	}
+
+	cxl_walk_bridge(bridge, pci_pm_runtime_put, NULL);
+
+	pci_info(bridge, "device recovery successful\n");
+	return status;
+
+failed:
+	cxl_walk_bridge(bridge, pci_pm_runtime_put, NULL);
+
+	pci_uevent_ers(bridge, PCI_ERS_RESULT_DISCONNECT);
+
+	pci_info(bridge, "device recovery failed\n");
+
+	return status;
+}