diff mbox series

[v7,12/17] cxl/pci: Add error handler for CXL PCIe Port RAS errors

Message ID 20250211192444.2292833-13-terry.bowman@amd.com (mailing list archive)
State Handled Elsewhere
Headers show
Series Enable CXL PCIe port protocol error handling and logging | expand

Commit Message

Bowman, Terry Feb. 11, 2025, 7:24 p.m. UTC
Introduce correctable and uncorrectable (UCE) CXL PCIe Port Protocol Error
handlers.

The handlers will be called with a 'struct pci_dev' parameter
indicating the CXL Port device requiring handling. The CXL PCIe Port
device's underlying 'struct device' will match the port device in the
CXL topology.

Use the PCIe Port's device object to find the matching CXL Upstream Switch
Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
matching CXL Port device should contain a cached reference to the RAS
register block. The cached RAS block will be used in handling the error.

Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
a reference to the RAS registers as a parameter. These functions will use
the RAS register reference to indicate an error and clear the device's RAS
status.

Update __cxl_handle_ras() to return PCI_ERS_RESULT_PANIC in the case
an error is present in the RAS status. Otherwise, return
PCI_ERS_RESULT_NONE.

Signed-off-by: Terry Bowman <terry.bowman@amd.com>
---
 drivers/cxl/core/pci.c | 81 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 77 insertions(+), 4 deletions(-)

Comments

Dave Jiang Feb. 12, 2025, 12:11 a.m. UTC | #1
On 2/11/25 12:24 PM, Terry Bowman wrote:
> Introduce correctable and uncorrectable (UCE) CXL PCIe Port Protocol Error
> handlers.
> 
> The handlers will be called with a 'struct pci_dev' parameter
> indicating the CXL Port device requiring handling. The CXL PCIe Port
> device's underlying 'struct device' will match the port device in the
> CXL topology.
> 
> Use the PCIe Port's device object to find the matching CXL Upstream Switch
> Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
> matching CXL Port device should contain a cached reference to the RAS
> register block. The cached RAS block will be used in handling the error.
> 
> Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
> a reference to the RAS registers as a parameter. These functions will use
> the RAS register reference to indicate an error and clear the device's RAS
> status.
> 
> Update __cxl_handle_ras() to return PCI_ERS_RESULT_PANIC in the case
> an error is present in the RAS status. Otherwise, return
> PCI_ERS_RESULT_NONE.

Maybe a comment on why the change?

> 
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
>  drivers/cxl/core/pci.c | 81 +++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 77 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index af809e7cbe3b..3f13d9dfb610 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -699,7 +699,7 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
>   * Log the state of the RAS status registers and prepare them to log the
>   * next error status. Return 1 if reset needed.
>   */
> -static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
> +static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>  {
>  	u32 hl[CXL_HEADERLOG_SIZE_U32];
>  	void __iomem *addr;
> @@ -708,13 +708,13 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>  
>  	if (!ras_base) {
>  		dev_warn_once(dev, "CXL RAS register block is not mapped");
> -		return false;
> +		return PCI_ERS_RESULT_NONE;
>  	}
>  
>  	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>  	status = readl(addr);
>  	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
> -		return false;
> +		return PCI_ERS_RESULT_NONE;
>  
>  	/* If multiple errors, log header points to first error from ctrl reg */
>  	if (hweight32(status) > 1) {
> @@ -733,7 +733,7 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>  
>  	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>  
> -	return true;
> +	return PCI_ERS_RESULT_PANIC;
>  }
>  
>  static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
> @@ -782,6 +782,79 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
>  	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
>  }
>  
> +static int match_uport(struct device *dev, const void *data)
> +{
> +	const struct device *uport_dev = data;
> +	struct cxl_port *port;
> +
> +	if (!is_cxl_port(dev))
> +		return 0;
> +
> +	port = to_cxl_port(dev);
> +
> +	return port->uport_dev == uport_dev;
> +}
> +
> +static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev, struct device **dev)
> +{
> +	void __iomem *ras_base;
> +
> +	if (!pdev || !*dev) {
> +		pr_err("Failed, parameter is NULL");
> +		return NULL;
> +	}
> +
> +	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
> +	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {

Can probably just do a switch block here for the type?

> +		struct cxl_port *port __free(put_cxl_port);
> +		struct cxl_dport *dport = NULL;
> +
> +		port = find_cxl_port(&pdev->dev, &dport);

Just declare port inline:

struct cxl_port *port __free(put_cxl_port) =
		find_cxl_port(&pdev->dev, &dport);

> +		if (!port) {
> +			pci_err(pdev, "Failed to find root/dport in CXL topology\n");
> +			return NULL;
> +		}
> +
> +		ras_base = dport ? dport->regs.ras : NULL;
> +		*dev = &port->dev;
> +	} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {
> +		struct device *port_dev __free(put_device);

same comment here as above

DJ

> +		struct cxl_port *port;
> +
> +		port_dev = bus_find_device(&cxl_bus_type, NULL, &pdev->dev,
> +					   match_uport);
> +		if (!port_dev || !is_cxl_port(port_dev)) {
> +			pci_err(pdev, "Failed to find uport in CXL topology\n");
> +			return NULL;
> +		}
> +
> +		port = to_cxl_port(port_dev);
> +		ras_base = port ? port->uport_regs.ras : NULL;
> +		*dev = port_dev;
> +	} else {
> +		pci_err(pdev, "Unsupported device type\n");
> +		ras_base = NULL;
> +	}
> +
> +	return ras_base;
> +}
> +
> +static void cxl_port_cor_error_detected(struct pci_dev *pdev)
> +{
> +	struct device *dev;
> +	void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev);
> +
> +	__cxl_handle_cor_ras(dev, ras_base);
> +}
> +
> +static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev)
> +{
> +	struct device *dev;
> +	void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev);
> +
> +	return __cxl_handle_ras(dev, ras_base);
> +}
> +
>  void cxl_uport_init_ras_reporting(struct cxl_port *port)
>  {
>
Bowman, Terry Feb. 12, 2025, 4:34 p.m. UTC | #2
On 2/11/2025 6:11 PM, Dave Jiang wrote:
>
> On 2/11/25 12:24 PM, Terry Bowman wrote:
>> Introduce correctable and uncorrectable (UCE) CXL PCIe Port Protocol Error
>> handlers.
>>
>> The handlers will be called with a 'struct pci_dev' parameter
>> indicating the CXL Port device requiring handling. The CXL PCIe Port
>> device's underlying 'struct device' will match the port device in the
>> CXL topology.
>>
>> Use the PCIe Port's device object to find the matching CXL Upstream Switch
>> Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
>> matching CXL Port device should contain a cached reference to the RAS
>> register block. The cached RAS block will be used in handling the error.
>>
>> Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
>> a reference to the RAS registers as a parameter. These functions will use
>> the RAS register reference to indicate an error and clear the device's RAS
>> status.
>>
>> Update __cxl_handle_ras() to return PCI_ERS_RESULT_PANIC in the case
>> an error is present in the RAS status. Otherwise, return
>> PCI_ERS_RESULT_NONE.
> Maybe a comment on why the change?

Ok.
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>  drivers/cxl/core/pci.c | 81 +++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 77 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index af809e7cbe3b..3f13d9dfb610 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -699,7 +699,7 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
>>   * Log the state of the RAS status registers and prepare them to log the
>>   * next error status. Return 1 if reset needed.
>>   */
>> -static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>> +static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>>  {
>>  	u32 hl[CXL_HEADERLOG_SIZE_U32];
>>  	void __iomem *addr;
>> @@ -708,13 +708,13 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>>  
>>  	if (!ras_base) {
>>  		dev_warn_once(dev, "CXL RAS register block is not mapped");
>> -		return false;
>> +		return PCI_ERS_RESULT_NONE;
>>  	}
>>  
>>  	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>>  	status = readl(addr);
>>  	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
>> -		return false;
>> +		return PCI_ERS_RESULT_NONE;
>>  
>>  	/* If multiple errors, log header points to first error from ctrl reg */
>>  	if (hweight32(status) > 1) {
>> @@ -733,7 +733,7 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>>  
>>  	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>>  
>> -	return true;
>> +	return PCI_ERS_RESULT_PANIC;
>>  }
>>  
>>  static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
>> @@ -782,6 +782,79 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
>>  	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
>>  }
>>  
>> +static int match_uport(struct device *dev, const void *data)
>> +{
>> +	const struct device *uport_dev = data;
>> +	struct cxl_port *port;
>> +
>> +	if (!is_cxl_port(dev))
>> +		return 0;
>> +
>> +	port = to_cxl_port(dev);
>> +
>> +	return port->uport_dev == uport_dev;
>> +}
>> +
>> +static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev, struct device **dev)
>> +{
>> +	void __iomem *ras_base;
>> +
>> +	if (!pdev || !*dev) {
>> +		pr_err("Failed, parameter is NULL");
>> +		return NULL;
>> +	}
>> +
>> +	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
>> +	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
> Can probably just do a switch block here for the type?
>
>> +		struct cxl_port *port __free(put_cxl_port);
>> +		struct cxl_dport *dport = NULL;
>> +
>> +		port = find_cxl_port(&pdev->dev, &dport);
> Just declare port inline:
>
> struct cxl_port *port __free(put_cxl_port) =
> 		find_cxl_port(&pdev->dev, &dport);
>
>> +		if (!port) {
>> +			pci_err(pdev, "Failed to find root/dport in CXL topology\n");
>> +			return NULL;
>> +		}
>> +
>> +		ras_base = dport ? dport->regs.ras : NULL;
>> +		*dev = &port->dev;
>> +	} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {
>> +		struct device *port_dev __free(put_device);
> same comment here as above
>
> DJ

Thanks Dave, I'll incorporate these changes into v8.

Terry

>> +		struct cxl_port *port;
>> +
>> +		port_dev = bus_find_device(&cxl_bus_type, NULL, &pdev->dev,
>> +					   match_uport);
>> +		if (!port_dev || !is_cxl_port(port_dev)) {
>> +			pci_err(pdev, "Failed to find uport in CXL topology\n");
>> +			return NULL;
>> +		}
>> +
>> +		port = to_cxl_port(port_dev);
>> +		ras_base = port ? port->uport_regs.ras : NULL;
>> +		*dev = port_dev;
>> +	} else {
>> +		pci_err(pdev, "Unsupported device type\n");
>> +		ras_base = NULL;
>> +	}
>> +
>> +	return ras_base;
>> +}
>> +
>> +static void cxl_port_cor_error_detected(struct pci_dev *pdev)
>> +{
>> +	struct device *dev;
>> +	void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev);
>> +
>> +	__cxl_handle_cor_ras(dev, ras_base);
>> +}
>> +
>> +static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev)
>> +{
>> +	struct device *dev;
>> +	void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev);
>> +
>> +	return __cxl_handle_ras(dev, ras_base);
>> +}
>> +
>>  void cxl_uport_init_ras_reporting(struct cxl_port *port)
>>  {
>>
Dan Williams Feb. 14, 2025, 2:18 a.m. UTC | #3
Terry Bowman wrote:
> Introduce correctable and uncorrectable (UCE) CXL PCIe Port Protocol Error
> handlers.
> 
> The handlers will be called with a 'struct pci_dev' parameter
> indicating the CXL Port device requiring handling. The CXL PCIe Port
> device's underlying 'struct device' will match the port device in the
> CXL topology.
> 
> Use the PCIe Port's device object to find the matching CXL Upstream Switch
> Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
> matching CXL Port device should contain a cached reference to the RAS
> register block. The cached RAS block will be used in handling the error.
> 
> Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
> a reference to the RAS registers as a parameter. These functions will use
> the RAS register reference to indicate an error and clear the device's RAS
> status.
> 
> Update __cxl_handle_ras() to return PCI_ERS_RESULT_PANIC in the case
> an error is present in the RAS status. Otherwise, return
> PCI_ERS_RESULT_NONE.

So I have been having this nagging feeling while reviewing this set that
perhaps the CXL error handlers should not be 'struct pci_error_handlers'
relative to a 'struct pci_driver', but instead 'struct
cxl_error_handlers' that are added to 'struct cxl_driver', in particular
'cxl_port_driver'.

See below for what I *think* are insurmountable problems when a PCI
error handler is tasked with looking up @ras_base in a race free manner.
Note I say "think" because I could be misreading or missing some other
circumstance that makes this ok, so do please challenge if you think I
missed something because what follows below is another major direction
change.
  
> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
> ---
>  drivers/cxl/core/pci.c | 81 +++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 77 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index af809e7cbe3b..3f13d9dfb610 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -699,7 +699,7 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
>   * Log the state of the RAS status registers and prepare them to log the
>   * next error status. Return 1 if reset needed.
>   */
> -static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
> +static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>  {
>  	u32 hl[CXL_HEADERLOG_SIZE_U32];
>  	void __iomem *addr;
> @@ -708,13 +708,13 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>  
>  	if (!ras_base) {
>  		dev_warn_once(dev, "CXL RAS register block is not mapped");
> -		return false;
> +		return PCI_ERS_RESULT_NONE;
>  	}
>  
>  	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>  	status = readl(addr);
>  	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
> -		return false;
> +		return PCI_ERS_RESULT_NONE;
>  
>  	/* If multiple errors, log header points to first error from ctrl reg */
>  	if (hweight32(status) > 1) {
> @@ -733,7 +733,7 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>  
>  	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>  
> -	return true;
> +	return PCI_ERS_RESULT_PANIC;
>  }
>  
>  static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
> @@ -782,6 +782,79 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
>  	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
>  }
>  
> +static int match_uport(struct device *dev, const void *data)
> +{
> +	const struct device *uport_dev = data;
> +	struct cxl_port *port;
> +
> +	if (!is_cxl_port(dev))
> +		return 0;
> +
> +	port = to_cxl_port(dev);
> +
> +	return port->uport_dev == uport_dev;
> +}
> +
> +static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev, struct device **dev)
> +{
> +	void __iomem *ras_base;
> +
> +	if (!pdev || !*dev) {
> +		pr_err("Failed, parameter is NULL");
> +		return NULL;
> +	}
> +
> +	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
> +	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
> +		struct cxl_port *port __free(put_cxl_port);
> +		struct cxl_dport *dport = NULL;
> +
> +		port = find_cxl_port(&pdev->dev, &dport);

side comment: please always declare and assign scope-based-cleanup
variables on the same line, i.e.:

        struct cxl_port *port __free(put_cxl_port) =
                find_cxl_port(&pdev->dev, &dport);

Yes, that means violating the coding-style rule of preferring variable
declarations at the top of blocks. This is for 2 reasons:

* The variable is uninitialized. If future refactoring injects an early
  exit then unitialized garbage gets passed to put_cxl_port().

* The cosmetic order of the declaration is not the unwind order. If
  future refactoring introduces other scope-based-cleanup variables it
  requires additional cleanup to move the declaration to satisfy unwind
  dependencies.

> +		if (!port) {
> +			pci_err(pdev, "Failed to find root/dport in CXL topology\n");
> +			return NULL;
> +		}
> +
> +		ras_base = dport ? dport->regs.ras : NULL;
> +		*dev = &port->dev;

Ok, so here is where the trouble I was alluding to earlier begins. At
this point we leave this scope which means @port will have its reference
dropped and may be freed by the time the caller tries to use it.

Additionally, @ras_base is only valid while @port->dev.driver is set. In
this set, cxl_do_recovery() is only holding the device lock of @pdev
which means nothing synchronizes against @ras_base pointing to garbage
because a cxl_port was unbound / unplugged / disabled while error
recovery was running.

Both of those problems go away if upon entry to ->error_detected() it
can already be assumed that the context holds both a 'struct cxl_port'
object reference, and the device_lock() for that object.

As for how to fix it, one idea is to have the AER core post CXL events
to their own fifo for the CXL core to handle. Something like have
aer_isr_one_error(), upon detection of an internal error on a CXL port
device, post the 'struct aer_err_source' to a new kfifo and wake up a
CXL core thread to run cxl_do_recovery() against the CXL port device
topology instead of the PCI device topology.

Essentially, the main point of cxl_do_recovery() is the acknowledgement
that the PCI core does not have the context to judge the severity of
CXL events, or fully annotate events with all the potential kernel
objects impacted by an event. It is also the case that we need a common
landing spot for PCI AER notified CXL error events and ACPI GHES
notified CXL CPER records. So both PCI AER, and CPER notified errors
need to end up in the same cxl_do_recovery() path that walks the CXL
port topology.

The CXL Type-2 series is showing uptake on accelerators registering
'struct cxl_memdev' objects to report their CXL.mem capabilities. I
imagine that effort would eventually end up with a scheme that
accelerators can register a cxl_error_handler instance with that memdev
to get involved in potentially recovering CXL.mem errors. For example,
it may be the case that CXL error isolation finally has a viable use
case when the accelerator knows it is the only device impacted by an
isolation event and can safely reset that entire host-bridge to recover.
That is difficult to achieve in the PCI error handler context.
Bowman, Terry Feb. 14, 2025, 9:43 p.m. UTC | #4
On 2/13/2025 8:18 PM, Dan Williams wrote:
> Terry Bowman wrote:
>> Introduce correctable and uncorrectable (UCE) CXL PCIe Port Protocol Error
>> handlers.
>>
>> The handlers will be called with a 'struct pci_dev' parameter
>> indicating the CXL Port device requiring handling. The CXL PCIe Port
>> device's underlying 'struct device' will match the port device in the
>> CXL topology.
>>
>> Use the PCIe Port's device object to find the matching CXL Upstream Switch
>> Port, CXL Downstream Switch Port, or CXL Root Port in the CXL topology. The
>> matching CXL Port device should contain a cached reference to the RAS
>> register block. The cached RAS block will be used in handling the error.
>>
>> Invoke the existing __cxl_handle_ras() or __cxl_handle_cor_ras() using
>> a reference to the RAS registers as a parameter. These functions will use
>> the RAS register reference to indicate an error and clear the device's RAS
>> status.
>>
>> Update __cxl_handle_ras() to return PCI_ERS_RESULT_PANIC in the case
>> an error is present in the RAS status. Otherwise, return
>> PCI_ERS_RESULT_NONE.
> So I have been having this nagging feeling while reviewing this set that
> perhaps the CXL error handlers should not be 'struct pci_error_handlers'
> relative to a 'struct pci_driver', but instead 'struct
> cxl_error_handlers' that are added to 'struct cxl_driver', in particular
> 'cxl_port_driver'.
>
> See below for what I *think* are insurmountable problems when a PCI
> error handler is tasked with looking up @ras_base in a race free manner.
> Note I say "think" because I could be misreading or missing some other
> circumstance that makes this ok, so do please challenge if you think I
> missed something because what follows below is another major direction
> change.
>   
>> Signed-off-by: Terry Bowman <terry.bowman@amd.com>
>> ---
>>  drivers/cxl/core/pci.c | 81 +++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 77 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
>> index af809e7cbe3b..3f13d9dfb610 100644
>> --- a/drivers/cxl/core/pci.c
>> +++ b/drivers/cxl/core/pci.c
>> @@ -699,7 +699,7 @@ static void header_log_copy(void __iomem *ras_base, u32 *log)
>>   * Log the state of the RAS status registers and prepare them to log the
>>   * next error status. Return 1 if reset needed.
>>   */
>> -static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>> +static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>>  {
>>  	u32 hl[CXL_HEADERLOG_SIZE_U32];
>>  	void __iomem *addr;
>> @@ -708,13 +708,13 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>>  
>>  	if (!ras_base) {
>>  		dev_warn_once(dev, "CXL RAS register block is not mapped");
>> -		return false;
>> +		return PCI_ERS_RESULT_NONE;
>>  	}
>>  
>>  	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
>>  	status = readl(addr);
>>  	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
>> -		return false;
>> +		return PCI_ERS_RESULT_NONE;
>>  
>>  	/* If multiple errors, log header points to first error from ctrl reg */
>>  	if (hweight32(status) > 1) {
>> @@ -733,7 +733,7 @@ static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
>>  
>>  	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
>>  
>> -	return true;
>> +	return PCI_ERS_RESULT_PANIC;
>>  }
>>  
>>  static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
>> @@ -782,6 +782,79 @@ static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
>>  	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
>>  }
>>  
>> +static int match_uport(struct device *dev, const void *data)
>> +{
>> +	const struct device *uport_dev = data;
>> +	struct cxl_port *port;
>> +
>> +	if (!is_cxl_port(dev))
>> +		return 0;
>> +
>> +	port = to_cxl_port(dev);
>> +
>> +	return port->uport_dev == uport_dev;
>> +}
>> +
>> +static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev, struct device **dev)
>> +{
>> +	void __iomem *ras_base;
>> +
>> +	if (!pdev || !*dev) {
>> +		pr_err("Failed, parameter is NULL");
>> +		return NULL;
>> +	}
>> +
>> +	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
>> +	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
>> +		struct cxl_port *port __free(put_cxl_port);
>> +		struct cxl_dport *dport = NULL;
>> +
>> +		port = find_cxl_port(&pdev->dev, &dport);
> side comment: please always declare and assign scope-based-cleanup
> variables on the same line, i.e.:
>
>         struct cxl_port *port __free(put_cxl_port) =
>                 find_cxl_port(&pdev->dev, &dport);
>
> Yes, that means violating the coding-style rule of preferring variable
> declarations at the top of blocks. This is for 2 reasons:
>
> * The variable is uninitialized. If future refactoring injects an early
>   exit then unitialized garbage gets passed to put_cxl_port().
>
> * The cosmetic order of the declaration is not the unwind order. If
>   future refactoring introduces other scope-based-cleanup variables it
>   requires additional cleanup to move the declaration to satisfy unwind
>   dependencies.
Got it. Thanks for pointing out.

>> +		if (!port) {
>> +			pci_err(pdev, "Failed to find root/dport in CXL topology\n");
>> +			return NULL;
>> +		}
>> +
>> +		ras_base = dport ? dport->regs.ras : NULL;
>> +		*dev = &port->dev;
> Ok, so here is where the trouble I was alluding to earlier begins. At
> this point we leave this scope which means @port will have its reference
> dropped and may be freed by the time the caller tries to use it.
>
> Additionally, @ras_base is only valid while @port->dev.driver is set. In
> this set, cxl_do_recovery() is only holding the device lock of @pdev
> which means nothing synchronizes against @ras_base pointing to garbage
> because a cxl_port was unbound / unplugged / disabled while error
> recovery was running.
>
> Both of those problems go away if upon entry to ->error_detected() it
> can already be assumed that the context holds both a 'struct cxl_port'
> object reference, and the device_lock() for that object.

I think the question is will there be much gained by taking the lock
earlier? The difference between the current implementation and the
proposed would be when the reference (or lock) is taken: cxl_report_error()
or cxl_port_error_detected()/cxl_port_cor_error_detected(). It's only a
few function calls difference but the biggest difference is in the CXL
topology search without reference or lock protection (you point at here).

> As for how to fix it, one idea is to have the AER core post CXL events
> to their own fifo for the CXL core to handle. Something like have
> aer_isr_one_error(), upon detection of an internal error on a CXL port
> device, post the 'struct aer_err_source' to a new kfifo and wake up a
> CXL core thread to run cxl_do_recovery() against the CXL port device
> topology instead of the PCI device topology.
>
> Essentially, the main point of cxl_do_recovery() is the acknowledgement
> that the PCI core does not have the context to judge the severity of
> CXL events, or fully annotate events with all the potential kernel
> objects impacted by an event. It is also the case that we need a common
> landing spot for PCI AER notified CXL error events and ACPI GHES
> notified CXL CPER records. So both PCI AER, and CPER notified errors
> need to end up in the same cxl_do_recovery() path that walks the CXL
> port topology.
Understood, it would fold in the GHES CPER too.
> The CXL Type-2 series is showing uptake on accelerators registering
> 'struct cxl_memdev' objects to report their CXL.mem capabilities. I
> imagine that effort would eventually end up with a scheme that
> accelerators can register a cxl_error_handler instance with that memdev
> to get involved in potentially recovering CXL.mem errors. For example,
> it may be the case that CXL error isolation finally has a viable use
> case when the accelerator knows it is the only device impacted by an
> isolation event and can safely reset that entire host-bridge to recover.
> That is difficult to achieve in the PCI error handler context.
Which directory do you see the CXL error handling and support landing
in: pci/pcie/ or cxl/core/ or elsewhere ?

Should we consider submitting this patchset first and then adding the CXL
kfifo changes you mention? It sounds like we need this for FW-first and
could be reused to solve the OS-first issue (time without a lock).

Or, if you like I can start to add the CXL kfifo changes now.

Terry
Dan Williams Feb. 15, 2025, 12:20 a.m. UTC | #5
Bowman, Terry wrote:
[..]
> > Ok, so here is where the trouble I was alluding to earlier begins. At
> > this point we leave this scope which means @port will have its reference
> > dropped and may be freed by the time the caller tries to use it.
> >
> > Additionally, @ras_base is only valid while @port->dev.driver is set. In
> > this set, cxl_do_recovery() is only holding the device lock of @pdev
> > which means nothing synchronizes against @ras_base pointing to garbage
> > because a cxl_port was unbound / unplugged / disabled while error
> > recovery was running.
> >
> > Both of those problems go away if upon entry to ->error_detected() it
> > can already be assumed that the context holds both a 'struct cxl_port'
> > object reference, and the device_lock() for that object.
> 
> I think the question is will there be much gained by taking the lock
> earlier? The difference between the current implementation and the
> proposed would be when the reference (or lock) is taken: cxl_report_error()
> or cxl_port_error_detected()/cxl_port_cor_error_detected(). It's only a
> few function calls difference but the biggest difference is in the CXL
> topology search without reference or lock protection (you point at here).

My point is that this series is holding the *wrong* device_lock():

I.e.:

> +static int cxl_report_error_detected(struct pci_dev *dev, void *data)
> +{
> +       const struct cxl_error_handlers *cxl_err_handler;
> +       pci_ers_result_t vote, *result = data;
> +       struct pci_driver *pdrv;
> +
> +       device_lock(&dev->dev);

This lock against the PCI device does nothing to protect against unbind events for the cxl_port
object...

> +       pdrv = dev->driver; 
> +       if (!pdrv || !pdrv->cxl_err_handler ||
> +           !pdrv->cxl_err_handler->error_detected) 
> +               goto out;
> +
> +       cxl_err_handler = pdrv->cxl_err_handler;
> +       vote = cxl_err_handler->error_detected(dev);

...subsequently any usage of @ras_base in this ->error_detected() is
racy.

> +       *result = merge_result(*result, vote); 
> +out:
> +       device_unlock(&dev->dev); 

[..]
> Which directory do you see the CXL error handling and support landing
> in: pci/pcie/ or cxl/core/ or elsewhere ?

In cxl/core/, that's the only place that understands CXL port topology
and the lifetime rules for dport RAS register mappings.

> Should we consider submitting this patchset first and then adding the CXL
> kfifo changes you mention? It sounds like we need this for FW-first and
> could be reused to solve the OS-first issue (time without a lock).

The problem is that the PCI core is always built-in and the CXL core is
modular. Without a kfifo() and a registration scheme the CXL core could
not remain modular.

> Or, if you like I can start to add the CXL kfifo changes now.

I feel like there's enough examples of kfifo in error handling to make
this not too burdensome, but let me know if you disagree. Otherwise,
would need to spend the time to figure out how to keep the test
environment functioning (cxl-test depends on modular core builds).
Bowman, Terry Feb. 18, 2025, 3:33 p.m. UTC | #6
On 2/14/2025 6:20 PM, Dan Williams wrote:
> Bowman, Terry wrote:
> [..]
>>> Ok, so here is where the trouble I was alluding to earlier begins. At
>>> this point we leave this scope which means @port will have its reference
>>> dropped and may be freed by the time the caller tries to use it.
>>>
>>> Additionally, @ras_base is only valid while @port->dev.driver is set. In
>>> this set, cxl_do_recovery() is only holding the device lock of @pdev
>>> which means nothing synchronizes against @ras_base pointing to garbage
>>> because a cxl_port was unbound / unplugged / disabled while error
>>> recovery was running.
>>>
>>> Both of those problems go away if upon entry to ->error_detected() it
>>> can already be assumed that the context holds both a 'struct cxl_port'
>>> object reference, and the device_lock() for that object.
>> I think the question is will there be much gained by taking the lock
>> earlier? The difference between the current implementation and the
>> proposed would be when the reference (or lock) is taken: cxl_report_error()
>> or cxl_port_error_detected()/cxl_port_cor_error_detected(). It's only a
>> few function calls difference but the biggest difference is in the CXL
>> topology search without reference or lock protection (you point at here).
> My point is that this series is holding the *wrong* device_lock():
>
> I.e.:
>
>> +static int cxl_report_error_detected(struct pci_dev *dev, void *data)
>> +{
>> +       const struct cxl_error_handlers *cxl_err_handler;
>> +       pci_ers_result_t vote, *result = data;
>> +       struct pci_driver *pdrv;
>> +
>> +       device_lock(&dev->dev);
> This lock against the PCI device does nothing to protect against unbind events for the cxl_port
> object...

I'll update to use the CXL device instead of the PCI device.

>> +       pdrv = dev->driver; 
>> +       if (!pdrv || !pdrv->cxl_err_handler ||
>> +           !pdrv->cxl_err_handler->error_detected) 
>> +               goto out;
>> +
>> +       cxl_err_handler = pdrv->cxl_err_handler;
>> +       vote = cxl_err_handler->error_detected(dev);
> ...subsequently any usage of @ras_base in this ->error_detected() is
> racy.
>
>> +       *result = merge_result(*result, vote); 
>> +out:
>> +       device_unlock(&dev->dev); 
> [..]
>> Which directory do you see the CXL error handling and support landing
>> in: pci/pcie/ or cxl/core/ or elsewhere ?
> In cxl/core/, that's the only place that understands CXL port topology
> and the lifetime rules for dport RAS register mappings.
>
>> Should we consider submitting this patchset first and then adding the CXL
>> kfifo changes you mention? It sounds like we need this for FW-first and
>> could be reused to solve the OS-first issue (time without a lock).
> The problem is that the PCI core is always built-in and the CXL core is
> modular. Without a kfifo() and a registration scheme the CXL core could
> not remain modular.
>
>> Or, if you like I can start to add the CXL kfifo changes now.
> I feel like there's enough examples of kfifo in error handling to make
> this not too burdensome, but let me know if you disagree. Otherwise,
> would need to spend the time to figure out how to keep the test
> environment functioning (cxl-test depends on modular core builds).

Thanks for the feedback. Yes, there are several examples and Smita is using for
FW-first as well. Correct me if I'm wrong but the goal in this case is
for the FW-first and OS-first to use the same kfifo.

Terry
Dan Williams Feb. 18, 2025, 5:15 p.m. UTC | #7
Bowman, Terry wrote:
[..]
> >> Or, if you like I can start to add the CXL kfifo changes now.
> > I feel like there's enough examples of kfifo in error handling to make
> > this not too burdensome, but let me know if you disagree. Otherwise,
> > would need to spend the time to figure out how to keep the test
> > environment functioning (cxl-test depends on modular core builds).
> 
> Thanks for the feedback. Yes, there are several examples and Smita is using for
> FW-first as well. Correct me if I'm wrong but the goal in this case is
> for the FW-first and OS-first to use the same kfifo.

Not necessarily, the leaf handlers should unify around
cxl_do_recovery(), however you will notice that CPER PCIe AER events get
routed to the aer_recover_ring kfifo and native PCIe AER events get
collected by the aer_rpc.aer_fifo.  Both of those route to
pcie_do_recovery() on the backend.

The CXL kfifo is to allow the CXL subsystem to remain modular and only
minimally burden the PCIe AER flow with just enough logic to determine
"not a PCIe AER event".
diff mbox series

Patch

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index af809e7cbe3b..3f13d9dfb610 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -699,7 +699,7 @@  static void header_log_copy(void __iomem *ras_base, u32 *log)
  * Log the state of the RAS status registers and prepare them to log the
  * next error status. Return 1 if reset needed.
  */
-static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
+static pci_ers_result_t __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
 {
 	u32 hl[CXL_HEADERLOG_SIZE_U32];
 	void __iomem *addr;
@@ -708,13 +708,13 @@  static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
 
 	if (!ras_base) {
 		dev_warn_once(dev, "CXL RAS register block is not mapped");
-		return false;
+		return PCI_ERS_RESULT_NONE;
 	}
 
 	addr = ras_base + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
 	status = readl(addr);
 	if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
-		return false;
+		return PCI_ERS_RESULT_NONE;
 
 	/* If multiple errors, log header points to first error from ctrl reg */
 	if (hweight32(status) > 1) {
@@ -733,7 +733,7 @@  static bool __cxl_handle_ras(struct device *dev, void __iomem *ras_base)
 
 	writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
 
-	return true;
+	return PCI_ERS_RESULT_PANIC;
 }
 
 static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds)
@@ -782,6 +782,79 @@  static void cxl_disable_rch_root_ints(struct cxl_dport *dport)
 	writel(aer_cmd, aer_base + PCI_ERR_ROOT_COMMAND);
 }
 
+static int match_uport(struct device *dev, const void *data)
+{
+	const struct device *uport_dev = data;
+	struct cxl_port *port;
+
+	if (!is_cxl_port(dev))
+		return 0;
+
+	port = to_cxl_port(dev);
+
+	return port->uport_dev == uport_dev;
+}
+
+static void __iomem *cxl_pci_port_ras(struct pci_dev *pdev, struct device **dev)
+{
+	void __iomem *ras_base;
+
+	if (!pdev || !*dev) {
+		pr_err("Failed, parameter is NULL");
+		return NULL;
+	}
+
+	if ((pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT) ||
+	    (pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM)) {
+		struct cxl_port *port __free(put_cxl_port);
+		struct cxl_dport *dport = NULL;
+
+		port = find_cxl_port(&pdev->dev, &dport);
+		if (!port) {
+			pci_err(pdev, "Failed to find root/dport in CXL topology\n");
+			return NULL;
+		}
+
+		ras_base = dport ? dport->regs.ras : NULL;
+		*dev = &port->dev;
+	} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM) {
+		struct device *port_dev __free(put_device);
+		struct cxl_port *port;
+
+		port_dev = bus_find_device(&cxl_bus_type, NULL, &pdev->dev,
+					   match_uport);
+		if (!port_dev || !is_cxl_port(port_dev)) {
+			pci_err(pdev, "Failed to find uport in CXL topology\n");
+			return NULL;
+		}
+
+		port = to_cxl_port(port_dev);
+		ras_base = port ? port->uport_regs.ras : NULL;
+		*dev = port_dev;
+	} else {
+		pci_err(pdev, "Unsupported device type\n");
+		ras_base = NULL;
+	}
+
+	return ras_base;
+}
+
+static void cxl_port_cor_error_detected(struct pci_dev *pdev)
+{
+	struct device *dev;
+	void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev);
+
+	__cxl_handle_cor_ras(dev, ras_base);
+}
+
+static pci_ers_result_t cxl_port_error_detected(struct pci_dev *pdev)
+{
+	struct device *dev;
+	void __iomem *ras_base = cxl_pci_port_ras(pdev, &dev);
+
+	return __cxl_handle_ras(dev, ras_base);
+}
+
 void cxl_uport_init_ras_reporting(struct cxl_port *port)
 {