Message ID | 20230622205523.85375-24-terry.bowman@amd.com |
---|---|
State | Superseded |
Headers | show |
Series | cxl/pci: Add support for RCH RAS error handling | expand |
On 6/22/23 13:55, Terry Bowman wrote: > RCH downstream port error logging is missing in the current CXL driver. The > missing AER and RAS error logging is needed for communicating driver error > details to userspace. Update the driver to include PCIe AER and CXL RAS > error logging. > > Add RCH downstream port error handling into the existing RCiEP handler. > The downstream port error handler is added to the RCiEP error handler > because the downstream port is implemented in a RCRB, is not PCI > enumerable, and as a result is not directly accessible to the PCI AER > root port driver. The AER root port driver calls the RCiEP handler for > handling RCD errors and RCH downstream port protocol errors. > > Update existing RCiEP correctable and uncorrectable handlers to also call > the RCH handler. The RCH handler will read the RCH AER registers, check for > error severity, and if an error exists will log using an existing kernel > AER trace routine. The RCH handler will also log downstream port RAS errors > if they exist. > > Co-developed-by: Robert Richter <rrichter@amd.com> > Signed-off-by: Robert Richter <rrichter@amd.com> > Signed-off-by: Terry Bowman <terry.bowman@amd.com> > Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> Reviewed-by: Dave Jiang <dave.jiang@intel.com> > --- > drivers/cxl/core/pci.c | 101 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 101 insertions(+) > > diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c > index 9cb39835e154..9e0eba5ccfc4 100644 > --- a/drivers/cxl/core/pci.c > +++ b/drivers/cxl/core/pci.c > @@ -5,6 +5,7 @@ > #include <linux/delay.h> > #include <linux/pci.h> > #include <linux/pci-doe.h> > +#include <linux/aer.h> > #include <cxlpci.h> > #include <cxlmem.h> > #include <cxl.h> > @@ -747,10 +748,107 @@ static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds) > return __cxl_handle_ras(cxlds, cxlds->regs.ras); > } > > +#ifdef CONFIG_PCIEAER_CXL > + > +static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds, > + struct cxl_dport *dport) > +{ > + return __cxl_handle_cor_ras(cxlds, dport->regs.ras); > +} > + > +static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds, > + struct cxl_dport *dport) > +{ > + return __cxl_handle_ras(cxlds, dport->regs.ras); > +} > + > +/* > + * Copy the AER capability registers using 32 bit read accesses. > + * This is necessary because RCRB AER capability is MMIO mapped. Clear the > + * status after copying. > + * > + * @aer_base: base address of AER capability block in RCRB > + * @aer_regs: destination for copying AER capability > + */ > +static bool cxl_rch_get_aer_info(void __iomem *aer_base, > + struct aer_capability_regs *aer_regs) > +{ > + int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); > + u32 *aer_regs_buf = (u32 *)aer_regs; > + int n; > + > + if (!aer_base) > + return false; > + > + /* Use readl() to guarantee 32-bit accesses */ > + for (n = 0; n < read_cnt; n++) > + aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); > + > + writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); > + writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); > + > + return true; > +} > + > +/* Get AER severity. Return false if there is no error. */ > +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, > + int *severity) > +{ > + if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { > + if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) > + *severity = AER_FATAL; > + else > + *severity = AER_NONFATAL; > + return true; > + } > + > + if (aer_regs->cor_status & ~aer_regs->cor_mask) { > + *severity = AER_CORRECTABLE; > + return true; > + } > + > + return false; > +} > + > +static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) > +{ > + struct pci_dev *pdev = to_pci_dev(cxlds->dev); > + struct aer_capability_regs aer_regs; > + struct cxl_dport *dport; > + struct cxl_port *port; > + int severity; > + > + port = cxl_pci_find_port(pdev, &dport); > + if (!port) > + return; > + > + put_device(&port->dev); > + > + if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) > + return; > + > + if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) > + return; > + > + pci_print_aer(pdev, severity, &aer_regs); > + > + if (severity == AER_CORRECTABLE) > + cxl_handle_rdport_cor_ras(cxlds, dport); > + else > + cxl_handle_rdport_ras(cxlds, dport); > +} > + > +#else > +static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } > +#endif > + > void cxl_cor_error_detected(struct pci_dev *pdev) > { > struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); > > + if (cxlds->rcd) > + cxl_handle_rdport_errors(cxlds); > + > cxl_handle_endpoint_cor_ras(cxlds); > } > EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); > @@ -763,6 +861,9 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, > struct device *dev = &cxlmd->dev; > bool ue; > > + if (cxlds->rcd) > + cxl_handle_rdport_errors(cxlds); > + > /* > * A frozen channel indicates an impending reset which is fatal to > * CXL.mem operation, and will likely crash the system. On the off
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 9cb39835e154..9e0eba5ccfc4 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -5,6 +5,7 @@ #include <linux/delay.h> #include <linux/pci.h> #include <linux/pci-doe.h> +#include <linux/aer.h> #include <cxlpci.h> #include <cxlmem.h> #include <cxl.h> @@ -747,10 +748,107 @@ static bool cxl_handle_endpoint_ras(struct cxl_dev_state *cxlds) return __cxl_handle_ras(cxlds, cxlds->regs.ras); } +#ifdef CONFIG_PCIEAER_CXL + +static void cxl_handle_rdport_cor_ras(struct cxl_dev_state *cxlds, + struct cxl_dport *dport) +{ + return __cxl_handle_cor_ras(cxlds, dport->regs.ras); +} + +static bool cxl_handle_rdport_ras(struct cxl_dev_state *cxlds, + struct cxl_dport *dport) +{ + return __cxl_handle_ras(cxlds, dport->regs.ras); +} + +/* + * Copy the AER capability registers using 32 bit read accesses. + * This is necessary because RCRB AER capability is MMIO mapped. Clear the + * status after copying. + * + * @aer_base: base address of AER capability block in RCRB + * @aer_regs: destination for copying AER capability + */ +static bool cxl_rch_get_aer_info(void __iomem *aer_base, + struct aer_capability_regs *aer_regs) +{ + int read_cnt = sizeof(struct aer_capability_regs) / sizeof(u32); + u32 *aer_regs_buf = (u32 *)aer_regs; + int n; + + if (!aer_base) + return false; + + /* Use readl() to guarantee 32-bit accesses */ + for (n = 0; n < read_cnt; n++) + aer_regs_buf[n] = readl(aer_base + n * sizeof(u32)); + + writel(aer_regs->uncor_status, aer_base + PCI_ERR_UNCOR_STATUS); + writel(aer_regs->cor_status, aer_base + PCI_ERR_COR_STATUS); + + return true; +} + +/* Get AER severity. Return false if there is no error. */ +static bool cxl_rch_get_aer_severity(struct aer_capability_regs *aer_regs, + int *severity) +{ + if (aer_regs->uncor_status & ~aer_regs->uncor_mask) { + if (aer_regs->uncor_status & PCI_ERR_ROOT_FATAL_RCV) + *severity = AER_FATAL; + else + *severity = AER_NONFATAL; + return true; + } + + if (aer_regs->cor_status & ~aer_regs->cor_mask) { + *severity = AER_CORRECTABLE; + return true; + } + + return false; +} + +static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) +{ + struct pci_dev *pdev = to_pci_dev(cxlds->dev); + struct aer_capability_regs aer_regs; + struct cxl_dport *dport; + struct cxl_port *port; + int severity; + + port = cxl_pci_find_port(pdev, &dport); + if (!port) + return; + + put_device(&port->dev); + + if (!cxl_rch_get_aer_info(dport->regs.dport_aer, &aer_regs)) + return; + + if (!cxl_rch_get_aer_severity(&aer_regs, &severity)) + return; + + pci_print_aer(pdev, severity, &aer_regs); + + if (severity == AER_CORRECTABLE) + cxl_handle_rdport_cor_ras(cxlds, dport); + else + cxl_handle_rdport_ras(cxlds, dport); +} + +#else +static void cxl_handle_rdport_errors(struct cxl_dev_state *cxlds) { } +#endif + void cxl_cor_error_detected(struct pci_dev *pdev) { struct cxl_dev_state *cxlds = pci_get_drvdata(pdev); + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + cxl_handle_endpoint_cor_ras(cxlds); } EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL); @@ -763,6 +861,9 @@ pci_ers_result_t cxl_error_detected(struct pci_dev *pdev, struct device *dev = &cxlmd->dev; bool ue; + if (cxlds->rcd) + cxl_handle_rdport_errors(cxlds); + /* * A frozen channel indicates an impending reset which is fatal to * CXL.mem operation, and will likely crash the system. On the off