diff mbox series

[2/4] pci/aer: Handle Advisory Non-Fatal properly

Message ID 20240111073227.31488-3-qingshun.wang@linux.intel.com (mailing list archive)
State Superseded
Delegated to: Bjorn Helgaas
Headers show
Series pci/aer: Handle Advisory Non-Fatal properly | expand

Commit Message

Wang, Qingshun Jan. 11, 2024, 7:32 a.m. UTC
If we are processing an Advisory Non-Fatal Error, first check the Device
Status. If any of Fatal/Non-Fatal Error Detected bits is set, leave it
to uncorrectable error handler to clear the UE status bit, which should
be executed right after the CE handler in this case.

Otherwise, filter out uncorrectable errors that is not possible to
trigger an Advisory Non-Fatal Error, then clear all the rest status bits.

Reviewed-by: "Tsaur, Erwin" <erwin.tsaur@intel.com>
Signed-off-by: "Wang, Qingshun" <qingshun.wang@linux.intel.com>
---
 drivers/pci/pcie/aer.c | 58 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

Comments

Bjorn Helgaas Jan. 12, 2024, 4:35 p.m. UTC | #1
On Thu, Jan 11, 2024 at 03:32:17PM +0800, Wang, Qingshun wrote:
> If we are processing an Advisory Non-Fatal Error, first check the Device
> Status. If any of Fatal/Non-Fatal Error Detected bits is set, leave it
> to uncorrectable error handler to clear the UE status bit, which should
> be executed right after the CE handler in this case.
> 
> Otherwise, filter out uncorrectable errors that is not possible to
> trigger an Advisory Non-Fatal Error, then clear all the rest status bits.

> +static int anfe_get_related_err(struct aer_err_info *info)
> +{
> +	/*
> +	 * Take the most conservative route here. If there are
> +	 * Non-Fatal/Fatal errors detected, do not assume any
> +	 * bit in uncor_status is set by ANFE.
> +	 */
> +	if (info->device_status & (PCI_EXP_DEVSTA_NFED | PCI_EXP_DEVSTA_FED))
> +		return 0;
> +	/*
> +	 * An UNCOR error may cause Advisory Non-Fatal error if:
> +	 *	a. The severity of the error is Non-Fatal.
> +	 *	b. The error is one of the following:
> +	 *		1. Poisoned TLP
> +	 *		2. Completion Timeout
> +	 *		3. Completer Abort
> +	 *		4. Unexpected Completion
> +	 *		5. Unsupported Request

This could benefit from a reference to the spec that outlines these
conditions.

Bjorn
Wang, Qingshun Jan. 16, 2024, 8:42 a.m. UTC | #2
On Fri, Jan 12, 2024 at 10:35:26AM -0600, Bjorn Helgaas wrote:
> On Thu, Jan 11, 2024 at 03:32:17PM +0800, Wang, Qingshun wrote:
> > If we are processing an Advisory Non-Fatal Error, first check the Device
> > Status. If any of Fatal/Non-Fatal Error Detected bits is set, leave it
> > to uncorrectable error handler to clear the UE status bit, which should
> > be executed right after the CE handler in this case.
> > 
> > Otherwise, filter out uncorrectable errors that is not possible to
> > trigger an Advisory Non-Fatal Error, then clear all the rest status bits.
> 
> > +static int anfe_get_related_err(struct aer_err_info *info)
> > +{
> > +	/*
> > +	 * Take the most conservative route here. If there are
> > +	 * Non-Fatal/Fatal errors detected, do not assume any
> > +	 * bit in uncor_status is set by ANFE.
> > +	 */
> > +	if (info->device_status & (PCI_EXP_DEVSTA_NFED | PCI_EXP_DEVSTA_FED))
> > +		return 0;
> > +	/*
> > +	 * An UNCOR error may cause Advisory Non-Fatal error if:
> > +	 *	a. The severity of the error is Non-Fatal.
> > +	 *	b. The error is one of the following:
> > +	 *		1. Poisoned TLP
> > +	 *		2. Completion Timeout
> > +	 *		3. Completer Abort
> > +	 *		4. Unexpected Completion
> > +	 *		5. Unsupported Request
> 
> This could benefit from a reference to the spec that outlines these
> conditions.
Thanks for suggestion. Will add a reference to latest spec.
> 
> Bjorn

Best regards
Wang, Qingshun
diff mbox series

Patch

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9311323a2391..86e7cfd71f23 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -107,6 +107,12 @@  struct aer_stats {
 					PCI_ERR_ROOT_MULTI_COR_RCV |	\
 					PCI_ERR_ROOT_MULTI_UNCOR_RCV)
 
+#define AER_ERR_ANFE_UNC_MASK		(PCI_ERR_UNC_POISON_TLP |	\
+					PCI_ERR_UNC_COMP_TIME |		\
+					PCI_ERR_UNC_COMP_ABORT |	\
+					PCI_ERR_UNC_UNX_COMP |		\
+					PCI_ERR_UNC_UNSUP)
+
 static int pcie_aer_disable;
 static pci_ers_result_t aer_root_reset(struct pci_dev *dev);
 
@@ -612,6 +618,29 @@  const struct attribute_group aer_stats_attr_group = {
 	.is_visible = aer_stats_attrs_are_visible,
 };
 
+static int anfe_get_related_err(struct aer_err_info *info)
+{
+	/*
+	 * Take the most conservative route here. If there are
+	 * Non-Fatal/Fatal errors detected, do not assume any
+	 * bit in uncor_status is set by ANFE.
+	 */
+	if (info->device_status & (PCI_EXP_DEVSTA_NFED | PCI_EXP_DEVSTA_FED))
+		return 0;
+	/*
+	 * An UNCOR error may cause Advisory Non-Fatal error if:
+	 *	a. The severity of the error is Non-Fatal.
+	 *	b. The error is one of the following:
+	 *		1. Poisoned TLP
+	 *		2. Completion Timeout
+	 *		3. Completer Abort
+	 *		4. Unexpected Completion
+	 *		5. Unsupported Request
+	 */
+	return info->uncor_status & ~info->uncor_mask
+		& AER_ERR_ANFE_UNC_MASK & ~info->severity;
+}
+
 static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
 				   struct aer_err_info *info)
 {
@@ -678,6 +707,7 @@  static void __aer_print_error(struct pci_dev *dev,
 			      struct aer_err_info *info)
 {
 	unsigned long status;
+	unsigned long anfe_status;
 	const char **strings;
 	const char *level, *errmsg;
 	int i;
@@ -700,6 +730,21 @@  static void __aer_print_error(struct pci_dev *dev,
 		pci_printk(level, dev, "   [%2d] %-22s%s\n", i, errmsg,
 				info->first_error == i ? " (First)" : "");
 	}
+
+	if (info->severity == AER_CORRECTABLE && (status & PCI_ERR_COR_ADV_NFAT)) {
+		anfe_status = anfe_get_related_err(info);
+		if (anfe_status) {
+			pci_printk(level, dev, "Uncorrectable errors that may cause Advisory Non-Fatal:");
+			for_each_set_bit(i, &anfe_status, 32) {
+				errmsg = aer_uncorrectable_error_string[i];
+				if (!errmsg)
+					errmsg = "Unknown Error Bit";
+
+				pci_printk(level, dev, "   [%2d] %-22s\n", i, errmsg);
+			}
+		}
+	}
+
 	pci_dev_aer_stats_incr(dev, info);
 }
 
@@ -1092,6 +1137,14 @@  static inline void cxl_rch_handle_error(struct pci_dev *dev,
 					struct aer_err_info *info) { }
 #endif
 
+static void handle_advisory_nonfatal(struct pci_dev *dev, struct aer_err_info *info)
+{
+	int aer = dev->aer_cap;
+
+	pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_STATUS,
+			       anfe_get_related_err(info));
+}
+
 /**
  * pci_aer_handle_error - handle logging error into an event log
  * @dev: pointer to pci_dev data structure of error source device
@@ -1108,9 +1161,12 @@  static void pci_aer_handle_error(struct pci_dev *dev, struct aer_err_info *info)
 		 * Correctable error does not need software intervention.
 		 * No need to go through error recovery process.
 		 */
-		if (aer)
+		if (aer) {
 			pci_write_config_dword(dev, aer + PCI_ERR_COR_STATUS,
 					info->cor_status);
+			if (info->cor_status & PCI_ERR_COR_ADV_NFAT)
+				handle_advisory_nonfatal(dev, info);
+		}
 		if (pcie_aer_is_native(dev)) {
 			struct pci_driver *pdrv = dev->driver;