Message ID | 20180918235702.26573-6-keith.busch@intel.com (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Bjorn Helgaas |
Headers | show |
Series | PCI error handling | expand |
On Tue, Sep 18, 2018 at 05:56:57PM -0600, Keith Busch wrote: > We don't need to be paranoid about the topology changing while handling an > error. If the device has changed in a hotplug capable slot, we can rely > on the presence detection handling to react to a changing topology. This > patch restores the fatal error handling behavior that existed before > merging DPC with AER with 7e9084b3674 ("PCI/AER: Handle ERR_FATAL with > removal and re-enumeration of devices"). 7e9084b3674 updated Documentation/PCI/pci-error-recovery.txt; doesn't this require another update to keep it matching the code? > Signed-off-by: Keith Busch <keith.busch@intel.com> > --- > drivers/pci/pci.h | 4 +-- > drivers/pci/pcie/aer.c | 12 +++++--- > drivers/pci/pcie/dpc.c | 4 +-- > drivers/pci/pcie/err.c | 75 ++++---------------------------------------------- > 4 files changed, 18 insertions(+), 77 deletions(-) > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 21bfa30db18d..5a96978d3403 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -425,8 +425,8 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev) > #endif > > /* PCI error reporting and recovery */ > -void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service); > -void pcie_do_nonfatal_recovery(struct pci_dev *dev); > +void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state, > + u32 service); > > bool pcie_wait_for_link(struct pci_dev *pdev, bool active); > #ifdef CONFIG_PCIEASPM > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index b4d14acee66d..6b59a23568f8 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -1010,9 +1010,11 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info) > info->status); > pci_aer_clear_device_status(dev); > } else if (info->severity == AER_NONFATAL) > - pcie_do_nonfatal_recovery(dev); > + pcie_do_recovery(dev, pci_channel_io_normal, > + PCIE_PORT_SERVICE_AER); > else if (info->severity == AER_FATAL) > - pcie_do_fatal_recovery(dev, PCIE_PORT_SERVICE_AER); > + pcie_do_recovery(dev, pci_channel_io_frozen, > + PCIE_PORT_SERVICE_AER); > pci_dev_put(dev); > } > > @@ -1048,9 +1050,11 @@ static void aer_recover_work_func(struct work_struct *work) > } > cper_print_aer(pdev, entry.severity, entry.regs); > if (entry.severity == AER_NONFATAL) > - pcie_do_nonfatal_recovery(pdev); > + pcie_do_recovery(pdev, pci_channel_io_normal, > + PCIE_PORT_SERVICE_AER); > else if (entry.severity == AER_FATAL) > - pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_AER); > + pcie_do_recovery(pdev, pci_channel_io_frozen, > + PCIE_PORT_SERVICE_AER); > pci_dev_put(pdev); > } > } > diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c > index aacfb50eccfc..1ed07db8ea7d 100644 > --- a/drivers/pci/pcie/dpc.c > +++ b/drivers/pci/pcie/dpc.c > @@ -169,7 +169,7 @@ static irqreturn_t dpc_handler(int irq, void *context) > > reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1; > ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5; > - dev_warn(dev, "DPC %s detected, remove downstream devices\n", > + dev_warn(dev, "DPC %s detected\n", > (reason == 0) ? "unmasked uncorrectable error" : > (reason == 1) ? "ERR_NONFATAL" : > (reason == 2) ? "ERR_FATAL" : > @@ -186,7 +186,7 @@ static irqreturn_t dpc_handler(int irq, void *context) > } > > /* We configure DPC so it only triggers on ERR_FATAL */ > - pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC); > + pcie_do_recovery(pdev, pci_channel_io_frozen, PCIE_PORT_SERVICE_DPC); > > return IRQ_HANDLED; > } > diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c > index 62ab665f0f03..644f3f725ef0 100644 > --- a/drivers/pci/pcie/err.c > +++ b/drivers/pci/pcie/err.c > @@ -271,83 +271,20 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev, > return result_data.result; > } > > -/** > - * pcie_do_fatal_recovery - handle fatal error recovery process > - * @dev: pointer to a pci_dev data structure of agent detecting an error > - * > - * Invoked when an error is fatal. Once being invoked, removes the devices > - * beneath this AER agent, followed by reset link e.g. secondary bus reset > - * followed by re-enumeration of devices. > - */ > -void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service) > -{ > - struct pci_dev *udev; > - struct pci_bus *parent; > - struct pci_dev *pdev, *temp; > - pci_ers_result_t result; > - > - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) > - udev = dev; > - else > - udev = dev->bus->self; > - > - parent = udev->subordinate; > - pci_walk_bus(parent, pci_dev_set_disconnected, NULL); > - > - pci_lock_rescan_remove(); > - pci_dev_get(dev); > - list_for_each_entry_safe_reverse(pdev, temp, &parent->devices, > - bus_list) { > - pci_stop_and_remove_bus_device(pdev); > - } > - > - result = reset_link(udev, service); > - > - if ((service == PCIE_PORT_SERVICE_AER) && > - (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) { > - /* > - * If the error is reported by a bridge, we think this error > - * is related to the downstream link of the bridge, so we > - * do error recovery on all subordinates of the bridge instead > - * of the bridge and clear the error status of the bridge. > - */ > - pci_aer_clear_fatal_status(dev); > - pci_aer_clear_device_status(dev); > - } > - > - if (result == PCI_ERS_RESULT_RECOVERED) { > - if (pcie_wait_for_link(udev, true)) > - pci_rescan_bus(udev->bus); > - pci_info(dev, "Device recovery from fatal error successful\n"); > - } else { > - pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); > - pci_info(dev, "Device recovery from fatal error failed\n"); > - } > - > - pci_dev_put(dev); > - pci_unlock_rescan_remove(); > -} > - > -/** > - * pcie_do_nonfatal_recovery - handle nonfatal error recovery process > - * @dev: pointer to a pci_dev data structure of agent detecting an error > - * > - * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast > - * error detected message to all downstream drivers within a hierarchy in > - * question and return the returned code. > - */ > -void pcie_do_nonfatal_recovery(struct pci_dev *dev) > +void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state, > + u32 service) > { > pci_ers_result_t status; > - enum pci_channel_state state; > - > - state = pci_channel_io_normal; > > status = broadcast_error_message(dev, > state, > "error_detected", > report_error_detected); > > + if (state == pci_channel_io_frozen && > + reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED) > + goto failed; > + > if (status == PCI_ERS_RESULT_CAN_RECOVER) > status = broadcast_error_message(dev, > state, > -- > 2.14.4 >
On Wed, Sep 19, 2018 at 10:46:51AM -0500, Bjorn Helgaas wrote: > On Tue, Sep 18, 2018 at 05:56:57PM -0600, Keith Busch wrote: > > We don't need to be paranoid about the topology changing while handling an > > error. If the device has changed in a hotplug capable slot, we can rely > > on the presence detection handling to react to a changing topology. This > > patch restores the fatal error handling behavior that existed before > > merging DPC with AER with 7e9084b3674 ("PCI/AER: Handle ERR_FATAL with > > removal and re-enumeration of devices"). > > 7e9084b3674 updated Documentation/PCI/pci-error-recovery.txt; doesn't > this require another update to keep it matching the code? Indeed, I will fix that.
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 21bfa30db18d..5a96978d3403 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -425,8 +425,8 @@ static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev) #endif /* PCI error reporting and recovery */ -void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service); -void pcie_do_nonfatal_recovery(struct pci_dev *dev); +void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state, + u32 service); bool pcie_wait_for_link(struct pci_dev *pdev, bool active); #ifdef CONFIG_PCIEASPM diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index b4d14acee66d..6b59a23568f8 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1010,9 +1010,11 @@ static void handle_error_source(struct pci_dev *dev, struct aer_err_info *info) info->status); pci_aer_clear_device_status(dev); } else if (info->severity == AER_NONFATAL) - pcie_do_nonfatal_recovery(dev); + pcie_do_recovery(dev, pci_channel_io_normal, + PCIE_PORT_SERVICE_AER); else if (info->severity == AER_FATAL) - pcie_do_fatal_recovery(dev, PCIE_PORT_SERVICE_AER); + pcie_do_recovery(dev, pci_channel_io_frozen, + PCIE_PORT_SERVICE_AER); pci_dev_put(dev); } @@ -1048,9 +1050,11 @@ static void aer_recover_work_func(struct work_struct *work) } cper_print_aer(pdev, entry.severity, entry.regs); if (entry.severity == AER_NONFATAL) - pcie_do_nonfatal_recovery(pdev); + pcie_do_recovery(pdev, pci_channel_io_normal, + PCIE_PORT_SERVICE_AER); else if (entry.severity == AER_FATAL) - pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_AER); + pcie_do_recovery(pdev, pci_channel_io_frozen, + PCIE_PORT_SERVICE_AER); pci_dev_put(pdev); } } diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index aacfb50eccfc..1ed07db8ea7d 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -169,7 +169,7 @@ static irqreturn_t dpc_handler(int irq, void *context) reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1; ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5; - dev_warn(dev, "DPC %s detected, remove downstream devices\n", + dev_warn(dev, "DPC %s detected\n", (reason == 0) ? "unmasked uncorrectable error" : (reason == 1) ? "ERR_NONFATAL" : (reason == 2) ? "ERR_FATAL" : @@ -186,7 +186,7 @@ static irqreturn_t dpc_handler(int irq, void *context) } /* We configure DPC so it only triggers on ERR_FATAL */ - pcie_do_fatal_recovery(pdev, PCIE_PORT_SERVICE_DPC); + pcie_do_recovery(pdev, pci_channel_io_frozen, PCIE_PORT_SERVICE_DPC); return IRQ_HANDLED; } diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c index 62ab665f0f03..644f3f725ef0 100644 --- a/drivers/pci/pcie/err.c +++ b/drivers/pci/pcie/err.c @@ -271,83 +271,20 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev, return result_data.result; } -/** - * pcie_do_fatal_recovery - handle fatal error recovery process - * @dev: pointer to a pci_dev data structure of agent detecting an error - * - * Invoked when an error is fatal. Once being invoked, removes the devices - * beneath this AER agent, followed by reset link e.g. secondary bus reset - * followed by re-enumeration of devices. - */ -void pcie_do_fatal_recovery(struct pci_dev *dev, u32 service) -{ - struct pci_dev *udev; - struct pci_bus *parent; - struct pci_dev *pdev, *temp; - pci_ers_result_t result; - - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) - udev = dev; - else - udev = dev->bus->self; - - parent = udev->subordinate; - pci_walk_bus(parent, pci_dev_set_disconnected, NULL); - - pci_lock_rescan_remove(); - pci_dev_get(dev); - list_for_each_entry_safe_reverse(pdev, temp, &parent->devices, - bus_list) { - pci_stop_and_remove_bus_device(pdev); - } - - result = reset_link(udev, service); - - if ((service == PCIE_PORT_SERVICE_AER) && - (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) { - /* - * If the error is reported by a bridge, we think this error - * is related to the downstream link of the bridge, so we - * do error recovery on all subordinates of the bridge instead - * of the bridge and clear the error status of the bridge. - */ - pci_aer_clear_fatal_status(dev); - pci_aer_clear_device_status(dev); - } - - if (result == PCI_ERS_RESULT_RECOVERED) { - if (pcie_wait_for_link(udev, true)) - pci_rescan_bus(udev->bus); - pci_info(dev, "Device recovery from fatal error successful\n"); - } else { - pci_uevent_ers(dev, PCI_ERS_RESULT_DISCONNECT); - pci_info(dev, "Device recovery from fatal error failed\n"); - } - - pci_dev_put(dev); - pci_unlock_rescan_remove(); -} - -/** - * pcie_do_nonfatal_recovery - handle nonfatal error recovery process - * @dev: pointer to a pci_dev data structure of agent detecting an error - * - * Invoked when an error is nonfatal/fatal. Once being invoked, broadcast - * error detected message to all downstream drivers within a hierarchy in - * question and return the returned code. - */ -void pcie_do_nonfatal_recovery(struct pci_dev *dev) +void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state, + u32 service) { pci_ers_result_t status; - enum pci_channel_state state; - - state = pci_channel_io_normal; status = broadcast_error_message(dev, state, "error_detected", report_error_detected); + if (state == pci_channel_io_frozen && + reset_link(dev, service) != PCI_ERS_RESULT_RECOVERED) + goto failed; + if (status == PCI_ERS_RESULT_CAN_RECOVER) status = broadcast_error_message(dev, state,
We don't need to be paranoid about the topology changing while handling an error. If the device has changed in a hotplug capable slot, we can rely on the presence detection handling to react to a changing topology. This patch restores the fatal error handling behavior that existed before merging DPC with AER with 7e9084b3674 ("PCI/AER: Handle ERR_FATAL with removal and re-enumeration of devices"). Signed-off-by: Keith Busch <keith.busch@intel.com> --- drivers/pci/pci.h | 4 +-- drivers/pci/pcie/aer.c | 12 +++++--- drivers/pci/pcie/dpc.c | 4 +-- drivers/pci/pcie/err.c | 75 ++++---------------------------------------------- 4 files changed, 18 insertions(+), 77 deletions(-)