Message ID | 59cb30f5e5ac6d65427ceaadf1012b2ba8dbf66c.1615606143.git.sathyanarayanan.kuppuswamy@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Bjorn Helgaas |
Headers | show |
Series | [v2,1/1] PCI: pciehp: Skip DLLSC handling if DPC is triggered | expand |
On 3/12/21 7:32 PM, sathyanarayanan.kuppuswamy@linux.intel.com wrote: > From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> > > When hotplug and DPC are both enabled on a Root port or > Downstream Port, during DPC events that cause a DLLSC link > down/up events, such events (DLLSC) must be suppressed to > let the DPC driver own the recovery path. > > When DPC is present and enabled, hardware will put the port in > containment state to allow SW to recover from the error condition > in the seamless manner. But, during the DPC error recovery process, > since the link is in disabled state, it will also raise the DLLSC > event. In Linux kernel architecture, DPC events are handled by DPC > driver and DLLSC events are handled by hotplug driver. If a hotplug > driver is allowed to handle such DLLSC event (triggered by DPC > containment), then we will have a race condition between error > recovery handler (in DPC driver) and hotplug handler in recovering > the contained port. Allowing such a race leads to a lot of stability > issues while recovering the device. So skip DLLSC handling in the > hotplug driver when the PCIe port associated with the hotplug event is > in DPC triggered state and let the DPC driver be responsible for the > port recovery. > > Following is the sample dmesg log which shows the contention > between hotplug handler and error recovery handler. In this > case, hotplug handler won the race and error recovery > handler reported failure. > > pcieport 0000:97:02.0: pciehp: Slot(4): Link Down > pcieport 0000:97:02.0: DPC: containment event, status:0x1f01 source:0x0000 > pcieport 0000:97:02.0: DPC: unmasked uncorrectable error detected > pcieport 0000:97:02.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID) > pcieport 0000:97:02.0: device [8086:347a] error status/mask=00004000/00100020 > pcieport 0000:97:02.0: [14] CmpltTO (First) > pci 0000:98:00.0: AER: can't recover (no error_detected callback) > pcieport 0000:97:02.0: pciehp: Slot(4): Card present > pcieport 0000:97:02.0: DPC: Data Link Layer Link Active not set in 1000 msec > pcieport 0000:97:02.0: AER: subordinate device reset failed > pcieport 0000:97:02.0: AER: device recovery failed > pci 0000:98:00.0: [8086:0953] type 00 class 0x010802 > nvme nvme1: pci function 0000:98:00.0 > nvme 0000:98:00.0: enabling device (0140 -> 0142) > nvme nvme1: 31/0/0 default/read/poll queues > nvme1n2: p1 > > Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com> > Reviewed-by: Dan Williams <dan.j.williams@intel.com> > Reviewed-by: Raj Ashok <ashok.raj@intel.com> > --- Missed to add the change log. will include it in next version. Changes since v1: * Trimmed down the kernel log in commit history. * Removed usage of !! in is_dpc_reset_active(). * Addressed other minor comments from Bjorn. > drivers/pci/hotplug/pciehp_hpc.c | 19 +++++++++++++++++ > drivers/pci/pci.h | 2 ++ > drivers/pci/pcie/dpc.c | 36 ++++++++++++++++++++++++++++++-- > include/linux/pci.h | 1 + > 4 files changed, 56 insertions(+), 2 deletions(-) > > diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c > index fb3840e222ad..55da5208c7e5 100644 > --- a/drivers/pci/hotplug/pciehp_hpc.c > +++ b/drivers/pci/hotplug/pciehp_hpc.c > @@ -691,6 +691,25 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) > goto out; > } > > + /* > + * If the DLLSC link up/down event is generated due to DPC containment > + * in the PCIe port, skip the DLLSC event handling and let the DPC > + * driver own the port recovery. Allowing both hotplug DLLSC event > + * handler and DPC event trigger handler to attempt recovery on the > + * same port leads to stability issues. If DPC recovery is successful, > + * is_dpc_reset_active() will return false and the hotplug handler will > + * not suppress the DLLSC event. If DPC recovery fails and the link is > + * left in disabled state, once the user changes the faulty card, the > + * hotplug handler can still handle the PRESENCE change event and bring > + * the device back up. > + */ > + if ((events == PCI_EXP_SLTSTA_DLLSC) && is_dpc_reset_active(pdev)) { > + ctrl_info(ctrl, "Slot(%s): DLLSC event(DPC), skipped\n", > + slot_name(ctrl)); > + ret = IRQ_HANDLED; > + goto out; > + } > + > /* Check Attention Button Pressed */ > if (events & PCI_EXP_SLTSTA_ABP) { > ctrl_info(ctrl, "Slot(%s): Attention button pressed\n", > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index ef7c4661314f..cee7095483bd 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -446,10 +446,12 @@ void pci_restore_dpc_state(struct pci_dev *dev); > void pci_dpc_init(struct pci_dev *pdev); > void dpc_process_error(struct pci_dev *pdev); > pci_ers_result_t dpc_reset_link(struct pci_dev *pdev); > +bool is_dpc_reset_active(struct pci_dev *pdev); > #else > static inline void pci_save_dpc_state(struct pci_dev *dev) {} > static inline void pci_restore_dpc_state(struct pci_dev *dev) {} > static inline void pci_dpc_init(struct pci_dev *pdev) {} > +static inline bool is_dpc_reset_active(struct pci_dev *pdev) { return false; } > #endif > > #ifdef CONFIG_PCIEPORTBUS > diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c > index e05aba86a317..9157d70ebe21 100644 > --- a/drivers/pci/pcie/dpc.c > +++ b/drivers/pci/pcie/dpc.c > @@ -71,6 +71,33 @@ void pci_restore_dpc_state(struct pci_dev *dev) > pci_write_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, *cap); > } > > +bool is_dpc_reset_active(struct pci_dev *dev) > +{ > + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); > + u16 status; > + > + if (!dev->dpc_cap) > + return false; > + > + /* > + * If DPC is owned by firmware and EDR is not supported, there is > + * no race between hotplug and DPC recovery handler. So return > + * false. > + */ > + if (!host->native_dpc && !IS_ENABLED(CONFIG_PCIE_EDR)) > + return false; > + > + if (atomic_read_acquire(&dev->dpc_reset_active)) > + return true; > + > + pci_read_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_STATUS, &status); > + > + if (status & PCI_EXP_DPC_STATUS_TRIGGER) > + return true; > + > + return false; > +} > + > static int dpc_wait_rp_inactive(struct pci_dev *pdev) > { > unsigned long timeout = jiffies + HZ; > @@ -91,6 +118,7 @@ static int dpc_wait_rp_inactive(struct pci_dev *pdev) > > pci_ers_result_t dpc_reset_link(struct pci_dev *pdev) > { > + pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED; > u16 cap; > > /* > @@ -109,15 +137,19 @@ pci_ers_result_t dpc_reset_link(struct pci_dev *pdev) > if (pdev->dpc_rp_extensions && dpc_wait_rp_inactive(pdev)) > return PCI_ERS_RESULT_DISCONNECT; > > + atomic_inc_return_acquire(&pdev->dpc_reset_active); > + > pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS, > PCI_EXP_DPC_STATUS_TRIGGER); > > if (!pcie_wait_for_link(pdev, true)) { > pci_info(pdev, "Data Link Layer Link Active not set in 1000 msec\n"); > - return PCI_ERS_RESULT_DISCONNECT; > + status = PCI_ERS_RESULT_DISCONNECT; > } > > - return PCI_ERS_RESULT_RECOVERED; > + atomic_dec_return_release(&pdev->dpc_reset_active); > + > + return status; > } > > static void dpc_process_rp_pio_error(struct pci_dev *pdev) > diff --git a/include/linux/pci.h b/include/linux/pci.h > index 86c799c97b77..3314f616520d 100644 > --- a/include/linux/pci.h > +++ b/include/linux/pci.h > @@ -479,6 +479,7 @@ struct pci_dev { > u16 dpc_cap; > unsigned int dpc_rp_extensions:1; > u8 dpc_rp_log_size; > + atomic_t dpc_reset_active; /* DPC trigger is active */ > #endif > #ifdef CONFIG_PCI_ATS > union { >
On Fri, Mar 12, 2021 at 07:32:08PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote: > + if ((events == PCI_EXP_SLTSTA_DLLSC) && is_dpc_reset_active(pdev)) { > + ctrl_info(ctrl, "Slot(%s): DLLSC event(DPC), skipped\n", > + slot_name(ctrl)); > + ret = IRQ_HANDLED; > + goto out; > + } Two problems here: (1) If recovery fails, the link will *remain* down, so there'll be no Link Up event. You've filtered the Link Down event, thus the slot will remain in ON_STATE even though the device in the slot is no longer accessible. That's not good, the slot should be brought down in this case. (2) If recovery succeeds, there's a race where pciehp may call is_dpc_reset_active() *after* dpc_reset_link() has finished. So both the DPC Trigger Status bit as well as pdev->dpc_reset_active will be cleared. Thus, the Link Up event is not filtered by pciehp and the slot is brought down and back up even though DPC recovery was succesful, which seems undesirable. The only viable solution I see is to wait until DPC has completed. Sinan (+cc) proposed something along those lines a couple years back: https://patchwork.ozlabs.org/project/linux-pci/patch/20180818065126.77912-1-okaya@kernel.org/ Included below please find my suggestion for how to fix this. I've sort of combined yours and Sinan's approach, but I'm using a waitqueue (Sinan used polling) and I'm using atomic bitops on pdev->priv_flags (you're using an atomic_t instead, which needs additionally space in struct pci_dev). Note: It's compile-tested only, I don't have any DPC-capable hardware at my disposal. Would this work for you? If so, I can add a commit message to the patch and submit it properly. Let me know what you think. Thanks! -- >8 -- Subject: [PATCH] PCI: pciehp: Ignore Link Down/Up caused by DPC Signed-off-by: Lukas Wunner <lukas@wunner.de> --- drivers/pci/hotplug/pciehp_hpc.c | 11 +++++++++ drivers/pci/pci.h | 4 ++++ drivers/pci/pcie/dpc.c | 51 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 64 insertions(+), 2 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index fb3840e..bcc018e 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -707,6 +707,17 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) } /* + * Ignore Link Down/Up caused by Downstream Port Containment + * if recovery from the error succeeded. + */ + if ((events & PCI_EXP_SLTSTA_DLLSC) && pci_dpc_recovered(pdev) && + ctrl->state == ON_STATE) { + atomic_and(~PCI_EXP_SLTSTA_DLLSC, &ctrl->pending_events); + if (pciehp_check_link_active(ctrl) > 0) + events &= ~PCI_EXP_SLTSTA_DLLSC; + } + + /* * Disable requests have higher priority than Presence Detect Changed * or Data Link Layer State Changed events. */ diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 9684b46..e5ae5e8 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -392,6 +392,8 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev) /* pci_dev priv_flags */ #define PCI_DEV_ADDED 0 +#define PCI_DPC_RECOVERED 1 +#define PCI_DPC_RECOVERING 2 static inline void pci_dev_assign_added(struct pci_dev *dev, bool added) { @@ -446,10 +448,12 @@ struct rcec_ea { void pci_dpc_init(struct pci_dev *pdev); void dpc_process_error(struct pci_dev *pdev); pci_ers_result_t dpc_reset_link(struct pci_dev *pdev); +bool pci_dpc_recovered(struct pci_dev *pdev); #else static inline void pci_save_dpc_state(struct pci_dev *dev) {} static inline void pci_restore_dpc_state(struct pci_dev *dev) {} static inline void pci_dpc_init(struct pci_dev *pdev) {} +static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; } #endif #ifdef CONFIG_PCIEPORTBUS diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index e05aba8..7328d9c4 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -71,6 +71,44 @@ void pci_restore_dpc_state(struct pci_dev *dev) pci_write_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, *cap); } +static bool dpc_completed(struct pci_dev *pdev) +{ + u16 status; + + pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_STATUS, &status); + if (status & PCI_EXP_DPC_STATUS_TRIGGER) + return false; + + if (test_bit(PCI_DPC_RECOVERING, &pdev->priv_flags)) + return false; + + return true; +} + +static DECLARE_WAIT_QUEUE_HEAD(dpc_completed_waitqueue); + +bool pci_dpc_recovered(struct pci_dev *pdev) +{ + struct pci_host_bridge *host; + + if (!pdev->dpc_cap) + return false; + + /* + * If DPC is owned by firmware and EDR is not supported, there is + * no race between hotplug and DPC recovery handler. So return + * false. + */ + host = pci_find_host_bridge(pdev->bus); + if (!host->native_dpc && !IS_ENABLED(CONFIG_PCIE_EDR)) + return false; + + wait_event_timeout(dpc_completed_waitqueue, dpc_completed(pdev), + msecs_to_jiffies(5000)); + + return test_and_clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags); +} + static int dpc_wait_rp_inactive(struct pci_dev *pdev) { unsigned long timeout = jiffies + HZ; @@ -91,8 +129,12 @@ static int dpc_wait_rp_inactive(struct pci_dev *pdev) pci_ers_result_t dpc_reset_link(struct pci_dev *pdev) { + pci_ers_result_t ret; u16 cap; + clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags); + set_bit(PCI_DPC_RECOVERING, &pdev->priv_flags); + /* * DPC disables the Link automatically in hardware, so it has * already been reset by the time we get here. @@ -114,10 +156,15 @@ pci_ers_result_t dpc_reset_link(struct pci_dev *pdev) if (!pcie_wait_for_link(pdev, true)) { pci_info(pdev, "Data Link Layer Link Active not set in 1000 msec\n"); - return PCI_ERS_RESULT_DISCONNECT; + ret = PCI_ERS_RESULT_DISCONNECT; + } else { + set_bit(PCI_DPC_RECOVERED, &pdev->priv_flags); + ret = PCI_ERS_RESULT_RECOVERED; } - return PCI_ERS_RESULT_RECOVERED; + clear_bit(PCI_DPC_RECOVERING, &pdev->priv_flags); + wake_up_all(&dpc_completed_waitqueue); + return ret; } static void dpc_process_rp_pio_error(struct pci_dev *pdev)
On Tue, Mar 16, 2021 at 9:14 PM Lukas Wunner <lukas@wunner.de> wrote: > > On Fri, Mar 12, 2021 at 07:32:08PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote: > > + if ((events == PCI_EXP_SLTSTA_DLLSC) && is_dpc_reset_active(pdev)) { > > + ctrl_info(ctrl, "Slot(%s): DLLSC event(DPC), skipped\n", > > + slot_name(ctrl)); > > + ret = IRQ_HANDLED; > > + goto out; > > + } > > Two problems here: > > (1) If recovery fails, the link will *remain* down, so there'll be > no Link Up event. You've filtered the Link Down event, thus the > slot will remain in ON_STATE even though the device in the slot is > no longer accessible. That's not good, the slot should be brought > down in this case. Can you elaborate on why that is "not good" from the end user perspective? From a driver perspective the device driver context is lost and the card needs servicing. The service event starts a new cycle of slot-attention being triggered and that syncs the slot-down state at that time. > > (2) If recovery succeeds, there's a race where pciehp may call > is_dpc_reset_active() *after* dpc_reset_link() has finished. > So both the DPC Trigger Status bit as well as pdev->dpc_reset_active > will be cleared. Thus, the Link Up event is not filtered by pciehp > and the slot is brought down and back up even though DPC recovery > was succesful, which seems undesirable. The hotplug driver never saw the Link Down, so what does it do when the slot transitions from Link Up to Link Up? Do you mean the Link Down might fire after the dpc recovery has completed if the hotplug notification was delayed?
On Tue, Mar 16, 2021 at 10:08:31PM -0700, Dan Williams wrote: > On Tue, Mar 16, 2021 at 9:14 PM Lukas Wunner <lukas@wunner.de> wrote: > > > > On Fri, Mar 12, 2021 at 07:32:08PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote: > > > + if ((events == PCI_EXP_SLTSTA_DLLSC) && is_dpc_reset_active(pdev)) { > > > + ctrl_info(ctrl, "Slot(%s): DLLSC event(DPC), skipped\n", > > > + slot_name(ctrl)); > > > + ret = IRQ_HANDLED; > > > + goto out; > > > + } > > > > Two problems here: > > > > (1) If recovery fails, the link will *remain* down, so there'll be > > no Link Up event. You've filtered the Link Down event, thus the > > slot will remain in ON_STATE even though the device in the slot is > > no longer accessible. That's not good, the slot should be brought > > down in this case. > > Can you elaborate on why that is "not good" from the end user > perspective? From a driver perspective the device driver context is > lost and the card needs servicing. The service event starts a new > cycle of slot-attention being triggered and that syncs the slot-down > state at that time. All of pciehp's code assumes that if the link is down, the slot must be off. A slot which is in ON_STATE for a prolonged period of time even though the link is down is an oddity the code doesn't account for. If the link goes down, the slot should be brought into OFF_STATE. (It's okay though to delay bringdown until DPC recovery has completed unsuccessfully, which is what the patch I'm proposing does.) I don't understand what you mean by "service event". Someone unplugging and replugging the NVMe drive? > > (2) If recovery succeeds, there's a race where pciehp may call > > is_dpc_reset_active() *after* dpc_reset_link() has finished. > > So both the DPC Trigger Status bit as well as pdev->dpc_reset_active > > will be cleared. Thus, the Link Up event is not filtered by pciehp > > and the slot is brought down and back up even though DPC recovery > > was succesful, which seems undesirable. > > The hotplug driver never saw the Link Down, so what does it do when > the slot transitions from Link Up to Link Up? Do you mean the Link > Down might fire after the dpc recovery has completed if the hotplug > notification was delayed? If the Link Down is filtered and the Link Up is not, pciehp will bring down the slot and then bring it back up. That's because pciehp can't really tell whether a DLLSC event is Link Up or Link Down. It just knows that the link was previously up, is now up again, but must have been down intermittently, so transactions to the device in the slot may have been lost and the slot is therefore brought down for safety. Because the link is up, it is then brought back up. Thanks, Lukas
On Tue, Mar 16, 2021 at 10:31 PM Lukas Wunner <lukas@wunner.de> wrote: > > On Tue, Mar 16, 2021 at 10:08:31PM -0700, Dan Williams wrote: > > On Tue, Mar 16, 2021 at 9:14 PM Lukas Wunner <lukas@wunner.de> wrote: > > > > > > On Fri, Mar 12, 2021 at 07:32:08PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote: > > > > + if ((events == PCI_EXP_SLTSTA_DLLSC) && is_dpc_reset_active(pdev)) { > > > > + ctrl_info(ctrl, "Slot(%s): DLLSC event(DPC), skipped\n", > > > > + slot_name(ctrl)); > > > > + ret = IRQ_HANDLED; > > > > + goto out; > > > > + } > > > > > > Two problems here: > > > > > > (1) If recovery fails, the link will *remain* down, so there'll be > > > no Link Up event. You've filtered the Link Down event, thus the > > > slot will remain in ON_STATE even though the device in the slot is > > > no longer accessible. That's not good, the slot should be brought > > > down in this case. > > > > Can you elaborate on why that is "not good" from the end user > > perspective? From a driver perspective the device driver context is > > lost and the card needs servicing. The service event starts a new > > cycle of slot-attention being triggered and that syncs the slot-down > > state at that time. > > All of pciehp's code assumes that if the link is down, the slot must be > off. A slot which is in ON_STATE for a prolonged period of time even > though the link is down is an oddity the code doesn't account for. > > If the link goes down, the slot should be brought into OFF_STATE. > (It's okay though to delay bringdown until DPC recovery has completed > unsuccessfully, which is what the patch I'm proposing does.) > > I don't understand what you mean by "service event". Someone unplugging > and replugging the NVMe drive? Yes, service meaning a technician physically removes the card. > > > > > (2) If recovery succeeds, there's a race where pciehp may call > > > is_dpc_reset_active() *after* dpc_reset_link() has finished. > > > So both the DPC Trigger Status bit as well as pdev->dpc_reset_active > > > will be cleared. Thus, the Link Up event is not filtered by pciehp > > > and the slot is brought down and back up even though DPC recovery > > > was succesful, which seems undesirable. > > > > The hotplug driver never saw the Link Down, so what does it do when > > the slot transitions from Link Up to Link Up? Do you mean the Link > > Down might fire after the dpc recovery has completed if the hotplug > > notification was delayed? > > If the Link Down is filtered and the Link Up is not, pciehp will > bring down the slot and then bring it back up. That's because pciehp > can't really tell whether a DLLSC event is Link Up or Link Down. > > It just knows that the link was previously up, is now up again, > but must have been down intermittently, so transactions to the > device in the slot may have been lost and the slot is therefore > brought down for safety. Because the link is up, it is then > brought back up. I wonder why we're not seeing that effect in testing?
Hi, On Wed, Mar 17, 2021 at 9:31 AM Dan Williams <dan.j.williams@intel.com> wrote: > > On Tue, Mar 16, 2021 at 10:31 PM Lukas Wunner <lukas@wunner.de> wrote: > > > > On Tue, Mar 16, 2021 at 10:08:31PM -0700, Dan Williams wrote: > > > On Tue, Mar 16, 2021 at 9:14 PM Lukas Wunner <lukas@wunner.de> wrote: > > > > > > > > On Fri, Mar 12, 2021 at 07:32:08PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote: > > > > > + if ((events == PCI_EXP_SLTSTA_DLLSC) && is_dpc_reset_active(pdev)) { > > > > > + ctrl_info(ctrl, "Slot(%s): DLLSC event(DPC), skipped\n", > > > > > + slot_name(ctrl)); > > > > > + ret = IRQ_HANDLED; > > > > > + goto out; > > > > > + } > > > > > > > > Two problems here: > > > > > > > > (1) If recovery fails, the link will *remain* down, so there'll be > > > > no Link Up event. You've filtered the Link Down event, thus the > > > > slot will remain in ON_STATE even though the device in the slot is > > > > no longer accessible. That's not good, the slot should be brought > > > > down in this case. > > > > > > Can you elaborate on why that is "not good" from the end user > > > perspective? From a driver perspective the device driver context is > > > lost and the card needs servicing. The service event starts a new > > > cycle of slot-attention being triggered and that syncs the slot-down > > > state at that time. > > > > All of pciehp's code assumes that if the link is down, the slot must be > > off. A slot which is in ON_STATE for a prolonged period of time even > > though the link is down is an oddity the code doesn't account for. > > > > If the link goes down, the slot should be brought into OFF_STATE. > > (It's okay though to delay bringdown until DPC recovery has completed > > unsuccessfully, which is what the patch I'm proposing does.) > > > > I don't understand what you mean by "service event". Someone unplugging > > and replugging the NVMe drive? > > Yes, service meaning a technician physically removes the card. > > > > > > > > > (2) If recovery succeeds, there's a race where pciehp may call > > > > is_dpc_reset_active() *after* dpc_reset_link() has finished. > > > > So both the DPC Trigger Status bit as well as pdev->dpc_reset_active > > > > will be cleared. Thus, the Link Up event is not filtered by pciehp > > > > and the slot is brought down and back up even though DPC recovery > > > > was succesful, which seems undesirable. > > > > > > The hotplug driver never saw the Link Down, so what does it do when > > > the slot transitions from Link Up to Link Up? Do you mean the Link > > > Down might fire after the dpc recovery has completed if the hotplug > > > notification was delayed? > > > > If the Link Down is filtered and the Link Up is not, pciehp will > > bring down the slot and then bring it back up. That's because pciehp > > can't really tell whether a DLLSC event is Link Up or Link Down. > > > > It just knows that the link was previously up, is now up again, > > but must have been down intermittently, so transactions to the > > device in the slot may have been lost and the slot is therefore > > brought down for safety. Because the link is up, it is then > > brought back up. > > I wonder why we're not seeing that effect in testing? In our test case, there is a good chance that the LINK UP event is also filtered. We change the dpc_reset_active status only after we verify the link is up. So if hotplug handler handles the LINK UP event before we change the status of dpc_reset_active, then it will not lead to the issue mentioned by Lukas. if (!pcie_wait_for_link(pdev, true)) { pci_info(pdev, "Data Link Layer Link Active not set in 1000 msec\n"); - return PCI_ERS_RESULT_DISCONNECT; + status = PCI_ERS_RESULT_DISCONNECT; } - return PCI_ERS_RESULT_RECOVERED; + atomic_dec_return_release(&pdev->dpc_reset_active);
On Wed, Mar 17, 2021 at 10:20 AM Sathyanarayanan Kuppuswamy Natarajan <sathyanarayanan.nkuppuswamy@gmail.com> wrote: > > Hi, > > On Wed, Mar 17, 2021 at 9:31 AM Dan Williams <dan.j.williams@intel.com> wrote: > > > > On Tue, Mar 16, 2021 at 10:31 PM Lukas Wunner <lukas@wunner.de> wrote: > > > > > > On Tue, Mar 16, 2021 at 10:08:31PM -0700, Dan Williams wrote: > > > > On Tue, Mar 16, 2021 at 9:14 PM Lukas Wunner <lukas@wunner.de> wrote: > > > > > > > > > > On Fri, Mar 12, 2021 at 07:32:08PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote: > > > > > > + if ((events == PCI_EXP_SLTSTA_DLLSC) && is_dpc_reset_active(pdev)) { > > > > > > + ctrl_info(ctrl, "Slot(%s): DLLSC event(DPC), skipped\n", > > > > > > + slot_name(ctrl)); > > > > > > + ret = IRQ_HANDLED; > > > > > > + goto out; > > > > > > + } > > > > > > > > > > Two problems here: > > > > > > > > > > (1) If recovery fails, the link will *remain* down, so there'll be > > > > > no Link Up event. You've filtered the Link Down event, thus the > > > > > slot will remain in ON_STATE even though the device in the slot is > > > > > no longer accessible. That's not good, the slot should be brought > > > > > down in this case. > > > > > > > > Can you elaborate on why that is "not good" from the end user > > > > perspective? From a driver perspective the device driver context is > > > > lost and the card needs servicing. The service event starts a new > > > > cycle of slot-attention being triggered and that syncs the slot-down > > > > state at that time. > > > > > > All of pciehp's code assumes that if the link is down, the slot must be > > > off. A slot which is in ON_STATE for a prolonged period of time even > > > though the link is down is an oddity the code doesn't account for. > > > > > > If the link goes down, the slot should be brought into OFF_STATE. > > > (It's okay though to delay bringdown until DPC recovery has completed > > > unsuccessfully, which is what the patch I'm proposing does.) > > > > > > I don't understand what you mean by "service event". Someone unplugging > > > and replugging the NVMe drive? > > > > Yes, service meaning a technician physically removes the card. > > > > > > > > > > > > > (2) If recovery succeeds, there's a race where pciehp may call > > > > > is_dpc_reset_active() *after* dpc_reset_link() has finished. > > > > > So both the DPC Trigger Status bit as well as pdev->dpc_reset_active > > > > > will be cleared. Thus, the Link Up event is not filtered by pciehp > > > > > and the slot is brought down and back up even though DPC recovery > > > > > was succesful, which seems undesirable. > > > > > > > > The hotplug driver never saw the Link Down, so what does it do when > > > > the slot transitions from Link Up to Link Up? Do you mean the Link > > > > Down might fire after the dpc recovery has completed if the hotplug > > > > notification was delayed? > > > > > > If the Link Down is filtered and the Link Up is not, pciehp will > > > bring down the slot and then bring it back up. That's because pciehp > > > can't really tell whether a DLLSC event is Link Up or Link Down. > > > > > > It just knows that the link was previously up, is now up again, > > > but must have been down intermittently, so transactions to the > > > device in the slot may have been lost and the slot is therefore > > > brought down for safety. Because the link is up, it is then > > > brought back up. > > > > I wonder why we're not seeing that effect in testing? > > In our test case, there is a good chance that the LINK UP event is also > filtered. We change the dpc_reset_active status only after we verify > the link is up. So if hotplug handler handles the LINK UP event before > we change the status of dpc_reset_active, then it will not lead to the > issue mentioned by Lukas. > Ah, ok, we're missing a flush of the hotplug event handler after the link is up to make sure the hotplug handler does not see the Link Up. I'm not immediately seeing how the new proposal ensures that there is no Link Up event still in flight after DPC completes its work. Wouldn't it be required to throw away Link Up to Link Up transitions?
Hi, On Wed, Mar 17, 2021 at 10:45 AM Dan Williams <dan.j.williams@intel.com> wrote: > > On Wed, Mar 17, 2021 at 10:20 AM Sathyanarayanan Kuppuswamy Natarajan > <sathyanarayanan.nkuppuswamy@gmail.com> wrote: > > > > Hi, > > > > On Wed, Mar 17, 2021 at 9:31 AM Dan Williams <dan.j.williams@intel.com> wrote: > > > > > > On Tue, Mar 16, 2021 at 10:31 PM Lukas Wunner <lukas@wunner.de> wrote: > > > > > > > > On Tue, Mar 16, 2021 at 10:08:31PM -0700, Dan Williams wrote: > > > > > On Tue, Mar 16, 2021 at 9:14 PM Lukas Wunner <lukas@wunner.de> wrote: > > > > > > > > > > > > On Fri, Mar 12, 2021 at 07:32:08PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote: > > > > > > > + if ((events == PCI_EXP_SLTSTA_DLLSC) && is_dpc_reset_active(pdev)) { > > > > > > > + ctrl_info(ctrl, "Slot(%s): DLLSC event(DPC), skipped\n", > > > > > > > + slot_name(ctrl)); > > > > > > > + ret = IRQ_HANDLED; > > > > > > > + goto out; > > > > > > > + } > > > > > > > > > > > > Two problems here: > > > > > > > > > > > > (1) If recovery fails, the link will *remain* down, so there'll be > > > > > > no Link Up event. You've filtered the Link Down event, thus the > > > > > > slot will remain in ON_STATE even though the device in the slot is > > > > > > no longer accessible. That's not good, the slot should be brought > > > > > > down in this case. > > > > > > > > > > Can you elaborate on why that is "not good" from the end user > > > > > perspective? From a driver perspective the device driver context is > > > > > lost and the card needs servicing. The service event starts a new > > > > > cycle of slot-attention being triggered and that syncs the slot-down > > > > > state at that time. > > > > > > > > All of pciehp's code assumes that if the link is down, the slot must be > > > > off. A slot which is in ON_STATE for a prolonged period of time even > > > > though the link is down is an oddity the code doesn't account for. > > > > > > > > If the link goes down, the slot should be brought into OFF_STATE. > > > > (It's okay though to delay bringdown until DPC recovery has completed > > > > unsuccessfully, which is what the patch I'm proposing does.) > > > > > > > > I don't understand what you mean by "service event". Someone unplugging > > > > and replugging the NVMe drive? > > > > > > Yes, service meaning a technician physically removes the card. > > > > > > > > > > > > > > > > > (2) If recovery succeeds, there's a race where pciehp may call > > > > > > is_dpc_reset_active() *after* dpc_reset_link() has finished. > > > > > > So both the DPC Trigger Status bit as well as pdev->dpc_reset_active > > > > > > will be cleared. Thus, the Link Up event is not filtered by pciehp > > > > > > and the slot is brought down and back up even though DPC recovery > > > > > > was succesful, which seems undesirable. > > > > > > > > > > The hotplug driver never saw the Link Down, so what does it do when > > > > > the slot transitions from Link Up to Link Up? Do you mean the Link > > > > > Down might fire after the dpc recovery has completed if the hotplug > > > > > notification was delayed? > > > > > > > > If the Link Down is filtered and the Link Up is not, pciehp will > > > > bring down the slot and then bring it back up. That's because pciehp > > > > can't really tell whether a DLLSC event is Link Up or Link Down. > > > > > > > > It just knows that the link was previously up, is now up again, > > > > but must have been down intermittently, so transactions to the > > > > device in the slot may have been lost and the slot is therefore > > > > brought down for safety. Because the link is up, it is then > > > > brought back up. > > > > > > I wonder why we're not seeing that effect in testing? > > > > In our test case, there is a good chance that the LINK UP event is also > > filtered. We change the dpc_reset_active status only after we verify > > the link is up. So if hotplug handler handles the LINK UP event before > > we change the status of dpc_reset_active, then it will not lead to the > > issue mentioned by Lukas. > > > > Ah, ok, we're missing a flush of the hotplug event handler after the > link is up to make sure the hotplug handler does not see the Link Up. Flush of hotplug event after successful recovery, and a simulated hotplug link down event after link recovery fails should solve the problems raised by Lukas. I assume Lukas' proposal adds this support. I will check his patch shortly. > I'm not immediately seeing how the new proposal ensures that there is > no Link Up event still in flight after DPC completes its work. > Wouldn't it be required to throw away Link Up to Link Up transitions?
On Wed, Mar 17, 2021 at 10:54:09AM -0700, Sathyanarayanan Kuppuswamy Natarajan wrote: > Flush of hotplug event after successful recovery, and a simulated > hotplug link down event after link recovery fails should solve the > problems raised by Lukas. I assume Lukas' proposal adds this support. > I will check his patch shortly. Thank you! I'd like to get a better understanding of the issues around hotplug/DPC, specifically I'm wondering: If DPC recovery was successful, what is the desired behavior by pciehp, should it ignore the Link Down/Up or bring the slot down and back up after DPC recovery? If the events are ignored, the driver of the device in the hotplug slot is not unbound and rebound. So the driver must be able to cope with loss of TLPs during DPC recovery and it must be able to cope with whatever state the endpoint device is in after DPC recovery. Is this really safe? How does the nvme driver deal with it? Also, if DPC is handled by firmware, your patch does not ignore the Link Down/Up events, so pciehp brings down the slot when DPC is triggered, then brings it up after succesful recovery. In a code comment, you write that this behavior is okay because there's "no race between hotplug and DPC recovery". However, Sinan wrote in 2018 that one of the issues with hotplug versus DPC is that pciehp may turn off slot power and thereby foil DPC recovery. (Power off = cold reset, whereas DPC recovery = warm reset.) This can occur as well if DPC is handled by firmware. So I guess pciehp should make an attempt to await DPC recovery even if it's handled by firmware? Or am I missing something? We may be able to achieve that by polling the DPC Trigger Status bit and DLLLA bit, but it won't work as perfectly as with native DPC support. Finally, you write in your commit message that there are "a lot of stability issues" if pciehp and DPC are allowed to recover freely without proper serialization. What are these issues exactly? (Beyond the slot power issue mentioned above, and that the endpoint device's driver should presumably not be unbound if DPC recovery was successful.) Thanks! Lukas
On Wed, Mar 17, 2021 at 10:45:21AM -0700, Dan Williams wrote: > Ah, ok, we're missing a flush of the hotplug event handler after the > link is up to make sure the hotplug handler does not see the Link Up. > I'm not immediately seeing how the new proposal ensures that there is > no Link Up event still in flight after DPC completes its work. > Wouldn't it be required to throw away Link Up to Link Up transitions? If you look at the new code added to pciehp_ist() by my patch... atomic_and(~PCI_EXP_SLTSTA_DLLSC, &ctrl->pending_events); if (pciehp_check_link_active(ctrl) > 0) events &= ~PCI_EXP_SLTSTA_DLLSC; ... the atomic_and() ignores the Link Up event which was picked up by the hardirq handler pciehp_isr() while pciehp_ist() waited for link recovery. Afterwards, the Link Down event is only ignored if the link is still up: If the link has gone down again before the call to pciehp_check_link_active(), that event is honored immediately (because the DLLSC event is then not filtered). If it goes down after the call, that event will be picked up by pciehp_isr(). Thus, only the DLLSC events caused by DPC are ignored, but no others. A DLLSC event caused by surprise removal during DPC may be incorrectly ignored, but the expectation is that the ensuing Presence Detect Changed event will still cause bringdown of the slot after DPC has completed. Hardware does exist which erroneously hardwires Presence Detect to zero, but that's rare and DPC-capable systems are likely not affected. I've realized today that I forgot to add a "synchronize_hardirq(irq);" before the call to atomic_and(). Sorry, that will be fixed in the next iteration. Thanks, Lukas
On Wed, Mar 17, 2021 at 08:09:52PM +0100, Lukas Wunner wrote: > On Wed, Mar 17, 2021 at 10:45:21AM -0700, Dan Williams wrote: > > Ah, ok, we're missing a flush of the hotplug event handler after the > > link is up to make sure the hotplug handler does not see the Link Up. > > I'm not immediately seeing how the new proposal ensures that there is > > no Link Up event still in flight after DPC completes its work. > > Wouldn't it be required to throw away Link Up to Link Up transitions? > > If you look at the new code added to pciehp_ist() by my patch... > > atomic_and(~PCI_EXP_SLTSTA_DLLSC, &ctrl->pending_events); > if (pciehp_check_link_active(ctrl) > 0) > events &= ~PCI_EXP_SLTSTA_DLLSC; When you have a Surprise Link Down and without any DPC, the link trains back up. Aren't we expected to take the slot down and add it as if a remove and add happens? without this change if slot-status == ON_STATE, DLLSC means we would power the slot off. Then we check link_active and bring the slot back on isn't it?
On Wed, Mar 17, 2021 at 12:22:41PM -0700, Raj, Ashok wrote: > On Wed, Mar 17, 2021 at 08:09:52PM +0100, Lukas Wunner wrote: > > On Wed, Mar 17, 2021 at 10:45:21AM -0700, Dan Williams wrote: > > > Ah, ok, we're missing a flush of the hotplug event handler after the > > > link is up to make sure the hotplug handler does not see the Link Up. > > > I'm not immediately seeing how the new proposal ensures that there is > > > no Link Up event still in flight after DPC completes its work. > > > Wouldn't it be required to throw away Link Up to Link Up transitions? > > > > If you look at the new code added to pciehp_ist() by my patch... > > > > atomic_and(~PCI_EXP_SLTSTA_DLLSC, &ctrl->pending_events); > > if (pciehp_check_link_active(ctrl) > 0) > > events &= ~PCI_EXP_SLTSTA_DLLSC; > > When you have a Surprise Link Down and without any DPC, the link trains > back up. Aren't we expected to take the slot down and add it as if a remove > and add happens? > > without this change if slot-status == ON_STATE, DLLSC means we would power > the slot off. Then we check link_active and bring the slot back on isn't > it? Yes to both questions. That behavior should still be the same with the patch. Do you think it's not?
On 3/17/21 12:01 PM, Lukas Wunner wrote: > On Wed, Mar 17, 2021 at 10:54:09AM -0700, Sathyanarayanan Kuppuswamy Natarajan wrote: >> Flush of hotplug event after successful recovery, and a simulated >> hotplug link down event after link recovery fails should solve the >> problems raised by Lukas. I assume Lukas' proposal adds this support. >> I will check his patch shortly. > > Thank you! > > I'd like to get a better understanding of the issues around hotplug/DPC, > specifically I'm wondering: > > If DPC recovery was successful, what is the desired behavior by pciehp, > should it ignore the Link Down/Up or bring the slot down and back up > after DPC recovery? > > If the events are ignored, the driver of the device in the hotplug slot > is not unbound and rebound. So the driver must be able to cope with > loss of TLPs during DPC recovery and it must be able to cope with > whatever state the endpoint device is in after DPC recovery. > Is this really safe? How does the nvme driver deal with it? During DPC recovery, in pcie_do_recovery() function, we use report_frozen_detected() to notify all devices attached to the port about the fatal error. After this notification, we expect all affected devices to halt its IO transactions. Regarding state restoration, after successful recovery, we use report_slot_reset() to notify about the slot/link reset. So device drivers are expected to restore the device to working state after this notification. > > Also, if DPC is handled by firmware, your patch does not ignore the > Link Down/Up events, Only for pure firmware model. For EDR case, we still ignore the Link Down/Up events. so pciehp brings down the slot when DPC is > triggered, then brings it up after succesful recovery. In a code > comment, you write that this behavior is okay because there's "no > race between hotplug and DPC recovery". My point is, there is no race in OS handlers (pciehp_ist() vs pcie_do_recovery()) However, Sinan wrote in > 2018 that one of the issues with hotplug versus DPC is that pciehp > may turn off slot power and thereby foil DPC recovery. (Power off = > cold reset, whereas DPC recovery = warm reset.) This can occur > as well if DPC is handled by firmware. I am not sure how pure firmware DPC recovery works. Is there a platform which uses this combination? For firmware DPC model, spec does not clarify following points. 1. Who will notify the affected device drivers to halt the IO transactions. 2. Who is responsible to restore the state of the device after link reset. IMO, pure firmware DPC does not support seamless recovery. I think after it clears the DPC trigger status, it might expect hotplug handler be responsible for device recovery. I don't want to add fix to the code path that I don't understand. This is the reason for extending this logic to pure firmware DPC case. > > So I guess pciehp should make an attempt to await DPC recovery even > if it's handled by firmware? Or am I missing something? We may be > able to achieve that by polling the DPC Trigger Status bit and > DLLLA bit, but it won't work as perfectly as with native DPC support. > > Finally, you write in your commit message that there are "a lot of > stability issues" if pciehp and DPC are allowed to recover freely > without proper serialization. What are these issues exactly? In most cases, I see failure of DPC recovery handler (you can see the example dmesg in commit log). Other than this, we also noticed some extended delay or failure in link retraining (while waiting for LINK UP in pcie_wait_for_link(pdev, true)). In some cases, we noticed slot power on failures and that card will not be detected when running lscpi. Above mentioned cases are just observations we have made. we did not dig deeper on why the race between pciehp and DPC leads to such issues. > (Beyond the slot power issue mentioned above, and that the endpoint > device's driver should presumably not be unbound if DPC recovery > was successful.) > > Thanks! > > Lukas >
On 3/17/2021 4:02 PM, Kuppuswamy, Sathyanarayanan wrote: > My point is, there is no race in OS handlers (pciehp_ist() vs > pcie_do_recovery()) > However, Sinan wrote in >> 2018 that one of the issues with hotplug versus DPC is that pciehp >> may turn off slot power and thereby foil DPC recovery. (Power off = >> cold reset, whereas DPC recovery = warm reset.) This can occur >> as well if DPC is handled by firmware. It has been a while... If I remember correctly, there is no race condition if the platform handles DPC and HP interrupts on the same MSI vector. If HP and DPC interrupts are handled as MSI-x interrupts, these can fire out of order and can cause problems for each one. I hope it helps.
On 3/16/21 9:13 PM, Lukas Wunner wrote: > On Fri, Mar 12, 2021 at 07:32:08PM -0800, sathyanarayanan.kuppuswamy@linux.intel.com wrote: >> + if ((events == PCI_EXP_SLTSTA_DLLSC) && is_dpc_reset_active(pdev)) { >> + ctrl_info(ctrl, "Slot(%s): DLLSC event(DPC), skipped\n", >> + slot_name(ctrl)); >> + ret = IRQ_HANDLED; >> + goto out; >> + } > > Two problems here: > > (1) If recovery fails, the link will *remain* down, so there'll be > no Link Up event. You've filtered the Link Down event, thus the > slot will remain in ON_STATE even though the device in the slot is > no longer accessible. That's not good, the slot should be brought > down in this case. > > (2) If recovery succeeds, there's a race where pciehp may call > is_dpc_reset_active() *after* dpc_reset_link() has finished. > So both the DPC Trigger Status bit as well as pdev->dpc_reset_active > will be cleared. Thus, the Link Up event is not filtered by pciehp > and the slot is brought down and back up even though DPC recovery > was succesful, which seems undesirable. > > The only viable solution I see is to wait until DPC has completed. > Sinan (+cc) proposed something along those lines a couple years back: > > https://patchwork.ozlabs.org/project/linux-pci/patch/20180818065126.77912-1-okaya@kernel.org/ > > Included below please find my suggestion for how to fix this. > I've sort of combined yours and Sinan's approach, but I'm > using a waitqueue (Sinan used polling) and I'm using atomic bitops > on pdev->priv_flags (you're using an atomic_t instead, which needs > additionally space in struct pci_dev). Note: It's compile-tested > only, I don't have any DPC-capable hardware at my disposal. > > Would this work for you? If so, I can add a commit message to the > patch and submit it properly. Let me know what you think. Thanks! > > -- >8 -- > Subject: [PATCH] PCI: pciehp: Ignore Link Down/Up caused by DPC > > Signed-off-by: Lukas Wunner <lukas@wunner.de> > --- > drivers/pci/hotplug/pciehp_hpc.c | 11 +++++++++ > drivers/pci/pci.h | 4 ++++ > drivers/pci/pcie/dpc.c | 51 ++++++++++++++++++++++++++++++++++++++-- > 3 files changed, 64 insertions(+), 2 deletions(-) > > diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c > index fb3840e..bcc018e 100644 > --- a/drivers/pci/hotplug/pciehp_hpc.c > +++ b/drivers/pci/hotplug/pciehp_hpc.c > @@ -707,6 +707,17 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) > } > > /* > + * Ignore Link Down/Up caused by Downstream Port Containment > + * if recovery from the error succeeded. > + */ > + if ((events & PCI_EXP_SLTSTA_DLLSC) && pci_dpc_recovered(pdev) && > + ctrl->state == ON_STATE) { > + atomic_and(~PCI_EXP_SLTSTA_DLLSC, &ctrl->pending_events); Why modify pending_events here. It should be already be zero right? > + if (pciehp_check_link_active(ctrl) > 0) > + events &= ~PCI_EXP_SLTSTA_DLLSC; > + } > + > + /* > * Disable requests have higher priority than Presence Detect Changed > * or Data Link Layer State Changed events. > */ > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 9684b46..e5ae5e8 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -392,6 +392,8 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev) > > /* pci_dev priv_flags */ > #define PCI_DEV_ADDED 0 > +#define PCI_DPC_RECOVERED 1 > +#define PCI_DPC_RECOVERING 2 > > static inline void pci_dev_assign_added(struct pci_dev *dev, bool added) > { > @@ -446,10 +448,12 @@ struct rcec_ea { > void pci_dpc_init(struct pci_dev *pdev); > void dpc_process_error(struct pci_dev *pdev); > pci_ers_result_t dpc_reset_link(struct pci_dev *pdev); > +bool pci_dpc_recovered(struct pci_dev *pdev); > #else > static inline void pci_save_dpc_state(struct pci_dev *dev) {} > static inline void pci_restore_dpc_state(struct pci_dev *dev) {} > static inline void pci_dpc_init(struct pci_dev *pdev) {} > +static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; } > #endif > > #ifdef CONFIG_PCIEPORTBUS > diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c > index e05aba8..7328d9c4 100644 > --- a/drivers/pci/pcie/dpc.c > +++ b/drivers/pci/pcie/dpc.c > @@ -71,6 +71,44 @@ void pci_restore_dpc_state(struct pci_dev *dev) > pci_write_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, *cap); > } > > +static bool dpc_completed(struct pci_dev *pdev) > +{ > + u16 status; > + > + pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_STATUS, &status); > + if (status & PCI_EXP_DPC_STATUS_TRIGGER) > + return false; > + > + if (test_bit(PCI_DPC_RECOVERING, &pdev->priv_flags)) > + return false; > + > + return true; > +} > + > +static DECLARE_WAIT_QUEUE_HEAD(dpc_completed_waitqueue); > + > +bool pci_dpc_recovered(struct pci_dev *pdev) > +{ > + struct pci_host_bridge *host; > + > + if (!pdev->dpc_cap) > + return false; > + > + /* > + * If DPC is owned by firmware and EDR is not supported, there is > + * no race between hotplug and DPC recovery handler. So return > + * false. > + */ > + host = pci_find_host_bridge(pdev->bus); > + if (!host->native_dpc && !IS_ENABLED(CONFIG_PCIE_EDR)) > + return false; > + > + wait_event_timeout(dpc_completed_waitqueue, dpc_completed(pdev), > + msecs_to_jiffies(5000)); > + > + return test_and_clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags); > +} > + > static int dpc_wait_rp_inactive(struct pci_dev *pdev) > { > unsigned long timeout = jiffies + HZ; > @@ -91,8 +129,12 @@ static int dpc_wait_rp_inactive(struct pci_dev *pdev) > > pci_ers_result_t dpc_reset_link(struct pci_dev *pdev) > { > + pci_ers_result_t ret; > u16 cap; > > + clear_bit(PCI_DPC_RECOVERED, &pdev->priv_flags); > + set_bit(PCI_DPC_RECOVERING, &pdev->priv_flags); > + > /* > * DPC disables the Link automatically in hardware, so it has > * already been reset by the time we get here. > @@ -114,10 +156,15 @@ pci_ers_result_t dpc_reset_link(struct pci_dev *pdev) > > if (!pcie_wait_for_link(pdev, true)) { > pci_info(pdev, "Data Link Layer Link Active not set in 1000 msec\n"); > - return PCI_ERS_RESULT_DISCONNECT; > + ret = PCI_ERS_RESULT_DISCONNECT; > + } else { > + set_bit(PCI_DPC_RECOVERED, &pdev->priv_flags); > + ret = PCI_ERS_RESULT_RECOVERED; > } > > - return PCI_ERS_RESULT_RECOVERED; > + clear_bit(PCI_DPC_RECOVERING, &pdev->priv_flags); > + wake_up_all(&dpc_completed_waitqueue); > + return ret; > } > > static void dpc_process_rp_pio_error(struct pci_dev *pdev) >
On Sat, Mar 27, 2021 at 10:49:45PM -0700, Kuppuswamy, Sathyanarayanan wrote: > On 3/16/21 9:13 PM, Lukas Wunner wrote: > > --- a/drivers/pci/hotplug/pciehp_hpc.c > > +++ b/drivers/pci/hotplug/pciehp_hpc.c > > @@ -707,6 +707,17 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) > > } > > /* > > + * Ignore Link Down/Up caused by Downstream Port Containment > > + * if recovery from the error succeeded. > > + */ > > + if ((events & PCI_EXP_SLTSTA_DLLSC) && pci_dpc_recovered(pdev) && > > + ctrl->state == ON_STATE) { > > + atomic_and(~PCI_EXP_SLTSTA_DLLSC, &ctrl->pending_events); > > Why modify pending_events here. It should be already be zero right? "pending_events" is expected to contain the Link Up event after successful recovery, whereas "events" contains the Link Down event (if DPC was triggered). pciehp is structured around the generic irq core's separation of hardirq handler (runs in interrupt context) and irq thread (runs in task context). The hardirq handler pciehp_isr() picks up events from the Slot Status register and stores them in "pending_events" for later consumption by the irq thread pciehp_ist(). The irq thread performs long running tasks such as slot bringup and bringdown. The irq thread is also allowed to sleep. While pciehp_ist() awaits completion of DPC recovery, a DLLSC event will be picked up by pciehp_isr() which is caused by link retraining. That event is contained in "pending_events", so after successful recovery, pciehp_ist() can just delete it. Thanks, Lukas
On Wed, Mar 17, 2021 at 01:02:07PM -0700, Kuppuswamy, Sathyanarayanan wrote: > On 3/17/21 12:01 PM, Lukas Wunner wrote: > > If the events are ignored, the driver of the device in the hotplug slot > > is not unbound and rebound. So the driver must be able to cope with > > loss of TLPs during DPC recovery and it must be able to cope with > > whatever state the endpoint device is in after DPC recovery. > > Is this really safe? How does the nvme driver deal with it? > > During DPC recovery, in pcie_do_recovery() function, we use > report_frozen_detected() to notify all devices attached to the port > about the fatal error. After this notification, we expect all > affected devices to halt its IO transactions. > > Regarding state restoration, after successful recovery, we use > report_slot_reset() to notify about the slot/link reset. So device > drivers are expected to restore the device to working state after this > notification. Thanks a lot for the explanation. > I am not sure how pure firmware DPC recovery works. Is there a platform > which uses this combination? For firmware DPC model, spec does not clarify > following points. > > 1. Who will notify the affected device drivers to halt the IO transactions. > 2. Who is responsible to restore the state of the device after link reset. > > IMO, pure firmware DPC does not support seamless recovery. I think after it > clears the DPC trigger status, it might expect hotplug handler be responsible > for device recovery. > > I don't want to add fix to the code path that I don't understand. This is the > reason for extending this logic to pure firmware DPC case. I agree, let's just declare synchronization of pciehp with pure firmware DPC recovery as unsupported for now. I've just submitted a refined version of my patch to the list: https://lore.kernel.org/linux-pci/b70e19324bbdded90b728a5687aa78dc17c20306.1616921228.git.lukas@wunner.de/ If you could give this new version a whirl I'd be grateful. This version contains more code comments and kernel-doc. There's now a check in dpc_completed() whether the DPC Status register contains "all ones", which can happen when a DPC-capable hotplug port is hot-removed, i.e. for cascaded DPC-capable hotplug ports. I've also realized that the previous version was prone to races which are theoretical but should nonetheless be avoided: E.g., previously the DLLSC event was only removed from "events" if the link is still up after DPC recovery. However if DPC triggers and recovers multiple times in a row, the link may happen to be up but a new DLLSC event may have been picked up in "pending_events" which should be ignored. I've solved this by inverting the logic such that DLLSC is *always* removed from "events", and if the link is unexpectedly *down* after successful recovery, a DLLSC event is synthesized. Thanks, Lukas
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index fb3840e222ad..55da5208c7e5 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -691,6 +691,25 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) goto out; } + /* + * If the DLLSC link up/down event is generated due to DPC containment + * in the PCIe port, skip the DLLSC event handling and let the DPC + * driver own the port recovery. Allowing both hotplug DLLSC event + * handler and DPC event trigger handler to attempt recovery on the + * same port leads to stability issues. If DPC recovery is successful, + * is_dpc_reset_active() will return false and the hotplug handler will + * not suppress the DLLSC event. If DPC recovery fails and the link is + * left in disabled state, once the user changes the faulty card, the + * hotplug handler can still handle the PRESENCE change event and bring + * the device back up. + */ + if ((events == PCI_EXP_SLTSTA_DLLSC) && is_dpc_reset_active(pdev)) { + ctrl_info(ctrl, "Slot(%s): DLLSC event(DPC), skipped\n", + slot_name(ctrl)); + ret = IRQ_HANDLED; + goto out; + } + /* Check Attention Button Pressed */ if (events & PCI_EXP_SLTSTA_ABP) { ctrl_info(ctrl, "Slot(%s): Attention button pressed\n", diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index ef7c4661314f..cee7095483bd 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -446,10 +446,12 @@ void pci_restore_dpc_state(struct pci_dev *dev); void pci_dpc_init(struct pci_dev *pdev); void dpc_process_error(struct pci_dev *pdev); pci_ers_result_t dpc_reset_link(struct pci_dev *pdev); +bool is_dpc_reset_active(struct pci_dev *pdev); #else static inline void pci_save_dpc_state(struct pci_dev *dev) {} static inline void pci_restore_dpc_state(struct pci_dev *dev) {} static inline void pci_dpc_init(struct pci_dev *pdev) {} +static inline bool is_dpc_reset_active(struct pci_dev *pdev) { return false; } #endif #ifdef CONFIG_PCIEPORTBUS diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index e05aba86a317..9157d70ebe21 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -71,6 +71,33 @@ void pci_restore_dpc_state(struct pci_dev *dev) pci_write_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_CTL, *cap); } +bool is_dpc_reset_active(struct pci_dev *dev) +{ + struct pci_host_bridge *host = pci_find_host_bridge(dev->bus); + u16 status; + + if (!dev->dpc_cap) + return false; + + /* + * If DPC is owned by firmware and EDR is not supported, there is + * no race between hotplug and DPC recovery handler. So return + * false. + */ + if (!host->native_dpc && !IS_ENABLED(CONFIG_PCIE_EDR)) + return false; + + if (atomic_read_acquire(&dev->dpc_reset_active)) + return true; + + pci_read_config_word(dev, dev->dpc_cap + PCI_EXP_DPC_STATUS, &status); + + if (status & PCI_EXP_DPC_STATUS_TRIGGER) + return true; + + return false; +} + static int dpc_wait_rp_inactive(struct pci_dev *pdev) { unsigned long timeout = jiffies + HZ; @@ -91,6 +118,7 @@ static int dpc_wait_rp_inactive(struct pci_dev *pdev) pci_ers_result_t dpc_reset_link(struct pci_dev *pdev) { + pci_ers_result_t status = PCI_ERS_RESULT_RECOVERED; u16 cap; /* @@ -109,15 +137,19 @@ pci_ers_result_t dpc_reset_link(struct pci_dev *pdev) if (pdev->dpc_rp_extensions && dpc_wait_rp_inactive(pdev)) return PCI_ERS_RESULT_DISCONNECT; + atomic_inc_return_acquire(&pdev->dpc_reset_active); + pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS, PCI_EXP_DPC_STATUS_TRIGGER); if (!pcie_wait_for_link(pdev, true)) { pci_info(pdev, "Data Link Layer Link Active not set in 1000 msec\n"); - return PCI_ERS_RESULT_DISCONNECT; + status = PCI_ERS_RESULT_DISCONNECT; } - return PCI_ERS_RESULT_RECOVERED; + atomic_dec_return_release(&pdev->dpc_reset_active); + + return status; } static void dpc_process_rp_pio_error(struct pci_dev *pdev) diff --git a/include/linux/pci.h b/include/linux/pci.h index 86c799c97b77..3314f616520d 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -479,6 +479,7 @@ struct pci_dev { u16 dpc_cap; unsigned int dpc_rp_extensions:1; u8 dpc_rp_log_size; + atomic_t dpc_reset_active; /* DPC trigger is active */ #endif #ifdef CONFIG_PCI_ATS union {