diff mbox

[3/6] nvme: Move all IO out of controller reset

Message ID 20180518163823.27820-3-keith.busch@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Keith Busch May 18, 2018, 4:38 p.m. UTC
IO may be retryable, so don't wait for them in the reset path. These
commands may trigger a reset if that IO expires without a completion,
placing it on the requeue list. Waiting for these would then deadlock
the reset handler.

To fix the theoretical deadlock, this patch unblocks IO submission from
the reset_work as before, but moves the waiting to the IO safe scan_work
so that the reset_work may proceed to completion. Since the unfreezing
happens in the controller LIVE state, the nvme device has to track if
the queues were frozen now to prevent incorrect freeze depths.

This patch is also renaming the function 'nvme_dev_add' to a
more appropriate name that describes what it's actually doing:
nvme_alloc_io_tags.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/core.c |  3 +++
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  | 46 +++++++++++++++++++++++++++++++++-------------
 3 files changed, 37 insertions(+), 13 deletions(-)

Comments

Ming Lei May 18, 2018, 11:03 p.m. UTC | #1
On Fri, May 18, 2018 at 10:38:20AM -0600, Keith Busch wrote:
> IO may be retryable, so don't wait for them in the reset path. These
> commands may trigger a reset if that IO expires without a completion,
> placing it on the requeue list. Waiting for these would then deadlock
> the reset handler.
> 
> To fix the theoretical deadlock, this patch unblocks IO submission from
> the reset_work as before, but moves the waiting to the IO safe scan_work
> so that the reset_work may proceed to completion. Since the unfreezing
> happens in the controller LIVE state, the nvme device has to track if
> the queues were frozen now to prevent incorrect freeze depths.
> 
> This patch is also renaming the function 'nvme_dev_add' to a
> more appropriate name that describes what it's actually doing:
> nvme_alloc_io_tags.
> 
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> ---
>  drivers/nvme/host/core.c |  3 +++
>  drivers/nvme/host/nvme.h |  1 +
>  drivers/nvme/host/pci.c  | 46 +++++++++++++++++++++++++++++++++-------------
>  3 files changed, 37 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 1de68b56b318..34d7731f1419 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -214,6 +214,7 @@ static inline bool nvme_req_needs_retry(struct request *req)
>  	if (blk_noretry_request(req))
>  		return false;
>  	if (nvme_req(req)->status & NVME_SC_DNR)
> +
>  		return false;
>  	if (nvme_req(req)->retries >= nvme_max_retries)
>  		return false;
> @@ -3177,6 +3178,8 @@ static void nvme_scan_work(struct work_struct *work)
>  	struct nvme_id_ctrl *id;
>  	unsigned nn;
>  
> +	if (ctrl->ops->update_hw_ctx)
> +		ctrl->ops->update_hw_ctx(ctrl);
>  	if (ctrl->state != NVME_CTRL_LIVE)
>  		return;
>  
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index c15c2ee7f61a..230c5424b197 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -320,6 +320,7 @@ struct nvme_ctrl_ops {
>  	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
>  	int (*reinit_request)(void *data, struct request *rq);
>  	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
> +	void (*update_hw_ctx)(struct nvme_ctrl *ctrl);
>  };
>  
>  #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 2bd9d84f58d0..6a7cbc631d92 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -99,6 +99,7 @@ struct nvme_dev {
>  	u32 cmbloc;
>  	struct nvme_ctrl ctrl;
>  	struct completion ioq_wait;
> +	bool queues_froze;
>  
>  	/* shadow doorbell buffer support: */
>  	u32 *dbbuf_dbs;
> @@ -2065,10 +2066,33 @@ static void nvme_disable_io_queues(struct nvme_dev *dev)
>  	}
>  }
>  
> +static void nvme_pci_update_hw_ctx(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_dev *dev = to_nvme_dev(ctrl);
> +	bool unfreeze;
> +
> +	mutex_lock(&dev->shutdown_lock);
> +	unfreeze = dev->queues_froze;
> +	mutex_unlock(&dev->shutdown_lock);

What if nvme_dev_disable() just sets the .queues_froze flag and
userspace sends a RESCAN command at the same time?

> +
> +	if (unfreeze)
> +		nvme_wait_freeze(&dev->ctrl);
> +

timeout may comes just before&during blk_mq_update_nr_hw_queues() or
the above nvme_wait_freeze(), then both two may hang forever.

> +	blk_mq_update_nr_hw_queues(ctrl->tagset, dev->online_queues - 1);
> +	nvme_free_queues(dev, dev->online_queues);
> +
> +	if (unfreeze)
> +		nvme_unfreeze(&dev->ctrl);
> +
> +	mutex_lock(&dev->shutdown_lock);
> +	dev->queues_froze = false;
> +	mutex_unlock(&dev->shutdown_lock);

If the running scan work is triggered via user-space, the above code
may clear the .queues_froze flag wrong.

Thanks,
Ming
Keith Busch May 21, 2018, 2:22 p.m. UTC | #2
On Sat, May 19, 2018 at 07:03:58AM +0800, Ming Lei wrote:
> On Fri, May 18, 2018 at 10:38:20AM -0600, Keith Busch wrote:
> > +
> > +	if (unfreeze)
> > +		nvme_wait_freeze(&dev->ctrl);
> > +
> 
> timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> the above nvme_wait_freeze(), then both two may hang forever.

Why would it hang forever? The scan_work doesn't stop a timeout from
triggering a reset to reclaim requests necessary to complete a freeze.
Ming Lei May 21, 2018, 2:58 p.m. UTC | #3
On Mon, May 21, 2018 at 08:22:19AM -0600, Keith Busch wrote:
> On Sat, May 19, 2018 at 07:03:58AM +0800, Ming Lei wrote:
> > On Fri, May 18, 2018 at 10:38:20AM -0600, Keith Busch wrote:
> > > +
> > > +	if (unfreeze)
> > > +		nvme_wait_freeze(&dev->ctrl);
> > > +
> > 
> > timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> > the above nvme_wait_freeze(), then both two may hang forever.
> 
> Why would it hang forever? The scan_work doesn't stop a timeout from
> triggering a reset to reclaim requests necessary to complete a freeze.

nvme_dev_disable() will quiesce queues, then nvme_wait_freeze() or
blk_mq_update_nr_hw_queues() may hang forever.

Thanks,
Ming
Keith Busch May 21, 2018, 3:03 p.m. UTC | #4
On Mon, May 21, 2018 at 10:58:51PM +0800, Ming Lei wrote:
> On Mon, May 21, 2018 at 08:22:19AM -0600, Keith Busch wrote:
> > On Sat, May 19, 2018 at 07:03:58AM +0800, Ming Lei wrote:
> > > On Fri, May 18, 2018 at 10:38:20AM -0600, Keith Busch wrote:
> > > > +
> > > > +	if (unfreeze)
> > > > +		nvme_wait_freeze(&dev->ctrl);
> > > > +
> > > 
> > > timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> > > the above nvme_wait_freeze(), then both two may hang forever.
> > 
> > Why would it hang forever? The scan_work doesn't stop a timeout from
> > triggering a reset to reclaim requests necessary to complete a freeze.
> 
> nvme_dev_disable() will quiesce queues, then nvme_wait_freeze() or
> blk_mq_update_nr_hw_queues() may hang forever.

nvme_dev_disable is just the first part of the timeout sequence. You
have to follow it through to the reset_work that either restarts or
kills the queues.
Ming Lei May 21, 2018, 3:34 p.m. UTC | #5
On Mon, May 21, 2018 at 09:03:31AM -0600, Keith Busch wrote:
> On Mon, May 21, 2018 at 10:58:51PM +0800, Ming Lei wrote:
> > On Mon, May 21, 2018 at 08:22:19AM -0600, Keith Busch wrote:
> > > On Sat, May 19, 2018 at 07:03:58AM +0800, Ming Lei wrote:
> > > > On Fri, May 18, 2018 at 10:38:20AM -0600, Keith Busch wrote:
> > > > > +
> > > > > +	if (unfreeze)
> > > > > +		nvme_wait_freeze(&dev->ctrl);
> > > > > +
> > > > 
> > > > timeout may comes just before&during blk_mq_update_nr_hw_queues() or
> > > > the above nvme_wait_freeze(), then both two may hang forever.
> > > 
> > > Why would it hang forever? The scan_work doesn't stop a timeout from
> > > triggering a reset to reclaim requests necessary to complete a freeze.
> > 
> > nvme_dev_disable() will quiesce queues, then nvme_wait_freeze() or
> > blk_mq_update_nr_hw_queues() may hang forever.
> 
> nvme_dev_disable is just the first part of the timeout sequence. You
> have to follow it through to the reset_work that either restarts or
> kills the queues.

nvme_dev_disable() quiesces queues first before killing queues.

If queues are quiesced during or before nvme_wait_freeze() is run
from the 2nd part of reset, the 2nd part can't move on, and IO hang
is caused. Finally no reset can be scheduled at all.

Thanks,
Ming
Keith Busch May 21, 2018, 3:44 p.m. UTC | #6
On Mon, May 21, 2018 at 11:34:27PM +0800, Ming Lei wrote:
> nvme_dev_disable() quiesces queues first before killing queues.
> 
> If queues are quiesced during or before nvme_wait_freeze() is run
> from the 2nd part of reset, the 2nd part can't move on, and IO hang
> is caused. Finally no reset can be scheduled at all.

But this patch moves nvme_wait_freeze outside the reset path, so I'm
afraid I'm unable to follow how you've concluded the wait freeze is
somehow part of the reset.
Ming Lei May 21, 2018, 4:04 p.m. UTC | #7
On Mon, May 21, 2018 at 09:44:33AM -0600, Keith Busch wrote:
> On Mon, May 21, 2018 at 11:34:27PM +0800, Ming Lei wrote:
> > nvme_dev_disable() quiesces queues first before killing queues.
> > 
> > If queues are quiesced during or before nvme_wait_freeze() is run
> > from the 2nd part of reset, the 2nd part can't move on, and IO hang
> > is caused. Finally no reset can be scheduled at all.
> 
> But this patch moves nvme_wait_freeze outside the reset path, so I'm
> afraid I'm unable to follow how you've concluded the wait freeze is
> somehow part of the reset.

For example:

1) the 1st timeout event:

- nvme_dev_disable()
- reset
- scan_work

2) the 2nd timeout event:

nvme_dev_disable() may come just after nvme_start_queues() in
the above reset of the 1st timeout. And nvme_timeout() won't
schedule a new reset since the controller state is NVME_CTRL_CONNECTING.

Then scan_work in 1st timeout still may hang for ever.

Thanks,
Ming
Keith Busch May 21, 2018, 4:23 p.m. UTC | #8
On Tue, May 22, 2018 at 12:04:53AM +0800, Ming Lei wrote:
> On Mon, May 21, 2018 at 09:44:33AM -0600, Keith Busch wrote:
> > On Mon, May 21, 2018 at 11:34:27PM +0800, Ming Lei wrote:
> > > nvme_dev_disable() quiesces queues first before killing queues.
> > > 
> > > If queues are quiesced during or before nvme_wait_freeze() is run
> > > from the 2nd part of reset, the 2nd part can't move on, and IO hang
> > > is caused. Finally no reset can be scheduled at all.
> > 
> > But this patch moves nvme_wait_freeze outside the reset path, so I'm
> > afraid I'm unable to follow how you've concluded the wait freeze is
> > somehow part of the reset.
> 
> For example:
> 
> 1) the 1st timeout event:
> 
> - nvme_dev_disable()
> - reset
> - scan_work
> 
> 2) the 2nd timeout event:
> 
> nvme_dev_disable() may come just after nvme_start_queues() in
> the above reset of the 1st timeout. And nvme_timeout() won't
> schedule a new reset since the controller state is NVME_CTRL_CONNECTING.

Let me get this straight -- you're saying nvme_start_queues is going
to somehow immediately trigger timeout work? I can't see how that could
possibly happen in real life, but we can just remove it and use the existing
nvme_start_ctrl to handle that in the LIVE state.
Ming Lei May 22, 2018, 1:46 a.m. UTC | #9
On Mon, May 21, 2018 at 10:23:55AM -0600, Keith Busch wrote:
> On Tue, May 22, 2018 at 12:04:53AM +0800, Ming Lei wrote:
> > On Mon, May 21, 2018 at 09:44:33AM -0600, Keith Busch wrote:
> > > On Mon, May 21, 2018 at 11:34:27PM +0800, Ming Lei wrote:
> > > > nvme_dev_disable() quiesces queues first before killing queues.
> > > > 
> > > > If queues are quiesced during or before nvme_wait_freeze() is run
> > > > from the 2nd part of reset, the 2nd part can't move on, and IO hang
> > > > is caused. Finally no reset can be scheduled at all.
> > > 
> > > But this patch moves nvme_wait_freeze outside the reset path, so I'm
> > > afraid I'm unable to follow how you've concluded the wait freeze is
> > > somehow part of the reset.
> > 
> > For example:
> > 
> > 1) the 1st timeout event:
> > 
> > - nvme_dev_disable()
> > - reset
> > - scan_work
> > 
> > 2) the 2nd timeout event:
> > 
> > nvme_dev_disable() may come just after nvme_start_queues() in
> > the above reset of the 1st timeout. And nvme_timeout() won't
> > schedule a new reset since the controller state is NVME_CTRL_CONNECTING.
> 
> Let me get this straight -- you're saying nvme_start_queues is going
> to somehow immediately trigger timeout work? I can't see how that could

It may be difficult to trigger in reality, but won't be impossible
since the timeout value can be adjusted via module parameter, and any
schedule delay can be introduced in one busy system.

It isn't a good practice to rely on timing for avoiding race, IMO.

> possibly happen in real life, but we can just remove it and use the existing
> nvme_start_ctrl to handle that in the LIVE state.

OK, please fix other issues mentioned in the following comment together,
then I will review further and see if it can work well.

https://marc.info/?l=linux-block&m=152668465710591&w=2

Thanks,
Ming
Keith Busch May 22, 2018, 2:03 p.m. UTC | #10
On Tue, May 22, 2018 at 09:46:32AM +0800, Ming Lei wrote:
> It isn't a good practice to rely on timing for avoiding race, IMO.

Sure thing, and it's easy enough to avoid this one.
 
> OK, please fix other issues mentioned in the following comment together,
> then I will review further and see if it can work well.
> 
> https://marc.info/?l=linux-block&m=152668465710591&w=2

Will do, v2 coming up by end of today.
diff mbox

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1de68b56b318..34d7731f1419 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -214,6 +214,7 @@  static inline bool nvme_req_needs_retry(struct request *req)
 	if (blk_noretry_request(req))
 		return false;
 	if (nvme_req(req)->status & NVME_SC_DNR)
+
 		return false;
 	if (nvme_req(req)->retries >= nvme_max_retries)
 		return false;
@@ -3177,6 +3178,8 @@  static void nvme_scan_work(struct work_struct *work)
 	struct nvme_id_ctrl *id;
 	unsigned nn;
 
+	if (ctrl->ops->update_hw_ctx)
+		ctrl->ops->update_hw_ctx(ctrl);
 	if (ctrl->state != NVME_CTRL_LIVE)
 		return;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c15c2ee7f61a..230c5424b197 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -320,6 +320,7 @@  struct nvme_ctrl_ops {
 	int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
 	int (*reinit_request)(void *data, struct request *rq);
 	void (*stop_ctrl)(struct nvme_ctrl *ctrl);
+	void (*update_hw_ctx)(struct nvme_ctrl *ctrl);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2bd9d84f58d0..6a7cbc631d92 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -99,6 +99,7 @@  struct nvme_dev {
 	u32 cmbloc;
 	struct nvme_ctrl ctrl;
 	struct completion ioq_wait;
+	bool queues_froze;
 
 	/* shadow doorbell buffer support: */
 	u32 *dbbuf_dbs;
@@ -2065,10 +2066,33 @@  static void nvme_disable_io_queues(struct nvme_dev *dev)
 	}
 }
 
+static void nvme_pci_update_hw_ctx(struct nvme_ctrl *ctrl)
+{
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+	bool unfreeze;
+
+	mutex_lock(&dev->shutdown_lock);
+	unfreeze = dev->queues_froze;
+	mutex_unlock(&dev->shutdown_lock);
+
+	if (unfreeze)
+		nvme_wait_freeze(&dev->ctrl);
+
+	blk_mq_update_nr_hw_queues(ctrl->tagset, dev->online_queues - 1);
+	nvme_free_queues(dev, dev->online_queues);
+
+	if (unfreeze)
+		nvme_unfreeze(&dev->ctrl);
+
+	mutex_lock(&dev->shutdown_lock);
+	dev->queues_froze = false;
+	mutex_unlock(&dev->shutdown_lock);
+}
+
 /*
  * return error value only when tagset allocation failed
  */
-static int nvme_dev_add(struct nvme_dev *dev)
+static int nvme_alloc_io_tags(struct nvme_dev *dev)
 {
 	int ret;
 
@@ -2097,10 +2121,7 @@  static int nvme_dev_add(struct nvme_dev *dev)
 
 		nvme_dbbuf_set(dev);
 	} else {
-		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
-
-		/* Free previously allocated queues that are no longer usable */
-		nvme_free_queues(dev, dev->online_queues);
+		nvme_start_queues(&dev->ctrl);
 	}
 
 	return 0;
@@ -2201,7 +2222,10 @@  static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	     dev->ctrl.state == NVME_CTRL_RESETTING)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-		nvme_start_freeze(&dev->ctrl);
+		if (!dev->queues_froze)	{
+			nvme_start_freeze(&dev->ctrl);
+			dev->queues_froze = true;
+		}
 		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
 			pci_channel_offline(pdev) || !pci_is_enabled(pdev));
 	}
@@ -2375,13 +2399,8 @@  static void nvme_reset_work(struct work_struct *work)
 		nvme_kill_queues(&dev->ctrl);
 		nvme_remove_namespaces(&dev->ctrl);
 		new_state = NVME_CTRL_ADMIN_ONLY;
-	} else {
-		nvme_start_queues(&dev->ctrl);
-		nvme_wait_freeze(&dev->ctrl);
-		/* hit this only when allocate tagset fails */
-		if (nvme_dev_add(dev))
-			new_state = NVME_CTRL_ADMIN_ONLY;
-		nvme_unfreeze(&dev->ctrl);
+	} else if (nvme_alloc_io_tags(dev)) {
+		new_state = NVME_CTRL_ADMIN_ONLY;
 	}
 
 	/*
@@ -2446,6 +2465,7 @@  static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
 	.reg_read64		= nvme_pci_reg_read64,
 	.free_ctrl		= nvme_pci_free_ctrl,
 	.submit_async_event	= nvme_pci_submit_async_event,
+	.update_hw_ctx		= nvme_pci_update_hw_ctx,
 	.get_address		= nvme_pci_get_address,
 };