[1/2] nvme: add API of nvme_delete_dead_ctrl

Message ID	20230530094322.258090-2-ming.lei@redhat.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@vger.kernel.org> From: Ming Lei <ming.lei@redhat.com> To: Christoph Hellwig <hch@lst.de>, Sagi Grimberg <sagi@grimberg.me>, Keith Busch <kbusch@kernel.org>, linux-nvme@lists.infradead.org Cc: Yi Zhang <yi.zhang@redhat.com>, linux-block@vger.kernel.org, Ming Lei <ming.lei@redhat.com> Subject: [PATCH 1/2] nvme: add API of nvme_delete_dead_ctrl Date: Tue, 30 May 2023 17:43:21 +0800 Message-Id: <20230530094322.258090-2-ming.lei@redhat.com> In-Reply-To: <20230530094322.258090-1-ming.lei@redhat.com> References: <20230530094322.258090-1-ming.lei@redhat.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	nvme: add nvme_delete_dead_ctrl for avoiding io deadlock \| expand [0/2] nvme: add nvme_delete_dead_ctrl for avoiding io deadlock [1/2] nvme: add API of nvme_delete_dead_ctrl [2/2] nvme: rdma/tcp: call nvme_delete_dead_ctrl for handling reconnect failure

Message ID

20230530094322.258090-2-ming.lei@redhat.com (mailing list archive)

State

New, archived

Headers

From: Ming Lei <ming.lei@redhat.com>
To: Christoph Hellwig <hch@lst.de>, Sagi Grimberg <sagi@grimberg.me>,
        Keith Busch <kbusch@kernel.org>, linux-nvme@lists.infradead.org
Cc: Yi Zhang <yi.zhang@redhat.com>, linux-block@vger.kernel.org,
        Ming Lei <ming.lei@redhat.com>
Subject: [PATCH 1/2] nvme: add API of nvme_delete_dead_ctrl
Date: Tue, 30 May 2023 17:43:21 +0800
Message-Id: <20230530094322.258090-2-ming.lei@redhat.com>
In-Reply-To: <20230530094322.258090-1-ming.lei@redhat.com>
References: <20230530094322.258090-1-ming.lei@redhat.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

Series

nvme: add nvme_delete_dead_ctrl for avoiding io deadlock | expand

Commit Message

Ming Lei May 30, 2023, 9:43 a.m. UTC

When driver confirms that the controller is dead, this controller should
be deleted with marking as DEAD. Otherwise, upper layer may wait forever
in __bio_queue_enter() since the disk won't be marked as DEAD.
Especially, in del_gendisk(), disk won't be marked as DEAD unless bdev
sync & invalidate returns. If any writeback IO waits in
__bio_queue_enter(), IO deadlock is caused.

Add nvme_delete_dead_ctrl() for avoiding such kind of io deadlock.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/nvme/host/core.c | 24 +++++++++++++++++++++++-
 drivers/nvme/host/nvme.h |  1 +
 2 files changed, 24 insertions(+), 1 deletion(-)

Comments

Sagi Grimberg June 5, 2023, 9:48 p.m. UTC | #1

> When driver confirms that the controller is dead, this controller should
> be deleted with marking as DEAD. Otherwise, upper layer may wait forever
> in __bio_queue_enter() since the disk won't be marked as DEAD.
> Especially, in del_gendisk(), disk won't be marked as DEAD unless bdev
> sync & invalidate returns. If any writeback IO waits in
> __bio_queue_enter(), IO deadlock is caused.
> 
> Add nvme_delete_dead_ctrl() for avoiding such kind of io deadlock.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>   drivers/nvme/host/core.c | 24 +++++++++++++++++++++++-
>   drivers/nvme/host/nvme.h |  1 +
>   2 files changed, 24 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index ccb6eb1282f8..413213cfa417 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -227,16 +227,38 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
>   	nvme_do_delete_ctrl(ctrl);
>   }
>   
> -int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
> +static int __nvme_delete_ctrl(struct nvme_ctrl *ctrl,
> +			      enum nvme_ctrl_state state)
>   {
> +	if (state != NVME_CTRL_DELETING && state != NVME_CTRL_DEAD)
> +		return -EINVAL;
> +
>   	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
>   		return -EBUSY;
> +	if (state == NVME_CTRL_DEAD) {
> +		if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD))
> +			return -EBUSY;
> +	}
>   	if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
>   		return -EBUSY;
>   	return 0;
>   }

the user can trigger a delete in exactly the same condition as
the transport (say a nanosecond before the transport exhaust
the max_reconnects).

I think that we'd want something like
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 841f155fe0b3..6c718ad46e06 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -231,6 +231,11 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
  {
         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
                 return -EBUSY;
+
+       if (ctrl->ops->transport_disconnected &&
+           ctrl->ops->transport_disconnected(ctrl))
+               nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD);
+
         if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
                 return -EBUSY;
         return 0;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 054bf2f8b1a1..657d3f79953d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2828,6 +2828,13 @@ static bool nvme_pci_supports_pci_p2pdma(struct 
nvme_ctrl *ctrl)
         return dma_pci_p2pdma_supported(dev->dev);
  }

+static bool nvme_pci_disconnected(struct nvme_ctrl *nctrl)
+{
+       struct nvme_dev *dev = to_nvme_dev(ctrl);
+
+       return !pci_device_is_present(to_pci_dev(dev->dev));
+}
+
  static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
         .name                   = "pcie",
         .module                 = THIS_MODULE,
@@ -2841,6 +2848,7 @@ static const struct nvme_ctrl_ops 
nvme_pci_ctrl_ops = {
         .get_address            = nvme_pci_get_address,
         .print_device_info      = nvme_pci_print_device_info,
         .supports_pci_p2pdma    = nvme_pci_supports_pci_p2pdma,
+       .transport_disconnected = nvme_pci_disconnected,
  };

  static int nvme_dev_map(struct nvme_dev *dev)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0eb79696fb73..2a03df693b0e 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -2235,6 +2235,18 @@ static void nvme_rdma_reset_ctrl_work(struct 
work_struct *work)
         nvme_rdma_reconnect_or_remove(ctrl);
  }

+static bool nvme_rdma_disconnected(struct nvme_ctrl *nctrl)
+{
+       struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
+       int i;
+
+       for (i = 0; i < ctrl->queue_count; i++) {
+               if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags))
+                       return false;
+       }
+       return true;
+}
+
  static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
         .name                   = "rdma",
         .module                 = THIS_MODULE,
@@ -2247,6 +2259,7 @@ static const struct nvme_ctrl_ops 
nvme_rdma_ctrl_ops = {
         .delete_ctrl            = nvme_rdma_delete_ctrl,
         .get_address            = nvmf_get_address,
         .stop_ctrl              = nvme_rdma_stop_ctrl,
+       .transport_disconnected = nvme_rdma_disconnected,
  };

  /*
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index fe01ba75570d..043ce9878560 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2536,6 +2536,18 @@ static int nvme_tcp_get_address(struct nvme_ctrl 
*ctrl, char *buf, int size)
         return len;
  }

+static bool nvme_tcp_disconnected(struct nvme_ctrl *nctrl)
+{
+       struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+       int i;
+
+       for (i = 0; i < ctrl->queue_count; i++) {
+               if (test_bit(NVME_TCP_Q_LIVE, &ctrl->queues[i].flags))
+                       return false;
+       }
+       return true;
+}
+
  static const struct blk_mq_ops nvme_tcp_mq_ops = {
         .queue_rq       = nvme_tcp_queue_rq,
         .commit_rqs     = nvme_tcp_commit_rqs,
@@ -2569,6 +2581,7 @@ static const struct nvme_ctrl_ops 
nvme_tcp_ctrl_ops = {
         .delete_ctrl            = nvme_tcp_delete_ctrl,
         .get_address            = nvme_tcp_get_address,
         .stop_ctrl              = nvme_tcp_stop_ctrl,
+       .transport_disconnected = nvme_tcp_disconnected,
  };

  static bool
--

Ming Lei June 6, 2023, 12:51 a.m. UTC | #2

On Tue, Jun 06, 2023 at 12:48:32AM +0300, Sagi Grimberg wrote:
> 
> > When driver confirms that the controller is dead, this controller should
> > be deleted with marking as DEAD. Otherwise, upper layer may wait forever
> > in __bio_queue_enter() since the disk won't be marked as DEAD.
> > Especially, in del_gendisk(), disk won't be marked as DEAD unless bdev
> > sync & invalidate returns. If any writeback IO waits in
> > __bio_queue_enter(), IO deadlock is caused.
> > 
> > Add nvme_delete_dead_ctrl() for avoiding such kind of io deadlock.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >   drivers/nvme/host/core.c | 24 +++++++++++++++++++++++-
> >   drivers/nvme/host/nvme.h |  1 +
> >   2 files changed, 24 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index ccb6eb1282f8..413213cfa417 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -227,16 +227,38 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
> >   	nvme_do_delete_ctrl(ctrl);
> >   }
> > -int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
> > +static int __nvme_delete_ctrl(struct nvme_ctrl *ctrl,
> > +			      enum nvme_ctrl_state state)
> >   {
> > +	if (state != NVME_CTRL_DELETING && state != NVME_CTRL_DEAD)
> > +		return -EINVAL;
> > +
> >   	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
> >   		return -EBUSY;
> > +	if (state == NVME_CTRL_DEAD) {
> > +		if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD))
> > +			return -EBUSY;
> > +	}
> >   	if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
> >   		return -EBUSY;
> >   	return 0;
> >   }
> 
> the user can trigger a delete in exactly the same condition as
> the transport (say a nanosecond before the transport exhaust
> the max_reconnects).

Yeah, the problem can be triggered when removing any NS which request
queue is frozen.

It it too fragile to call freeze_queue and unfreeze_queue in different
contexts since both two should have been done in strict pair, and
meantime I don't think freezing queues in nvme_rdma_teardown_io_queues()
is needed cause quiescing is supposed to be enough for driver to recover
controller.

And I guess the following approach(rdma only) should address the issue cleanly:

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 0eb79696fb73..59352671aeb7 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -919,6 +919,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)

        if (!new) {
                nvme_unquiesce_io_queues(&ctrl->ctrl);
+               nvme_start_freeze(&ctrl->ctrl);
                if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
                        /*
                         * If we timed out waiting for freeze we are likely to
@@ -926,6 +927,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
                         * to be safe.
                         */
                        ret = -ENODEV;
+                       nvme_unfreeze(&ctrl->ctrl);
                        goto out_wait_freeze_timed_out;
                }
                blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
@@ -975,7 +977,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
                bool remove)
 {
        if (ctrl->ctrl.queue_count > 1) {
-               nvme_start_freeze(&ctrl->ctrl);
                nvme_quiesce_io_queues(&ctrl->ctrl);
                nvme_sync_io_queues(&ctrl->ctrl);
                nvme_rdma_stop_io_queues(ctrl);

> 
> I think that we'd want something like
> --
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 841f155fe0b3..6c718ad46e06 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -231,6 +231,11 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
>  {
>         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
>                 return -EBUSY;
> +
> +       if (ctrl->ops->transport_disconnected &&
> +           ctrl->ops->transport_disconnected(ctrl))
> +               nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD);
> +
>         if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
>                 return -EBUSY;
>         return 0;
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 054bf2f8b1a1..657d3f79953d 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2828,6 +2828,13 @@ static bool nvme_pci_supports_pci_p2pdma(struct
> nvme_ctrl *ctrl)
>         return dma_pci_p2pdma_supported(dev->dev);
>  }
> 
> +static bool nvme_pci_disconnected(struct nvme_ctrl *nctrl)
> +{
> +       struct nvme_dev *dev = to_nvme_dev(ctrl);
> +
> +       return !pci_device_is_present(to_pci_dev(dev->dev));
> +}
> +
>  static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
>         .name                   = "pcie",
>         .module                 = THIS_MODULE,
> @@ -2841,6 +2848,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops =
> {
>         .get_address            = nvme_pci_get_address,
>         .print_device_info      = nvme_pci_print_device_info,
>         .supports_pci_p2pdma    = nvme_pci_supports_pci_p2pdma,
> +       .transport_disconnected = nvme_pci_disconnected,
>  };
> 
>  static int nvme_dev_map(struct nvme_dev *dev)
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 0eb79696fb73..2a03df693b0e 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -2235,6 +2235,18 @@ static void nvme_rdma_reset_ctrl_work(struct
> work_struct *work)
>         nvme_rdma_reconnect_or_remove(ctrl);
>  }
> 
> +static bool nvme_rdma_disconnected(struct nvme_ctrl *nctrl)
> +{
> +       struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
> +       int i;
> +
> +       for (i = 0; i < ctrl->queue_count; i++) {
> +               if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags))
> +                       return false;
> +       }
> +       return true;
> +}
> +
>  static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
>         .name                   = "rdma",
>         .module                 = THIS_MODULE,
> @@ -2247,6 +2259,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops =
> {
>         .delete_ctrl            = nvme_rdma_delete_ctrl,
>         .get_address            = nvmf_get_address,
>         .stop_ctrl              = nvme_rdma_stop_ctrl,
> +       .transport_disconnected = nvme_rdma_disconnected,
>  };
> 
>  /*
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index fe01ba75570d..043ce9878560 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -2536,6 +2536,18 @@ static int nvme_tcp_get_address(struct nvme_ctrl
> *ctrl, char *buf, int size)
>         return len;
>  }
> 
> +static bool nvme_tcp_disconnected(struct nvme_ctrl *nctrl)
> +{
> +       struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
> +       int i;
> +
> +       for (i = 0; i < ctrl->queue_count; i++) {
> +               if (test_bit(NVME_TCP_Q_LIVE, &ctrl->queues[i].flags))
> +                       return false;
> +       }
> +       return true;
> +}
> +
>  static const struct blk_mq_ops nvme_tcp_mq_ops = {
>         .queue_rq       = nvme_tcp_queue_rq,
>         .commit_rqs     = nvme_tcp_commit_rqs,
> @@ -2569,6 +2581,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops =
> {
>         .delete_ctrl            = nvme_tcp_delete_ctrl,
>         .get_address            = nvme_tcp_get_address,
>         .stop_ctrl              = nvme_tcp_stop_ctrl,
> +       .transport_disconnected = nvme_tcp_disconnected,

This way is too violent. The current queue/device status does not mean
the controller/queue is really dead cause recovering is in in-progress,
and we should call blk_mark_disk_dead() in case that controller is confirmed
as DEAD.


Thanks,
Ming

Sagi Grimberg June 6, 2023, 6:28 a.m. UTC | #3

On 6/6/23 03:51, Ming Lei wrote:
> On Tue, Jun 06, 2023 at 12:48:32AM +0300, Sagi Grimberg wrote:
>>
>>> When driver confirms that the controller is dead, this controller should
>>> be deleted with marking as DEAD. Otherwise, upper layer may wait forever
>>> in __bio_queue_enter() since the disk won't be marked as DEAD.
>>> Especially, in del_gendisk(), disk won't be marked as DEAD unless bdev
>>> sync & invalidate returns. If any writeback IO waits in
>>> __bio_queue_enter(), IO deadlock is caused.
>>>
>>> Add nvme_delete_dead_ctrl() for avoiding such kind of io deadlock.
>>>
>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>> ---
>>>    drivers/nvme/host/core.c | 24 +++++++++++++++++++++++-
>>>    drivers/nvme/host/nvme.h |  1 +
>>>    2 files changed, 24 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index ccb6eb1282f8..413213cfa417 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -227,16 +227,38 @@ static void nvme_delete_ctrl_work(struct work_struct *work)
>>>    	nvme_do_delete_ctrl(ctrl);
>>>    }
>>> -int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
>>> +static int __nvme_delete_ctrl(struct nvme_ctrl *ctrl,
>>> +			      enum nvme_ctrl_state state)
>>>    {
>>> +	if (state != NVME_CTRL_DELETING && state != NVME_CTRL_DEAD)
>>> +		return -EINVAL;
>>> +
>>>    	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
>>>    		return -EBUSY;
>>> +	if (state == NVME_CTRL_DEAD) {
>>> +		if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD))
>>> +			return -EBUSY;
>>> +	}
>>>    	if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
>>>    		return -EBUSY;
>>>    	return 0;
>>>    }
>>
>> the user can trigger a delete in exactly the same condition as
>> the transport (say a nanosecond before the transport exhaust
>> the max_reconnects).
> 
> Yeah, the problem can be triggered when removing any NS which request
> queue is frozen.
> 
> It it too fragile to call freeze_queue and unfreeze_queue in different
> contexts since both two should have been done in strict pair, and
> meantime I don't think freezing queues in nvme_rdma_teardown_io_queues()
> is needed cause quiescing is supposed to be enough for driver to recover
> controller.
> 
> And I guess the following approach(rdma only) should address the issue cleanly:
> 
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 0eb79696fb73..59352671aeb7 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -919,6 +919,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
> 
>          if (!new) {
>                  nvme_unquiesce_io_queues(&ctrl->ctrl);
> +               nvme_start_freeze(&ctrl->ctrl);
>                  if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
>                          /*
>                           * If we timed out waiting for freeze we are likely to
> @@ -926,6 +927,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
>                           * to be safe.
>                           */
>                          ret = -ENODEV;
> +                       nvme_unfreeze(&ctrl->ctrl);
>                          goto out_wait_freeze_timed_out;
>                  }
>                  blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
> @@ -975,7 +977,6 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
>                  bool remove)
>   {
>          if (ctrl->ctrl.queue_count > 1) {
> -               nvme_start_freeze(&ctrl->ctrl);
>                  nvme_quiesce_io_queues(&ctrl->ctrl);
>                  nvme_sync_io_queues(&ctrl->ctrl);
>                  nvme_rdma_stop_io_queues(ctrl);

I agree this should work.

> 
>>
>> I think that we'd want something like
>> --
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 841f155fe0b3..6c718ad46e06 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -231,6 +231,11 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
>>   {
>>          if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
>>                  return -EBUSY;
>> +
>> +       if (ctrl->ops->transport_disconnected &&
>> +           ctrl->ops->transport_disconnected(ctrl))
>> +               nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD);
>> +
>>          if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
>>                  return -EBUSY;
>>          return 0;
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 054bf2f8b1a1..657d3f79953d 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -2828,6 +2828,13 @@ static bool nvme_pci_supports_pci_p2pdma(struct
>> nvme_ctrl *ctrl)
>>          return dma_pci_p2pdma_supported(dev->dev);
>>   }
>>
>> +static bool nvme_pci_disconnected(struct nvme_ctrl *nctrl)
>> +{
>> +       struct nvme_dev *dev = to_nvme_dev(ctrl);
>> +
>> +       return !pci_device_is_present(to_pci_dev(dev->dev));
>> +}
>> +
>>   static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
>>          .name                   = "pcie",
>>          .module                 = THIS_MODULE,
>> @@ -2841,6 +2848,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops =
>> {
>>          .get_address            = nvme_pci_get_address,
>>          .print_device_info      = nvme_pci_print_device_info,
>>          .supports_pci_p2pdma    = nvme_pci_supports_pci_p2pdma,
>> +       .transport_disconnected = nvme_pci_disconnected,
>>   };
>>
>>   static int nvme_dev_map(struct nvme_dev *dev)
>> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
>> index 0eb79696fb73..2a03df693b0e 100644
>> --- a/drivers/nvme/host/rdma.c
>> +++ b/drivers/nvme/host/rdma.c
>> @@ -2235,6 +2235,18 @@ static void nvme_rdma_reset_ctrl_work(struct
>> work_struct *work)
>>          nvme_rdma_reconnect_or_remove(ctrl);
>>   }
>>
>> +static bool nvme_rdma_disconnected(struct nvme_ctrl *nctrl)
>> +{
>> +       struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
>> +       int i;
>> +
>> +       for (i = 0; i < ctrl->queue_count; i++) {
>> +               if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags))
>> +                       return false;
>> +       }
>> +       return true;
>> +}
>> +
>>   static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
>>          .name                   = "rdma",
>>          .module                 = THIS_MODULE,
>> @@ -2247,6 +2259,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops =
>> {
>>          .delete_ctrl            = nvme_rdma_delete_ctrl,
>>          .get_address            = nvmf_get_address,
>>          .stop_ctrl              = nvme_rdma_stop_ctrl,
>> +       .transport_disconnected = nvme_rdma_disconnected,
>>   };
>>
>>   /*
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index fe01ba75570d..043ce9878560 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -2536,6 +2536,18 @@ static int nvme_tcp_get_address(struct nvme_ctrl
>> *ctrl, char *buf, int size)
>>          return len;
>>   }
>>
>> +static bool nvme_tcp_disconnected(struct nvme_ctrl *nctrl)
>> +{
>> +       struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
>> +       int i;
>> +
>> +       for (i = 0; i < ctrl->queue_count; i++) {
>> +               if (test_bit(NVME_TCP_Q_LIVE, &ctrl->queues[i].flags))
>> +                       return false;
>> +       }
>> +       return true;
>> +}
>> +
>>   static const struct blk_mq_ops nvme_tcp_mq_ops = {
>>          .queue_rq       = nvme_tcp_queue_rq,
>>          .commit_rqs     = nvme_tcp_commit_rqs,
>> @@ -2569,6 +2581,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops =
>> {
>>          .delete_ctrl            = nvme_tcp_delete_ctrl,
>>          .get_address            = nvme_tcp_get_address,
>>          .stop_ctrl              = nvme_tcp_stop_ctrl,
>> +       .transport_disconnected = nvme_tcp_disconnected,
> 
> This way is too violent. The current queue/device status does not mean
> the controller/queue is really dead cause recovering is in in-progress,
> and we should call blk_mark_disk_dead() in case that controller is confirmed
> as DEAD.

Well, the controller is going away, and the queues are not LIVE, so I
think the logic makes sense. However I do agree that your other
suggestion is cleaner.

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ccb6eb1282f8..413213cfa417 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -227,16 +227,38 @@  static void nvme_delete_ctrl_work(struct work_struct *work)
 	nvme_do_delete_ctrl(ctrl);
 }
 
-int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+static int __nvme_delete_ctrl(struct nvme_ctrl *ctrl,
+			      enum nvme_ctrl_state state)
 {
+	if (state != NVME_CTRL_DELETING && state != NVME_CTRL_DEAD)
+		return -EINVAL;
+
 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
 		return -EBUSY;
+	if (state == NVME_CTRL_DEAD) {
+		if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DEAD))
+			return -EBUSY;
+	}
 	if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
 		return -EBUSY;
 	return 0;
 }
+
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+	return __nvme_delete_ctrl(ctrl, NVME_CTRL_DELETING);
+}
 EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
 
+/*
+ * Called when driver confirmed that the controller is really dead
+ */
+int nvme_delete_dead_ctrl(struct nvme_ctrl *ctrl)
+{
+	return __nvme_delete_ctrl(ctrl, NVME_CTRL_DEAD);
+}
+EXPORT_SYMBOL_GPL(nvme_delete_dead_ctrl);
+
 static void nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
 {
 	/*
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bf46f122e9e1..8f62246a85be 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -828,6 +828,7 @@  void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
+int nvme_delete_dead_ctrl(struct nvme_ctrl *ctrl);
 void nvme_queue_scan(struct nvme_ctrl *ctrl);
 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
 		void *log, size_t size, u64 offset);

[1/2] nvme: add API of nvme_delete_dead_ctrl

Commit Message

Comments

Patch