diff mbox series

[v2] scsi: libsas: abort all inflight requests when device is gone

Message ID 20230329124357.3746533-1-yanaijie@huawei.com (mailing list archive)
State Superseded
Headers show
Series [v2] scsi: libsas: abort all inflight requests when device is gone | expand

Commit Message

Jason Yan March 29, 2023, 12:43 p.m. UTC
When a disk is removed with inflight IO, the userspace application need
to wait for 30 senconds(depends on the timeout configuration) to getback
from the kernel. Xingui tried to fix this issue by aborting the ata link
for SATA devices[1]. However this approach left the SAS devices unresolved.

This patch try to fix this issue by aborting all inflight requests while
the device is gone. This is implemented by itering the tagset.

[1] https://lore.kernel.org/lkml/234e04db-7539-07e4-a6b8-c6b05f78193d@opensource.wdc.com/T/

Cc: Xingui Yang <yangxingui@huawei.com>
Cc: John Garry <john.g.garry@oracle.com>
Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jason Yan <yanaijie@huawei.com>
---

 v1->v2:
   1. Rename sas_abort_domain_cmds() to sas_abort_device_scsi_cmds().
   2. Don't do the aborting for expanders and for devices not completely initialinzed.
   3. Add a comment to explain why we need to abort these commands.

 drivers/scsi/libsas/sas_discover.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

Comments

John Garry March 30, 2023, 7:28 a.m. UTC | #1
On 29/03/2023 13:43, Jason Yan wrote:
> When a disk is removed with inflight IO, the userspace application need
> to wait for 30 senconds(depends on the timeout configuration) to getback

I think that you mean "get back", but this is not really clear in 
meaning. As below, everything the kernel does affects userspace in a 
way, so I don't think that we need to explicitly mention userspace.

> from the kernel. Xingui tried to fix this issue by aborting the ata link

/s/ata/ATA/

> for SATA devices[1]. However this approach left the SAS devices unresolved.
> 
> This patch try to fix this issue by aborting all inflight requests while
> the device is gone. This is implemented by itering the tagset.
> 
> [1] https://urldefense.com/v3/__https://lore.kernel.org/lkml/234e04db-7539-07e4-a6b8-c6b05f78193d@opensource.wdc.com/T/__;!!ACWV5N9M2RV99hQ!NCSE1y-PvbYiLiO35roetxUhmyG3r2_H86b8XasnaSrb0WuZvfhRDTHnwmn6tKYyV774bzF31KYhlZbzsWry$
> 
> Cc: Xingui Yang <yangxingui@huawei.com>
> Cc: John Garry <john.g.garry@oracle.com>
> Cc: Damien Le Moal <damien.lemoal@opensource.wdc.com>
> Cc: Hannes Reinecke <hare@suse.com>
> Signed-off-by: Jason Yan <yanaijie@huawei.com>

Apart from comments on documentation,

Reviewed-by: John Garry <john.g.garry@oracle.com>

> ---
> 
>   v1->v2:
>     1. Rename sas_abort_domain_cmds() to sas_abort_device_scsi_cmds().
>     2. Don't do the aborting for expanders and for devices not completely initialinzed.
>     3. Add a comment to explain why we need to abort these commands.
> 
>   drivers/scsi/libsas/sas_discover.c | 29 +++++++++++++++++++++++++++++
>   1 file changed, 29 insertions(+)
> 
> diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
> index 72fdb2e5d047..8c6afe724944 100644
> --- a/drivers/scsi/libsas/sas_discover.c
> +++ b/drivers/scsi/libsas/sas_discover.c
> @@ -360,6 +360,33 @@ static void sas_destruct_ports(struct asd_sas_port *port)
>   	}
>   }
>   
> +static bool sas_abort_cmd(struct request *req, void *data)
> +{
> +	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
> +	struct domain_device *dev = data;
> +
> +	if (dev == cmd_to_domain_dev(cmd))
> +		blk_abort_request(req);
> +	return true;
> +}
> +
> +static void sas_abort_device_scsi_cmds(struct domain_device *dev)
> +{
> +	struct sas_ha_struct *sas_ha = dev->port->ha;
> +	struct Scsi_Host *shost = sas_ha->core.shost;
> +
> +	if (dev_is_expander(dev->dev_type))
> +		return;
> +
> +	/*
> +	 * For removed device with active IOs, the user space applications have
> +	 * to spend very long time waiting for the timeout.

I don't think that you need to mention userspace at all. Just mention 
that we want to accelerate the removal process by starting EH early.

* This is not
> +	 * necessary because a removed device will not return the IOs.
> +	 * Abort the inflight IOs here so that EH can be quickly kicked in.
> +	 */
> +	blk_mq_tagset_busy_iter(&shost->tag_set, sas_abort_cmd, dev);
> +}
> +
>   void sas_unregister_dev(struct asd_sas_port *port, struct domain_device *dev)
>   {
>   	if (!test_bit(SAS_DEV_DESTROY, &dev->state) &&
> @@ -372,6 +399,8 @@ void sas_unregister_dev(struct asd_sas_port *port, struct domain_device *dev)
>   	}
>   
>   	if (!test_and_set_bit(SAS_DEV_DESTROY, &dev->state)) {
> +		if (test_bit(SAS_DEV_GONE, &dev->state))
> +			sas_abort_device_scsi_cmds(dev);
>   		sas_rphy_unlink(dev->rphy);
>   		list_move_tail(&dev->disco_list_node, &port->destroy_list);
>   	}
diff mbox series

Patch

diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
index 72fdb2e5d047..8c6afe724944 100644
--- a/drivers/scsi/libsas/sas_discover.c
+++ b/drivers/scsi/libsas/sas_discover.c
@@ -360,6 +360,33 @@  static void sas_destruct_ports(struct asd_sas_port *port)
 	}
 }
 
+static bool sas_abort_cmd(struct request *req, void *data)
+{
+	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
+	struct domain_device *dev = data;
+
+	if (dev == cmd_to_domain_dev(cmd))
+		blk_abort_request(req);
+	return true;
+}
+
+static void sas_abort_device_scsi_cmds(struct domain_device *dev)
+{
+	struct sas_ha_struct *sas_ha = dev->port->ha;
+	struct Scsi_Host *shost = sas_ha->core.shost;
+
+	if (dev_is_expander(dev->dev_type))
+		return;
+
+	/*
+	 * For removed device with active IOs, the user space applications have
+	 * to spend very long time waiting for the timeout. This is not
+	 * necessary because a removed device will not return the IOs.
+	 * Abort the inflight IOs here so that EH can be quickly kicked in.
+	 */
+	blk_mq_tagset_busy_iter(&shost->tag_set, sas_abort_cmd, dev);
+}
+
 void sas_unregister_dev(struct asd_sas_port *port, struct domain_device *dev)
 {
 	if (!test_bit(SAS_DEV_DESTROY, &dev->state) &&
@@ -372,6 +399,8 @@  void sas_unregister_dev(struct asd_sas_port *port, struct domain_device *dev)
 	}
 
 	if (!test_and_set_bit(SAS_DEV_DESTROY, &dev->state)) {
+		if (test_bit(SAS_DEV_GONE, &dev->state))
+			sas_abort_device_scsi_cmds(dev);
 		sas_rphy_unlink(dev->rphy);
 		list_move_tail(&dev->disco_list_node, &port->destroy_list);
 	}