diff mbox series

[v16,05/26] blk-zoned: Fix a deadlock triggered by unaligned writes

Message ID 20241119002815.600608-6-bvanassche@acm.org (mailing list archive)
State Not Applicable
Headers show
Series Improve write performance for zoned UFS devices | expand

Commit Message

Bart Van Assche Nov. 19, 2024, 12:27 a.m. UTC
If the queue is filled with unaligned writes then the following
deadlock occurs:

Call Trace:
 <TASK>
 __schedule+0x8cc/0x2190
 schedule+0xdd/0x2b0
 blk_queue_enter+0x2ce/0x4f0
 blk_mq_alloc_request+0x303/0x810
 scsi_execute_cmd+0x3f4/0x7b0
 sd_zbc_do_report_zones+0x19e/0x4c0
 sd_zbc_report_zones+0x304/0x920
 disk_zone_wplug_handle_error+0x237/0x920
 disk_zone_wplugs_work+0x17e/0x430
 process_one_work+0xdd0/0x1490
 worker_thread+0x5eb/0x1010
 kthread+0x2e5/0x3b0
 ret_from_fork+0x3a/0x80
 </TASK>

Fix this deadlock by removing the disk->fops->report_zones() call and by
deriving the write pointer information from successfully completed zoned
writes.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-zoned.c | 69 +++++++++++++++++++----------------------------
 block/blk.h       |  4 ++-
 2 files changed, 31 insertions(+), 42 deletions(-)

Comments

Damien Le Moal Nov. 19, 2024, 2:57 a.m. UTC | #1
On 11/19/24 9:27 AM, Bart Van Assche wrote:
> If the queue is filled with unaligned writes then the following
> deadlock occurs:
> 
> Call Trace:
>  <TASK>
>  __schedule+0x8cc/0x2190
>  schedule+0xdd/0x2b0
>  blk_queue_enter+0x2ce/0x4f0
>  blk_mq_alloc_request+0x303/0x810
>  scsi_execute_cmd+0x3f4/0x7b0
>  sd_zbc_do_report_zones+0x19e/0x4c0
>  sd_zbc_report_zones+0x304/0x920
>  disk_zone_wplug_handle_error+0x237/0x920
>  disk_zone_wplugs_work+0x17e/0x430
>  process_one_work+0xdd0/0x1490
>  worker_thread+0x5eb/0x1010
>  kthread+0x2e5/0x3b0
>  ret_from_fork+0x3a/0x80
>  </TASK>
> 
> Fix this deadlock by removing the disk->fops->report_zones() call and by
> deriving the write pointer information from successfully completed zoned
> writes.
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>

Doesn't this need a Fixes tag and CC stable, and come earlier in the series (or
sent separately) ?

Overall, this patch seems wrong anyway as zone reset and zone finish may be
done between 2 writes, failing the next one but the recovery done here will use
the previous succeful write end position as the wp, which is NOT correct as
reset or finish changed that... And we also have the possibility of torn writes
(partial writes) with SAS SMR drives. So I really think that you cannot avoid
doing a report zone to recover errors.

> ---
>  block/blk-zoned.c | 69 +++++++++++++++++++----------------------------
>  block/blk.h       |  4 ++-
>  2 files changed, 31 insertions(+), 42 deletions(-)
> 
> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
> index b570b773e65f..284820b29285 100644
> --- a/block/blk-zoned.c
> +++ b/block/blk-zoned.c
> @@ -56,6 +56,8 @@ static const char *const zone_cond_name[] = {
>   * @zone_no: The number of the zone the plug is managing.
>   * @wp_offset: The zone write pointer location relative to the start of the zone
>   *             as a number of 512B sectors.
> + * @wp_offset_compl: End offset for completed zoned writes as a number of 512
> + *		     byte sectors.
>   * @bio_list: The list of BIOs that are currently plugged.
>   * @bio_work: Work struct to handle issuing of plugged BIOs
>   * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
> @@ -69,6 +71,7 @@ struct blk_zone_wplug {
>  	unsigned int		flags;
>  	unsigned int		zone_no;
>  	unsigned int		wp_offset;
> +	unsigned int		wp_offset_compl;
>  	struct bio_list		bio_list;
>  	struct work_struct	bio_work;
>  	struct rcu_head		rcu_head;
> @@ -531,6 +534,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
>  	zwplug->flags = 0;
>  	zwplug->zone_no = zno;
>  	zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
> +	zwplug->wp_offset_compl = 0;
>  	bio_list_init(&zwplug->bio_list);
>  	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
>  	zwplug->disk = disk;
> @@ -1226,6 +1230,7 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
>  	struct gendisk *disk = bio->bi_bdev->bd_disk;
>  	struct blk_zone_wplug *zwplug =
>  		disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
> +	unsigned int end_sector;
>  	unsigned long flags;
>  
>  	if (WARN_ON_ONCE(!zwplug))
> @@ -1243,11 +1248,24 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
>  		bio->bi_opf |= REQ_OP_ZONE_APPEND;
>  	}
>  
> -	/*
> -	 * If the BIO failed, mark the plug as having an error to trigger
> -	 * recovery.
> -	 */
> -	if (bio->bi_status != BLK_STS_OK) {
> +	if (bio->bi_status == BLK_STS_OK) {
> +		switch (bio_op(bio)) {
> +		case REQ_OP_WRITE:
> +		case REQ_OP_ZONE_APPEND:
> +		case REQ_OP_WRITE_ZEROES:
> +			end_sector = bdev_offset_from_zone_start(disk->part0,
> +				     bio->bi_iter.bi_sector + bio_sectors(bio));
> +			if (end_sector > zwplug->wp_offset_compl)
> +				zwplug->wp_offset_compl = end_sector;
> +			break;
> +		default:
> +			break;
> +		}
> +	} else {
> +		/*
> +		 * If the BIO failed, mark the plug as having an error to
> +		 * trigger recovery.
> +		 */
>  		spin_lock_irqsave(&zwplug->lock, flags);
>  		disk_zone_wplug_set_error(disk, zwplug);
>  		spin_unlock_irqrestore(&zwplug->lock, flags);
> @@ -1388,30 +1406,10 @@ static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
>  	}
>  }
>  
> -static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
> -					 unsigned int idx, void *data)
> -{
> -	struct blk_zone *zonep = data;
> -
> -	*zonep = *zone;
> -	return 0;
> -}
> -
>  static void disk_zone_wplug_handle_error(struct gendisk *disk,
>  					 struct blk_zone_wplug *zwplug)
>  {
> -	sector_t zone_start_sector =
> -		bdev_zone_sectors(disk->part0) * zwplug->zone_no;
> -	unsigned int noio_flag;
> -	struct blk_zone zone;
>  	unsigned long flags;
> -	int ret;
> -
> -	/* Get the current zone information from the device. */
> -	noio_flag = memalloc_noio_save();
> -	ret = disk->fops->report_zones(disk, zone_start_sector, 1,
> -				       blk_zone_wplug_report_zone_cb, &zone);
> -	memalloc_noio_restore(noio_flag);
>  
>  	spin_lock_irqsave(&zwplug->lock, flags);
>  
> @@ -1425,19 +1423,8 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk,
>  
>  	zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
>  
> -	if (ret != 1) {
> -		/*
> -		 * We failed to get the zone information, meaning that something
> -		 * is likely really wrong with the device. Abort all remaining
> -		 * plugged BIOs as otherwise we could endup waiting forever on
> -		 * plugged BIOs to complete if there is a queue freeze on-going.
> -		 */
> -		disk_zone_wplug_abort(zwplug);
> -		goto unplug;
> -	}
> -
>  	/* Update the zone write pointer offset. */
> -	zwplug->wp_offset = blk_zone_wp_offset(&zone);
> +	zwplug->wp_offset = zwplug->wp_offset_compl;
>  	disk_zone_wplug_abort_unaligned(disk, zwplug);
>  
>  	/* Restart BIO submission if we still have any BIO left. */
> @@ -1446,7 +1433,6 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk,
>  		goto unlock;
>  	}
>  
> -unplug:
>  	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
>  	if (disk_should_remove_zone_wplug(disk, zwplug))
>  		disk_remove_zone_wplug(disk, zwplug);
> @@ -1978,7 +1964,7 @@ EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
>  static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
>  				  struct seq_file *m)
>  {
> -	unsigned int zwp_wp_offset, zwp_flags;
> +	unsigned int zwp_wp_offset, zwp_wp_offset_compl, zwp_flags;
>  	unsigned int zwp_zone_no, zwp_ref;
>  	unsigned int zwp_bio_list_size;
>  	unsigned long flags;
> @@ -1988,14 +1974,15 @@ static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
>  	zwp_flags = zwplug->flags;
>  	zwp_ref = refcount_read(&zwplug->ref);
>  	zwp_wp_offset = zwplug->wp_offset;
> +	zwp_wp_offset_compl = zwplug->wp_offset_compl;
>  	zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
>  	spin_unlock_irqrestore(&zwplug->lock, flags);
>  
>  	bool all_zwr_inserted = blk_zone_all_zwr_inserted(zwplug);
>  
> -	seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u bio_list_size %u all_zwr_inserted %d\n",
> +	seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u wp_offset_compl %u bio_list_size %u all_zwr_inserted %d\n",
>  		   zwp_zone_no, zwp_flags, zwp_ref, zwp_wp_offset,
> -		   zwp_bio_list_size, all_zwr_inserted);
> +		   zwp_wp_offset_compl, zwp_bio_list_size, all_zwr_inserted);
>  }
>  
>  int queue_zone_wplugs_show(void *data, struct seq_file *m)
> diff --git a/block/blk.h b/block/blk.h
> index be945db6298d..88a6e258eafe 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -470,8 +470,10 @@ static inline void blk_zone_update_request_bio(struct request *rq,
>  	 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
>  	 * lookup the zone write plug.
>  	 */
> -	if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
> +	if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) {
>  		bio->bi_iter.bi_sector = rq->__sector;
> +		bio->bi_iter.bi_size = rq->__data_len;
> +	}
>  }
>  
>  void blk_zone_write_plug_requeue_request(struct request *rq);
Bart Van Assche Nov. 19, 2024, 9:04 p.m. UTC | #2
On 11/18/24 6:57 PM, Damien Le Moal wrote:
> On 11/19/24 9:27 AM, Bart Van Assche wrote:
>> If the queue is filled with unaligned writes then the following
>> deadlock occurs:
>>
>> Call Trace:
>>   <TASK>
>>   __schedule+0x8cc/0x2190
>>   schedule+0xdd/0x2b0
>>   blk_queue_enter+0x2ce/0x4f0
>>   blk_mq_alloc_request+0x303/0x810
>>   scsi_execute_cmd+0x3f4/0x7b0
>>   sd_zbc_do_report_zones+0x19e/0x4c0
>>   sd_zbc_report_zones+0x304/0x920
>>   disk_zone_wplug_handle_error+0x237/0x920
>>   disk_zone_wplugs_work+0x17e/0x430
>>   process_one_work+0xdd0/0x1490
>>   worker_thread+0x5eb/0x1010
>>   kthread+0x2e5/0x3b0
>>   ret_from_fork+0x3a/0x80
>>   </TASK>
>>
>> Fix this deadlock by removing the disk->fops->report_zones() call and by
>> deriving the write pointer information from successfully completed zoned
>> writes.
>>
>> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> 
> Doesn't this need a Fixes tag and CC stable, and come earlier in the series (or
> sent separately) ?

I will add Fixes: and Cc: stable tags.

I'm not sure how to move this patch earlier since it depends on the
previous patch in this series ("blk-zoned: Only handle errors after
pending zoned writes have completed"). Without that patch, it is not
safe to use zwplug->wp_offset_compl in the error handler.

> Overall, this patch seems wrong anyway as zone reset and zone finish may be
> done between 2 writes, failing the next one but the recovery done here will use
> the previous succeful write end position as the wp, which is NOT correct as
> reset or finish changed that...

I will add support for the zone reset and zone finish commands in this
patch.

> And we also have the possibility of torn writes
> (partial writes) with SAS SMR drives. So I really think that you cannot avoid
> doing a report zone to recover errors.

Thanks for having brought this up. This is something I was not aware of.

disk_zone_wplug_handle_error() submits a new request to retrieve zone 
information while handling an error triggered by other requests. This
can easily lead to a deadlock as the above call trace shows. How about
introducing a queue flag for the "report zones" approach in
disk_zone_wplug_handle_error() such that the "report zones" approach is
only used for SAS SMR drives?

Thanks,

Bart.
Damien Le Moal Nov. 21, 2024, 3:32 a.m. UTC | #3
On 11/20/24 06:04, Bart Van Assche wrote:
> On 11/18/24 6:57 PM, Damien Le Moal wrote:
>> On 11/19/24 9:27 AM, Bart Van Assche wrote:
>>> If the queue is filled with unaligned writes then the following
>>> deadlock occurs:
>>>
>>> Call Trace:
>>>   <TASK>
>>>   __schedule+0x8cc/0x2190
>>>   schedule+0xdd/0x2b0
>>>   blk_queue_enter+0x2ce/0x4f0
>>>   blk_mq_alloc_request+0x303/0x810
>>>   scsi_execute_cmd+0x3f4/0x7b0
>>>   sd_zbc_do_report_zones+0x19e/0x4c0
>>>   sd_zbc_report_zones+0x304/0x920
>>>   disk_zone_wplug_handle_error+0x237/0x920
>>>   disk_zone_wplugs_work+0x17e/0x430
>>>   process_one_work+0xdd0/0x1490
>>>   worker_thread+0x5eb/0x1010
>>>   kthread+0x2e5/0x3b0
>>>   ret_from_fork+0x3a/0x80
>>>   </TASK>
>>>
>>> Fix this deadlock by removing the disk->fops->report_zones() call and by
>>> deriving the write pointer information from successfully completed zoned
>>> writes.
>>>
>>> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
>>
>> Doesn't this need a Fixes tag and CC stable, and come earlier in the series (or
>> sent separately) ?
> 
> I will add Fixes: and Cc: stable tags.
> 
> I'm not sure how to move this patch earlier since it depends on the
> previous patch in this series ("blk-zoned: Only handle errors after
> pending zoned writes have completed"). Without that patch, it is not
> safe to use zwplug->wp_offset_compl in the error handler.
> 
>> Overall, this patch seems wrong anyway as zone reset and zone finish may be
>> done between 2 writes, failing the next one but the recovery done here will use
>> the previous succeful write end position as the wp, which is NOT correct as
>> reset or finish changed that...
> 
> I will add support for the zone reset and zone finish commands in this
> patch.
> 
>> And we also have the possibility of torn writes
>> (partial writes) with SAS SMR drives. So I really think that you cannot avoid
>> doing a report zone to recover errors.
> 
> Thanks for having brought this up. This is something I was not aware of.
> 
> disk_zone_wplug_handle_error() submits a new request to retrieve zone 
> information while handling an error triggered by other requests. This
> can easily lead to a deadlock as the above call trace shows. How about
> introducing a queue flag for the "report zones" approach in
> disk_zone_wplug_handle_error() such that the "report zones" approach is
> only used for SAS SMR drives?

Sure, but how would that solve the potential deadlock problem ? ALso, I am not
entirely clear on how the deadlock can happen given that zone write plugs are
queueing/blocking BIOs, not requests. So even assuming you have a large number
of BIOs plugged in a zone write plug, the error handler work should still be
able to issue a request to do a report zones, no ? On which resource can the
deadlock happen ? Plugged BIOs do not yet use a tag, right ?

What am I missing here ? Or is it maybe something that can happen with your
modifications because you changed the zone write plug behavior to allow for more
than one BIO at a time being unplugged and issued to the device ?

Note that if you do have a test case for this triggering the deadlock, we
definitely need to solve this and ideally have a blktest case checking it.
Bart Van Assche Nov. 21, 2024, 5:51 p.m. UTC | #4
On 11/20/24 7:32 PM, Damien Le Moal wrote:
> On 11/20/24 06:04, Bart Van Assche wrote:
>> On 11/18/24 6:57 PM, Damien Le Moal wrote:
>>> On 11/19/24 9:27 AM, Bart Van Assche wrote:
>>>> If the queue is filled with unaligned writes then the following
>>>> deadlock occurs:
>>>>
>>>> Call Trace:
>>>>    <TASK>
>>>>    __schedule+0x8cc/0x2190
>>>>    schedule+0xdd/0x2b0
>>>>    blk_queue_enter+0x2ce/0x4f0
>>>>    blk_mq_alloc_request+0x303/0x810
>>>>    scsi_execute_cmd+0x3f4/0x7b0
>>>>    sd_zbc_do_report_zones+0x19e/0x4c0
>>>>    sd_zbc_report_zones+0x304/0x920
>>>>    disk_zone_wplug_handle_error+0x237/0x920
>>>>    disk_zone_wplugs_work+0x17e/0x430
>>>>    process_one_work+0xdd0/0x1490
>>>>    worker_thread+0x5eb/0x1010
>>>>    kthread+0x2e5/0x3b0
>>>>    ret_from_fork+0x3a/0x80
>>>>    </TASK>
>>>>
>>>> Fix this deadlock by removing the disk->fops->report_zones() call and by
>>>> deriving the write pointer information from successfully completed zoned
>>>> writes.
>>>>
>>>> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
>>>
>>> Doesn't this need a Fixes tag and CC stable, and come earlier in the series (or
>>> sent separately) ?
>>
>> I will add Fixes: and Cc: stable tags.
>>
>> I'm not sure how to move this patch earlier since it depends on the
>> previous patch in this series ("blk-zoned: Only handle errors after
>> pending zoned writes have completed"). Without that patch, it is not
>> safe to use zwplug->wp_offset_compl in the error handler.
>>
>>> Overall, this patch seems wrong anyway as zone reset and zone finish may be
>>> done between 2 writes, failing the next one but the recovery done here will use
>>> the previous succeful write end position as the wp, which is NOT correct as
>>> reset or finish changed that...
>>
>> I will add support for the zone reset and zone finish commands in this
>> patch.
>>
>>> And we also have the possibility of torn writes
>>> (partial writes) with SAS SMR drives. So I really think that you cannot avoid
>>> doing a report zone to recover errors.
>>
>> Thanks for having brought this up. This is something I was not aware of.
>>
>> disk_zone_wplug_handle_error() submits a new request to retrieve zone
>> information while handling an error triggered by other requests. This
>> can easily lead to a deadlock as the above call trace shows. How about
>> introducing a queue flag for the "report zones" approach in
>> disk_zone_wplug_handle_error() such that the "report zones" approach is
>> only used for SAS SMR drives?
> 
> Sure, but how would that solve the potential deadlock problem ? ALso, I am not
> entirely clear on how the deadlock can happen given that zone write plugs are
> queueing/blocking BIOs, not requests. So even assuming you have a large number
> of BIOs plugged in a zone write plug, the error handler work should still be
> able to issue a request to do a report zones, no ? On which resource can the
> deadlock happen ? Plugged BIOs do not yet use a tag, right ?
> 
> What am I missing here ? Or is it maybe something that can happen with your
> modifications because you changed the zone write plug behavior to allow for more
> than one BIO at a time being unplugged and issued to the device ?
> 
> Note that if you do have a test case for this triggering the deadlock, we
> definitely need to solve this and ideally have a blktest case checking it.

Hi Damien,

The call trace mentioned above comes from the kernel log and was
encountered while I was testing this patch series. A reproducer has
already been shared - see also
https://lore.kernel.org/linux-block/e840b66a-79f0-4169-9ab1-c475d9608e4d@acm.org/. 
The lockup happened after the queue was filled up
with requests and hence sd_zbc_report_zones() failed to allocate an
additional request for the zone report.

I'm wondering whether this lockup can also happen with the current
upstream kernel by submitting multiple unaligned writes simultaneously
where each write affects another zone and where the number of writes
matches the queue depth.

Thanks,

Bart.
Damien Le Moal Nov. 25, 2024, 4 a.m. UTC | #5
On 11/22/24 2:51 AM, Bart Van Assche wrote:
> 
> On 11/20/24 7:32 PM, Damien Le Moal wrote:
>> On 11/20/24 06:04, Bart Van Assche wrote:
>>> On 11/18/24 6:57 PM, Damien Le Moal wrote:
>>>> On 11/19/24 9:27 AM, Bart Van Assche wrote:
>>>>> If the queue is filled with unaligned writes then the following
>>>>> deadlock occurs:
>>>>>
>>>>> Call Trace:
>>>>>    <TASK>
>>>>>    __schedule+0x8cc/0x2190
>>>>>    schedule+0xdd/0x2b0
>>>>>    blk_queue_enter+0x2ce/0x4f0
>>>>>    blk_mq_alloc_request+0x303/0x810
>>>>>    scsi_execute_cmd+0x3f4/0x7b0
>>>>>    sd_zbc_do_report_zones+0x19e/0x4c0
>>>>>    sd_zbc_report_zones+0x304/0x920
>>>>>    disk_zone_wplug_handle_error+0x237/0x920
>>>>>    disk_zone_wplugs_work+0x17e/0x430
>>>>>    process_one_work+0xdd0/0x1490
>>>>>    worker_thread+0x5eb/0x1010
>>>>>    kthread+0x2e5/0x3b0
>>>>>    ret_from_fork+0x3a/0x80
>>>>>    </TASK>
>>>>>
>>>>> Fix this deadlock by removing the disk->fops->report_zones() call and by
>>>>> deriving the write pointer information from successfully completed zoned
>>>>> writes.
>>>>>
>>>>> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
>>>>
>>>> Doesn't this need a Fixes tag and CC stable, and come earlier in the series
>>>> (or
>>>> sent separately) ?
>>>
>>> I will add Fixes: and Cc: stable tags.
>>>
>>> I'm not sure how to move this patch earlier since it depends on the
>>> previous patch in this series ("blk-zoned: Only handle errors after
>>> pending zoned writes have completed"). Without that patch, it is not
>>> safe to use zwplug->wp_offset_compl in the error handler.
>>>
>>>> Overall, this patch seems wrong anyway as zone reset and zone finish may be
>>>> done between 2 writes, failing the next one but the recovery done here will
>>>> use
>>>> the previous succeful write end position as the wp, which is NOT correct as
>>>> reset or finish changed that...
>>>
>>> I will add support for the zone reset and zone finish commands in this
>>> patch.
>>>
>>>> And we also have the possibility of torn writes
>>>> (partial writes) with SAS SMR drives. So I really think that you cannot avoid
>>>> doing a report zone to recover errors.
>>>
>>> Thanks for having brought this up. This is something I was not aware of.
>>>
>>> disk_zone_wplug_handle_error() submits a new request to retrieve zone
>>> information while handling an error triggered by other requests. This
>>> can easily lead to a deadlock as the above call trace shows. How about
>>> introducing a queue flag for the "report zones" approach in
>>> disk_zone_wplug_handle_error() such that the "report zones" approach is
>>> only used for SAS SMR drives?
>>
>> Sure, but how would that solve the potential deadlock problem ? ALso, I am not
>> entirely clear on how the deadlock can happen given that zone write plugs are
>> queueing/blocking BIOs, not requests. So even assuming you have a large number
>> of BIOs plugged in a zone write plug, the error handler work should still be
>> able to issue a request to do a report zones, no ? On which resource can the
>> deadlock happen ? Plugged BIOs do not yet use a tag, right ?
>>
>> What am I missing here ? Or is it maybe something that can happen with your
>> modifications because you changed the zone write plug behavior to allow for more
>> than one BIO at a time being unplugged and issued to the device ?
>>
>> Note that if you do have a test case for this triggering the deadlock, we
>> definitely need to solve this and ideally have a blktest case checking it.
> 
> Hi Damien,
> 
> The call trace mentioned above comes from the kernel log and was
> encountered while I was testing this patch series. A reproducer has
> already been shared - see also
> https://lore.kernel.org/linux-block/e840b66a-79f0-4169-9ab1-
> c475d9608e4d@acm.org/. The lockup happened after the queue was filled up
> with requests and hence sd_zbc_report_zones() failed to allocate an
> additional request for the zone report.
> 
> I'm wondering whether this lockup can also happen with the current
> upstream kernel by submitting multiple unaligned writes simultaneously
> where each write affects another zone and where the number of writes
> matches the queue depth.

Let me check. This indeed may be a possibility.

> 
> Thanks,
> 
> Bart.
> 
>
Damien Le Moal Nov. 25, 2024, 4:19 a.m. UTC | #6
On 11/25/24 1:00 PM, Damien Le Moal wrote:
> On 11/22/24 2:51 AM, Bart Van Assche wrote:
>>
>> On 11/20/24 7:32 PM, Damien Le Moal wrote:
>>> On 11/20/24 06:04, Bart Van Assche wrote:
>>>> On 11/18/24 6:57 PM, Damien Le Moal wrote:
>>>>> On 11/19/24 9:27 AM, Bart Van Assche wrote:
>>>>>> If the queue is filled with unaligned writes then the following
>>>>>> deadlock occurs:
>>>>>>
>>>>>> Call Trace:
>>>>>>    <TASK>
>>>>>>    __schedule+0x8cc/0x2190
>>>>>>    schedule+0xdd/0x2b0
>>>>>>    blk_queue_enter+0x2ce/0x4f0
>>>>>>    blk_mq_alloc_request+0x303/0x810
>>>>>>    scsi_execute_cmd+0x3f4/0x7b0
>>>>>>    sd_zbc_do_report_zones+0x19e/0x4c0
>>>>>>    sd_zbc_report_zones+0x304/0x920
>>>>>>    disk_zone_wplug_handle_error+0x237/0x920
>>>>>>    disk_zone_wplugs_work+0x17e/0x430
>>>>>>    process_one_work+0xdd0/0x1490
>>>>>>    worker_thread+0x5eb/0x1010
>>>>>>    kthread+0x2e5/0x3b0
>>>>>>    ret_from_fork+0x3a/0x80
>>>>>>    </TASK>
>>>>>>
>>>>>> Fix this deadlock by removing the disk->fops->report_zones() call and by
>>>>>> deriving the write pointer information from successfully completed zoned
>>>>>> writes.
>>>>>>
>>>>>> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
>>>>>
>>>>> Doesn't this need a Fixes tag and CC stable, and come earlier in the series
>>>>> (or
>>>>> sent separately) ?
>>>>
>>>> I will add Fixes: and Cc: stable tags.
>>>>
>>>> I'm not sure how to move this patch earlier since it depends on the
>>>> previous patch in this series ("blk-zoned: Only handle errors after
>>>> pending zoned writes have completed"). Without that patch, it is not
>>>> safe to use zwplug->wp_offset_compl in the error handler.
>>>>
>>>>> Overall, this patch seems wrong anyway as zone reset and zone finish may be
>>>>> done between 2 writes, failing the next one but the recovery done here will
>>>>> use
>>>>> the previous succeful write end position as the wp, which is NOT correct as
>>>>> reset or finish changed that...
>>>>
>>>> I will add support for the zone reset and zone finish commands in this
>>>> patch.
>>>>
>>>>> And we also have the possibility of torn writes
>>>>> (partial writes) with SAS SMR drives. So I really think that you cannot avoid
>>>>> doing a report zone to recover errors.
>>>>
>>>> Thanks for having brought this up. This is something I was not aware of.
>>>>
>>>> disk_zone_wplug_handle_error() submits a new request to retrieve zone
>>>> information while handling an error triggered by other requests. This
>>>> can easily lead to a deadlock as the above call trace shows. How about
>>>> introducing a queue flag for the "report zones" approach in
>>>> disk_zone_wplug_handle_error() such that the "report zones" approach is
>>>> only used for SAS SMR drives?
>>>
>>> Sure, but how would that solve the potential deadlock problem ? ALso, I am not
>>> entirely clear on how the deadlock can happen given that zone write plugs are
>>> queueing/blocking BIOs, not requests. So even assuming you have a large number
>>> of BIOs plugged in a zone write plug, the error handler work should still be
>>> able to issue a request to do a report zones, no ? On which resource can the
>>> deadlock happen ? Plugged BIOs do not yet use a tag, right ?
>>>
>>> What am I missing here ? Or is it maybe something that can happen with your
>>> modifications because you changed the zone write plug behavior to allow for more
>>> than one BIO at a time being unplugged and issued to the device ?
>>>
>>> Note that if you do have a test case for this triggering the deadlock, we
>>> definitely need to solve this and ideally have a blktest case checking it.
>>
>> Hi Damien,
>>
>> The call trace mentioned above comes from the kernel log and was
>> encountered while I was testing this patch series. A reproducer has
>> already been shared - see also
>> https://lore.kernel.org/linux-block/e840b66a-79f0-4169-9ab1-
>> c475d9608e4d@acm.org/. The lockup happened after the queue was filled up
>> with requests and hence sd_zbc_report_zones() failed to allocate an
>> additional request for the zone report.
>>
>> I'm wondering whether this lockup can also happen with the current
>> upstream kernel by submitting multiple unaligned writes simultaneously
>> where each write affects another zone and where the number of writes
>> matches the queue depth.
> 
> Let me check. This indeed may be a possibility.

Thinking more about this, the answer is no, this cannot happen. The reason is
that zone write plugs are blocking BIOs, not requests. So the number of BIOs
queued waiting for execution does not matter as we are not holding tags. A
report zone for recovering from a write error in a zone may have to wait for a
tag for some time before executing, when many zones are being read/written at
the same time. But as long as requests complete and tags are freed, error
recovery report zones will eventually proceed and execute. No deadlock.
Even if the drive stops responding, we will eventually get a timeout and drive
reset which will abort all commands, so even then, we will have forward
progress with error recovery.

This is for the current upstream code.

Now, in your case, with several write requests issued per zone and said write
requests potentially waiting in the requeue or dispatch lists, then a report
zone for error recovery may indeed block forever... So with your changes, a
solution for this is needed I think.
diff mbox series

Patch

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index b570b773e65f..284820b29285 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -56,6 +56,8 @@  static const char *const zone_cond_name[] = {
  * @zone_no: The number of the zone the plug is managing.
  * @wp_offset: The zone write pointer location relative to the start of the zone
  *             as a number of 512B sectors.
+ * @wp_offset_compl: End offset for completed zoned writes as a number of 512
+ *		     byte sectors.
  * @bio_list: The list of BIOs that are currently plugged.
  * @bio_work: Work struct to handle issuing of plugged BIOs
  * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
@@ -69,6 +71,7 @@  struct blk_zone_wplug {
 	unsigned int		flags;
 	unsigned int		zone_no;
 	unsigned int		wp_offset;
+	unsigned int		wp_offset_compl;
 	struct bio_list		bio_list;
 	struct work_struct	bio_work;
 	struct rcu_head		rcu_head;
@@ -531,6 +534,7 @@  static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
 	zwplug->flags = 0;
 	zwplug->zone_no = zno;
 	zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
+	zwplug->wp_offset_compl = 0;
 	bio_list_init(&zwplug->bio_list);
 	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
 	zwplug->disk = disk;
@@ -1226,6 +1230,7 @@  void blk_zone_write_plug_bio_endio(struct bio *bio)
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
 	struct blk_zone_wplug *zwplug =
 		disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+	unsigned int end_sector;
 	unsigned long flags;
 
 	if (WARN_ON_ONCE(!zwplug))
@@ -1243,11 +1248,24 @@  void blk_zone_write_plug_bio_endio(struct bio *bio)
 		bio->bi_opf |= REQ_OP_ZONE_APPEND;
 	}
 
-	/*
-	 * If the BIO failed, mark the plug as having an error to trigger
-	 * recovery.
-	 */
-	if (bio->bi_status != BLK_STS_OK) {
+	if (bio->bi_status == BLK_STS_OK) {
+		switch (bio_op(bio)) {
+		case REQ_OP_WRITE:
+		case REQ_OP_ZONE_APPEND:
+		case REQ_OP_WRITE_ZEROES:
+			end_sector = bdev_offset_from_zone_start(disk->part0,
+				     bio->bi_iter.bi_sector + bio_sectors(bio));
+			if (end_sector > zwplug->wp_offset_compl)
+				zwplug->wp_offset_compl = end_sector;
+			break;
+		default:
+			break;
+		}
+	} else {
+		/*
+		 * If the BIO failed, mark the plug as having an error to
+		 * trigger recovery.
+		 */
 		spin_lock_irqsave(&zwplug->lock, flags);
 		disk_zone_wplug_set_error(disk, zwplug);
 		spin_unlock_irqrestore(&zwplug->lock, flags);
@@ -1388,30 +1406,10 @@  static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
 	}
 }
 
-static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
-					 unsigned int idx, void *data)
-{
-	struct blk_zone *zonep = data;
-
-	*zonep = *zone;
-	return 0;
-}
-
 static void disk_zone_wplug_handle_error(struct gendisk *disk,
 					 struct blk_zone_wplug *zwplug)
 {
-	sector_t zone_start_sector =
-		bdev_zone_sectors(disk->part0) * zwplug->zone_no;
-	unsigned int noio_flag;
-	struct blk_zone zone;
 	unsigned long flags;
-	int ret;
-
-	/* Get the current zone information from the device. */
-	noio_flag = memalloc_noio_save();
-	ret = disk->fops->report_zones(disk, zone_start_sector, 1,
-				       blk_zone_wplug_report_zone_cb, &zone);
-	memalloc_noio_restore(noio_flag);
 
 	spin_lock_irqsave(&zwplug->lock, flags);
 
@@ -1425,19 +1423,8 @@  static void disk_zone_wplug_handle_error(struct gendisk *disk,
 
 	zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
 
-	if (ret != 1) {
-		/*
-		 * We failed to get the zone information, meaning that something
-		 * is likely really wrong with the device. Abort all remaining
-		 * plugged BIOs as otherwise we could endup waiting forever on
-		 * plugged BIOs to complete if there is a queue freeze on-going.
-		 */
-		disk_zone_wplug_abort(zwplug);
-		goto unplug;
-	}
-
 	/* Update the zone write pointer offset. */
-	zwplug->wp_offset = blk_zone_wp_offset(&zone);
+	zwplug->wp_offset = zwplug->wp_offset_compl;
 	disk_zone_wplug_abort_unaligned(disk, zwplug);
 
 	/* Restart BIO submission if we still have any BIO left. */
@@ -1446,7 +1433,6 @@  static void disk_zone_wplug_handle_error(struct gendisk *disk,
 		goto unlock;
 	}
 
-unplug:
 	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
 	if (disk_should_remove_zone_wplug(disk, zwplug))
 		disk_remove_zone_wplug(disk, zwplug);
@@ -1978,7 +1964,7 @@  EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
 				  struct seq_file *m)
 {
-	unsigned int zwp_wp_offset, zwp_flags;
+	unsigned int zwp_wp_offset, zwp_wp_offset_compl, zwp_flags;
 	unsigned int zwp_zone_no, zwp_ref;
 	unsigned int zwp_bio_list_size;
 	unsigned long flags;
@@ -1988,14 +1974,15 @@  static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
 	zwp_flags = zwplug->flags;
 	zwp_ref = refcount_read(&zwplug->ref);
 	zwp_wp_offset = zwplug->wp_offset;
+	zwp_wp_offset_compl = zwplug->wp_offset_compl;
 	zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
 	spin_unlock_irqrestore(&zwplug->lock, flags);
 
 	bool all_zwr_inserted = blk_zone_all_zwr_inserted(zwplug);
 
-	seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u bio_list_size %u all_zwr_inserted %d\n",
+	seq_printf(m, "zone_no %u flags 0x%x ref %u wp_offset %u wp_offset_compl %u bio_list_size %u all_zwr_inserted %d\n",
 		   zwp_zone_no, zwp_flags, zwp_ref, zwp_wp_offset,
-		   zwp_bio_list_size, all_zwr_inserted);
+		   zwp_wp_offset_compl, zwp_bio_list_size, all_zwr_inserted);
 }
 
 int queue_zone_wplugs_show(void *data, struct seq_file *m)
diff --git a/block/blk.h b/block/blk.h
index be945db6298d..88a6e258eafe 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -470,8 +470,10 @@  static inline void blk_zone_update_request_bio(struct request *rq,
 	 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
 	 * lookup the zone write plug.
 	 */
-	if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
+	if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) {
 		bio->bi_iter.bi_sector = rq->__sector;
+		bio->bi_iter.bi_size = rq->__data_len;
+	}
 }
 
 void blk_zone_write_plug_requeue_request(struct request *rq);