diff mbox series

[blktests] zbd/012: Test requeuing of zoned writes and queue freezing

Message ID 20241125211048.1694246-1-bvanassche@acm.org (mailing list archive)
State New
Headers show
Series [blktests] zbd/012: Test requeuing of zoned writes and queue freezing | expand

Commit Message

Bart Van Assche Nov. 25, 2024, 9:10 p.m. UTC
Test concurrent requeuing of zoned writes and request queue freezing. While
test test passes with kernel 6.9, it triggers a hang with kernels 6.10 and
later. This shows that this hang is a regression introduced by the zone
write plugging code.

sysrq: Show Blocked State
task:(udev-worker)   state:D stack:0     pid:75392 tgid:75392 ppid:2178   flags:0x00000006
Call Trace:
 <TASK>
 __schedule+0x3e8/0x1410
 schedule+0x27/0xf0
 blk_mq_freeze_queue_wait+0x6f/0xa0
 queue_attr_store+0x60/0xc0
 kernfs_fop_write_iter+0x13e/0x1f0
 vfs_write+0x25b/0x420
 ksys_write+0x65/0xe0
 do_syscall_64+0x82/0x160
 entry_SYSCALL_64_after_hwframe+0x76/0x7e

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 tests/zbd/012     | 70 +++++++++++++++++++++++++++++++++++++++++++++++
 tests/zbd/012.out |  2 ++
 2 files changed, 72 insertions(+)
 create mode 100644 tests/zbd/012
 create mode 100644 tests/zbd/012.out

Comments

Damien Le Moal Nov. 26, 2024, 8:34 a.m. UTC | #1
On 11/26/24 06:10, Bart Van Assche wrote:
> Test concurrent requeuing of zoned writes and request queue freezing. While
> test test passes with kernel 6.9, it triggers a hang with kernels 6.10 and
> later. This shows that this hang is a regression introduced by the zone
> write plugging code.

Instead of this sentence, it would be nicer to describe the hang...
I could recreate it: I end up with fio blocked on queue enter trying to issue a
read, the sysfs attribute write blocked on queue freeze and one write plugged
waiting, but no thread trying to do anything about it, which is weird given that
the workload is QD=1, so the write should be in the submission path and not
plugged at all. So something weird is going on.

Of note, is that by simply creating the scsi_debug device in a vm manually, I
get this lockdep splat:

[   51.934109] ======================================================
[   51.935916] WARNING: possible circular locking dependency detected
[   51.937561] 6.12.0+ #2107 Not tainted
[   51.938648] ------------------------------------------------------
[   51.940351] kworker/u16:4/157 is trying to acquire lock:
[   51.941805] ffff9fff0aa0bea8 (&q->limits_lock){+.+.}-{4:4}, at:
disk_update_zone_resources+0x86/0x170
[   51.944314]
[   51.944314] but task is already holding lock:
[   51.945688] ffff9fff0aa0b890 (&q->q_usage_counter(queue)#3){++++}-{0:0}, at:
blk_revalidate_disk_zones+0x15f/0x340
[   51.948527]
[   51.948527] which lock already depends on the new lock.
[   51.948527]
[   51.951296]
[   51.951296] the existing dependency chain (in reverse order) is:
[   51.953708]
[   51.953708] -> #1 (&q->q_usage_counter(queue)#3){++++}-{0:0}:
[   51.956131]        blk_queue_enter+0x1c9/0x1e0
[   51.957290]        blk_mq_alloc_request+0x187/0x2a0
[   51.958365]        scsi_execute_cmd+0x78/0x490 [scsi_mod]
[   51.959514]        read_capacity_16+0x111/0x410 [sd_mod]
[   51.960693]        sd_revalidate_disk.isra.0+0x872/0x3240 [sd_mod]
[   51.962004]        sd_probe+0x2d7/0x520 [sd_mod]
[   51.962993]        really_probe+0xd5/0x330
[   51.963898]        __driver_probe_device+0x78/0x110
[   51.964925]        driver_probe_device+0x1f/0xa0
[   51.965916]        __driver_attach_async_helper+0x60/0xe0
[   51.967017]        async_run_entry_fn+0x2e/0x140
[   51.968004]        process_one_work+0x21f/0x5a0
[   51.968987]        worker_thread+0x1dc/0x3c0
[   51.969868]        kthread+0xe0/0x110
[   51.970377]        ret_from_fork+0x31/0x50
[   51.970983]        ret_from_fork_asm+0x11/0x20
[   51.971587]
[   51.971587] -> #0 (&q->limits_lock){+.+.}-{4:4}:
[   51.972479]        __lock_acquire+0x1337/0x2130
[   51.973133]        lock_acquire+0xc5/0x2d0
[   51.973691]        __mutex_lock+0xda/0xcf0
[   51.974300]        disk_update_zone_resources+0x86/0x170
[   51.975032]        blk_revalidate_disk_zones+0x16c/0x340
[   51.975740]        sd_zbc_revalidate_zones+0x73/0x160 [sd_mod]
[   51.976524]        sd_revalidate_disk.isra.0+0x465/0x3240 [sd_mod]
[   51.977824]        sd_probe+0x2d7/0x520 [sd_mod]
[   51.978917]        really_probe+0xd5/0x330
[   51.979915]        __driver_probe_device+0x78/0x110
[   51.981047]        driver_probe_device+0x1f/0xa0
[   51.982143]        __driver_attach_async_helper+0x60/0xe0
[   51.983282]        async_run_entry_fn+0x2e/0x140
[   51.984319]        process_one_work+0x21f/0x5a0
[   51.985873]        worker_thread+0x1dc/0x3c0
[   51.987289]        kthread+0xe0/0x110
[   51.988546]        ret_from_fork+0x31/0x50
[   51.989926]        ret_from_fork_asm+0x11/0x20
[   51.991376]
[   51.991376] other info that might help us debug this:
[   51.991376]
[   51.994127]  Possible unsafe locking scenario:
[   51.994127]
[   51.995651]        CPU0                    CPU1
[   51.996694]        ----                    ----
[   51.997716]   lock(&q->q_usage_counter(queue)#3);
[   51.998817]                                lock(&q->limits_lock);
[   52.000043]                                lock(&q->q_usage_counter(queue)#3);
[   52.001638]   lock(&q->limits_lock);
[   52.002485]
[   52.002485]  *** DEADLOCK ***
[   52.002485]
[   52.003978] 5 locks held by kworker/u16:4/157:
[   52.004921]  #0: ffff9fff002a8d48 ((wq_completion)async){+.+.}-{0:0}, at:
process_one_work+0x4c4/0x5a0
[   52.006864]  #1: ffffb71780e5fe58
((work_completion)(&entry->work)){+.+.}-{0:0}, at: process_one_work+0x1df/0x5a0
[   52.008202]  #2: ffff9fff2d57f378 (&dev->mutex){....}-{4:4}, at:
__driver_attach_async_helper+0x3f/0xe0
[   52.009459]  #3: ffff9fff0aa0b858 (&q->q_usage_counter(io)#3){+.+.}-{0:0},
at: blk_revalidate_disk_zones+0x15f/0x340
[   52.010903]  #4: ffff9fff0aa0b890 (&q->q_usage_counter(queue)#3){++++}-{0:0},
at: blk_revalidate_disk_zones+0x15f/0x340
[   52.013437]
[   52.013437] stack backtrace:
[   52.014691] CPU: 2 UID: 0 PID: 157 Comm: kworker/u16:4 Not tainted 6.12.0+ #2107
[   52.016195] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
1.16.3-3.fc41 04/01/2014
[   52.017376] Workqueue: async async_run_entry_fn
[   52.018050] Call Trace:
[   52.018442]  <TASK>
[   52.018810]  dump_stack_lvl+0x5d/0x80
[   52.019353]  print_circular_bug.cold+0x178/0x1be
[   52.020316]  check_noncircular+0x144/0x160
[   52.021616]  __lock_acquire+0x1337/0x2130
[   52.022350]  lock_acquire+0xc5/0x2d0
[   52.022877]  ? disk_update_zone_resources+0x86/0x170
[   52.023543]  ? lock_is_held_type+0x85/0xe0
[   52.024170]  __mutex_lock+0xda/0xcf0
[   52.024684]  ? disk_update_zone_resources+0x86/0x170
[   52.025411]  ? disk_update_zone_resources+0x86/0x170
[   52.026131]  ? mark_held_locks+0x40/0x70
[   52.026681]  ? disk_update_zone_resources+0x86/0x170
[   52.027390]  disk_update_zone_resources+0x86/0x170
[   52.028074]  ? mark_held_locks+0x40/0x70
[   52.028629]  ? lockdep_hardirqs_on_prepare+0xd3/0x170
[   52.029344]  ? _raw_spin_unlock_irqrestore+0x40/0x50
[   52.030321]  ? percpu_ref_is_zero+0x3a/0x50
[   52.031565]  ? blk_mq_freeze_queue_wait+0x47/0xe0
[   52.032897]  blk_revalidate_disk_zones+0x16c/0x340
[   52.033610]  sd_zbc_revalidate_zones+0x73/0x160 [sd_mod]
[   52.034377]  sd_revalidate_disk.isra.0+0x465/0x3240 [sd_mod]
[   52.035171]  ? save_trace+0x4e/0x360
[   52.035700]  sd_probe+0x2d7/0x520 [sd_mod]
[   52.036314]  really_probe+0xd5/0x330
[   52.036829]  ? pm_runtime_barrier+0x54/0x90
[   52.037421]  __driver_probe_device+0x78/0x110
[   52.038061]  driver_probe_device+0x1f/0xa0
[   52.038631]  __driver_attach_async_helper+0x60/0xe0
[   52.039329]  async_run_entry_fn+0x2e/0x140
[   52.039908]  process_one_work+0x21f/0x5a0
[   52.040471]  worker_thread+0x1dc/0x3c0
[   52.041073]  ? rescuer_thread+0x480/0x480
[   52.041639]  kthread+0xe0/0x110
[   52.042420]  ? kthread_insert_work_sanity_check+0x60/0x60
[   52.043790]  ret_from_fork+0x31/0x50
[   52.044700]  ? kthread_insert_work_sanity_check+0x60/0x60
[   52.046055]  ret_from_fork_asm+0x11/0x20
[   52.047023]  </TASK>

Checking the code, it is a false positive though because sd_revalidate_disk()
calls queue_limits_commit_update() which releases the limits mutex lock before
calling blk_revalidate_disk_zones().
Will see what I can do about it.

See additional comments on this patch below.

> 
> sysrq: Show Blocked State
> task:(udev-worker)   state:D stack:0     pid:75392 tgid:75392 ppid:2178   flags:0x00000006
> Call Trace:
>  <TASK>
>  __schedule+0x3e8/0x1410
>  schedule+0x27/0xf0
>  blk_mq_freeze_queue_wait+0x6f/0xa0
>  queue_attr_store+0x60/0xc0
>  kernfs_fop_write_iter+0x13e/0x1f0
>  vfs_write+0x25b/0x420
>  ksys_write+0x65/0xe0
>  do_syscall_64+0x82/0x160
>  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  tests/zbd/012     | 70 +++++++++++++++++++++++++++++++++++++++++++++++
>  tests/zbd/012.out |  2 ++
>  2 files changed, 72 insertions(+)
>  create mode 100644 tests/zbd/012
>  create mode 100644 tests/zbd/012.out
> 
> diff --git a/tests/zbd/012 b/tests/zbd/012
> new file mode 100644
> index 000000000000..0551d01011af
> --- /dev/null
> +++ b/tests/zbd/012
> @@ -0,0 +1,70 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (C) 2024 Google LLC
> +
> +. tests/scsi/rc
> +. common/scsi_debug
> +
> +DESCRIPTION="test requeuing of zoned writes and queue freezing"

There is no requeueing going on here so this description is inaccurate.

> +QUICK=1
> +
> +requires() {
> +	_have_fio_zbd_zonemode
> +}
> +
> +toggle_iosched() {
> +	while true; do
> +		for iosched in none mq-deadline; do
> +			echo "${iosched}" > "/sys/class/block/$(basename "$zdev")/queue/scheduler"
> +			sleep .1
> +		done
> +	done
> +}
> +
> +test() {
> +	echo "Running ${TEST_NAME}"
> +
> +	local qd=1
> +	local scsi_debug_params=(
> +		delay=0
> +		dev_size_mb=1024
> +		every_nth=$((2 * qd))
> +		max_queue="${qd}"
> +		opts=0x8000          # SDEBUG_OPT_HOST_BUSY
> +		sector_size=4096
> +		statistics=1
> +		zbc=host-managed
> +		zone_nr_conv=0
> +		zone_size_mb=4
> +	)
> +	_init_scsi_debug "${scsi_debug_params[@]}" &&
> +	local zdev="/dev/${SCSI_DEBUG_DEVICES[0]}" fail &&
> +	ls -ld "${zdev}" >>"${FULL}" &&
> +	{ toggle_iosched & } &&
> +	toggle_iosched_pid=$! &&
> +	local fio_args=(
> +		--direct=1
> +		--filename="${zdev}"
> +		--iodepth="${qd}"
> +		--ioengine=io_uring
> +		--ioscheduler=none
> +		--name=pipeline-zoned-writes

Nothing is pipelined here since this is a qd=1 run.

> +		--output="${RESULTS_DIR}/fio-output-zbd-092.txt"
> +		--runtime="${TIMEOUT:-30}"
> +		--rw=randwrite
> +		--time_based
> +		--zonemode=zbd
> +	) &&
> +	_run_fio "${fio_args[@]}" >>"${FULL}" 2>&1 ||
> +	fail=true
> +
> +	kill "${toggle_iosched_pid}" 2>&1
> +	_exit_scsi_debug
> +
> +	if [ -z "$fail" ]; then
> +		echo "Test complete"
> +	else
> +		echo "Test failed"
> +		return 1
> +	fi
> +}
> diff --git a/tests/zbd/012.out b/tests/zbd/012.out
> new file mode 100644
> index 000000000000..8ff654950c5f
> --- /dev/null
> +++ b/tests/zbd/012.out
> @@ -0,0 +1,2 @@
> +Running zbd/012
> +Test complete
Damien Le Moal Nov. 26, 2024, 11:26 a.m. UTC | #2
On 11/26/24 06:10, Bart Van Assche wrote:
> Test concurrent requeuing of zoned writes and request queue freezing. While
> test test passes with kernel 6.9, it triggers a hang with kernels 6.10 and
> later. This shows that this hang is a regression introduced by the zone
> write plugging code.
> 
> sysrq: Show Blocked State
> task:(udev-worker)   state:D stack:0     pid:75392 tgid:75392 ppid:2178   flags:0x00000006
> Call Trace:
>  <TASK>
>  __schedule+0x3e8/0x1410
>  schedule+0x27/0xf0
>  blk_mq_freeze_queue_wait+0x6f/0xa0
>  queue_attr_store+0x60/0xc0
>  kernfs_fop_write_iter+0x13e/0x1f0
>  vfs_write+0x25b/0x420
>  ksys_write+0x65/0xe0
>  do_syscall_64+0x82/0x160
>  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  tests/zbd/012     | 70 +++++++++++++++++++++++++++++++++++++++++++++++
>  tests/zbd/012.out |  2 ++
>  2 files changed, 72 insertions(+)
>  create mode 100644 tests/zbd/012
>  create mode 100644 tests/zbd/012.out
> 
> diff --git a/tests/zbd/012 b/tests/zbd/012
> new file mode 100644
> index 000000000000..0551d01011af
> --- /dev/null
> +++ b/tests/zbd/012
> @@ -0,0 +1,70 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (C) 2024 Google LLC
> +
> +. tests/scsi/rc
> +. common/scsi_debug
> +
> +DESCRIPTION="test requeuing of zoned writes and queue freezing"
> +QUICK=1
> +
> +requires() {
> +	_have_fio_zbd_zonemode
> +}
> +
> +toggle_iosched() {
> +	while true; do
> +		for iosched in none mq-deadline; do
> +			echo "${iosched}" > "/sys/class/block/$(basename "$zdev")/queue/scheduler"
> +			sleep .1
> +		done
> +	done
> +}
> +
> +test() {
> +	echo "Running ${TEST_NAME}"
> +
> +	local qd=1
> +	local scsi_debug_params=(
> +		delay=0
> +		dev_size_mb=1024
> +		every_nth=$((2 * qd))
> +		max_queue="${qd}"
> +		opts=0x8000          # SDEBUG_OPT_HOST_BUSY
> +		sector_size=4096
> +		statistics=1
> +		zbc=host-managed
> +		zone_nr_conv=0
> +		zone_size_mb=4
> +	)
> +	_init_scsi_debug "${scsi_debug_params[@]}" &&
> +	local zdev="/dev/${SCSI_DEBUG_DEVICES[0]}" fail &&
> +	ls -ld "${zdev}" >>"${FULL}" &&
> +	{ toggle_iosched & } &&
> +	toggle_iosched_pid=$! &&
> +	local fio_args=(
> +		--direct=1
> +		--filename="${zdev}"
> +		--iodepth="${qd}"
> +		--ioengine=io_uring

Something very odd is going on here: this fio run is supposed to be qd=1 but
when I get the hang, I see plugged BIOs (at least always 1, and very often more
than 1 in different zones). But with a QD=1 workload, this should *NEVER*
happen. No write command should ever enter a zone write plug (as long as the
write BIO does not get split). So it looks to me like fio, or io_uring
submission context, is sending weird writes...

If I change --ioengine=io_uring to --ioengine=libaio, then the test passes, always.

I am not sure what is going on.

But I do think there is a potential deadlock anyway: if a write command fails,
zone write plug error recovery will be triggered. If that zone write plug has
BIOs plugged and the call to queue freeze when changing the scheduler happens
before the report zone command is issued for the zone write plug error recovery,
then we will deadlock on the queue freeze.
So this is a definitive bug, even though this test does not create this
situation. As already discussed, given the need to properly recover errors for
emulated zone append commands, I am not sure how to fix this yet...

If you can also look into why io_uring IO engine does not respect the iodepth=1
argument, that would be great.

Note that I tried qd > 1 with libaio and everything works just fine: the test
passes. Things are strange only with io_uring.

> +		--ioscheduler=none
> +		--name=pipeline-zoned-writes
> +		--output="${RESULTS_DIR}/fio-output-zbd-092.txt"
> +		--runtime="${TIMEOUT:-30}"
> +		--rw=randwrite
> +		--time_based
> +		--zonemode=zbd
> +	) &&
> +	_run_fio "${fio_args[@]}" >>"${FULL}" 2>&1 ||
> +	fail=true
> +
> +	kill "${toggle_iosched_pid}" 2>&1
> +	_exit_scsi_debug
> +
> +	if [ -z "$fail" ]; then
> +		echo "Test complete"
> +	else
> +		echo "Test failed"
> +		return 1
> +	fi
> +}
> diff --git a/tests/zbd/012.out b/tests/zbd/012.out
> new file mode 100644
> index 000000000000..8ff654950c5f
> --- /dev/null
> +++ b/tests/zbd/012.out
> @@ -0,0 +1,2 @@
> +Running zbd/012
> +Test complete
Christoph Hellwig Nov. 26, 2024, 12:49 p.m. UTC | #3
On Tue, Nov 26, 2024 at 08:26:13PM +0900, Damien Le Moal wrote:
> Note that I tried qd > 1 with libaio and everything works just fine: the test
> passes. Things are strange only with io_uring.

It might be worth checking if it has something to do with REQ_NOWAIT
writes and error handling for their EAGAIN case.
Bart Van Assche Nov. 26, 2024, 1:44 p.m. UTC | #4
On 11/26/24 12:34 AM, Damien Le Moal wrote:
> Of note, is that by simply creating the scsi_debug device in a vm manually, I
> get this lockdep splat:
> 
> [   51.934109] ======================================================
> [   51.935916] WARNING: possible circular locking dependency detected
> [   51.937561] 6.12.0+ #2107 Not tainted
> [   51.938648] ------------------------------------------------------
> [   51.940351] kworker/u16:4/157 is trying to acquire lock:
> [   51.941805] ffff9fff0aa0bea8 (&q->limits_lock){+.+.}-{4:4}, at:
> disk_update_zone_resources+0x86/0x170
> [   51.944314]
> [   51.944314] but task is already holding lock:
> [   51.945688] ffff9fff0aa0b890 (&q->q_usage_counter(queue)#3){++++}-{0:0}, at:
> blk_revalidate_disk_zones+0x15f/0x340
> [   51.948527]
> [   51.948527] which lock already depends on the new lock.
> [   51.948527]
> [   51.951296]
> [   51.951296] the existing dependency chain (in reverse order) is:
> [   51.953708]
> [   51.953708] -> #1 (&q->q_usage_counter(queue)#3){++++}-{0:0}:
> [   51.956131]        blk_queue_enter+0x1c9/0x1e0

I have disabled the blk_queue_enter() lockdep annotations locally
because these annotations cause too many false positive reports.

>> +DESCRIPTION="test requeuing of zoned writes and queue freezing"
> 
> There is no requeueing going on here so this description is inaccurate.

The combination of the scsi_debug options every_nth=$((2 * qd)) and 
opts=0x8000 should cause requeuing, isn't it?

>> +		--name=pipeline-zoned-writes
> 
> Nothing is pipelined here since this is a qd=1 run.

Agreed. I will change the fio job name.

Thanks,

Bart.
Damien Le Moal Nov. 27, 2024, 2:28 a.m. UTC | #5
On 11/26/24 9:49 PM, Christoph Hellwig wrote:
> On Tue, Nov 26, 2024 at 08:26:13PM +0900, Damien Le Moal wrote:
>> Note that I tried qd > 1 with libaio and everything works just fine: the test
>> passes. Things are strange only with io_uring.
> 
> It might be worth checking if it has something to do with REQ_NOWAIT
> writes and error handling for their EAGAIN case.

Yes, the issue is related to this. If I clear REQ_NOWAIT from all BIOs that go
through write plug, the problem disappears. Now trying to figure out the best
way to address this as I do not think that always clearing REQ_NOWAIT is the
best solution (plugging a BIO in a zone write plug does not block the caller...).
Damien Le Moal Nov. 27, 2024, 5:18 a.m. UTC | #6
On 11/26/24 10:44 PM, Bart Van Assche wrote:
> On 11/26/24 12:34 AM, Damien Le Moal wrote:
>> Of note, is that by simply creating the scsi_debug device in a vm manually, I
>> get this lockdep splat:
>>
>> [   51.934109] ======================================================
>> [   51.935916] WARNING: possible circular locking dependency detected
>> [   51.937561] 6.12.0+ #2107 Not tainted
>> [   51.938648] ------------------------------------------------------
>> [   51.940351] kworker/u16:4/157 is trying to acquire lock:
>> [   51.941805] ffff9fff0aa0bea8 (&q->limits_lock){+.+.}-{4:4}, at:
>> disk_update_zone_resources+0x86/0x170
>> [   51.944314]
>> [   51.944314] but task is already holding lock:
>> [   51.945688] ffff9fff0aa0b890 (&q->q_usage_counter(queue)#3){++++}-{0:0}, at:
>> blk_revalidate_disk_zones+0x15f/0x340
>> [   51.948527]
>> [   51.948527] which lock already depends on the new lock.
>> [   51.948527]
>> [   51.951296]
>> [   51.951296] the existing dependency chain (in reverse order) is:
>> [   51.953708]
>> [   51.953708] -> #1 (&q->q_usage_counter(queue)#3){++++}-{0:0}:
>> [   51.956131]        blk_queue_enter+0x1c9/0x1e0
> 
> I have disabled the blk_queue_enter() lockdep annotations locally
> because these annotations cause too many false positive reports.
> 
>>> +DESCRIPTION="test requeuing of zoned writes and queue freezing"
>>
>> There is no requeueing going on here so this description is inaccurate.
> 
> The combination of the scsi_debug options every_nth=$((2 * qd)) and opts=0x8000
> should cause requeuing, isn't it?

Indeed. I did not know these options..

>>> +        --name=pipeline-zoned-writes
>>
>> Nothing is pipelined here since this is a qd=1 run.
> 
> Agreed. I will change the fio job name.

After some debugging, I understood the issue. It is not a problem with the
queue usage counter but rather an issue with REQ_NOWAIT BIOs that may be failed
with bio_wouldblock_error() *after* having been processed by
blk_zone_plug_bio(). E.g. blk_mq_get_new_requests() may fail to get a request
due to REQ_NOWAIT being set and fail the BIO using bio_wouldblock_error(). This
in turn will lead to a call to disk_zone_wplug_set_error() which will mark the
zone write plug with the ERROR flag. However, since the BIO failure is not from
a failed request, we are not calling disk_zone_wplug_unplug_bio(), which if
called would run the error recovery for the zone. That is, the zone write plug
of the BLK_STS_AGAIN failed BIO is left "busy", which result in the BIO to be
added to the plug BIO list when it is re-submitted again. But we donot have any
write BIO on-going for this zone, the BIO ends up stuck in the zone write plug
list holding a queue reference count, which causes the freeze to never terminate.

One "simple" fix is something like this:

iff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 263e28b72053..d6334ed5b9be 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1158,10 +1158,18 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
         * If the BIO failed, mark the plug as having an error to trigger
         * recovery.
         */
-       if (bio->bi_status != BLK_STS_OK) {
+       switch (bio->bi_status) {
+       case BLK_STS_OK:
+               break;
+       case BLK_STS_AGAIN:
+               if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
+                       disk_zone_wplug_unplug_bio(disk, zwplug);
+               break;
+       default:
                spin_lock_irqsave(&zwplug->lock, flags);
                disk_zone_wplug_set_error(disk, zwplug);
                spin_unlock_irqrestore(&zwplug->lock, flags);
+               break;
        }

        /* Drop the reference we took when the BIO was issued. */

BUT ! I really do not like this fix because it causes a failure when the BIO is
retried, because the zone write pointer was advanced for the BIO the first time
it was submitted (and failed with EAGAIN). This results in going through the
error recovery procedure for a zone every time a BIO that failed with EAGAIN is
resubmitted. Not nice.

On the other hand, if we take a "big hammer" approach and simply ignore the
REQ_NOWAIT flag, things work fine. This patch does that:

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 263e28b72053..492bcbb04a74 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -491,8 +491,7 @@ static void blk_zone_wplug_bio_work(struct work_struct *work);
  * Return a pointer to the zone write plug with the plug spinlock held.
  */
 static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
-                                       sector_t sector, gfp_t gfp_mask,
-                                       unsigned long *flags)
+                                       sector_t sector, unsigned long *flags)
 {
        unsigned int zno = disk_zone_no(disk, sector);
        struct blk_zone_wplug *zwplug;
@@ -520,7 +519,7 @@ static struct blk_zone_wplug
*disk_get_and_lock_zone_wplug(struct gendisk *disk,
         * so that it is not freed when the zone write plug becomes idle without
         * the zone being full.
         */
-       zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
+       zwplug = mempool_alloc(disk->zone_wplugs_pool, GFP_NOIO);
        if (!zwplug)
                return NULL;

@@ -939,7 +938,6 @@ static bool blk_zone_wplug_handle_write(struct bio *bio,
unsigned int nr_segs)
        struct gendisk *disk = bio->bi_bdev->bd_disk;
        sector_t sector = bio->bi_iter.bi_sector;
        struct blk_zone_wplug *zwplug;
-       gfp_t gfp_mask = GFP_NOIO;
        unsigned long flags;

        /*
@@ -965,10 +963,20 @@ static bool blk_zone_wplug_handle_write(struct bio *bio,
unsigned int nr_segs)
                return false;
        }

-       if (bio->bi_opf & REQ_NOWAIT)
-               gfp_mask = GFP_NOWAIT;
+       /*
+        * A no-wait BIO may be failed after blk_zone_plug_bio() returns,
+        * e.g. blk_mq_get_new_requests() may fail the BIO with
+        * bio_wouldblock_error() in blk_mq_submit_bio(), after
+        * blk_zone_plug_bio() was called. We cannot easily handle such failure
+        * as we may be handling emulated zone append operations that rely on
+        * the write pointer tracking done in blk_zone_wplug_prepare_bio().
+        * Failing the BIO with bio_wouldblock_error() will break this tracking
+        * and result in failure for the following BIOs submitted.
+        * Avoid such issue by ignoring the REQ_NOWAIT flag.
+        */
+       bio->bi_opf &= ~REQ_NOWAIT;

-       zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
+       zwplug = disk_get_and_lock_zone_wplug(disk, sector, &flags);
        if (!zwplug) {
                bio_io_error(bio);
                return true;
@@ -1673,7 +1681,7 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone,
unsigned int idx,
        if (!wp_offset || wp_offset >= zone->capacity)
                return 0;

-       zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
+       zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, &flags);
        if (!zwplug)
                return -ENOMEM;
        spin_unlock_irqrestore(&zwplug->lock, flags);

I really prefer this second solution as it is more efficient and safer.

On another note: Changing the iodepth from 1 to some higher value result in the
test failing because of unaligned write pointer errors. I think that io_uring
is unsafe for writes to zoned block device at high queue depth because of the
asynchronous request submission that can happen (as that potentially reorders
SQEs that are otherwise correclty queued and sequential for a zone).
I would like to fix that as well, but that is another issue.
Christoph Hellwig Nov. 27, 2024, 6:16 a.m. UTC | #7
On Wed, Nov 27, 2024 at 02:18:34PM +0900, Damien Le Moal wrote:
> After some debugging, I understood the issue. It is not a problem with the
> queue usage counter but rather an issue with REQ_NOWAIT BIOs that may be failed
> with bio_wouldblock_error() *after* having been processed by
> blk_zone_plug_bio(). E.g. blk_mq_get_new_requests() may fail to get a request
> due to REQ_NOWAIT being set and fail the BIO using bio_wouldblock_error(). This
> in turn will lead to a call to disk_zone_wplug_set_error() which will mark the
> zone write plug with the ERROR flag. However, since the BIO failure is not from
> a failed request, we are not calling disk_zone_wplug_unplug_bio(), which if
> called would run the error recovery for the zone. That is, the zone write plug
> of the BLK_STS_AGAIN failed BIO is left "busy", which result in the BIO to be
> added to the plug BIO list when it is re-submitted again. But we donot have any
> write BIO on-going for this zone, the BIO ends up stuck in the zone write plug
> list holding a queue reference count, which causes the freeze to never terminate.

Did you trace where the bio_wouldblock_error is coming from?  Probably
a failing request allocation?  Can we call the guts of blk_zone_plug_bio
after allocating the request to avoid this?
Christoph Hellwig Nov. 27, 2024, 6:21 a.m. UTC | #8
On Tue, Nov 26, 2024 at 10:16:18PM -0800, Christoph Hellwig wrote:
> Did you trace where the bio_wouldblock_error is coming from?  Probably
> a failing request allocation?  Can we call the guts of blk_zone_plug_bio
> after allocating the request to avoid this?

The easier option might be to simply to "unprepare" the bio
(i.e. undo the append op rewrite and sector adjustment), decrement
wp_offset and retun.  Given that no one else could issue I/O
while we were trying to allocate the bio this should work just fine.
Damien Le Moal Nov. 27, 2024, 6:32 a.m. UTC | #9
On 11/27/24 3:21 PM, Christoph Hellwig wrote:
> On Tue, Nov 26, 2024 at 10:16:18PM -0800, Christoph Hellwig wrote:
>> Did you trace where the bio_wouldblock_error is coming from?  Probably
>> a failing request allocation?  Can we call the guts of blk_zone_plug_bio
>> after allocating the request to avoid this?
> 
> The easier option might be to simply to "unprepare" the bio
> (i.e. undo the append op rewrite and sector adjustment), decrement
> wp_offset and retun.  Given that no one else could issue I/O
> while we were trying to allocate the bio this should work just fine.

Might be good enough. But will also need to clear the REQ_NOWAIT flag for the
BIO if it is issued from the zone write plug BIO work. Because that one should
not have to deal with all this.

Let me try.
Christoph Hellwig Nov. 27, 2024, 6:33 a.m. UTC | #10
On Wed, Nov 27, 2024 at 03:32:04PM +0900, Damien Le Moal wrote:
> Might be good enough. But will also need to clear the REQ_NOWAIT flag for the
> BIO if it is issued from the zone write plug BIO work. Because that one should
> not have to deal with all this.

Yes, by that time there is no point in the nowait.

Another simpler option would be to always use the zone write plug BIO
work to issue NOWAIT bios.
Damien Le Moal Nov. 27, 2024, 6:43 a.m. UTC | #11
On 11/27/24 3:33 PM, Christoph Hellwig wrote:
> On Wed, Nov 27, 2024 at 03:32:04PM +0900, Damien Le Moal wrote:
>> Might be good enough. But will also need to clear the REQ_NOWAIT flag for the
>> BIO if it is issued from the zone write plug BIO work. Because that one should
>> not have to deal with all this.
> 
> Yes, by that time there is no point in the nowait.
> 
> Another simpler option would be to always use the zone write plug BIO
> work to issue NOWAIT bios.

I thought about that one. The problem with it is the significant performance
penalty that the context switch to the zone write plug BIO work causes. But
that is for qd=1 writes only... If that is acceptable, that solution is
actually the easiest. The overhead is not an issue for HDDs and for ZNS SSDs,
zone append to the rescue ! So maybe for now, it is best to just do that.
Christoph Hellwig Nov. 27, 2024, 6:45 a.m. UTC | #12
On Wed, Nov 27, 2024 at 03:43:11PM +0900, Damien Le Moal wrote:
> I thought about that one. The problem with it is the significant performance
> penalty that the context switch to the zone write plug BIO work causes. But
> that is for qd=1 writes only... If that is acceptable, that solution is
> actually the easiest. The overhead is not an issue for HDDs and for ZNS SSDs,
> zone append to the rescue ! So maybe for now, it is best to just do that.

The NOWAIT flag doesn't really make much sense for synchronous QD=1
writes, the rationale for it is not block the submission thread when
doing batch submissions with asynchronous completions.

So I think this should be fine (famous last words..)
Damien Le Moal Nov. 27, 2024, 7:02 a.m. UTC | #13
On 11/27/24 3:45 PM, Christoph Hellwig wrote:
> On Wed, Nov 27, 2024 at 03:43:11PM +0900, Damien Le Moal wrote:
>> I thought about that one. The problem with it is the significant performance
>> penalty that the context switch to the zone write plug BIO work causes. But
>> that is for qd=1 writes only... If that is acceptable, that solution is
>> actually the easiest. The overhead is not an issue for HDDs and for ZNS SSDs,
>> zone append to the rescue ! So maybe for now, it is best to just do that.
> 
> The NOWAIT flag doesn't really make much sense for synchronous QD=1
> writes, the rationale for it is not block the submission thread when
> doing batch submissions with asynchronous completions.
> 
> So I think this should be fine (famous last words..)

Got something simple and working.
Will run it through our test suite to make sure it does not regress anything.

The change is:

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 263e28b72053..648b8d2534a4 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -746,9 +746,25 @@ static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
        return false;
 }
 
-static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
-                                         struct bio *bio, unsigned int nr_segs)
+static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
+                                             struct blk_zone_wplug *zwplug)
 {
+       /*
+        * Take a reference on the zone write plug and schedule the submission
+        * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
+        * reference we take here.
+        */
+       WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
+       refcount_inc(&zwplug->ref);
+       queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
+}
+
+static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
+                               struct blk_zone_wplug *zwplug,
+                               struct bio *bio, unsigned int nr_segs)
+{
+       bool schedule_bio_work = false;
+
        /*
         * Grab an extra reference on the BIO request queue usage counter.
         * This reference will be reused to submit a request for the BIO for
@@ -764,6 +780,16 @@ static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
         */
        bio_clear_polled(bio);
 
+       /*
+        * REQ_NOWAIT BIOs are always handled using the zone write plug BIO
+        * work, which can block. So clear the REQ_NOWAIT flag and schedule the
+        * work if this is the first BIO we are plugging.
+        */
+       if (bio->bi_opf & REQ_NOWAIT) {
+               schedule_bio_work = bio_list_empty(&zwplug->bio_list);
+               bio->bi_opf &= ~REQ_NOWAIT;
+       }
+
        /*
         * Reuse the poll cookie field to store the number of segments when
         * split to the hardware limits.
@@ -777,6 +803,9 @@ static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
         * at the tail of the list to preserve the sequential write order.
         */
        bio_list_add(&zwplug->bio_list, bio);
+
+       if (schedule_bio_work)
+               disk_zone_wplug_schedule_bio_work(disk, zwplug);
 }
 
 /*
@@ -970,7 +999,10 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
 
        zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
        if (!zwplug) {
-               bio_io_error(bio);
+               if (bio->bi_opf & REQ_NOWAIT)
+                       bio_wouldblock_error(bio);
+               else
+                       bio_io_error(bio);
                return true;
        }
 
@@ -979,9 +1011,11 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
 
        /*
         * If the zone is already plugged or has a pending error, add the BIO
-        * to the plug BIO list. Otherwise, plug and let the BIO execute.
+        * to the plug BIO list. Do the same for REQ_NOWAIT BIOs to ensure that
+        * we will not see a BLK_STS_AGAIN failure if we let the BIO execute.
+        * Otherwise, plug and let the BIO execute.
         */
-       if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+       if (zwplug->flags & BLK_ZONE_WPLUG_BUSY || (bio->bi_opf & REQ_NOWAIT))
                goto plug;
 
        /*
@@ -999,7 +1033,7 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
 
 plug:
        zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
-       blk_zone_wplug_add_bio(zwplug, bio, nr_segs);
+       disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
 
        spin_unlock_irqrestore(&zwplug->lock, flags);
 
@@ -1083,19 +1117,6 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
 }
 EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
 
-static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
-                                             struct blk_zone_wplug *zwplug)
-{
-       /*
-        * Take a reference on the zone write plug and schedule the submission
-        * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
-        * reference we take here.
-        */
-       WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
-       refcount_inc(&zwplug->ref);
-       queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
-}
-
 static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
                                       struct blk_zone_wplug *zwplug)
 {
Christoph Hellwig Nov. 27, 2024, 7:19 a.m. UTC | #14
On Wed, Nov 27, 2024 at 04:02:42PM +0900, Damien Le Moal wrote:
> Got something simple and working.
> Will run it through our test suite to make sure it does not regress anything.
> 
> The change is:

This looks reasonable.  Do we also need to do something about
zone reset/finish command with the NOWAIT flag set?  Right now no
caller set NOWAIT, but maybe we should warn about that and clear
the flag?
Damien Le Moal Nov. 27, 2024, 8:17 a.m. UTC | #15
On 11/27/24 4:19 PM, Christoph Hellwig wrote:
> On Wed, Nov 27, 2024 at 04:02:42PM +0900, Damien Le Moal wrote:
>> Got something simple and working.
>> Will run it through our test suite to make sure it does not regress anything.
>>
>> The change is:
> 
> This looks reasonable.  Do we also need to do something about
> zone reset/finish command with the NOWAIT flag set?  Right now no
> caller set NOWAIT, but maybe we should warn about that and clear
> the flag?

Good point. I will add that as well.

After all these fixes, the last remaining problem is the zone write plug error
recovery issuing a report zone which can block if a queue freeze was initiated.
That can prevent forward progress and hang the freeze caller. I do not see any
way to avoid that report zones. I think this could be fixed with a magic
BLK_MQ_REQ_INTERNAL flag passed to blk_mq_alloc_request() and propagated to
blk_queue_enter() to forcefully take a queue usage counter reference even if a
queue freeze was started. That would ensure forward progress (i.e.
scsi_execute_cmd() or the NVMe equivalent would not block forever). Need to
think more about that.
Christoph Hellwig Nov. 27, 2024, 8:58 a.m. UTC | #16
On Wed, Nov 27, 2024 at 05:17:08PM +0900, Damien Le Moal wrote:
> After all these fixes, the last remaining problem is the zone write
> plug error recovery issuing a report zone which can block if a queue 
> freeze was initiated.
>
> That can prevent forward progress and hang the freeze caller. I do not
> see any way to avoid that report zones. I think this could be fixed with
> a magic BLK_MQ_REQ_INTERNAL flag passed to blk_mq_alloc_request() and
> propagated to blk_queue_enter() to forcefully take a queue usage counter
> reference even if a queue freeze was started. That would ensure forward
> progress (i.e. scsi_execute_cmd() or the NVMe equivalent would not block
> forever). Need to think more about that.

You are talking about disk_zone_wplug_handle_error here, right?

We should not issue a report zones to a frozen queue, as that would
bypass the freezing protection.  I suspect the right thing is to
simply defer the error recovery action until after the queue is
unfrozen.

I wonder if the separate error work handler should go away, instead
blk_zone_wplug_bio_work should always check for an error first
and in that case do the report zones.  And blk_zone_wplug_handle_write
would always defer to the work queue if there was an error.
Damien Le Moal Nov. 27, 2024, 11:31 a.m. UTC | #17
On 11/27/24 17:58, Christoph Hellwig wrote:
> On Wed, Nov 27, 2024 at 05:17:08PM +0900, Damien Le Moal wrote:
>> After all these fixes, the last remaining problem is the zone write
>> plug error recovery issuing a report zone which can block if a queue 
>> freeze was initiated.
>>
>> That can prevent forward progress and hang the freeze caller. I do not
>> see any way to avoid that report zones. I think this could be fixed with
>> a magic BLK_MQ_REQ_INTERNAL flag passed to blk_mq_alloc_request() and
>> propagated to blk_queue_enter() to forcefully take a queue usage counter
>> reference even if a queue freeze was started. That would ensure forward
>> progress (i.e. scsi_execute_cmd() or the NVMe equivalent would not block
>> forever). Need to think more about that.
> 
> You are talking about disk_zone_wplug_handle_error here, right?

Yes.

> We should not issue a report zones to a frozen queue, as that would
> bypass the freezing protection.  I suspect the right thing is to
> simply defer the error recovery action until after the queue is
> unfrozen.

But that is the issue: if we defer the report zones, we cannot make progress
with BIOs still plugged in the zone write plug BIO list. These hold a queue
usage reference that the queue freeze wait is waiting for. We have to somehow
allow that report zones to execute to make progress and empty the zone write
plugs of all plugged BIOs.

Note that if we were talking about regular writes only, we would not need to
care about error recovery as we would simply need to abort all these plugged
BIOs (as we know they will fail anyway). But for a correct zone append
emulation, we need to recover the zone write pointer to resume the execution of
the plugged BIOs. Otherwise, the user would see failed zone append commands that
are not suppose to fail unless the drive (or the zone) is dead...

> I wonder if the separate error work handler should go away, instead
> blk_zone_wplug_bio_work should always check for an error first
> and in that case do the report zones.  And blk_zone_wplug_handle_write
> would always defer to the work queue if there was an error.

That would not change a thing. The issue is that if a queue freeze has started,
executing a report zone can block on a request allocation (on the
blk_queue_enter() it implies if there are no cached requests). So the same
problem remains.

Though I agree that the error recovery could be moved to the zone BIO work and
we could get rid of the error recovery work.

But we still need to somehow allow that report zone to execute even if a queue
freeze has started... Hence the idea of the BLK_MQ_REQ_INTERNAL flag to allow
that, for special cases like this one were completing BIOs depends on first
executing another internal command. Or maybe we could try to pre-allocate a
request for such case, but managing that request to not have it freed to be able
to reuse it until all errors are processed may need many block layer changes...
Christoph Hellwig Nov. 27, 2024, 4:58 p.m. UTC | #18
On Wed, Nov 27, 2024 at 08:31:43PM +0900, Damien Le Moal wrote:
> > We should not issue a report zones to a frozen queue, as that would
> > bypass the freezing protection.  I suspect the right thing is to
> > simply defer the error recovery action until after the queue is
> > unfrozen.
> 
> But that is the issue: if we defer the report zones, we cannot make progress
> with BIOs still plugged in the zone write plug BIO list. These hold a queue
> usage reference that the queue freeze wait is waiting for. We have to somehow
> allow that report zones to execute to make progress and empty the zone write
> plugs of all plugged BIOs.

Or just fail all the bios.

> Note that if we were talking about regular writes only, we would not need to
> care about error recovery as we would simply need to abort all these plugged
> BIOs (as we know they will fail anyway). But for a correct zone append
> emulation, we need to recover the zone write pointer to resume the
> execution of the plugged BIOs. Otherwise, the user would see failed zone
> append commands that are not suppose to fail unless the drive (or the
> zone) is dead...

Maybe I'm thick, but what error could we recover from here?
Bart Van Assche Nov. 27, 2024, 5:10 p.m. UTC | #19
On 11/26/24 10:21 PM, Christoph Hellwig wrote:
> On Tue, Nov 26, 2024 at 10:16:18PM -0800, Christoph Hellwig wrote:
>> Did you trace where the bio_wouldblock_error is coming from?  Probably
>> a failing request allocation?  Can we call the guts of blk_zone_plug_bio
>> after allocating the request to avoid this?
> 
> The easier option might be to simply to "unprepare" the bio
> (i.e. undo the append op rewrite and sector adjustment), decrement
> wp_offset and retun.  Given that no one else could issue I/O
> while we were trying to allocate the bio this should work just fine.

Yet another possibility is to move the code that updates
zwplug->wp_offset from blk_zone_plug_bio() into 
blk_zone_write_plug_init_request(). With this change the 
zwplug->wp_offset update won't have to be undone in any case since it
happens after the all code in blk_mq_submit_bio() that can potentially
fail.

Thanks,

Bart.
Damien Le Moal Nov. 27, 2024, 11:11 p.m. UTC | #20
On 11/28/24 02:10, Bart Van Assche wrote:
> On 11/26/24 10:21 PM, Christoph Hellwig wrote:
>> On Tue, Nov 26, 2024 at 10:16:18PM -0800, Christoph Hellwig wrote:
>>> Did you trace where the bio_wouldblock_error is coming from?  Probably
>>> a failing request allocation?  Can we call the guts of blk_zone_plug_bio
>>> after allocating the request to avoid this?
>>
>> The easier option might be to simply to "unprepare" the bio
>> (i.e. undo the append op rewrite and sector adjustment), decrement
>> wp_offset and retun.  Given that no one else could issue I/O
>> while we were trying to allocate the bio this should work just fine.
> 
> Yet another possibility is to move the code that updates
> zwplug->wp_offset from blk_zone_plug_bio() into 
> blk_zone_write_plug_init_request(). With this change the 
> zwplug->wp_offset update won't have to be undone in any case since it
> happens after the all code in blk_mq_submit_bio() that can potentially
> fail.

We could for mq devices, but given that blk_zone_write_plug_init_request() is
not used for BIO devices (dm), we would still need to update the wp early. And
DM may have different failure patterns for REQ_NOWAIT BIOs, so it seems simpler
and more solid to punt the BIO to the BIO work and ignore the REQ_NOWAIT flag there.
Damien Le Moal Nov. 27, 2024, 11:18 p.m. UTC | #21
On 11/28/24 01:58, Christoph Hellwig wrote:
> On Wed, Nov 27, 2024 at 08:31:43PM +0900, Damien Le Moal wrote:
>>> We should not issue a report zones to a frozen queue, as that would
>>> bypass the freezing protection.  I suspect the right thing is to
>>> simply defer the error recovery action until after the queue is
>>> unfrozen.
>>
>> But that is the issue: if we defer the report zones, we cannot make progress
>> with BIOs still plugged in the zone write plug BIO list. These hold a queue
>> usage reference that the queue freeze wait is waiting for. We have to somehow
>> allow that report zones to execute to make progress and empty the zone write
>> plugs of all plugged BIOs.
> 
> Or just fail all the bios.
> 
>> Note that if we were talking about regular writes only, we would not need to
>> care about error recovery as we would simply need to abort all these plugged
>> BIOs (as we know they will fail anyway). But for a correct zone append
>> emulation, we need to recover the zone write pointer to resume the
>> execution of the plugged BIOs. Otherwise, the user would see failed zone
>> append commands that are not suppose to fail unless the drive (or the
>> zone) is dead...
> 
> Maybe I'm thick, but what error could we recover from here?

The BIO that failed is not recovered. The user will see the failure. The error
recovery report zones is all about avoiding more failures of plugged zone append
BIOs behind that failed BIO. These can succeed with the error recovery.

So sure, we can fail all BIOs. The user will see more failures. If that is OK,
that's easy to do. But in the end, that is not a solution because we still need
to get an updated zone write pointer to be able to restart zone append
emulation. Otherwise, we are in the dark and will not know where to send the
regular writes emulating zone append. That means that we still need to issue a
zone report and that is racing with queue freeze and reception of a new BIO. We
cannot have new BIOs "wait" for the zone report as that would create a hang
situation again if a queue freeze is started between reception of the new BIO
and the zone report. Do we fail these new BIOs too ? That seems extreme.
Bart Van Assche Nov. 27, 2024, 11:36 p.m. UTC | #22
On 11/27/24 3:18 PM, Damien Le Moal wrote:
> The BIO that failed is not recovered. The user will see the failure. The error
> recovery report zones is all about avoiding more failures of plugged zone append
> BIOs behind that failed BIO. These can succeed with the error recovery.
> 
> So sure, we can fail all BIOs. The user will see more failures. If that is OK,
> that's easy to do. But in the end, that is not a solution because we still need
> to get an updated zone write pointer to be able to restart zone append
> emulation. Otherwise, we are in the dark and will not know where to send the
> regular writes emulating zone append. That means that we still need to issue a
> zone report and that is racing with queue freeze and reception of a new BIO. We
> cannot have new BIOs "wait" for the zone report as that would create a hang
> situation again if a queue freeze is started between reception of the new BIO
> and the zone report. Do we fail these new BIOs too ? That seems extreme.

This patch removes the disk->fops->report_zones() call from the
blk-zoned.c code:
https://lore.kernel.org/linux-block/20241119002815.600608-6-bvanassche@acm.org/. 
Is it really not possible to get that
approach working for SAS SMR drives? If a torn write happens, is the
residual information in the response from the drive reliable?

Thanks,

Bart.
Damien Le Moal Nov. 27, 2024, 11:43 p.m. UTC | #23
On 11/28/24 08:36, Bart Van Assche wrote:
> 
> On 11/27/24 3:18 PM, Damien Le Moal wrote:
>> The BIO that failed is not recovered. The user will see the failure. The error
>> recovery report zones is all about avoiding more failures of plugged zone append
>> BIOs behind that failed BIO. These can succeed with the error recovery.
>>
>> So sure, we can fail all BIOs. The user will see more failures. If that is OK,
>> that's easy to do. But in the end, that is not a solution because we still need
>> to get an updated zone write pointer to be able to restart zone append
>> emulation. Otherwise, we are in the dark and will not know where to send the
>> regular writes emulating zone append. That means that we still need to issue a
>> zone report and that is racing with queue freeze and reception of a new BIO. We
>> cannot have new BIOs "wait" for the zone report as that would create a hang
>> situation again if a queue freeze is started between reception of the new BIO
>> and the zone report. Do we fail these new BIOs too ? That seems extreme.
> 
> This patch removes the disk->fops->report_zones() call from the
> blk-zoned.c code:
> https://lore.kernel.org/linux-block/20241119002815.600608-6-bvanassche@acm.org/. 
> Is it really not possible to get that
> approach working for SAS SMR drives? If a torn write happens, is the
> residual information in the response from the drive reliable?

Let me dig into this again and see if that can work.
Christoph Hellwig Nov. 28, 2024, 3:20 a.m. UTC | #24
On Thu, Nov 28, 2024 at 08:18:16AM +0900, Damien Le Moal wrote:
> The BIO that failed is not recovered. The user will see the failure. The error
> recovery report zones is all about avoiding more failures of plugged zone append
> BIOs behind that failed BIO. These can succeed with the error recovery.
> 
> So sure, we can fail all BIOs. The user will see more failures. If that is OK,
> that's easy to do. But in the end, that is not a solution because we still need

What is the scenario where only one I/O will fail?  The time of dust on
a sector failign writes to just sector are long gone these days.

So unless we can come up with a scenario where:

 - one I/O will fail, but others won't
 - this matters to the writer

optimizing for being able to just fail a single I/O seems like a
wasted effort.

> to get an updated zone write pointer to be able to restart zone append
> emulation. Otherwise, we are in the dark and will not know where to send the
> regular writes emulating zone append. That means that we still need to issue a
> zone report and that is racing with queue freeze and reception of a new BIO. We
> cannot have new BIOs "wait" for the zone report as that would create a hang
> situation again if a queue freeze is started between reception of the new BIO
> and the zone report. Do we fail these new BIOs too ? That seems extreme.

Just add a "need resync" flag and do the report zones before issuing the
next write?
Shinichiro Kawasaki Nov. 28, 2024, 4:35 a.m. UTC | #25
Bart, thanks for the patch. Please find my comments in line.

On Nov 25, 2024 / 13:10, Bart Van Assche wrote:
> Test concurrent requeuing of zoned writes and request queue freezing. While
> test test passes with kernel 6.9, it triggers a hang with kernels 6.10 and

I guess you mean, s/test test passes/the test passes/

> later. This shows that this hang is a regression introduced by the zone
> write plugging code.
> 
> sysrq: Show Blocked State
> task:(udev-worker)   state:D stack:0     pid:75392 tgid:75392 ppid:2178   flags:0x00000006
> Call Trace:
>  <TASK>
>  __schedule+0x3e8/0x1410
>  schedule+0x27/0xf0
>  blk_mq_freeze_queue_wait+0x6f/0xa0
>  queue_attr_store+0x60/0xc0
>  kernfs_fop_write_iter+0x13e/0x1f0
>  vfs_write+0x25b/0x420
>  ksys_write+0x65/0xe0
>  do_syscall_64+0x82/0x160
>  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> 
> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> ---
>  tests/zbd/012     | 70 +++++++++++++++++++++++++++++++++++++++++++++++
>  tests/zbd/012.out |  2 ++
>  2 files changed, 72 insertions(+)
>  create mode 100644 tests/zbd/012
>  create mode 100644 tests/zbd/012.out
> 
> diff --git a/tests/zbd/012 b/tests/zbd/012
> new file mode 100644

For the consistency, the test script should be executable. I expect the file
mode 100755 here.

> index 000000000000..0551d01011af
> --- /dev/null
> +++ b/tests/zbd/012
> @@ -0,0 +1,70 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0

The other blktests files have GPL-2.0+ or GPL-3.0+. Unless you have specific
reason, I suggest one of the two instead of GPL-2.0.

> +# Copyright (C) 2024 Google LLC

I would like have a short description here in same manner as the commit message.
How about this?

# Test concurrent requeuing of zoned writes and request queue freezing. It
# triggers a hang caused by the regression introduced by the zone write
# plugging.

> +
> +. tests/scsi/rc

s/scsi/zbd/

> +. common/scsi_debug
> +
> +DESCRIPTION="test requeuing of zoned writes and queue freezing"
> +QUICK=1

The test case uses the TIMEOUT value to control its runtime. So the QUICK=1
should not be set, and TIMED=1 should be set.

> +
> +requires() {
> +	_have_fio_zbd_zonemode
> +}
> +
> +toggle_iosched() {

Nit: I suggest to add "local iosched" here.

> +	while true; do
> +		for iosched in none mq-deadline; do
> +			echo "${iosched}" > "/sys/class/block/$(basename "$zdev")/queue/scheduler"
> +			sleep .1
> +		done
> +	done
> +}
> +
> +test() {
> +	echo "Running ${TEST_NAME}"
> +
> +	local qd=1
> +	local scsi_debug_params=(
> +		delay=0
> +		dev_size_mb=1024
> +		every_nth=$((2 * qd))
> +		max_queue="${qd}"
> +		opts=0x8000          # SDEBUG_OPT_HOST_BUSY
> +		sector_size=4096
> +		statistics=1
> +		zbc=host-managed
> +		zone_nr_conv=0
> +		zone_size_mb=4
> +	)
> +	_init_scsi_debug "${scsi_debug_params[@]}" &&
> +	local zdev="/dev/${SCSI_DEBUG_DEVICES[0]}" fail &&
> +	ls -ld "${zdev}" >>"${FULL}" &&
> +	{ toggle_iosched & } &&
> +	toggle_iosched_pid=$! &&
> +	local fio_args=(
> +		--direct=1
> +		--filename="${zdev}"
> +		--iodepth="${qd}"
> +		--ioengine=io_uring
> +		--ioscheduler=none
> +		--name=pipeline-zoned-writes
> +		--output="${RESULTS_DIR}/fio-output-zbd-092.txt"

I think the separated fio output file is rather difficult to find. Also,
_run_fio sets --output-format=terse, which is rather difficult to understand.
I suggest to not set the --output option here and to use "fio" instead of
"_run_fio". This way, the default fio output will be recorded in $FULL.

> +		--runtime="${TIMEOUT:-30}"
> +		--rw=randwrite
> +		--time_based
> +		--zonemode=zbd
> +	) &&
> +	_run_fio "${fio_args[@]}" >>"${FULL}" 2>&1 ||
> +	fail=true
> +
> +	kill "${toggle_iosched_pid}" 2>&1
> +	_exit_scsi_debug
> +
> +	if [ -z "$fail" ]; then
> +		echo "Test complete"
> +	else
> +		echo "Test failed"
> +		return 1
> +	fi
> +}
> diff --git a/tests/zbd/012.out b/tests/zbd/012.out
> new file mode 100644
> index 000000000000..8ff654950c5f
> --- /dev/null
> +++ b/tests/zbd/012.out
> @@ -0,0 +1,2 @@
> +Running zbd/012
> +Test complete
Damien Le Moal Nov. 28, 2024, 4:37 a.m. UTC | #26
On 11/28/24 12:20, Christoph Hellwig wrote:
> On Thu, Nov 28, 2024 at 08:18:16AM +0900, Damien Le Moal wrote:
>> The BIO that failed is not recovered. The user will see the failure. The error
>> recovery report zones is all about avoiding more failures of plugged zone append
>> BIOs behind that failed BIO. These can succeed with the error recovery.
>>
>> So sure, we can fail all BIOs. The user will see more failures. If that is OK,
>> that's easy to do. But in the end, that is not a solution because we still need
> 
> What is the scenario where only one I/O will fail?  The time of dust on
> a sector failign writes to just sector are long gone these days.
> 
> So unless we can come up with a scenario where:
> 
>  - one I/O will fail, but others won't
>  - this matters to the writer
> 
> optimizing for being able to just fail a single I/O seems like a
> wasted effort.
> 
>> to get an updated zone write pointer to be able to restart zone append
>> emulation. Otherwise, we are in the dark and will not know where to send the
>> regular writes emulating zone append. That means that we still need to issue a
>> zone report and that is racing with queue freeze and reception of a new BIO. We
>> cannot have new BIOs "wait" for the zone report as that would create a hang
>> situation again if a queue freeze is started between reception of the new BIO
>> and the zone report. Do we fail these new BIOs too ? That seems extreme.
> 
> Just add a "need resync" flag and do the report zones before issuing the
> next write?

The problem here would be that "before issuing the next write" needs to be
really before we do a blk_queue_enter() for that write, so that would need to be
on entry to blk_mq_submit_bio() or before in the stack. Otherwise, we endup with
the write bio completing depending on the report zones, again, and the potential
hang is back.

But I have something now that completely remove report zones. Same idea as you
suggested: a BLK_ZONE_WPLUG_NEED_WP_UPDATE flag that is set on error and an
automatic update of the zone write plug to the start sector of a bio when we
start seeing writes again for that zone. The idea is that well-behaved users
will do a report zone after a failed write and restart writing at the correct
position.

And for good measures, I modified report zones to also automatically update the
wp of zones that have BLK_ZONE_WPLUG_NEED_WP_UPDATE. So the user doing a report
zones clears everything up.

Overall, that removes *a lot* of code and makes things a lot simpler. Starting
test runs with that now.
Christoph Hellwig Nov. 28, 2024, 4:39 a.m. UTC | #27
On Thu, Nov 28, 2024 at 01:37:46PM +0900, Damien Le Moal wrote:
> suggested: a BLK_ZONE_WPLUG_NEED_WP_UPDATE flag that is set on error and an
> automatic update of the zone write plug to the start sector of a bio when we
> start seeing writes again for that zone. The idea is that well-behaved users
> will do a report zone after a failed write and restart writing at the correct
> position.
> 
> And for good measures, I modified report zones to also automatically update the
> wp of zones that have BLK_ZONE_WPLUG_NEED_WP_UPDATE. So the user doing a report
> zones clears everything up.
> 
> Overall, that removes *a lot* of code and makes things a lot simpler. Starting
> test runs with that now.

How does that work for zone append emulation?
Damien Le Moal Nov. 28, 2024, 4:52 a.m. UTC | #28
On 11/28/24 13:39, Christoph Hellwig wrote:
> On Thu, Nov 28, 2024 at 01:37:46PM +0900, Damien Le Moal wrote:
>> suggested: a BLK_ZONE_WPLUG_NEED_WP_UPDATE flag that is set on error and an
>> automatic update of the zone write plug to the start sector of a bio when we
>> start seeing writes again for that zone. The idea is that well-behaved users
>> will do a report zone after a failed write and restart writing at the correct
>> position.
>>
>> And for good measures, I modified report zones to also automatically update the
>> wp of zones that have BLK_ZONE_WPLUG_NEED_WP_UPDATE. So the user doing a report
>> zones clears everything up.
>>
>> Overall, that removes *a lot* of code and makes things a lot simpler. Starting
>> test runs with that now.
> 
> How does that work for zone append emulation?

Same: the user must do a report zones to update the zone wp.
Otherwise, it is hard to guarantee that we can get a valid wp offset value after
an error. I looked into trying to get a valid value even in the case of a
partial request failure, but that is easily douable.

I could keep the error work and trigger it to do a report zones to refresh zones
wp on entry to submit_bio() also instead on relying on the user doing a report
zones.
Christoph Hellwig Nov. 28, 2024, 5 a.m. UTC | #29
On Thu, Nov 28, 2024 at 01:52:55PM +0900, Damien Le Moal wrote:
> Same: the user must do a report zones to update the zone wp.
> Otherwise, it is hard to guarantee that we can get a valid wp offset value after
> an error. I looked into trying to get a valid value even in the case of a
> partial request failure, but that is easily douable.

Well, we'll better document that.  Then again, what kinds of error types
do we have where a write fails, but another write to the same zone later
on will work again?
Damien Le Moal Nov. 28, 2024, 5:07 a.m. UTC | #30
On 11/28/24 14:00, Christoph Hellwig wrote:
> On Thu, Nov 28, 2024 at 01:52:55PM +0900, Damien Le Moal wrote:
>> Same: the user must do a report zones to update the zone wp.
>> Otherwise, it is hard to guarantee that we can get a valid wp offset value after
>> an error. I looked into trying to get a valid value even in the case of a
>> partial request failure, but that is easily douable.

s/that is easily douable/that is NOT easily douable

> Well, we'll better document that.  Then again, what kinds of error types
> do we have where a write fails, but another write to the same zone later
> on will work again?

A bad sector that gets remapped when overwritten is probably the most common,
and maybe the only one. I need to check again, but I think that for this case,
the scsi stack retries the reminder of a torn write so we probably do not even
see it in practice, unless the sector/zone is really dead and cannot be
recovered. But in that case, no matter what we do, that zone would not be
writable anymore.

Still trying to see if I can have some sort of synchronization between incoming
writes and zone wp update to avoid relying on the user doing a report zones.
That would ensure that emulated zone append always work like the real command.
Christoph Hellwig Nov. 28, 2024, 5:16 a.m. UTC | #31
On Thu, Nov 28, 2024 at 02:07:58PM +0900, Damien Le Moal wrote:
> A bad sector that gets remapped when overwritten is probably the most common,
> and maybe the only one. I need to check again, but I think that for this case,
> the scsi stack retries the reminder of a torn write so we probably do not even
> see it in practice, unless the sector/zone is really dead and cannot be
> recovered. But in that case, no matter what we do, that zone would not be
> writable anymore.

Yes, all retryable errors should be handled by the drivers.  NVMe makes
this very clear with the DNR bit, while SCSI deals with this on a more
ad-hoc basis by looking at the sense codes.  So by the time a write error
bubbles up to the file systems I do not expect the device to ever
recover from it.  Maybe with some kind of dynamic depop in the future
where we drop just that zone, but otherwise we're very much done.

> Still trying to see if I can have some sort of synchronization between incoming
> writes and zone wp update to avoid relying on the user doing a report zones.
> That would ensure that emulated zone append always work like the real command.

I think we're much better off leaving that to the submitter, because
it better have a really good reason to resubmit a write to the zone.
We'll just need to properly document the assumptions.
Damien Le Moal Nov. 28, 2024, 5:19 a.m. UTC | #32
On 11/28/24 14:16, Christoph Hellwig wrote:
> On Thu, Nov 28, 2024 at 02:07:58PM +0900, Damien Le Moal wrote:
>> A bad sector that gets remapped when overwritten is probably the most common,
>> and maybe the only one. I need to check again, but I think that for this case,
>> the scsi stack retries the reminder of a torn write so we probably do not even
>> see it in practice, unless the sector/zone is really dead and cannot be
>> recovered. But in that case, no matter what we do, that zone would not be
>> writable anymore.
> 
> Yes, all retryable errors should be handled by the drivers.  NVMe makes
> this very clear with the DNR bit, while SCSI deals with this on a more
> ad-hoc basis by looking at the sense codes.  So by the time a write error
> bubbles up to the file systems I do not expect the device to ever
> recover from it.  Maybe with some kind of dynamic depop in the future
> where we drop just that zone, but otherwise we're very much done.
> 
>> Still trying to see if I can have some sort of synchronization between incoming
>> writes and zone wp update to avoid relying on the user doing a report zones.
>> That would ensure that emulated zone append always work like the real command.
> 
> I think we're much better off leaving that to the submitter, because
> it better have a really good reason to resubmit a write to the zone.
> We'll just need to properly document the assumptions.

Sounds good. What do you think of adding the opportunistic "update zone wp"
whenever we execute a user report zones ? It is very easy to do and should not
slow down significantly report zones itself because we usually have very zone
write plugs and the hash search is fast.
Christoph Hellwig Nov. 28, 2024, 5:21 a.m. UTC | #33
On Thu, Nov 28, 2024 at 02:19:23PM +0900, Damien Le Moal wrote:
> Sounds good. What do you think of adding the opportunistic "update zone wp"
> whenever we execute a user report zones ? It is very easy to do and should not
> slow down significantly report zones itself because we usually have very zone
> write plugs and the hash search is fast.

Just user or any report zones?  But I think updating it is probably a
good thing.  When the zone state is requested there is no excuse to
not have the write plug information in sync
Damien Le Moal Nov. 28, 2024, 5:27 a.m. UTC | #34
On 11/28/24 14:21, Christoph Hellwig wrote:
> On Thu, Nov 28, 2024 at 02:19:23PM +0900, Damien Le Moal wrote:
>> Sounds good. What do you think of adding the opportunistic "update zone wp"
>> whenever we execute a user report zones ? It is very easy to do and should not
>> slow down significantly report zones itself because we usually have very zone
>> write plugs and the hash search is fast.
> 
> Just user or any report zones?  But I think updating it is probably a
> good thing.  When the zone state is requested there is no excuse to
> not have the write plug information in sync

All report zones. That would cover things like an FS mkfs report zones putting
everything in sync and an FS or dm-zoned doing a report zones after a write
error syncing single zones.

To do that, all that is needed is a default report zones callback that syncs
write plugs that have the "need update" flag and then call the user specified
report zones callback.
diff mbox series

Patch

diff --git a/tests/zbd/012 b/tests/zbd/012
new file mode 100644
index 000000000000..0551d01011af
--- /dev/null
+++ b/tests/zbd/012
@@ -0,0 +1,70 @@ 
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2024 Google LLC
+
+. tests/scsi/rc
+. common/scsi_debug
+
+DESCRIPTION="test requeuing of zoned writes and queue freezing"
+QUICK=1
+
+requires() {
+	_have_fio_zbd_zonemode
+}
+
+toggle_iosched() {
+	while true; do
+		for iosched in none mq-deadline; do
+			echo "${iosched}" > "/sys/class/block/$(basename "$zdev")/queue/scheduler"
+			sleep .1
+		done
+	done
+}
+
+test() {
+	echo "Running ${TEST_NAME}"
+
+	local qd=1
+	local scsi_debug_params=(
+		delay=0
+		dev_size_mb=1024
+		every_nth=$((2 * qd))
+		max_queue="${qd}"
+		opts=0x8000          # SDEBUG_OPT_HOST_BUSY
+		sector_size=4096
+		statistics=1
+		zbc=host-managed
+		zone_nr_conv=0
+		zone_size_mb=4
+	)
+	_init_scsi_debug "${scsi_debug_params[@]}" &&
+	local zdev="/dev/${SCSI_DEBUG_DEVICES[0]}" fail &&
+	ls -ld "${zdev}" >>"${FULL}" &&
+	{ toggle_iosched & } &&
+	toggle_iosched_pid=$! &&
+	local fio_args=(
+		--direct=1
+		--filename="${zdev}"
+		--iodepth="${qd}"
+		--ioengine=io_uring
+		--ioscheduler=none
+		--name=pipeline-zoned-writes
+		--output="${RESULTS_DIR}/fio-output-zbd-092.txt"
+		--runtime="${TIMEOUT:-30}"
+		--rw=randwrite
+		--time_based
+		--zonemode=zbd
+	) &&
+	_run_fio "${fio_args[@]}" >>"${FULL}" 2>&1 ||
+	fail=true
+
+	kill "${toggle_iosched_pid}" 2>&1
+	_exit_scsi_debug
+
+	if [ -z "$fail" ]; then
+		echo "Test complete"
+	else
+		echo "Test failed"
+		return 1
+	fi
+}
diff --git a/tests/zbd/012.out b/tests/zbd/012.out
new file mode 100644
index 000000000000..8ff654950c5f
--- /dev/null
+++ b/tests/zbd/012.out
@@ -0,0 +1,2 @@ 
+Running zbd/012
+Test complete