diff mbox series

btrfs: fix deadlock between quota disable and qgroup rescan worker

Message ID 20220113104029.902200-1-shinichiro.kawasaki@wdc.com (mailing list archive)
State New, archived
Headers show
Series btrfs: fix deadlock between quota disable and qgroup rescan worker | expand

Commit Message

Shinichiro Kawasaki Jan. 13, 2022, 10:40 a.m. UTC
Quota disable ioctl starts a transaction before waiting for the qgroup
rescan worker completes. However, this wait can be infinite and results
in deadlock because of circular dependency among the quota disable
ioctl, the qgroup rescan worker and the other task with transaction such
as block group relocation task.

The deadlock happens with the steps following:

1) Task A calls ioctl to disable quota. It starts a transaction and
   waits for qgroup rescan worker completes.
2) Task B such as block group relocation task starts a transaction and
   joins to the transaction that task A started. Then task B commits to
   the transaction. In this commit, task B waits for a commit by task A.
3) Task C as the qgroup rescan worker starts its job and starts a
   transaction. In this transaction start, task C waits for completion
   of the transaction that task A started and task B committed.

This deadlock was found with fstests test case block/115 and a zoned
null_blk device. The test case enables and disables quota, and the
block group reclaim was triggered during the quota disable by chance.
The deadlock was also observed by running quota enable and disable in
parallel with 'btrfs balance' command on regular null_blk devices.

An example report of the deadlock:

[  372.469894] INFO: task kworker/u16:6:103 blocked for more than 122 seconds.
[  372.479944]       Not tainted 5.16.0-rc8 #7
[  372.485067] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  372.493898] task:kworker/u16:6   state:D stack:    0 pid:  103 ppid:     2 flags:0x00004000
[  372.503285] Workqueue: btrfs-qgroup-rescan btrfs_work_helper [btrfs]
[  372.510782] Call Trace:
[  372.514092]  <TASK>
[  372.521684]  __schedule+0xb56/0x4850
[  372.530104]  ? io_schedule_timeout+0x190/0x190
[  372.538842]  ? lockdep_hardirqs_on+0x7e/0x100
[  372.547092]  ? _raw_spin_unlock_irqrestore+0x3e/0x60
[  372.555591]  schedule+0xe0/0x270
[  372.561894]  btrfs_commit_transaction+0x18bb/0x2610 [btrfs]
[  372.570506]  ? btrfs_apply_pending_changes+0x50/0x50 [btrfs]
[  372.578875]  ? free_unref_page+0x3f2/0x650
[  372.585484]  ? finish_wait+0x270/0x270
[  372.591594]  ? release_extent_buffer+0x224/0x420 [btrfs]
[  372.599264]  btrfs_qgroup_rescan_worker+0xc13/0x10c0 [btrfs]
[  372.607157]  ? lock_release+0x3a9/0x6d0
[  372.613054]  ? btrfs_qgroup_account_extent+0xda0/0xda0 [btrfs]
[  372.620960]  ? do_raw_spin_lock+0x11e/0x250
[  372.627137]  ? rwlock_bug.part.0+0x90/0x90
[  372.633215]  ? lock_is_held_type+0xe4/0x140
[  372.639404]  btrfs_work_helper+0x1ae/0xa90 [btrfs]
[  372.646268]  process_one_work+0x7e9/0x1320
[  372.652321]  ? lock_release+0x6d0/0x6d0
[  372.658081]  ? pwq_dec_nr_in_flight+0x230/0x230
[  372.664513]  ? rwlock_bug.part.0+0x90/0x90
[  372.670529]  worker_thread+0x59e/0xf90
[  372.676172]  ? process_one_work+0x1320/0x1320
[  372.682440]  kthread+0x3b9/0x490
[  372.687550]  ? _raw_spin_unlock_irq+0x24/0x50
[  372.693811]  ? set_kthread_struct+0x100/0x100
[  372.700052]  ret_from_fork+0x22/0x30
[  372.705517]  </TASK>
[  372.709747] INFO: task btrfs-transacti:2347 blocked for more than 123 seconds.
[  372.729827]       Not tainted 5.16.0-rc8 #7
[  372.745907] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  372.767106] task:btrfs-transacti state:D stack:    0 pid: 2347 ppid:     2 flags:0x00004000
[  372.787776] Call Trace:
[  372.801652]  <TASK>
[  372.812961]  __schedule+0xb56/0x4850
[  372.830011]  ? io_schedule_timeout+0x190/0x190
[  372.852547]  ? lockdep_hardirqs_on+0x7e/0x100
[  372.871761]  ? _raw_spin_unlock_irqrestore+0x3e/0x60
[  372.886792]  schedule+0xe0/0x270
[  372.901685]  wait_current_trans+0x22c/0x310 [btrfs]
[  372.919743]  ? btrfs_put_transaction+0x3d0/0x3d0 [btrfs]
[  372.938923]  ? finish_wait+0x270/0x270
[  372.959085]  ? join_transaction+0xc75/0xe30 [btrfs]
[  372.977706]  start_transaction+0x938/0x10a0 [btrfs]
[  372.997168]  transaction_kthread+0x19d/0x3c0 [btrfs]
[  373.013021]  ? btrfs_cleanup_transaction.isra.0+0xfc0/0xfc0 [btrfs]
[  373.031678]  kthread+0x3b9/0x490
[  373.047420]  ? _raw_spin_unlock_irq+0x24/0x50
[  373.064645]  ? set_kthread_struct+0x100/0x100
[  373.078571]  ret_from_fork+0x22/0x30
[  373.091197]  </TASK>
[  373.105611] INFO: task btrfs:3145 blocked for more than 123 seconds.
[  373.114147]       Not tainted 5.16.0-rc8 #7
[  373.120401] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  373.130393] task:btrfs           state:D stack:    0 pid: 3145 ppid:  3141 flags:0x00004000
[  373.140998] Call Trace:
[  373.145501]  <TASK>
[  373.149654]  __schedule+0xb56/0x4850
[  373.155306]  ? io_schedule_timeout+0x190/0x190
[  373.161965]  ? lockdep_hardirqs_on+0x7e/0x100
[  373.168469]  ? _raw_spin_unlock_irqrestore+0x3e/0x60
[  373.175468]  schedule+0xe0/0x270
[  373.180814]  wait_for_commit+0x104/0x150 [btrfs]
[  373.187643]  ? test_and_set_bit+0x20/0x20 [btrfs]
[  373.194772]  ? kmem_cache_free+0x124/0x550
[  373.201191]  ? btrfs_put_transaction+0x69/0x3d0 [btrfs]
[  373.208738]  ? finish_wait+0x270/0x270
[  373.214704]  ? __btrfs_end_transaction+0x347/0x7b0 [btrfs]
[  373.222342]  btrfs_commit_transaction+0x44d/0x2610 [btrfs]
[  373.230233]  ? join_transaction+0x255/0xe30 [btrfs]
[  373.237334]  ? btrfs_record_root_in_trans+0x4d/0x170 [btrfs]
[  373.245251]  ? btrfs_apply_pending_changes+0x50/0x50 [btrfs]
[  373.253296]  relocate_block_group+0x105/0xc20 [btrfs]
[  373.260533]  ? mutex_lock_io_nested+0x1270/0x1270
[  373.267516]  ? btrfs_wait_nocow_writers+0x85/0x180 [btrfs]
[  373.275155]  ? merge_reloc_roots+0x710/0x710 [btrfs]
[  373.283602]  ? btrfs_wait_ordered_extents+0xd30/0xd30 [btrfs]
[  373.291934]  ? kmem_cache_free+0x124/0x550
[  373.298180]  btrfs_relocate_block_group+0x35c/0x930 [btrfs]
[  373.306047]  btrfs_relocate_chunk+0x85/0x210 [btrfs]
[  373.313229]  btrfs_balance+0x12f4/0x2d20 [btrfs]
[  373.320227]  ? lock_release+0x3a9/0x6d0
[  373.326206]  ? btrfs_relocate_chunk+0x210/0x210 [btrfs]
[  373.333591]  ? lock_is_held_type+0xe4/0x140
[  373.340031]  ? rcu_read_lock_sched_held+0x3f/0x70
[  373.346910]  btrfs_ioctl_balance+0x548/0x700 [btrfs]
[  373.354207]  btrfs_ioctl+0x7f2/0x71b0 [btrfs]
[  373.360774]  ? lockdep_hardirqs_on_prepare+0x410/0x410
[  373.367957]  ? lockdep_hardirqs_on_prepare+0x410/0x410
[  373.375327]  ? btrfs_ioctl_get_supported_features+0x20/0x20 [btrfs]
[  373.383841]  ? find_held_lock+0x2c/0x110
[  373.389993]  ? lock_release+0x3a9/0x6d0
[  373.395828]  ? mntput_no_expire+0xf7/0xad0
[  373.402083]  ? lock_is_held_type+0xe4/0x140
[  373.408249]  ? vfs_fileattr_set+0x9f0/0x9f0
[  373.414486]  ? selinux_file_ioctl+0x349/0x4e0
[  373.420938]  ? trace_raw_output_lock+0xb4/0xe0
[  373.427442]  ? selinux_inode_getsecctx+0x80/0x80
[  373.434224]  ? lockdep_hardirqs_on+0x7e/0x100
[  373.440660]  ? force_qs_rnp+0x2a0/0x6b0
[  373.446534]  ? lock_is_held_type+0x9b/0x140
[  373.452763]  ? __blkcg_punt_bio_submit+0x1b0/0x1b0
[  373.459732]  ? security_file_ioctl+0x50/0x90
[  373.466089]  __x64_sys_ioctl+0x127/0x190
[  373.472022]  do_syscall_64+0x3b/0x90
[  373.477513]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  373.484823] RIP: 0033:0x7f8f4af7e2bb
[  373.490493] RSP: 002b:00007ffcbf936178 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[  373.500197] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f8f4af7e2bb
[  373.509451] RDX: 00007ffcbf936220 RSI: 00000000c4009420 RDI: 0000000000000003
[  373.518659] RBP: 00007ffcbf93774a R08: 0000000000000013 R09: 00007f8f4b02d4e0
[  373.527872] R10: 00007f8f4ae87740 R11: 0000000000000246 R12: 0000000000000001
[  373.537222] R13: 00007ffcbf936220 R14: 0000000000000000 R15: 0000000000000002
[  373.546506]  </TASK>
[  373.550878] INFO: task btrfs:3146 blocked for more than 123 seconds.
[  373.559383]       Not tainted 5.16.0-rc8 #7
[  373.565748] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  373.575748] task:btrfs           state:D stack:    0 pid: 3146 ppid:  2168 flags:0x00000000
[  373.586314] Call Trace:
[  373.590846]  <TASK>
[  373.595121]  __schedule+0xb56/0x4850
[  373.600901]  ? __lock_acquire+0x23db/0x5030
[  373.607176]  ? io_schedule_timeout+0x190/0x190
[  373.613954]  schedule+0xe0/0x270
[  373.619157]  schedule_timeout+0x168/0x220
[  373.625170]  ? usleep_range_state+0x150/0x150
[  373.631653]  ? mark_held_locks+0x9e/0xe0
[  373.637767]  ? do_raw_spin_lock+0x11e/0x250
[  373.643993]  ? lockdep_hardirqs_on_prepare+0x17b/0x410
[  373.651267]  ? _raw_spin_unlock_irq+0x24/0x50
[  373.657677]  ? lockdep_hardirqs_on+0x7e/0x100
[  373.664103]  wait_for_completion+0x163/0x250
[  373.670437]  ? bit_wait_timeout+0x160/0x160
[  373.676585]  btrfs_quota_disable+0x176/0x9a0 [btrfs]
[  373.683979]  ? btrfs_quota_enable+0x12f0/0x12f0 [btrfs]
[  373.691340]  ? down_write+0xd0/0x130
[  373.696880]  ? down_write_killable+0x150/0x150
[  373.703352]  btrfs_ioctl+0x3945/0x71b0 [btrfs]
[  373.710061]  ? find_held_lock+0x2c/0x110
[  373.716192]  ? lock_release+0x3a9/0x6d0
[  373.722047]  ? __handle_mm_fault+0x23cd/0x3050
[  373.728486]  ? btrfs_ioctl_get_supported_features+0x20/0x20 [btrfs]
[  373.737032]  ? set_pte+0x6a/0x90
[  373.742271]  ? do_raw_spin_unlock+0x55/0x1f0
[  373.748506]  ? lock_is_held_type+0xe4/0x140
[  373.754792]  ? vfs_fileattr_set+0x9f0/0x9f0
[  373.761083]  ? selinux_file_ioctl+0x349/0x4e0
[  373.767521]  ? selinux_inode_getsecctx+0x80/0x80
[  373.774247]  ? __up_read+0x182/0x6e0
[  373.780026]  ? count_memcg_events.constprop.0+0x46/0x60
[  373.787281]  ? up_write+0x460/0x460
[  373.792932]  ? security_file_ioctl+0x50/0x90
[  373.799232]  __x64_sys_ioctl+0x127/0x190
[  373.805237]  do_syscall_64+0x3b/0x90
[  373.810947]  entry_SYSCALL_64_after_hwframe+0x44/0xae
[  373.818102] RIP: 0033:0x7f1383ea02bb
[  373.823847] RSP: 002b:00007fffeb4d71f8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[  373.833641] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f1383ea02bb
[  373.842961] RDX: 00007fffeb4d7210 RSI: 00000000c0109428 RDI: 0000000000000003
[  373.852179] RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000078
[  373.861408] R10: 00007f1383daec78 R11: 0000000000000202 R12: 00007fffeb4d874a
[  373.870647] R13: 0000000000493099 R14: 0000000000000001 R15: 0000000000000000
[  373.879838]  </TASK>
[  373.884018]
               Showing all locks held in the system:
[  373.894250] 3 locks held by kworker/4:1/58:
[  373.900356] 1 lock held by khungtaskd/63:
[  373.906333]  #0: ffffffff8945ff60 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260
[  373.917307] 3 locks held by kworker/u16:6/103:
[  373.923938]  #0: ffff888127b4f138 ((wq_completion)btrfs-qgroup-rescan){+.+.}-{0:0}, at: process_one_work+0x712/0x1320
[  373.936555]  #1: ffff88810b817dd8 ((work_completion)(&work->normal_work)){+.+.}-{0:0}, at: process_one_work+0x73f/0x1320
[  373.951109]  #2: ffff888102dd4650 (sb_internal#2){.+.+}-{0:0}, at: btrfs_qgroup_rescan_worker+0x1f6/0x10c0 [btrfs]
[  373.964027] 2 locks held by less/1803:
[  373.969982]  #0: ffff88813ed56098 (&tty->ldisc_sem){++++}-{0:0}, at: tty_ldisc_ref_wait+0x24/0x80
[  373.981295]  #1: ffffc90000b3b2e8 (&ldata->atomic_read_lock){+.+.}-{3:3}, at: n_tty_read+0x9e2/0x1060
[  373.992969] 1 lock held by btrfs-transacti/2347:
[  373.999893]  #0: ffff88813d4887a8 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0xe3/0x3c0 [btrfs]
[  374.015872] 3 locks held by btrfs/3145:
[  374.022298]  #0: ffff888102dd4460 (sb_writers#18){.+.+}-{0:0}, at: btrfs_ioctl_balance+0xc3/0x700 [btrfs]
[  374.034456]  #1: ffff88813d48a0a0 (&fs_info->reclaim_bgs_lock){+.+.}-{3:3}, at: btrfs_balance+0xfe5/0x2d20 [btrfs]
[  374.047646]  #2: ffff88813d488838 (&fs_info->cleaner_mutex){+.+.}-{3:3}, at: btrfs_relocate_block_group+0x354/0x930 [btrfs]
[  374.063295] 4 locks held by btrfs/3146:
[  374.069647]  #0: ffff888102dd4460 (sb_writers#18){.+.+}-{0:0}, at: btrfs_ioctl+0x38b1/0x71b0 [btrfs]
[  374.081601]  #1: ffff88813d488bb8 (&fs_info->subvol_sem){+.+.}-{3:3}, at: btrfs_ioctl+0x38fd/0x71b0 [btrfs]
[  374.094283]  #2: ffff888102dd4650 (sb_internal#2){.+.+}-{0:0}, at: btrfs_quota_disable+0xc8/0x9a0 [btrfs]
[  374.106885]  #3: ffff88813d489800 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_quota_disable+0xd5/0x9a0 [btrfs]

[  374.126780] =============================================

To avoid the deadlock, wait for the qgroup rescan worker to complete
before starting the transaction for the quota disable ioctl. With
current implementation, BTRFS_FS_QUOTA_ENABLE flag is cleared to request
the qgroup rescan worker to complete. However, this flag can not be
cleared before the transaction for quota disable. Then add and use a new
flag BTRFS_FS_STATE_QUOTA_DISABLING instead. To avoid another new qgroup
rescan worker to start after the previous qgroup worker completed, check
the BTRFS_FS_STATE_QUOTA_DISABLING flag in qgroup_rescan_init.

Suggested-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
---
 fs/btrfs/ctree.h  |  2 ++
 fs/btrfs/qgroup.c | 20 ++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

Comments

David Sterba Jan. 13, 2022, 11:41 a.m. UTC | #1
On Thu, Jan 13, 2022 at 07:40:29PM +0900, Shin'ichiro Kawasaki wrote:
> Quota disable ioctl starts a transaction before waiting for the qgroup
> rescan worker completes. However, this wait can be infinite and results
> in deadlock because of circular dependency among the quota disable
> ioctl, the qgroup rescan worker and the other task with transaction such
> as block group relocation task.
> 
> The deadlock happens with the steps following:
> 
> 1) Task A calls ioctl to disable quota. It starts a transaction and
>    waits for qgroup rescan worker completes.
> 2) Task B such as block group relocation task starts a transaction and
>    joins to the transaction that task A started. Then task B commits to
>    the transaction. In this commit, task B waits for a commit by task A.
> 3) Task C as the qgroup rescan worker starts its job and starts a
>    transaction. In this transaction start, task C waits for completion
>    of the transaction that task A started and task B committed.
> 
> This deadlock was found with fstests test case block/115 and a zoned
> null_blk device. The test case enables and disables quota, and the
> block group reclaim was triggered during the quota disable by chance.
> The deadlock was also observed by running quota enable and disable in
> parallel with 'btrfs balance' command on regular null_blk devices.
> 
> An example report of the deadlock:
> 
> [  372.469894] INFO: task kworker/u16:6:103 blocked for more than 122 seconds.
> [  372.479944]       Not tainted 5.16.0-rc8 #7
> [  372.485067] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  372.493898] task:kworker/u16:6   state:D stack:    0 pid:  103 ppid:     2 flags:0x00004000
> [  372.503285] Workqueue: btrfs-qgroup-rescan btrfs_work_helper [btrfs]
> [  372.510782] Call Trace:
> [  372.514092]  <TASK>
> [  372.521684]  __schedule+0xb56/0x4850
> [  372.530104]  ? io_schedule_timeout+0x190/0x190
> [  372.538842]  ? lockdep_hardirqs_on+0x7e/0x100
> [  372.547092]  ? _raw_spin_unlock_irqrestore+0x3e/0x60
> [  372.555591]  schedule+0xe0/0x270
> [  372.561894]  btrfs_commit_transaction+0x18bb/0x2610 [btrfs]
> [  372.570506]  ? btrfs_apply_pending_changes+0x50/0x50 [btrfs]
> [  372.578875]  ? free_unref_page+0x3f2/0x650
> [  372.585484]  ? finish_wait+0x270/0x270
> [  372.591594]  ? release_extent_buffer+0x224/0x420 [btrfs]
> [  372.599264]  btrfs_qgroup_rescan_worker+0xc13/0x10c0 [btrfs]
> [  372.607157]  ? lock_release+0x3a9/0x6d0
> [  372.613054]  ? btrfs_qgroup_account_extent+0xda0/0xda0 [btrfs]
> [  372.620960]  ? do_raw_spin_lock+0x11e/0x250
> [  372.627137]  ? rwlock_bug.part.0+0x90/0x90
> [  372.633215]  ? lock_is_held_type+0xe4/0x140
> [  372.639404]  btrfs_work_helper+0x1ae/0xa90 [btrfs]
> [  372.646268]  process_one_work+0x7e9/0x1320
> [  372.652321]  ? lock_release+0x6d0/0x6d0
> [  372.658081]  ? pwq_dec_nr_in_flight+0x230/0x230
> [  372.664513]  ? rwlock_bug.part.0+0x90/0x90
> [  372.670529]  worker_thread+0x59e/0xf90
> [  372.676172]  ? process_one_work+0x1320/0x1320
> [  372.682440]  kthread+0x3b9/0x490
> [  372.687550]  ? _raw_spin_unlock_irq+0x24/0x50
> [  372.693811]  ? set_kthread_struct+0x100/0x100
> [  372.700052]  ret_from_fork+0x22/0x30
> [  372.705517]  </TASK>
> [  372.709747] INFO: task btrfs-transacti:2347 blocked for more than 123 seconds.
> [  372.729827]       Not tainted 5.16.0-rc8 #7
> [  372.745907] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  372.767106] task:btrfs-transacti state:D stack:    0 pid: 2347 ppid:     2 flags:0x00004000
> [  372.787776] Call Trace:
> [  372.801652]  <TASK>
> [  372.812961]  __schedule+0xb56/0x4850
> [  372.830011]  ? io_schedule_timeout+0x190/0x190
> [  372.852547]  ? lockdep_hardirqs_on+0x7e/0x100
> [  372.871761]  ? _raw_spin_unlock_irqrestore+0x3e/0x60
> [  372.886792]  schedule+0xe0/0x270
> [  372.901685]  wait_current_trans+0x22c/0x310 [btrfs]
> [  372.919743]  ? btrfs_put_transaction+0x3d0/0x3d0 [btrfs]
> [  372.938923]  ? finish_wait+0x270/0x270
> [  372.959085]  ? join_transaction+0xc75/0xe30 [btrfs]
> [  372.977706]  start_transaction+0x938/0x10a0 [btrfs]
> [  372.997168]  transaction_kthread+0x19d/0x3c0 [btrfs]
> [  373.013021]  ? btrfs_cleanup_transaction.isra.0+0xfc0/0xfc0 [btrfs]
> [  373.031678]  kthread+0x3b9/0x490
> [  373.047420]  ? _raw_spin_unlock_irq+0x24/0x50
> [  373.064645]  ? set_kthread_struct+0x100/0x100
> [  373.078571]  ret_from_fork+0x22/0x30
> [  373.091197]  </TASK>
> [  373.105611] INFO: task btrfs:3145 blocked for more than 123 seconds.
> [  373.114147]       Not tainted 5.16.0-rc8 #7
> [  373.120401] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  373.130393] task:btrfs           state:D stack:    0 pid: 3145 ppid:  3141 flags:0x00004000
> [  373.140998] Call Trace:
> [  373.145501]  <TASK>
> [  373.149654]  __schedule+0xb56/0x4850
> [  373.155306]  ? io_schedule_timeout+0x190/0x190
> [  373.161965]  ? lockdep_hardirqs_on+0x7e/0x100
> [  373.168469]  ? _raw_spin_unlock_irqrestore+0x3e/0x60
> [  373.175468]  schedule+0xe0/0x270
> [  373.180814]  wait_for_commit+0x104/0x150 [btrfs]
> [  373.187643]  ? test_and_set_bit+0x20/0x20 [btrfs]
> [  373.194772]  ? kmem_cache_free+0x124/0x550
> [  373.201191]  ? btrfs_put_transaction+0x69/0x3d0 [btrfs]
> [  373.208738]  ? finish_wait+0x270/0x270
> [  373.214704]  ? __btrfs_end_transaction+0x347/0x7b0 [btrfs]
> [  373.222342]  btrfs_commit_transaction+0x44d/0x2610 [btrfs]
> [  373.230233]  ? join_transaction+0x255/0xe30 [btrfs]
> [  373.237334]  ? btrfs_record_root_in_trans+0x4d/0x170 [btrfs]
> [  373.245251]  ? btrfs_apply_pending_changes+0x50/0x50 [btrfs]
> [  373.253296]  relocate_block_group+0x105/0xc20 [btrfs]
> [  373.260533]  ? mutex_lock_io_nested+0x1270/0x1270
> [  373.267516]  ? btrfs_wait_nocow_writers+0x85/0x180 [btrfs]
> [  373.275155]  ? merge_reloc_roots+0x710/0x710 [btrfs]
> [  373.283602]  ? btrfs_wait_ordered_extents+0xd30/0xd30 [btrfs]
> [  373.291934]  ? kmem_cache_free+0x124/0x550
> [  373.298180]  btrfs_relocate_block_group+0x35c/0x930 [btrfs]
> [  373.306047]  btrfs_relocate_chunk+0x85/0x210 [btrfs]
> [  373.313229]  btrfs_balance+0x12f4/0x2d20 [btrfs]
> [  373.320227]  ? lock_release+0x3a9/0x6d0
> [  373.326206]  ? btrfs_relocate_chunk+0x210/0x210 [btrfs]
> [  373.333591]  ? lock_is_held_type+0xe4/0x140
> [  373.340031]  ? rcu_read_lock_sched_held+0x3f/0x70
> [  373.346910]  btrfs_ioctl_balance+0x548/0x700 [btrfs]
> [  373.354207]  btrfs_ioctl+0x7f2/0x71b0 [btrfs]
> [  373.360774]  ? lockdep_hardirqs_on_prepare+0x410/0x410
> [  373.367957]  ? lockdep_hardirqs_on_prepare+0x410/0x410
> [  373.375327]  ? btrfs_ioctl_get_supported_features+0x20/0x20 [btrfs]
> [  373.383841]  ? find_held_lock+0x2c/0x110
> [  373.389993]  ? lock_release+0x3a9/0x6d0
> [  373.395828]  ? mntput_no_expire+0xf7/0xad0
> [  373.402083]  ? lock_is_held_type+0xe4/0x140
> [  373.408249]  ? vfs_fileattr_set+0x9f0/0x9f0
> [  373.414486]  ? selinux_file_ioctl+0x349/0x4e0
> [  373.420938]  ? trace_raw_output_lock+0xb4/0xe0
> [  373.427442]  ? selinux_inode_getsecctx+0x80/0x80
> [  373.434224]  ? lockdep_hardirqs_on+0x7e/0x100
> [  373.440660]  ? force_qs_rnp+0x2a0/0x6b0
> [  373.446534]  ? lock_is_held_type+0x9b/0x140
> [  373.452763]  ? __blkcg_punt_bio_submit+0x1b0/0x1b0
> [  373.459732]  ? security_file_ioctl+0x50/0x90
> [  373.466089]  __x64_sys_ioctl+0x127/0x190
> [  373.472022]  do_syscall_64+0x3b/0x90
> [  373.477513]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  373.484823] RIP: 0033:0x7f8f4af7e2bb
> [  373.490493] RSP: 002b:00007ffcbf936178 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
> [  373.500197] RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f8f4af7e2bb
> [  373.509451] RDX: 00007ffcbf936220 RSI: 00000000c4009420 RDI: 0000000000000003
> [  373.518659] RBP: 00007ffcbf93774a R08: 0000000000000013 R09: 00007f8f4b02d4e0
> [  373.527872] R10: 00007f8f4ae87740 R11: 0000000000000246 R12: 0000000000000001
> [  373.537222] R13: 00007ffcbf936220 R14: 0000000000000000 R15: 0000000000000002
> [  373.546506]  </TASK>
> [  373.550878] INFO: task btrfs:3146 blocked for more than 123 seconds.
> [  373.559383]       Not tainted 5.16.0-rc8 #7
> [  373.565748] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  373.575748] task:btrfs           state:D stack:    0 pid: 3146 ppid:  2168 flags:0x00000000
> [  373.586314] Call Trace:
> [  373.590846]  <TASK>
> [  373.595121]  __schedule+0xb56/0x4850
> [  373.600901]  ? __lock_acquire+0x23db/0x5030
> [  373.607176]  ? io_schedule_timeout+0x190/0x190
> [  373.613954]  schedule+0xe0/0x270
> [  373.619157]  schedule_timeout+0x168/0x220
> [  373.625170]  ? usleep_range_state+0x150/0x150
> [  373.631653]  ? mark_held_locks+0x9e/0xe0
> [  373.637767]  ? do_raw_spin_lock+0x11e/0x250
> [  373.643993]  ? lockdep_hardirqs_on_prepare+0x17b/0x410
> [  373.651267]  ? _raw_spin_unlock_irq+0x24/0x50
> [  373.657677]  ? lockdep_hardirqs_on+0x7e/0x100
> [  373.664103]  wait_for_completion+0x163/0x250
> [  373.670437]  ? bit_wait_timeout+0x160/0x160
> [  373.676585]  btrfs_quota_disable+0x176/0x9a0 [btrfs]
> [  373.683979]  ? btrfs_quota_enable+0x12f0/0x12f0 [btrfs]
> [  373.691340]  ? down_write+0xd0/0x130
> [  373.696880]  ? down_write_killable+0x150/0x150
> [  373.703352]  btrfs_ioctl+0x3945/0x71b0 [btrfs]
> [  373.710061]  ? find_held_lock+0x2c/0x110
> [  373.716192]  ? lock_release+0x3a9/0x6d0
> [  373.722047]  ? __handle_mm_fault+0x23cd/0x3050
> [  373.728486]  ? btrfs_ioctl_get_supported_features+0x20/0x20 [btrfs]
> [  373.737032]  ? set_pte+0x6a/0x90
> [  373.742271]  ? do_raw_spin_unlock+0x55/0x1f0
> [  373.748506]  ? lock_is_held_type+0xe4/0x140
> [  373.754792]  ? vfs_fileattr_set+0x9f0/0x9f0
> [  373.761083]  ? selinux_file_ioctl+0x349/0x4e0
> [  373.767521]  ? selinux_inode_getsecctx+0x80/0x80
> [  373.774247]  ? __up_read+0x182/0x6e0
> [  373.780026]  ? count_memcg_events.constprop.0+0x46/0x60
> [  373.787281]  ? up_write+0x460/0x460
> [  373.792932]  ? security_file_ioctl+0x50/0x90
> [  373.799232]  __x64_sys_ioctl+0x127/0x190
> [  373.805237]  do_syscall_64+0x3b/0x90
> [  373.810947]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> [  373.818102] RIP: 0033:0x7f1383ea02bb
> [  373.823847] RSP: 002b:00007fffeb4d71f8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
> [  373.833641] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f1383ea02bb
> [  373.842961] RDX: 00007fffeb4d7210 RSI: 00000000c0109428 RDI: 0000000000000003
> [  373.852179] RBP: 0000000000000003 R08: 0000000000000003 R09: 0000000000000078
> [  373.861408] R10: 00007f1383daec78 R11: 0000000000000202 R12: 00007fffeb4d874a
> [  373.870647] R13: 0000000000493099 R14: 0000000000000001 R15: 0000000000000000
> [  373.879838]  </TASK>
> [  373.884018]
>                Showing all locks held in the system:
> [  373.894250] 3 locks held by kworker/4:1/58:
> [  373.900356] 1 lock held by khungtaskd/63:
> [  373.906333]  #0: ffffffff8945ff60 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x53/0x260
> [  373.917307] 3 locks held by kworker/u16:6/103:
> [  373.923938]  #0: ffff888127b4f138 ((wq_completion)btrfs-qgroup-rescan){+.+.}-{0:0}, at: process_one_work+0x712/0x1320
> [  373.936555]  #1: ffff88810b817dd8 ((work_completion)(&work->normal_work)){+.+.}-{0:0}, at: process_one_work+0x73f/0x1320
> [  373.951109]  #2: ffff888102dd4650 (sb_internal#2){.+.+}-{0:0}, at: btrfs_qgroup_rescan_worker+0x1f6/0x10c0 [btrfs]
> [  373.964027] 2 locks held by less/1803:
> [  373.969982]  #0: ffff88813ed56098 (&tty->ldisc_sem){++++}-{0:0}, at: tty_ldisc_ref_wait+0x24/0x80
> [  373.981295]  #1: ffffc90000b3b2e8 (&ldata->atomic_read_lock){+.+.}-{3:3}, at: n_tty_read+0x9e2/0x1060
> [  373.992969] 1 lock held by btrfs-transacti/2347:
> [  373.999893]  #0: ffff88813d4887a8 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0xe3/0x3c0 [btrfs]
> [  374.015872] 3 locks held by btrfs/3145:
> [  374.022298]  #0: ffff888102dd4460 (sb_writers#18){.+.+}-{0:0}, at: btrfs_ioctl_balance+0xc3/0x700 [btrfs]
> [  374.034456]  #1: ffff88813d48a0a0 (&fs_info->reclaim_bgs_lock){+.+.}-{3:3}, at: btrfs_balance+0xfe5/0x2d20 [btrfs]
> [  374.047646]  #2: ffff88813d488838 (&fs_info->cleaner_mutex){+.+.}-{3:3}, at: btrfs_relocate_block_group+0x354/0x930 [btrfs]
> [  374.063295] 4 locks held by btrfs/3146:
> [  374.069647]  #0: ffff888102dd4460 (sb_writers#18){.+.+}-{0:0}, at: btrfs_ioctl+0x38b1/0x71b0 [btrfs]
> [  374.081601]  #1: ffff88813d488bb8 (&fs_info->subvol_sem){+.+.}-{3:3}, at: btrfs_ioctl+0x38fd/0x71b0 [btrfs]
> [  374.094283]  #2: ffff888102dd4650 (sb_internal#2){.+.+}-{0:0}, at: btrfs_quota_disable+0xc8/0x9a0 [btrfs]
> [  374.106885]  #3: ffff88813d489800 (&fs_info->qgroup_ioctl_lock){+.+.}-{3:3}, at: btrfs_quota_disable+0xd5/0x9a0 [btrfs]
> 
> [  374.126780] =============================================
> 
> To avoid the deadlock, wait for the qgroup rescan worker to complete
> before starting the transaction for the quota disable ioctl. With
> current implementation, BTRFS_FS_QUOTA_ENABLE flag is cleared to request
> the qgroup rescan worker to complete. However, this flag can not be
> cleared before the transaction for quota disable. Then add and use a new
> flag BTRFS_FS_STATE_QUOTA_DISABLING instead. To avoid another new qgroup
> rescan worker to start after the previous qgroup worker completed, check
> the BTRFS_FS_STATE_QUOTA_DISABLING flag in qgroup_rescan_init.
> 
> Suggested-by: Naohiro Aota <naohiro.aota@wdc.com>
> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> ---
>  fs/btrfs/ctree.h  |  2 ++
>  fs/btrfs/qgroup.c | 20 ++++++++++++++++++--
>  2 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index b4a9b1c58d22..fe275697e3eb 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -145,6 +145,8 @@ enum {
>  	BTRFS_FS_STATE_DUMMY_FS_INFO,
>  
>  	BTRFS_FS_STATE_NO_CSUMS,
> +	/* Quota is in disabling process */
> +	BTRFS_FS_STATE_QUOTA_DISABLING,

We had that already in the past, two bits handling quota state,
c2faff790ccd11ea5be8e "btrfs: remove BTRFS_FS_QUOTA_DISABLING flag".
It's not exactly the same change but reintroducing similar logic makes
me wonder what's wrong with the thread synchronization.
Nikolay Borisov Jan. 13, 2022, 12:44 p.m. UTC | #2
On 13.01.22 г. 12:40, Shin'ichiro Kawasaki wrote:
> Quota disable ioctl starts a transaction before waiting for the qgroup
> rescan worker completes. However, this wait can be infinite and results
> in deadlock because of circular dependency among the quota disable
> ioctl, the qgroup rescan worker and the other task with transaction such
> as block group relocation task.
> 
> The deadlock happens with the steps following:
> 
> 1) Task A calls ioctl to disable quota. It starts a transaction and
>    waits for qgroup rescan worker completes.
> 2) Task B such as block group relocation task starts a transaction and
>    joins to the transaction that task A started. Then task B commits to
>    the transaction. In this commit, task B waits for a commit by task A.
> 3) Task C as the qgroup rescan worker starts its job and starts a
>    transaction. In this transaction start, task C waits for completion
>    of the transaction that task A started and task B committed.
> 
> This deadlock was found with fstests test case block/115 and a zoned
> null_blk device. The test case enables and disables quota, and the
> block group reclaim was triggered during the quota disable by chance.
> The deadlock was also observed by running quota enable and disable in
> parallel with 'btrfs balance' command on regular null_blk devices.
> 


In btrfs_qgroup_rescan_worker can't we make the check for
BTRFS_FS_QUOTA_ENABLED be done _before_ starting the transaction by
refactoring the body of the loop? Even better will be to move the
FS_QUOTA_ENABLED check in rescan_should_stop. That way transaction
commit/quota disable will be delayed until the end of the current
qgroup_rescan_leaf invocation, no?
Nikolay Borisov Jan. 13, 2022, 12:53 p.m. UTC | #3
On 13.01.22 г. 12:40, Shin'ichiro Kawasaki wrote:
> Quota disable ioctl starts a transaction before waiting for the qgroup
> rescan worker completes. However, this wait can be infinite and results
> in deadlock because of circular dependency among the quota disable
> ioctl, the qgroup rescan worker and the other task with transaction such
> as block group relocation task.

<snip>

> Suggested-by: Naohiro Aota <naohiro.aota@wdc.com>
> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> ---
>  fs/btrfs/ctree.h  |  2 ++
>  fs/btrfs/qgroup.c | 20 ++++++++++++++++++--
>  2 files changed, 20 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index b4a9b1c58d22..fe275697e3eb 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -145,6 +145,8 @@ enum {
>  	BTRFS_FS_STATE_DUMMY_FS_INFO,
>  
>  	BTRFS_FS_STATE_NO_CSUMS,
> +	/* Quota is in disabling process */
> +	BTRFS_FS_STATE_QUOTA_DISABLING,
>  };
>  
>  #define BTRFS_BACKREF_REV_MAX		256
> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> index 8928275823a1..6f94da4896bc 100644
> --- a/fs/btrfs/qgroup.c
> +++ b/fs/btrfs/qgroup.c
> @@ -1188,6 +1188,17 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
>  	mutex_lock(&fs_info->qgroup_ioctl_lock);
>  	if (!fs_info->quota_root)
>  		goto out;
> +	/*
> +	 * Request qgroup rescan worker to complete and wait for it. This wait
> +	 * must be done before transaction start for quota disable since it may
> +	 * deadlock with transaction by the qgroup rescan worker.
> +	 */
> +	if (test_and_set_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
> +			     &fs_info->fs_state)) {
> +		ret = -EBUSY;
> +		goto busy;
> +	}
> +	btrfs_qgroup_wait_for_completion(fs_info, false);

Actually you don't need to introduce a separate flag to have this logic,
simply clear QUOTA_ENABLED, do the wait, start the transaction - in case
of failure i.e not able to get a trans handle just set QUOTA_ENABLED
again. Still, moving QUOTA_ENABLED check in rescan_should_stop would be
useful as well.

>  	mutex_unlock(&fs_info->qgroup_ioctl_lock);
>  
>  	/*
> @@ -1212,7 +1223,6 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
>  		goto out;
>  
>  	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
> -	btrfs_qgroup_wait_for_completion(fs_info, false);
>  	spin_lock(&fs_info->qgroup_lock);
>  	quota_root = fs_info->quota_root;
>  	fs_info->quota_root = NULL;
> @@ -1244,6 +1254,8 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
>  	btrfs_put_root(quota_root);
>  
>  out:
> +	clear_bit(BTRFS_FS_STATE_QUOTA_DISABLING, &fs_info->fs_state);
> +busy:
>  	mutex_unlock(&fs_info->qgroup_ioctl_lock);
>  	if (ret && trans)
>  		btrfs_end_transaction(trans);
> @@ -3277,7 +3289,8 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
>  			err = PTR_ERR(trans);
>  			break;
>  		}
> -		if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
> +		if (test_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
> +			     &fs_info->fs_state)) {
>  			err = -EINTR;
>  		} else {
>  			err = qgroup_rescan_leaf(trans, path);
> @@ -3378,6 +3391,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
>  			btrfs_warn(fs_info,
>  				   "qgroup rescan is already in progress");
>  			ret = -EINPROGRESS;
> +		} else if (test_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
> +				    &fs_info->fs_state)) {
> +			ret = -EBUSY;
>  		} else if (!(fs_info->qgroup_flags &
>  			     BTRFS_QGROUP_STATUS_FLAG_ON)) {
>  			btrfs_warn(fs_info,
>
Shinichiro Kawasaki Jan. 14, 2022, 12:52 a.m. UTC | #4
On Jan 13, 2022 / 14:53, Nikolay Borisov wrote:
> 
> 
> On 13.01.22 г. 12:40, Shin'ichiro Kawasaki wrote:
> > Quota disable ioctl starts a transaction before waiting for the qgroup
> > rescan worker completes. However, this wait can be infinite and results
> > in deadlock because of circular dependency among the quota disable
> > ioctl, the qgroup rescan worker and the other task with transaction such
> > as block group relocation task.
> 
> <snip>
> 
> > Suggested-by: Naohiro Aota <naohiro.aota@wdc.com>
> > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > ---
> >  fs/btrfs/ctree.h  |  2 ++
> >  fs/btrfs/qgroup.c | 20 ++++++++++++++++++--
> >  2 files changed, 20 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > index b4a9b1c58d22..fe275697e3eb 100644
> > --- a/fs/btrfs/ctree.h
> > +++ b/fs/btrfs/ctree.h
> > @@ -145,6 +145,8 @@ enum {
> >  	BTRFS_FS_STATE_DUMMY_FS_INFO,
> >  
> >  	BTRFS_FS_STATE_NO_CSUMS,
> > +	/* Quota is in disabling process */
> > +	BTRFS_FS_STATE_QUOTA_DISABLING,
> >  };
> >  
> >  #define BTRFS_BACKREF_REV_MAX		256
> > diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> > index 8928275823a1..6f94da4896bc 100644
> > --- a/fs/btrfs/qgroup.c
> > +++ b/fs/btrfs/qgroup.c
> > @@ -1188,6 +1188,17 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
> >  	mutex_lock(&fs_info->qgroup_ioctl_lock);
> >  	if (!fs_info->quota_root)
> >  		goto out;
> > +	/*
> > +	 * Request qgroup rescan worker to complete and wait for it. This wait
> > +	 * must be done before transaction start for quota disable since it may
> > +	 * deadlock with transaction by the qgroup rescan worker.
> > +	 */
> > +	if (test_and_set_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
> > +			     &fs_info->fs_state)) {
> > +		ret = -EBUSY;
> > +		goto busy;
> > +	}
> > +	btrfs_qgroup_wait_for_completion(fs_info, false);
> 
> Actually you don't need to introduce a separate flag to have this logic,
> simply clear QUOTA_ENABLED, do the wait, start the transaction - in case
> of failure i.e not able to get a trans handle just set QUOTA_ENABLED
> again. Still, moving QUOTA_ENABLED check in rescan_should_stop would be
> useful as well.

Nikolay, thank you for the comment.

The reason to introduce the new flag is that qgroup_ioctl_lock, which guards
QUOTA_ENABLED set by quota enable ioctl, is unlocked during the quota disable
ioctl process to start the transaction. Quote from btrfs_quota_disable():

	...
	mutex_unlock(&fs_info->qgroup_ioctl_lock);

	/*
	 * 1 For the root item
	 *
	 * We should also reserve enough items for the quota tree deletion in
	 * btrfs_clean_quota_tree but this is not done.
	 *
	 * Also, we must always start a transaction without holding the mutex
	 * qgroup_ioctl_lock, see btrfs_quota_enable().
	 */
	trans = btrfs_start_transaction(fs_info->tree_root, 1);

	mutex_lock(&fs_info->qgroup_ioctl_lock);
	...

Then even if we would clear QUOTA_ENABLED and wait for qgroup worker completion
before the transaction start, QUOTA_ENABLED can be set again by quota enable
ioctl and another qgroup worker can start at the point of the transaction start.
Even though this scenario is very rare and unlikely, it can happen. So IMO, we
can not rely on QUOTA_ENABLED to wait for qgroup worker completion.

> 
> >  	mutex_unlock(&fs_info->qgroup_ioctl_lock);
> >  
> >  	/*
> > @@ -1212,7 +1223,6 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
> >  		goto out;
> >  
> >  	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
> > -	btrfs_qgroup_wait_for_completion(fs_info, false);
> >  	spin_lock(&fs_info->qgroup_lock);
> >  	quota_root = fs_info->quota_root;
> >  	fs_info->quota_root = NULL;
> > @@ -1244,6 +1254,8 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
> >  	btrfs_put_root(quota_root);
> >  
> >  out:
> > +	clear_bit(BTRFS_FS_STATE_QUOTA_DISABLING, &fs_info->fs_state);
> > +busy:
> >  	mutex_unlock(&fs_info->qgroup_ioctl_lock);
> >  	if (ret && trans)
> >  		btrfs_end_transaction(trans);
> > @@ -3277,7 +3289,8 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
> >  			err = PTR_ERR(trans);
> >  			break;
> >  		}
> > -		if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
> > +		if (test_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
> > +			     &fs_info->fs_state)) {
> >  			err = -EINTR;
> >  		} else {
> >  			err = qgroup_rescan_leaf(trans, path);
> > @@ -3378,6 +3391,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
> >  			btrfs_warn(fs_info,
> >  				   "qgroup rescan is already in progress");
> >  			ret = -EINPROGRESS;
> > +		} else if (test_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
> > +				    &fs_info->fs_state)) {
> > +			ret = -EBUSY;
> >  		} else if (!(fs_info->qgroup_flags &
> >  			     BTRFS_QGROUP_STATUS_FLAG_ON)) {
> >  			btrfs_warn(fs_info,
> >
Shinichiro Kawasaki Jan. 14, 2022, 4:03 a.m. UTC | #5
On Jan 14, 2022 / 00:52, Shinichiro Kawasaki wrote:
> On Jan 13, 2022 / 14:53, Nikolay Borisov wrote:
> > 
> > 
> > On 13.01.22 г. 12:40, Shin'ichiro Kawasaki wrote:
> > > Quota disable ioctl starts a transaction before waiting for the qgroup
> > > rescan worker completes. However, this wait can be infinite and results
> > > in deadlock because of circular dependency among the quota disable
> > > ioctl, the qgroup rescan worker and the other task with transaction such
> > > as block group relocation task.
> > 
> > <snip>
> > 
> > > Suggested-by: Naohiro Aota <naohiro.aota@wdc.com>
> > > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > > ---
> > >  fs/btrfs/ctree.h  |  2 ++
> > >  fs/btrfs/qgroup.c | 20 ++++++++++++++++++--
> > >  2 files changed, 20 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > > index b4a9b1c58d22..fe275697e3eb 100644
> > > --- a/fs/btrfs/ctree.h
> > > +++ b/fs/btrfs/ctree.h
> > > @@ -145,6 +145,8 @@ enum {
> > >  	BTRFS_FS_STATE_DUMMY_FS_INFO,
> > >  
> > >  	BTRFS_FS_STATE_NO_CSUMS,
> > > +	/* Quota is in disabling process */
> > > +	BTRFS_FS_STATE_QUOTA_DISABLING,
> > >  };
> > >  
> > >  #define BTRFS_BACKREF_REV_MAX		256
> > > diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> > > index 8928275823a1..6f94da4896bc 100644
> > > --- a/fs/btrfs/qgroup.c
> > > +++ b/fs/btrfs/qgroup.c
> > > @@ -1188,6 +1188,17 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
> > >  	mutex_lock(&fs_info->qgroup_ioctl_lock);
> > >  	if (!fs_info->quota_root)
> > >  		goto out;
> > > +	/*
> > > +	 * Request qgroup rescan worker to complete and wait for it. This wait
> > > +	 * must be done before transaction start for quota disable since it may
> > > +	 * deadlock with transaction by the qgroup rescan worker.
> > > +	 */
> > > +	if (test_and_set_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
> > > +			     &fs_info->fs_state)) {
> > > +		ret = -EBUSY;
> > > +		goto busy;
> > > +	}
> > > +	btrfs_qgroup_wait_for_completion(fs_info, false);
> > 
> > Actually you don't need to introduce a separate flag to have this logic,
> > simply clear QUOTA_ENABLED, do the wait, start the transaction - in case
> > of failure i.e not able to get a trans handle just set QUOTA_ENABLED
> > again. Still, moving QUOTA_ENABLED check in rescan_should_stop would be
> > useful as well.
> 
> Nikolay, thank you for the comment.
> 
> The reason to introduce the new flag is that qgroup_ioctl_lock, which guards
> QUOTA_ENABLED set by quota enable ioctl, is unlocked during the quota disable
> ioctl process to start the transaction. Quote from btrfs_quota_disable():
> 
> 	...
> 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
> 
> 	/*
> 	 * 1 For the root item
> 	 *
> 	 * We should also reserve enough items for the quota tree deletion in
> 	 * btrfs_clean_quota_tree but this is not done.
> 	 *
> 	 * Also, we must always start a transaction without holding the mutex
> 	 * qgroup_ioctl_lock, see btrfs_quota_enable().
> 	 */
> 	trans = btrfs_start_transaction(fs_info->tree_root, 1);
> 
> 	mutex_lock(&fs_info->qgroup_ioctl_lock);
> 	...
> 
> Then even if we would clear QUOTA_ENABLED and wait for qgroup worker completion
> before the transaction start, QUOTA_ENABLED can be set again by quota enable
> ioctl and another qgroup worker can start at the point of the transaction start.
> Even though this scenario is very rare and unlikely, it can happen. So IMO, we
> can not rely on QUOTA_ENABLED to wait for qgroup worker completion.

I relooked at btrfs_quota_enable() and btrfs_quota_disable() and found that my
response above is wrong. btrfs_quota_enable() checks fs_info->quota_root to see
if quota is enabled or not. So while QUOTA_ENABLED is cleared in quota disabling
process, quota enable does not run, regardless of the qgroup_ioctl_lock unlock
in btrfs_quota_disable().

With this, Nikolay's suggestion looks the good idea. Will rework the patch.
Shinichiro Kawasaki Jan. 15, 2022, 5:36 a.m. UTC | #6
On Jan 14, 2022 / 04:03, Shinichiro Kawasaki wrote:
> On Jan 14, 2022 / 00:52, Shinichiro Kawasaki wrote:
> > On Jan 13, 2022 / 14:53, Nikolay Borisov wrote:
> > > 
> > > 
> > > On 13.01.22 г. 12:40, Shin'ichiro Kawasaki wrote:
> > > > Quota disable ioctl starts a transaction before waiting for the qgroup
> > > > rescan worker completes. However, this wait can be infinite and results
> > > > in deadlock because of circular dependency among the quota disable
> > > > ioctl, the qgroup rescan worker and the other task with transaction such
> > > > as block group relocation task.
> > > 
> > > <snip>
> > > 
> > > > Suggested-by: Naohiro Aota <naohiro.aota@wdc.com>
> > > > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > > > ---
> > > >  fs/btrfs/ctree.h  |  2 ++
> > > >  fs/btrfs/qgroup.c | 20 ++++++++++++++++++--
> > > >  2 files changed, 20 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > > > index b4a9b1c58d22..fe275697e3eb 100644
> > > > --- a/fs/btrfs/ctree.h
> > > > +++ b/fs/btrfs/ctree.h
> > > > @@ -145,6 +145,8 @@ enum {
> > > >  	BTRFS_FS_STATE_DUMMY_FS_INFO,
> > > >  
> > > >  	BTRFS_FS_STATE_NO_CSUMS,
> > > > +	/* Quota is in disabling process */
> > > > +	BTRFS_FS_STATE_QUOTA_DISABLING,
> > > >  };
> > > >  
> > > >  #define BTRFS_BACKREF_REV_MAX		256
> > > > diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> > > > index 8928275823a1..6f94da4896bc 100644
> > > > --- a/fs/btrfs/qgroup.c
> > > > +++ b/fs/btrfs/qgroup.c
> > > > @@ -1188,6 +1188,17 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
> > > >  	mutex_lock(&fs_info->qgroup_ioctl_lock);
> > > >  	if (!fs_info->quota_root)
> > > >  		goto out;
> > > > +	/*
> > > > +	 * Request qgroup rescan worker to complete and wait for it. This wait
> > > > +	 * must be done before transaction start for quota disable since it may
> > > > +	 * deadlock with transaction by the qgroup rescan worker.
> > > > +	 */
> > > > +	if (test_and_set_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
> > > > +			     &fs_info->fs_state)) {
> > > > +		ret = -EBUSY;
> > > > +		goto busy;
> > > > +	}
> > > > +	btrfs_qgroup_wait_for_completion(fs_info, false);
> > > 
> > > Actually you don't need to introduce a separate flag to have this logic,
> > > simply clear QUOTA_ENABLED, do the wait, start the transaction - in case
> > > of failure i.e not able to get a trans handle just set QUOTA_ENABLED

I have just posted v2 patch based on this approach. Review comments will be
appreciated.

> > > again. Still, moving QUOTA_ENABLED check in rescan_should_stop would be
> > > useful as well.

I saw Nikolay posted this part as a separated patch, so it is not included in
the v2 patch.

https://lore.kernel.org/linux-btrfs/20220113151618.2149736-1-nborisov@suse.com/T/#u
diff mbox series

Patch

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b4a9b1c58d22..fe275697e3eb 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -145,6 +145,8 @@  enum {
 	BTRFS_FS_STATE_DUMMY_FS_INFO,
 
 	BTRFS_FS_STATE_NO_CSUMS,
+	/* Quota is in disabling process */
+	BTRFS_FS_STATE_QUOTA_DISABLING,
 };
 
 #define BTRFS_BACKREF_REV_MAX		256
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 8928275823a1..6f94da4896bc 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1188,6 +1188,17 @@  int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 	mutex_lock(&fs_info->qgroup_ioctl_lock);
 	if (!fs_info->quota_root)
 		goto out;
+	/*
+	 * Request qgroup rescan worker to complete and wait for it. This wait
+	 * must be done before transaction start for quota disable since it may
+	 * deadlock with transaction by the qgroup rescan worker.
+	 */
+	if (test_and_set_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
+			     &fs_info->fs_state)) {
+		ret = -EBUSY;
+		goto busy;
+	}
+	btrfs_qgroup_wait_for_completion(fs_info, false);
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 
 	/*
@@ -1212,7 +1223,6 @@  int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 		goto out;
 
 	clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
-	btrfs_qgroup_wait_for_completion(fs_info, false);
 	spin_lock(&fs_info->qgroup_lock);
 	quota_root = fs_info->quota_root;
 	fs_info->quota_root = NULL;
@@ -1244,6 +1254,8 @@  int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
 	btrfs_put_root(quota_root);
 
 out:
+	clear_bit(BTRFS_FS_STATE_QUOTA_DISABLING, &fs_info->fs_state);
+busy:
 	mutex_unlock(&fs_info->qgroup_ioctl_lock);
 	if (ret && trans)
 		btrfs_end_transaction(trans);
@@ -3277,7 +3289,8 @@  static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
 			err = PTR_ERR(trans);
 			break;
 		}
-		if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+		if (test_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
+			     &fs_info->fs_state)) {
 			err = -EINTR;
 		} else {
 			err = qgroup_rescan_leaf(trans, path);
@@ -3378,6 +3391,9 @@  qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
 			btrfs_warn(fs_info,
 				   "qgroup rescan is already in progress");
 			ret = -EINPROGRESS;
+		} else if (test_bit(BTRFS_FS_STATE_QUOTA_DISABLING,
+				    &fs_info->fs_state)) {
+			ret = -EBUSY;
 		} else if (!(fs_info->qgroup_flags &
 			     BTRFS_QGROUP_STATUS_FLAG_ON)) {
 			btrfs_warn(fs_info,