Message ID | 20220829131502.165356-3-yukuai1@huaweicloud.com (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Song Liu |
Headers | show |
Series | md/raid10: reduce lock contention for io | expand |
Hi, On 2022-08-29 07:15, Yu Kuai wrote: > From: Yu Kuai <yukuai3@huawei.com> > > Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier', > and io can't be dispatched until 'barrier' is dropped. > > Since holding the 'barrier' is not common, convert 'resync_lock' to use > seqlock so that holding lock can be avoided in fast path. > > Signed-off-by: Yu Kuai <yukuai3@huawei.com> I've found some lockdep issues starting with this patch in md-next while running mdadm tests (specifically 00raid10 when run about 10 times in a row). I've seen a couple different lock dep errors. The first seems to be reproducible on this patch, then it possibly changes to the second on subsequent patches. Not sure exactly. I haven't dug into it too deeply, but hopefully it can be fixed easily. Logan -- ================================ WARNING: inconsistent lock state 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 Not tainted -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. fsck.ext3/1695 [HC0[0]:SC0[0]:HE0:SE1] takes: ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at: raid10_read_request+0x21f/0x760 (raid10.c:1134) {IN-SOFTIRQ-W} state was registered at: lock_acquire+0x183/0x440 lower_barrier+0x5e/0xd0 end_sync_request+0x178/0x180 end_sync_write+0x193/0x380 bio_endio+0x346/0x3a0 blk_update_request+0x1eb/0x7c0 blk_mq_end_request+0x30/0x50 lo_complete_rq+0xb7/0x100 blk_complete_reqs+0x77/0x90 blk_done_softirq+0x38/0x40 __do_softirq+0x10c/0x650 run_ksoftirqd+0x48/0x80 smpboot_thread_fn+0x302/0x400 kthread+0x18c/0x1c0 ret_from_fork+0x1f/0x30 irq event stamp: 8930 hardirqs last enabled at (8929): [<ffffffff96df8351>] _raw_spin_unlock_irqrestore+0x31/0x60 hardirqs last disabled at (8930): [<ffffffff96df7fc5>] _raw_spin_lock_irq+0x75/0x90 softirqs last enabled at (6768): [<ffffffff9554970e>] __irq_exit_rcu+0xfe/0x150 softirqs last disabled at (6757): [<ffffffff9554970e>] __irq_exit_rcu+0xfe/0x150 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&____s->seqcount#10); <Interrupt> lock(&____s->seqcount#10); *** DEADLOCK *** 2 locks held by fsck.ext3/1695: #0: ffff8881007d0930 (mapping.invalidate_lock#2){++++}-{3:3}, at: page_cache_ra_unbounded+0xaf/0x250 #1: ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at: raid10_read_request+0x21f/0x760 stack backtrace: CPU: 0 PID: 1695 Comm: fsck.ext3 Not tainted 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x5a/0x74 dump_stack+0x10/0x12 print_usage_bug.part.0+0x233/0x246 mark_lock.part.0.cold+0x73/0x14f mark_held_locks+0x71/0xa0 lockdep_hardirqs_on_prepare+0x158/0x230 trace_hardirqs_on+0x34/0x100 _raw_spin_unlock_irq+0x28/0x60 wait_barrier+0x4a6/0x720 raid10.c:1004 raid10_read_request+0x21f/0x760 raid10_make_request+0x2d6/0x2160 md_handle_request+0x3f3/0x5b0 md_submit_bio+0xd9/0x120 __submit_bio+0x9d/0x100 submit_bio_noacct_nocheck+0x1fd/0x470 submit_bio_noacct+0x4c2/0xbb0 submit_bio+0x3f/0xf0 mpage_readahead+0x323/0x3b0 blkdev_readahead+0x15/0x20 read_pages+0x136/0x7a0 page_cache_ra_unbounded+0x18d/0x250 page_cache_ra_order+0x2c9/0x400 ondemand_readahead+0x320/0x730 page_cache_sync_ra+0xa6/0xb0 filemap_get_pages+0x1eb/0xc00 filemap_read+0x1f1/0x770 blkdev_read_iter+0x164/0x310 vfs_read+0x467/0x5a0 __x64_sys_pread64+0x122/0x160 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 -- ====================================================== WARNING: possible circular locking dependency detected 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 Not tainted ------------------------------------------------------ systemd-udevd/292 is trying to acquire lock: ffff88817b644170 (&(&conf->resync_lock)->lock){....}-{2:2}, at: wait_barrier+0x4fe/0x770 but task is already holding lock: ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at: raid10_read_request+0x21f/0x760 raid10.c:1140 wait_barrier() raid10.c:1204 regular_request_wait() which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&____s->seqcount#11){+.+.}-{0:0}: raise_barrier+0xe0/0x300 raid10.c:940 write_seqlock_irq() raid10_sync_request+0x629/0x4750 raid10.c:3689 raise_barrire() md_do_sync.cold+0x8ec/0x1491 md_thread+0x19d/0x2d0 kthread+0x18c/0x1c0 ret_from_fork+0x1f/0x30 -> #0 (&(&conf->resync_lock)->lock){....}-{2:2}: __lock_acquire+0x1cb4/0x3170 lock_acquire+0x183/0x440 _raw_spin_lock_irq+0x4d/0x90 wait_barrier+0x4fe/0x770 raid10_read_request+0x21f/0x760 raid10.c:1140 wait_barrier() raid10.c:1204 regular_request_wait() raid10_make_request+0x2d6/0x2190 md_handle_request+0x3f3/0x5b0 md_submit_bio+0xd9/0x120 __submit_bio+0x9d/0x100 submit_bio_noacct_nocheck+0x1fd/0x470 submit_bio_noacct+0x4c2/0xbb0 submit_bio+0x3f/0xf0 submit_bh_wbc+0x270/0x2a0 block_read_full_folio+0x37c/0x580 blkdev_read_folio+0x18/0x20 filemap_read_folio+0x3f/0x110 do_read_cache_folio+0x13b/0x2c0 read_cache_folio+0x42/0x50 read_part_sector+0x74/0x1c0 read_lba+0x176/0x2a0 efi_partition+0x1ce/0xdd0 bdev_disk_changed+0x2e7/0x6a0 blkdev_get_whole+0xd2/0x140 blkdev_get_by_dev.part.0+0x37f/0x570 blkdev_get_by_dev+0x51/0x60 disk_scan_partitions+0xad/0xf0 blkdev_common_ioctl+0x3f3/0xdf0 blkdev_ioctl+0x1e1/0x450 __x64_sys_ioctl+0xc0/0x100 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&____s->seqcount#11); lock(&(&conf->resync_lock)->lock); lock(&____s->seqcount#11); lock(&(&conf->resync_lock)->lock); *** DEADLOCK *** 2 locks held by systemd-udevd/292: #0: ffff88817a532528 (&disk->open_mutex){+.+.}-{3:3}, at: blkdev_get_by_dev.part.0+0x180/0x570 #1: ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at: raid10_read_request+0x21f/0x760 stack backtrace: CPU: 3 PID: 292 Comm: systemd-udevd Not tainted 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 Call Trace: <TASK> dump_stack_lvl+0x5a/0x74 dump_stack+0x10/0x12 print_circular_bug.cold+0x146/0x14b check_noncircular+0x1ff/0x250 __lock_acquire+0x1cb4/0x3170 lock_acquire+0x183/0x440 _raw_spin_lock_irq+0x4d/0x90 wait_barrier+0x4fe/0x770 raid10_read_request+0x21f/0x760 raid10_make_request+0x2d6/0x2190 md_handle_request+0x3f3/0x5b0 md_submit_bio+0xd9/0x120 __submit_bio+0x9d/0x100 submit_bio_noacct_nocheck+0x1fd/0x470 submit_bio_noacct+0x4c2/0xbb0 submit_bio+0x3f/0xf0 submit_bh_wbc+0x270/0x2a0 block_read_full_folio+0x37c/0x580 blkdev_read_folio+0x18/0x20 filemap_read_folio+0x3f/0x110 do_read_cache_folio+0x13b/0x2c0 read_cache_folio+0x42/0x50 read_part_sector+0x74/0x1c0 read_lba+0x176/0x2a0 efi_partition+0x1ce/0xdd0 bdev_disk_changed+0x2e7/0x6a0 blkdev_get_whole+0xd2/0x140 blkdev_get_by_dev.part.0+0x37f/0x570 blkdev_get_by_dev+0x51/0x60 disk_scan_partitions+0xad/0xf0 blkdev_common_ioctl+0x3f3/0xdf0 blkdev_ioctl+0x1e1/0x450 __x64_sys_ioctl+0xc0/0x100 do_syscall_64+0x35/0x80
On 9/2/22 2:41 AM, Logan Gunthorpe wrote: > Hi, > > On 2022-08-29 07:15, Yu Kuai wrote: >> From: Yu Kuai <yukuai3@huawei.com> >> >> Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier', >> and io can't be dispatched until 'barrier' is dropped. >> >> Since holding the 'barrier' is not common, convert 'resync_lock' to use >> seqlock so that holding lock can be avoided in fast path. >> >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> > I've found some lockdep issues starting with this patch in md-next while > running mdadm tests (specifically 00raid10 when run about 10 times in a > row). > > I've seen a couple different lock dep errors. The first seems to be > reproducible on this patch, then it possibly changes to the second on > subsequent patches. Not sure exactly. That's why I said "try mdadm test suites too to avoid regression." ... Guoqing
On 2022-09-01 18:49, Guoqing Jiang wrote: > > > On 9/2/22 2:41 AM, Logan Gunthorpe wrote: >> Hi, >> >> On 2022-08-29 07:15, Yu Kuai wrote: >>> From: Yu Kuai <yukuai3@huawei.com> >>> >>> Currently, wait_barrier() will hold 'resync_lock' to read >>> 'conf->barrier', >>> and io can't be dispatched until 'barrier' is dropped. >>> >>> Since holding the 'barrier' is not common, convert 'resync_lock' to use >>> seqlock so that holding lock can be avoided in fast path. >>> >>> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> I've found some lockdep issues starting with this patch in md-next while >> running mdadm tests (specifically 00raid10 when run about 10 times in a >> row). >> >> I've seen a couple different lock dep errors. The first seems to be >> reproducible on this patch, then it possibly changes to the second on >> subsequent patches. Not sure exactly. > > That's why I said "try mdadm test suites too to avoid regression." ... You may have to run it multiple times, a single run tends not to catch all errors. I had to loop the noted test 10 times to be sure I hit this every time when I did the simple bisect. And ensure that all the debug options are on when you run it (take a look at the Kernel Hacking section in menuconfig). You won't hit this bug without at least CONFIG_PROVE_LOCKING=y. Logan
On 9/2/22 8:56 AM, Logan Gunthorpe wrote: > > On 2022-09-01 18:49, Guoqing Jiang wrote: >> >> On 9/2/22 2:41 AM, Logan Gunthorpe wrote: >>> Hi, >>> >>> On 2022-08-29 07:15, Yu Kuai wrote: >>>> From: Yu Kuai <yukuai3@huawei.com> >>>> >>>> Currently, wait_barrier() will hold 'resync_lock' to read >>>> 'conf->barrier', >>>> and io can't be dispatched until 'barrier' is dropped. >>>> >>>> Since holding the 'barrier' is not common, convert 'resync_lock' to use >>>> seqlock so that holding lock can be avoided in fast path. >>>> >>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >>> I've found some lockdep issues starting with this patch in md-next while >>> running mdadm tests (specifically 00raid10 when run about 10 times in a >>> row). >>> >>> I've seen a couple different lock dep errors. The first seems to be >>> reproducible on this patch, then it possibly changes to the second on >>> subsequent patches. Not sure exactly. >> That's why I said "try mdadm test suites too to avoid regression." ... > You may have to run it multiple times, a single run tends not to catch > all errors. I had to loop the noted test 10 times to be sure I hit this > every time when I did the simple bisect. > > And ensure that all the debug options are on when you run it (take a > look at the Kernel Hacking section in menuconfig). You won't hit this > bug without at least CONFIG_PROVE_LOCKING=y. Yes, we definitely need to enable the option to test change for locking stuffs. Thanks, Guoqing
Hi, 在 2022/09/02 2:41, Logan Gunthorpe 写道: > Hi, > > On 2022-08-29 07:15, Yu Kuai wrote: >> From: Yu Kuai <yukuai3@huawei.com> >> >> Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier', >> and io can't be dispatched until 'barrier' is dropped. >> >> Since holding the 'barrier' is not common, convert 'resync_lock' to use >> seqlock so that holding lock can be avoided in fast path. >> >> Signed-off-by: Yu Kuai <yukuai3@huawei.com> > > I've found some lockdep issues starting with this patch in md-next while > running mdadm tests (specifically 00raid10 when run about 10 times in a > row). > > I've seen a couple different lock dep errors. The first seems to be > reproducible on this patch, then it possibly changes to the second on > subsequent patches. Not sure exactly. > Thanks for the test, I think this is false positive because of the special usage here, for example, in raise_barrier(): write_seqlock_irq spin_lock_irq(); lock_acquire do_write_seqcount_begin seqcount_acquire wait_event_lock_irq_cmd spin_unlock_irq -> lock is released while seqcount is still hold if other context hold the lock again, lockdep will trigger warning. ... spin_lock_irq write_sequnlock_irq Functionality should be ok, I'll try to find a way to prevent such warning. Thanks, Kuai > I haven't dug into it too deeply, but hopefully it can be fixed easily. > > Logan > > -- > > > ================================ > WARNING: inconsistent lock state > 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 Not tainted > -------------------------------- > inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. > fsck.ext3/1695 [HC0[0]:SC0[0]:HE0:SE1] takes: > ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at: > raid10_read_request+0x21f/0x760 > (raid10.c:1134) > > {IN-SOFTIRQ-W} state was registered at: > lock_acquire+0x183/0x440 > lower_barrier+0x5e/0xd0 > end_sync_request+0x178/0x180 > end_sync_write+0x193/0x380 > bio_endio+0x346/0x3a0 > blk_update_request+0x1eb/0x7c0 > blk_mq_end_request+0x30/0x50 > lo_complete_rq+0xb7/0x100 > blk_complete_reqs+0x77/0x90 > blk_done_softirq+0x38/0x40 > __do_softirq+0x10c/0x650 > run_ksoftirqd+0x48/0x80 > smpboot_thread_fn+0x302/0x400 > kthread+0x18c/0x1c0 > ret_from_fork+0x1f/0x30 > > irq event stamp: 8930 > hardirqs last enabled at (8929): [<ffffffff96df8351>] > _raw_spin_unlock_irqrestore+0x31/0x60 > hardirqs last disabled at (8930): [<ffffffff96df7fc5>] > _raw_spin_lock_irq+0x75/0x90 > softirqs last enabled at (6768): [<ffffffff9554970e>] > __irq_exit_rcu+0xfe/0x150 > softirqs last disabled at (6757): [<ffffffff9554970e>] > __irq_exit_rcu+0xfe/0x150 > > other info that might help us debug this: > Possible unsafe locking scenario: > > CPU0 > ---- > lock(&____s->seqcount#10); > <Interrupt> > lock(&____s->seqcount#10); > > *** DEADLOCK *** > > 2 locks held by fsck.ext3/1695: > #0: ffff8881007d0930 (mapping.invalidate_lock#2){++++}-{3:3}, at: > page_cache_ra_unbounded+0xaf/0x250 > #1: ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at: > raid10_read_request+0x21f/0x760 > > stack backtrace: > CPU: 0 PID: 1695 Comm: fsck.ext3 Not tainted > 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 > 04/01/2014 > Call Trace: > <TASK> > dump_stack_lvl+0x5a/0x74 > dump_stack+0x10/0x12 > print_usage_bug.part.0+0x233/0x246 > mark_lock.part.0.cold+0x73/0x14f > mark_held_locks+0x71/0xa0 > lockdep_hardirqs_on_prepare+0x158/0x230 > trace_hardirqs_on+0x34/0x100 > _raw_spin_unlock_irq+0x28/0x60 > wait_barrier+0x4a6/0x720 > raid10.c:1004 > raid10_read_request+0x21f/0x760 > raid10_make_request+0x2d6/0x2160 > md_handle_request+0x3f3/0x5b0 > md_submit_bio+0xd9/0x120 > __submit_bio+0x9d/0x100 > submit_bio_noacct_nocheck+0x1fd/0x470 > submit_bio_noacct+0x4c2/0xbb0 > submit_bio+0x3f/0xf0 > mpage_readahead+0x323/0x3b0 > blkdev_readahead+0x15/0x20 > read_pages+0x136/0x7a0 > page_cache_ra_unbounded+0x18d/0x250 > page_cache_ra_order+0x2c9/0x400 > ondemand_readahead+0x320/0x730 > page_cache_sync_ra+0xa6/0xb0 > filemap_get_pages+0x1eb/0xc00 > filemap_read+0x1f1/0x770 > blkdev_read_iter+0x164/0x310 > vfs_read+0x467/0x5a0 > __x64_sys_pread64+0x122/0x160 > do_syscall_64+0x35/0x80 > entry_SYSCALL_64_after_hwframe+0x46/0xb0 > > -- > > ====================================================== > WARNING: possible circular locking dependency detected > 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 Not tainted > ------------------------------------------------------ > systemd-udevd/292 is trying to acquire lock: > ffff88817b644170 (&(&conf->resync_lock)->lock){....}-{2:2}, at: > wait_barrier+0x4fe/0x770 > > but task is already holding lock: > ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at: > raid10_read_request+0x21f/0x760 > raid10.c:1140 wait_barrier() > raid10.c:1204 regular_request_wait() > > > > which lock already depends on the new lock. > > > the existing dependency chain (in reverse order) is: > > -> #1 (&____s->seqcount#11){+.+.}-{0:0}: > raise_barrier+0xe0/0x300 > raid10.c:940 write_seqlock_irq() > raid10_sync_request+0x629/0x4750 > raid10.c:3689 raise_barrire() > md_do_sync.cold+0x8ec/0x1491 > md_thread+0x19d/0x2d0 > kthread+0x18c/0x1c0 > ret_from_fork+0x1f/0x30 > > -> #0 (&(&conf->resync_lock)->lock){....}-{2:2}: > __lock_acquire+0x1cb4/0x3170 > lock_acquire+0x183/0x440 > _raw_spin_lock_irq+0x4d/0x90 > wait_barrier+0x4fe/0x770 > raid10_read_request+0x21f/0x760 > raid10.c:1140 wait_barrier() > raid10.c:1204 regular_request_wait() > raid10_make_request+0x2d6/0x2190 > md_handle_request+0x3f3/0x5b0 > md_submit_bio+0xd9/0x120 > __submit_bio+0x9d/0x100 > submit_bio_noacct_nocheck+0x1fd/0x470 > submit_bio_noacct+0x4c2/0xbb0 > submit_bio+0x3f/0xf0 > submit_bh_wbc+0x270/0x2a0 > block_read_full_folio+0x37c/0x580 > blkdev_read_folio+0x18/0x20 > filemap_read_folio+0x3f/0x110 > do_read_cache_folio+0x13b/0x2c0 > read_cache_folio+0x42/0x50 > read_part_sector+0x74/0x1c0 > read_lba+0x176/0x2a0 > efi_partition+0x1ce/0xdd0 > bdev_disk_changed+0x2e7/0x6a0 > blkdev_get_whole+0xd2/0x140 > blkdev_get_by_dev.part.0+0x37f/0x570 > blkdev_get_by_dev+0x51/0x60 > disk_scan_partitions+0xad/0xf0 > blkdev_common_ioctl+0x3f3/0xdf0 > blkdev_ioctl+0x1e1/0x450 > __x64_sys_ioctl+0xc0/0x100 > do_syscall_64+0x35/0x80 > entry_SYSCALL_64_after_hwframe+0x46/0xb0 > > other info that might help us debug this: > > Possible unsafe locking scenario: > > CPU0 CPU1 > ---- ---- > lock(&____s->seqcount#11); > lock(&(&conf->resync_lock)->lock); > lock(&____s->seqcount#11); > lock(&(&conf->resync_lock)->lock); > > *** DEADLOCK *** > > 2 locks held by systemd-udevd/292: > #0: ffff88817a532528 (&disk->open_mutex){+.+.}-{3:3}, at: > blkdev_get_by_dev.part.0+0x180/0x570 > #1: ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at: > raid10_read_request+0x21f/0x760 > > stack backtrace: > CPU: 3 PID: 292 Comm: systemd-udevd Not tainted > 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 > 04/01/2014 > Call Trace: > <TASK> > dump_stack_lvl+0x5a/0x74 > dump_stack+0x10/0x12 > print_circular_bug.cold+0x146/0x14b > check_noncircular+0x1ff/0x250 > __lock_acquire+0x1cb4/0x3170 > lock_acquire+0x183/0x440 > _raw_spin_lock_irq+0x4d/0x90 > wait_barrier+0x4fe/0x770 > raid10_read_request+0x21f/0x760 > raid10_make_request+0x2d6/0x2190 > md_handle_request+0x3f3/0x5b0 > md_submit_bio+0xd9/0x120 > __submit_bio+0x9d/0x100 > submit_bio_noacct_nocheck+0x1fd/0x470 > submit_bio_noacct+0x4c2/0xbb0 > submit_bio+0x3f/0xf0 > submit_bh_wbc+0x270/0x2a0 > block_read_full_folio+0x37c/0x580 > blkdev_read_folio+0x18/0x20 > filemap_read_folio+0x3f/0x110 > do_read_cache_folio+0x13b/0x2c0 > read_cache_folio+0x42/0x50 > read_part_sector+0x74/0x1c0 > read_lba+0x176/0x2a0 > efi_partition+0x1ce/0xdd0 > bdev_disk_changed+0x2e7/0x6a0 > blkdev_get_whole+0xd2/0x140 > blkdev_get_by_dev.part.0+0x37f/0x570 > blkdev_get_by_dev+0x51/0x60 > disk_scan_partitions+0xad/0xf0 > blkdev_common_ioctl+0x3f3/0xdf0 > blkdev_ioctl+0x1e1/0x450 > __x64_sys_ioctl+0xc0/0x100 > do_syscall_64+0x35/0x80 > . >
Hi, Logan 在 2022/09/02 9:21, Yu Kuai 写道: > Hi, > > 在 2022/09/02 2:41, Logan Gunthorpe 写道: >> Hi, >> >> On 2022-08-29 07:15, Yu Kuai wrote: >>> From: Yu Kuai <yukuai3@huawei.com> >>> >>> Currently, wait_barrier() will hold 'resync_lock' to read >>> 'conf->barrier', >>> and io can't be dispatched until 'barrier' is dropped. >>> >>> Since holding the 'barrier' is not common, convert 'resync_lock' to use >>> seqlock so that holding lock can be avoided in fast path. >>> >>> Signed-off-by: Yu Kuai <yukuai3@huawei.com> >> >> I've found some lockdep issues starting with this patch in md-next while >> running mdadm tests (specifically 00raid10 when run about 10 times in a >> row). >> >> I've seen a couple different lock dep errors. The first seems to be >> reproducible on this patch, then it possibly changes to the second on >> subsequent patches. Not sure exactly. >> > > Thanks for the test, > > I think this is false positive because of the special usage here, > > for example, in raise_barrier(): > > write_seqlock_irq > spin_lock_irq(); > lock_acquire > do_write_seqcount_begin > seqcount_acquire > > wait_event_lock_irq_cmd > spin_unlock_irq -> lock is released while seqcount is still hold > if other context hold the lock again, lockdep > will trigger warning. > ... > spin_lock_irq > > write_sequnlock_irq > > Functionality should be ok, I'll try to find a way to prevent such > warning. Can you try the following patch? I'm running mdadm tests myself and I didn't see any problems yet. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2f7c8bef6dc2..317bd862f40a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -940,16 +940,16 @@ static void raise_barrier(struct r10conf *conf, int force) BUG_ON(force && !conf->barrier); /* Wait until no block IO is waiting (unless 'force') */ - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock.lock); + wait_event_seqlock_irq(conf->wait_barrier, force || !conf->nr_waiting, + conf->resync_lock); /* block any new IO from starting */ WRITE_ONCE(conf->barrier, conf->barrier + 1); /* Now wait for all pending IO to complete */ - wait_event_lock_irq(conf->wait_barrier, - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, - conf->resync_lock.lock); + wait_event_seqlock_irq(conf->wait_barrier, + !atomic_read(&conf->nr_pending) && + conf->barrier < RESYNC_DEPTH, conf->resync_lock); write_sequnlock_irq(&conf->resync_lock); } @@ -1007,7 +1007,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) ret = false; } else { raid10_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, + wait_event_seqlock_irq(conf->wait_barrier, !conf->barrier || (atomic_read(&conf->nr_pending) && bio_list && @@ -1020,7 +1020,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) && conf->nr_queued > 0), - conf->resync_lock.lock); + conf->resync_lock); } conf->nr_waiting--; if (!conf->nr_waiting) @@ -1058,10 +1058,9 @@ static void freeze_array(struct r10conf *conf, int extra) conf->array_freeze_pending++; WRITE_ONCE(conf->barrier, conf->barrier + 1); conf->nr_waiting++; - wait_event_lock_irq_cmd(conf->wait_barrier, + wait_event_seqlock_irq_cmd(conf->wait_barrier, atomic_read(&conf->nr_pending) == conf->nr_queued+extra, - conf->resync_lock.lock, - flush_pending_writes(conf)); + conf->resync_lock, flush_pending_writes(conf)); conf->array_freeze_pending--; write_sequnlock_irq(&conf->resync_lock); diff --git a/include/linux/wait.h b/include/linux/wait.h index 58cfbf81447c..97d6b378e40c 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -977,6 +977,13 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); schedule(); \ spin_lock_irq(&lock)) +#define __wait_event_seqlock_irq(wq_head, condition, lock, cmd) \ + (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ + write_sequnlock_irq(&lock); \ + cmd; \ + schedule(); \ + write_seqlock_irq(&lock)) + /** * wait_event_lock_irq_cmd - sleep until a condition gets true. The * condition is checked under the lock. This @@ -1007,6 +1014,13 @@ do { \ __wait_event_lock_irq(wq_head, condition, lock, cmd); \ } while (0) +#define wait_event_seqlock_irq_cmd(wq_head, condition, lock, cmd) \ +do { \ + if (condition) \ + break; \ + __wait_event_seqlock_irq(wq_head, condition, lock, cmd); \ +} while (0) + /** * wait_event_lock_irq - sleep until a condition gets true. The * condition is checked under the lock. This @@ -1034,6 +1048,12 @@ do { \ __wait_event_lock_irq(wq_head, condition, lock, ); \ } while (0) +#define wait_event_seqlock_irq(wq_head, condition, lock) \ +do { \ + if (condition) \ + break; \ + __wait_event_seqlock_irq(wq_head, condition, lock, ); \ +} while (0) #define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd) \ ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ > > Thanks, > Kuai >> I haven't dug into it too deeply, but hopefully it can be fixed easily. >> >> Logan >> >> -- >> >> >> ================================ >> WARNING: inconsistent lock state >> 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 Not tainted >> -------------------------------- >> inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. >> fsck.ext3/1695 [HC0[0]:SC0[0]:HE0:SE1] takes: >> ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at: >> raid10_read_request+0x21f/0x760 >> (raid10.c:1134) >> >> {IN-SOFTIRQ-W} state was registered at: >> lock_acquire+0x183/0x440 >> lower_barrier+0x5e/0xd0 >> end_sync_request+0x178/0x180 >> end_sync_write+0x193/0x380 >> bio_endio+0x346/0x3a0 >> blk_update_request+0x1eb/0x7c0 >> blk_mq_end_request+0x30/0x50 >> lo_complete_rq+0xb7/0x100 >> blk_complete_reqs+0x77/0x90 >> blk_done_softirq+0x38/0x40 >> __do_softirq+0x10c/0x650 >> run_ksoftirqd+0x48/0x80 >> smpboot_thread_fn+0x302/0x400 >> kthread+0x18c/0x1c0 >> ret_from_fork+0x1f/0x30 >> >> irq event stamp: 8930 >> hardirqs last enabled at (8929): [<ffffffff96df8351>] >> _raw_spin_unlock_irqrestore+0x31/0x60 >> hardirqs last disabled at (8930): [<ffffffff96df7fc5>] >> _raw_spin_lock_irq+0x75/0x90 >> softirqs last enabled at (6768): [<ffffffff9554970e>] >> __irq_exit_rcu+0xfe/0x150 >> softirqs last disabled at (6757): [<ffffffff9554970e>] >> __irq_exit_rcu+0xfe/0x150 >> >> other info that might help us debug this: >> Possible unsafe locking scenario: >> >> CPU0 >> ---- >> lock(&____s->seqcount#10); >> <Interrupt> >> lock(&____s->seqcount#10); >> >> *** DEADLOCK *** >> >> 2 locks held by fsck.ext3/1695: >> #0: ffff8881007d0930 (mapping.invalidate_lock#2){++++}-{3:3}, at: >> page_cache_ra_unbounded+0xaf/0x250 >> #1: ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at: >> raid10_read_request+0x21f/0x760 >> >> stack backtrace: >> CPU: 0 PID: 1695 Comm: fsck.ext3 Not tainted >> 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 >> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 >> 04/01/2014 >> Call Trace: >> <TASK> >> dump_stack_lvl+0x5a/0x74 >> dump_stack+0x10/0x12 >> print_usage_bug.part.0+0x233/0x246 >> mark_lock.part.0.cold+0x73/0x14f >> mark_held_locks+0x71/0xa0 >> lockdep_hardirqs_on_prepare+0x158/0x230 >> trace_hardirqs_on+0x34/0x100 >> _raw_spin_unlock_irq+0x28/0x60 >> wait_barrier+0x4a6/0x720 >> raid10.c:1004 >> raid10_read_request+0x21f/0x760 >> raid10_make_request+0x2d6/0x2160 >> md_handle_request+0x3f3/0x5b0 >> md_submit_bio+0xd9/0x120 >> __submit_bio+0x9d/0x100 >> submit_bio_noacct_nocheck+0x1fd/0x470 >> submit_bio_noacct+0x4c2/0xbb0 >> submit_bio+0x3f/0xf0 >> mpage_readahead+0x323/0x3b0 >> blkdev_readahead+0x15/0x20 >> read_pages+0x136/0x7a0 >> page_cache_ra_unbounded+0x18d/0x250 >> page_cache_ra_order+0x2c9/0x400 >> ondemand_readahead+0x320/0x730 >> page_cache_sync_ra+0xa6/0xb0 >> filemap_get_pages+0x1eb/0xc00 >> filemap_read+0x1f1/0x770 >> blkdev_read_iter+0x164/0x310 >> vfs_read+0x467/0x5a0 >> __x64_sys_pread64+0x122/0x160 >> do_syscall_64+0x35/0x80 >> entry_SYSCALL_64_after_hwframe+0x46/0xb0 >> >> -- >> >> ====================================================== >> WARNING: possible circular locking dependency detected >> 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 Not tainted >> ------------------------------------------------------ >> systemd-udevd/292 is trying to acquire lock: >> ffff88817b644170 (&(&conf->resync_lock)->lock){....}-{2:2}, at: >> wait_barrier+0x4fe/0x770 >> >> but task is already holding lock: >> ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at: >> raid10_read_request+0x21f/0x760 >> raid10.c:1140 wait_barrier() >> raid10.c:1204 regular_request_wait() >> >> >> >> which lock already depends on the new lock. >> >> >> the existing dependency chain (in reverse order) is: >> >> -> #1 (&____s->seqcount#11){+.+.}-{0:0}: >> raise_barrier+0xe0/0x300 >> raid10.c:940 write_seqlock_irq() >> raid10_sync_request+0x629/0x4750 >> raid10.c:3689 raise_barrire() >> md_do_sync.cold+0x8ec/0x1491 >> md_thread+0x19d/0x2d0 >> kthread+0x18c/0x1c0 >> ret_from_fork+0x1f/0x30 >> >> -> #0 (&(&conf->resync_lock)->lock){....}-{2:2}: >> __lock_acquire+0x1cb4/0x3170 >> lock_acquire+0x183/0x440 >> _raw_spin_lock_irq+0x4d/0x90 >> wait_barrier+0x4fe/0x770 >> raid10_read_request+0x21f/0x760 >> raid10.c:1140 wait_barrier() >> raid10.c:1204 regular_request_wait() >> raid10_make_request+0x2d6/0x2190 >> md_handle_request+0x3f3/0x5b0 >> md_submit_bio+0xd9/0x120 >> __submit_bio+0x9d/0x100 >> submit_bio_noacct_nocheck+0x1fd/0x470 >> submit_bio_noacct+0x4c2/0xbb0 >> submit_bio+0x3f/0xf0 >> submit_bh_wbc+0x270/0x2a0 >> block_read_full_folio+0x37c/0x580 >> blkdev_read_folio+0x18/0x20 >> filemap_read_folio+0x3f/0x110 >> do_read_cache_folio+0x13b/0x2c0 >> read_cache_folio+0x42/0x50 >> read_part_sector+0x74/0x1c0 >> read_lba+0x176/0x2a0 >> efi_partition+0x1ce/0xdd0 >> bdev_disk_changed+0x2e7/0x6a0 >> blkdev_get_whole+0xd2/0x140 >> blkdev_get_by_dev.part.0+0x37f/0x570 >> blkdev_get_by_dev+0x51/0x60 >> disk_scan_partitions+0xad/0xf0 >> blkdev_common_ioctl+0x3f3/0xdf0 >> blkdev_ioctl+0x1e1/0x450 >> __x64_sys_ioctl+0xc0/0x100 >> do_syscall_64+0x35/0x80 >> entry_SYSCALL_64_after_hwframe+0x46/0xb0 >> >> other info that might help us debug this: >> >> Possible unsafe locking scenario: >> >> CPU0 CPU1 >> ---- ---- >> lock(&____s->seqcount#11); >> lock(&(&conf->resync_lock)->lock); >> lock(&____s->seqcount#11); >> lock(&(&conf->resync_lock)->lock); >> >> *** DEADLOCK *** >> >> 2 locks held by systemd-udevd/292: >> #0: ffff88817a532528 (&disk->open_mutex){+.+.}-{3:3}, at: >> blkdev_get_by_dev.part.0+0x180/0x570 >> #1: ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at: >> raid10_read_request+0x21f/0x760 >> >> stack backtrace: >> CPU: 3 PID: 292 Comm: systemd-udevd Not tainted >> 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 >> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 >> 04/01/2014 >> Call Trace: >> <TASK> >> dump_stack_lvl+0x5a/0x74 >> dump_stack+0x10/0x12 >> print_circular_bug.cold+0x146/0x14b >> check_noncircular+0x1ff/0x250 >> __lock_acquire+0x1cb4/0x3170 >> lock_acquire+0x183/0x440 >> _raw_spin_lock_irq+0x4d/0x90 >> wait_barrier+0x4fe/0x770 >> raid10_read_request+0x21f/0x760 >> raid10_make_request+0x2d6/0x2190 >> md_handle_request+0x3f3/0x5b0 >> md_submit_bio+0xd9/0x120 >> __submit_bio+0x9d/0x100 >> submit_bio_noacct_nocheck+0x1fd/0x470 >> submit_bio_noacct+0x4c2/0xbb0 >> submit_bio+0x3f/0xf0 >> submit_bh_wbc+0x270/0x2a0 >> block_read_full_folio+0x37c/0x580 >> blkdev_read_folio+0x18/0x20 >> filemap_read_folio+0x3f/0x110 >> do_read_cache_folio+0x13b/0x2c0 >> read_cache_folio+0x42/0x50 >> read_part_sector+0x74/0x1c0 >> read_lba+0x176/0x2a0 >> efi_partition+0x1ce/0xdd0 >> bdev_disk_changed+0x2e7/0x6a0 >> blkdev_get_whole+0xd2/0x140 >> blkdev_get_by_dev.part.0+0x37f/0x570 >> blkdev_get_by_dev+0x51/0x60 >> disk_scan_partitions+0xad/0xf0 >> blkdev_common_ioctl+0x3f3/0xdf0 >> blkdev_ioctl+0x1e1/0x450 >> __x64_sys_ioctl+0xc0/0x100 >> do_syscall_64+0x35/0x80 >> . >> > > . >
Hi, On 8/29/22 9:15 PM, Yu Kuai wrote: > +static bool wait_barrier_nolock(struct r10conf *conf) > +{ > + unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount); > + > + if (seq & 1) > + return false; > + > + if (READ_ONCE(conf->barrier)) > + return false; > + > + atomic_inc(&conf->nr_pending); > + if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq)) I think 'seq' is usually get from read_seqcount_begin. > + return true; > + > + atomic_dec(&conf->nr_pending); > + return false; > + Thanks, Guoqing
Hi, 在 2022/09/02 17:42, Guoqing Jiang 写道: > Hi, > > On 8/29/22 9:15 PM, Yu Kuai wrote: >> +static bool wait_barrier_nolock(struct r10conf *conf) >> +{ >> + unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount); >> + >> + if (seq & 1) >> + return false; >> + >> + if (READ_ONCE(conf->barrier)) >> + return false; >> + >> + atomic_inc(&conf->nr_pending); >> + if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq)) > > I think 'seq' is usually get from read_seqcount_begin. read_seqcount_begin will loop untill "req & 1" failed, I'm afraid this will cause high cpu usage in come cases. What I try to do here is just try once, and fall back to hold lock and wait if failed. What do you think? Thanks, Kuai > >> + return true; >> + >> + atomic_dec(&conf->nr_pending); >> + return false; >> + > > Thanks, > Guoqing > . >
On 9/2/22 6:02 PM, Yu Kuai wrote: > Hi, > > 在 2022/09/02 17:42, Guoqing Jiang 写道: >> Hi, >> >> On 8/29/22 9:15 PM, Yu Kuai wrote: >>> +static bool wait_barrier_nolock(struct r10conf *conf) >>> +{ >>> + unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount); >>> + >>> + if (seq & 1) >>> + return false; >>> + >>> + if (READ_ONCE(conf->barrier)) >>> + return false; >>> + >>> + atomic_inc(&conf->nr_pending); >>> + if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq)) >> >> I think 'seq' is usually get from read_seqcount_begin. > > read_seqcount_begin will loop untill "req & 1" failed, I'm afraid this > will cause high cpu usage in come cases. > > What I try to do here is just try once, and fall back to hold lock and > wait if failed. Thanks for the explanation. I'd suggest to try with read_seqcount_begin/read_seqcount_retry pattern because it is a common usage in kernel I think, then check whether the performance drops or not. Maybe it is related to lockdep issue, but I am not sure. Thanks, Guoqing
Hi, 在 2022/09/02 18:16, Guoqing Jiang 写道: > > > On 9/2/22 6:02 PM, Yu Kuai wrote: >> Hi, >> >> 在 2022/09/02 17:42, Guoqing Jiang 写道: >>> Hi, >>> >>> On 8/29/22 9:15 PM, Yu Kuai wrote: >>>> +static bool wait_barrier_nolock(struct r10conf *conf) >>>> +{ >>>> + unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount); >>>> + >>>> + if (seq & 1) >>>> + return false; >>>> + >>>> + if (READ_ONCE(conf->barrier)) >>>> + return false; >>>> + >>>> + atomic_inc(&conf->nr_pending); >>>> + if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq)) >>> >>> I think 'seq' is usually get from read_seqcount_begin. >> >> read_seqcount_begin will loop untill "req & 1" failed, I'm afraid this >> will cause high cpu usage in come cases. >> >> What I try to do here is just try once, and fall back to hold lock and >> wait if failed. > > Thanks for the explanation. > > I'd suggest to try with read_seqcount_begin/read_seqcount_retry pattern > because it is a common usage in kernel I think, then check whether the > performance drops or not. Maybe it is related to lockdep issue, but I am > not sure. I can try read_seqcount_begin/read_seqcount_retry. Please take a look at another thread, lockdep issue is due to inconsistent usage of lock and seqcount inside seqlock: wait_event() only release lock, seqcount is not released. Thansk, Kuai > > Thanks, > Guoqing > . >
On 2022-09-02 02:14, Yu Kuai wrote: > Can you try the following patch? I'm running mdadm tests myself and I > didn't see any problems yet. Yes, that patch seems to fix the issue. However, may I suggest we do this without trying to introduce new helpers into wait.h? I suspect that could result in a fair amount of bike shedding and delay. wait_event_cmd() is often used in situations where a specific lock type doesn't have a helper. My stab at it is in a diff below which also fixes the bug. I'd also recommend somebody clean up that nasty condition in wait_barrier(). Put it into an appropriately named function with some comments. As is, it is pretty much unreadable. Logan -- diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0e3229ee1ebc..ae297bc870bd 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -934,22 +934,26 @@ static void flush_pending_writes(struct r10conf *conf) * lower_barrier when the particular background IO completes. */ +#define wait_event_barrier_cmd(conf, cond, cmd) \ + wait_event_cmd((conf)->wait_barrier, cond, \ + write_sequnlock_irq(&(conf)->resync_lock); cmd, \ + write_seqlock_irq(&(conf)->resync_lock)) +#define wait_event_barrier(conf, cond) wait_event_barrier_cmd(conf, cond, ) + static void raise_barrier(struct r10conf *conf, int force) { write_seqlock_irq(&conf->resync_lock); BUG_ON(force && !conf->barrier); /* Wait until no block IO is waiting (unless 'force') */ - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock.lock); + wait_event_barrier(conf, force || !conf->nr_waiting); /* block any new IO from starting */ WRITE_ONCE(conf->barrier, conf->barrier + 1); /* Now wait for all pending IO to complete */ - wait_event_lock_irq(conf->wait_barrier, - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, - conf->resync_lock.lock); + wait_event_barrier(conf, !atomic_read(&conf->nr_pending) && + conf->barrier < RESYNC_DEPTH); write_sequnlock_irq(&conf->resync_lock); } @@ -1007,20 +1011,19 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) ret = false; } else { raid10_log(conf->mddev, "wait barrier"); - wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (atomic_read(&conf->nr_pending) && - bio_list && - (!bio_list_empty(&bio_list[0]) || - !bio_list_empty(&bio_list[1]))) || + wait_event_barrier(conf, + !conf->barrier || + (atomic_read(&conf->nr_pending) && + bio_list && + (!bio_list_empty(&bio_list[0]) || + !bio_list_empty(&bio_list[1]))) || /* move on if recovery thread is * blocked by us */ - (conf->mddev->thread->tsk == current && - test_bit(MD_RECOVERY_RUNNING, - &conf->mddev->recovery) && - conf->nr_queued > 0), - conf->resync_lock.lock); + (conf->mddev->thread->tsk == current && + test_bit(MD_RECOVERY_RUNNING, + &conf->mddev->recovery) && + conf->nr_queued > 0)); } conf->nr_waiting--; if (!conf->nr_waiting) @@ -1058,10 +1061,9 @@ static void freeze_array(struct r10conf *conf, int extra) conf->array_freeze_pending++; WRITE_ONCE(conf->barrier, conf->barrier + 1); conf->nr_waiting++; - wait_event_lock_irq_cmd(conf->wait_barrier, - atomic_read(&conf->nr_pending) == conf->nr_queued+extra, - conf->resync_lock.lock, - flush_pending_writes(conf)); + wait_event_barrier_cmd(conf, + atomic_read(&conf->nr_pending) == conf->nr_queued+extra, + flush_pending_writes(conf)); conf->array_freeze_pending--; write_sequnlock_irq(&conf->resync_lock);
Hi, 在 2022/09/03 1:03, Logan Gunthorpe 写道: > > > > On 2022-09-02 02:14, Yu Kuai wrote: >> Can you try the following patch? I'm running mdadm tests myself and I >> didn't see any problems yet. > > Yes, that patch seems to fix the issue. > > However, may I suggest we do this without trying to introduce new > helpers into wait.h? I suspect that could result in a fair amount of > bike shedding and delay. wait_event_cmd() is often used in situations > where a specific lock type doesn't have a helper. Yes, that sounds good. > > My stab at it is in a diff below which also fixes the bug. > > I'd also recommend somebody clean up that nasty condition in > wait_barrier(). Put it into an appropriately named function > with some comments. As is, it is pretty much unreadable. Now we're at it, I can take a look. Thanks, Kuai > > Logan > > -- > > > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c > index 0e3229ee1ebc..ae297bc870bd 100644 > --- a/drivers/md/raid10.c > +++ b/drivers/md/raid10.c > @@ -934,22 +934,26 @@ static void flush_pending_writes(struct r10conf *conf) > * lower_barrier when the particular background IO completes. > */ > > +#define wait_event_barrier_cmd(conf, cond, cmd) \ > + wait_event_cmd((conf)->wait_barrier, cond, \ > + write_sequnlock_irq(&(conf)->resync_lock); cmd, \ > + write_seqlock_irq(&(conf)->resync_lock)) > +#define wait_event_barrier(conf, cond) wait_event_barrier_cmd(conf, cond, ) > + > static void raise_barrier(struct r10conf *conf, int force) > { > write_seqlock_irq(&conf->resync_lock); > BUG_ON(force && !conf->barrier); > > /* Wait until no block IO is waiting (unless 'force') */ > - wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, > - conf->resync_lock.lock); > + wait_event_barrier(conf, force || !conf->nr_waiting); > > /* block any new IO from starting */ > WRITE_ONCE(conf->barrier, conf->barrier + 1); > > /* Now wait for all pending IO to complete */ > - wait_event_lock_irq(conf->wait_barrier, > - !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, > - conf->resync_lock.lock); > + wait_event_barrier(conf, !atomic_read(&conf->nr_pending) && > + conf->barrier < RESYNC_DEPTH); > > write_sequnlock_irq(&conf->resync_lock); > } > @@ -1007,20 +1011,19 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) > ret = false; > } else { > raid10_log(conf->mddev, "wait barrier"); > - wait_event_lock_irq(conf->wait_barrier, > - !conf->barrier || > - (atomic_read(&conf->nr_pending) && > - bio_list && > - (!bio_list_empty(&bio_list[0]) || > - !bio_list_empty(&bio_list[1]))) || > + wait_event_barrier(conf, > + !conf->barrier || > + (atomic_read(&conf->nr_pending) && > + bio_list && > + (!bio_list_empty(&bio_list[0]) || > + !bio_list_empty(&bio_list[1]))) || > /* move on if recovery thread is > * blocked by us > */ > - (conf->mddev->thread->tsk == current && > - test_bit(MD_RECOVERY_RUNNING, > - &conf->mddev->recovery) && > - conf->nr_queued > 0), > - conf->resync_lock.lock); > + (conf->mddev->thread->tsk == current && > + test_bit(MD_RECOVERY_RUNNING, > + &conf->mddev->recovery) && > + conf->nr_queued > 0)); > } > conf->nr_waiting--; > if (!conf->nr_waiting) > @@ -1058,10 +1061,9 @@ static void freeze_array(struct r10conf *conf, int extra) > conf->array_freeze_pending++; > WRITE_ONCE(conf->barrier, conf->barrier + 1); > conf->nr_waiting++; > - wait_event_lock_irq_cmd(conf->wait_barrier, > - atomic_read(&conf->nr_pending) == conf->nr_queued+extra, > - conf->resync_lock.lock, > - flush_pending_writes(conf)); > + wait_event_barrier_cmd(conf, > + atomic_read(&conf->nr_pending) == conf->nr_queued+extra, > + flush_pending_writes(conf)); > > conf->array_freeze_pending--; > write_sequnlock_irq(&conf->resync_lock); > . >
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b70c207f7932..086216b051f5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -930,38 +930,60 @@ static void flush_pending_writes(struct r10conf *conf) static void raise_barrier(struct r10conf *conf, int force) { - spin_lock_irq(&conf->resync_lock); + write_seqlock_irq(&conf->resync_lock); BUG_ON(force && !conf->barrier); /* Wait until no block IO is waiting (unless 'force') */ wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, - conf->resync_lock); + conf->resync_lock.lock); /* block any new IO from starting */ - conf->barrier++; + WRITE_ONCE(conf->barrier, conf->barrier + 1); /* Now wait for all pending IO to complete */ wait_event_lock_irq(conf->wait_barrier, !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, - conf->resync_lock); + conf->resync_lock.lock); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); } static void lower_barrier(struct r10conf *conf) { unsigned long flags; - spin_lock_irqsave(&conf->resync_lock, flags); - conf->barrier--; - spin_unlock_irqrestore(&conf->resync_lock, flags); + + write_seqlock_irqsave(&conf->resync_lock, flags); + WRITE_ONCE(conf->barrier, conf->barrier - 1); + write_sequnlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } +static bool wait_barrier_nolock(struct r10conf *conf) +{ + unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount); + + if (seq & 1) + return false; + + if (READ_ONCE(conf->barrier)) + return false; + + atomic_inc(&conf->nr_pending); + if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq)) + return true; + + atomic_dec(&conf->nr_pending); + return false; +} + static bool wait_barrier(struct r10conf *conf, bool nowait) { bool ret = true; - spin_lock_irq(&conf->resync_lock); + if (wait_barrier_nolock(conf)) + return true; + + write_seqlock_irq(&conf->resync_lock); if (conf->barrier) { struct bio_list *bio_list = current->bio_list; conf->nr_waiting++; @@ -992,7 +1014,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) && conf->nr_queued > 0), - conf->resync_lock); + conf->resync_lock.lock); } conf->nr_waiting--; if (!conf->nr_waiting) @@ -1001,7 +1023,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) /* Only increment nr_pending when we wait */ if (ret) atomic_inc(&conf->nr_pending); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); return ret; } @@ -1026,27 +1048,27 @@ static void freeze_array(struct r10conf *conf, int extra) * must match the number of pending IOs (nr_pending) before * we continue. */ - spin_lock_irq(&conf->resync_lock); + write_seqlock_irq(&conf->resync_lock); conf->array_freeze_pending++; - conf->barrier++; + WRITE_ONCE(conf->barrier, conf->barrier + 1); conf->nr_waiting++; wait_event_lock_irq_cmd(conf->wait_barrier, atomic_read(&conf->nr_pending) == conf->nr_queued+extra, - conf->resync_lock, + conf->resync_lock.lock, flush_pending_writes(conf)); conf->array_freeze_pending--; - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); } static void unfreeze_array(struct r10conf *conf) { /* reverse the effect of the freeze */ - spin_lock_irq(&conf->resync_lock); - conf->barrier--; + write_seqlock_irq(&conf->resync_lock); + WRITE_ONCE(conf->barrier, conf->barrier - 1); conf->nr_waiting--; wake_up(&conf->wait_barrier); - spin_unlock_irq(&conf->resync_lock); + write_sequnlock_irq(&conf->resync_lock); } static sector_t choose_data_offset(struct r10bio *r10_bio, @@ -4033,7 +4055,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->bio_end_io_list); - spin_lock_init(&conf->resync_lock); + seqlock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); atomic_set(&conf->nr_pending, 0); @@ -4352,7 +4374,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) rdev->new_raid_disk = rdev->raid_disk * 2; rdev->sectors = size; } - conf->barrier = 1; + WRITE_ONCE(conf->barrier, 1); } return conf; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 5c0804d8bb1f..8c072ce0bc54 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -76,7 +76,7 @@ struct r10conf { /* queue pending writes and submit them on unplug */ struct bio_list pending_bio_list; - spinlock_t resync_lock; + seqlock_t resync_lock; atomic_t nr_pending; int nr_waiting; int nr_queued;