diff mbox series

[-next,2/3] md/raid10: convert resync_lock to use seqlock

Message ID 20220829131502.165356-3-yukuai1@huaweicloud.com (mailing list archive)
State New, archived
Delegated to: Song Liu
Headers show
Series md/raid10: reduce lock contention for io | expand

Commit Message

Yu Kuai Aug. 29, 2022, 1:15 p.m. UTC
From: Yu Kuai <yukuai3@huawei.com>

Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier',
and io can't be dispatched until 'barrier' is dropped.

Since holding the 'barrier' is not common, convert 'resync_lock' to use
seqlock so that holding lock can be avoided in fast path.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 drivers/md/raid10.c | 62 ++++++++++++++++++++++++++++++---------------
 drivers/md/raid10.h |  2 +-
 2 files changed, 43 insertions(+), 21 deletions(-)

Comments

Logan Gunthorpe Sept. 1, 2022, 6:41 p.m. UTC | #1
Hi,

On 2022-08-29 07:15, Yu Kuai wrote:
> From: Yu Kuai <yukuai3@huawei.com>
> 
> Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier',
> and io can't be dispatched until 'barrier' is dropped.
> 
> Since holding the 'barrier' is not common, convert 'resync_lock' to use
> seqlock so that holding lock can be avoided in fast path.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>

I've found some lockdep issues starting with this patch in md-next while
running mdadm tests (specifically 00raid10 when run about 10 times in a
row).

I've seen a couple different lock dep errors. The first seems to be
reproducible on this patch, then it possibly changes to the second on
subsequent patches. Not sure exactly.

I haven't dug into it too deeply, but hopefully it can be fixed easily.

Logan

--


    ================================
    WARNING: inconsistent lock state
    6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 Not tainted
    --------------------------------
    inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
    fsck.ext3/1695 [HC0[0]:SC0[0]:HE0:SE1] takes:
    ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
raid10_read_request+0x21f/0x760
		(raid10.c:1134)

    {IN-SOFTIRQ-W} state was registered at:
      lock_acquire+0x183/0x440
      lower_barrier+0x5e/0xd0
      end_sync_request+0x178/0x180
      end_sync_write+0x193/0x380
      bio_endio+0x346/0x3a0
      blk_update_request+0x1eb/0x7c0
      blk_mq_end_request+0x30/0x50
      lo_complete_rq+0xb7/0x100
      blk_complete_reqs+0x77/0x90
      blk_done_softirq+0x38/0x40
      __do_softirq+0x10c/0x650
      run_ksoftirqd+0x48/0x80
      smpboot_thread_fn+0x302/0x400
      kthread+0x18c/0x1c0
      ret_from_fork+0x1f/0x30

    irq event stamp: 8930
    hardirqs last  enabled at (8929): [<ffffffff96df8351>]
_raw_spin_unlock_irqrestore+0x31/0x60
    hardirqs last disabled at (8930): [<ffffffff96df7fc5>]
_raw_spin_lock_irq+0x75/0x90
    softirqs last  enabled at (6768): [<ffffffff9554970e>]
__irq_exit_rcu+0xfe/0x150
    softirqs last disabled at (6757): [<ffffffff9554970e>]
__irq_exit_rcu+0xfe/0x150

    other info that might help us debug this:
     Possible unsafe locking scenario:

           CPU0
           ----
      lock(&____s->seqcount#10);
      <Interrupt>
        lock(&____s->seqcount#10);

     *** DEADLOCK ***

    2 locks held by fsck.ext3/1695:
     #0: ffff8881007d0930 (mapping.invalidate_lock#2){++++}-{3:3}, at:
page_cache_ra_unbounded+0xaf/0x250
     #1: ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
raid10_read_request+0x21f/0x760

    stack backtrace:
    CPU: 0 PID: 1695 Comm: fsck.ext3 Not tainted
6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604
    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
04/01/2014
    Call Trace:
     <TASK>
     dump_stack_lvl+0x5a/0x74
     dump_stack+0x10/0x12
     print_usage_bug.part.0+0x233/0x246
     mark_lock.part.0.cold+0x73/0x14f
     mark_held_locks+0x71/0xa0
     lockdep_hardirqs_on_prepare+0x158/0x230
     trace_hardirqs_on+0x34/0x100
     _raw_spin_unlock_irq+0x28/0x60
     wait_barrier+0x4a6/0x720
         raid10.c:1004
     raid10_read_request+0x21f/0x760
     raid10_make_request+0x2d6/0x2160
     md_handle_request+0x3f3/0x5b0
     md_submit_bio+0xd9/0x120
     __submit_bio+0x9d/0x100
     submit_bio_noacct_nocheck+0x1fd/0x470
     submit_bio_noacct+0x4c2/0xbb0
     submit_bio+0x3f/0xf0
     mpage_readahead+0x323/0x3b0
     blkdev_readahead+0x15/0x20
     read_pages+0x136/0x7a0
     page_cache_ra_unbounded+0x18d/0x250
     page_cache_ra_order+0x2c9/0x400
     ondemand_readahead+0x320/0x730
     page_cache_sync_ra+0xa6/0xb0
     filemap_get_pages+0x1eb/0xc00
     filemap_read+0x1f1/0x770
     blkdev_read_iter+0x164/0x310
     vfs_read+0x467/0x5a0
     __x64_sys_pread64+0x122/0x160
     do_syscall_64+0x35/0x80
     entry_SYSCALL_64_after_hwframe+0x46/0xb0

--

    ======================================================
    WARNING: possible circular locking dependency detected
    6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 Not tainted
    ------------------------------------------------------
    systemd-udevd/292 is trying to acquire lock:
    ffff88817b644170 (&(&conf->resync_lock)->lock){....}-{2:2}, at:
wait_barrier+0x4fe/0x770

    but task is already holding lock:
    ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
raid10_read_request+0x21f/0x760
			raid10.c:1140  wait_barrier()
			raid10.c:1204  regular_request_wait()



    which lock already depends on the new lock.


    the existing dependency chain (in reverse order) is:

    -> #1 (&____s->seqcount#11){+.+.}-{0:0}:
           raise_barrier+0xe0/0x300
		raid10.c:940 write_seqlock_irq()
           raid10_sync_request+0x629/0x4750
		raid10.c:3689 raise_barrire()
           md_do_sync.cold+0x8ec/0x1491
           md_thread+0x19d/0x2d0
           kthread+0x18c/0x1c0
           ret_from_fork+0x1f/0x30

    -> #0 (&(&conf->resync_lock)->lock){....}-{2:2}:
           __lock_acquire+0x1cb4/0x3170
           lock_acquire+0x183/0x440
           _raw_spin_lock_irq+0x4d/0x90
           wait_barrier+0x4fe/0x770
           raid10_read_request+0x21f/0x760
		raid10.c:1140  wait_barrier()
		raid10.c:1204  regular_request_wait()
           raid10_make_request+0x2d6/0x2190
           md_handle_request+0x3f3/0x5b0
           md_submit_bio+0xd9/0x120
           __submit_bio+0x9d/0x100
           submit_bio_noacct_nocheck+0x1fd/0x470
           submit_bio_noacct+0x4c2/0xbb0
           submit_bio+0x3f/0xf0
           submit_bh_wbc+0x270/0x2a0
           block_read_full_folio+0x37c/0x580
           blkdev_read_folio+0x18/0x20
           filemap_read_folio+0x3f/0x110
           do_read_cache_folio+0x13b/0x2c0
           read_cache_folio+0x42/0x50
           read_part_sector+0x74/0x1c0
           read_lba+0x176/0x2a0
           efi_partition+0x1ce/0xdd0
           bdev_disk_changed+0x2e7/0x6a0
           blkdev_get_whole+0xd2/0x140
           blkdev_get_by_dev.part.0+0x37f/0x570
           blkdev_get_by_dev+0x51/0x60
           disk_scan_partitions+0xad/0xf0
           blkdev_common_ioctl+0x3f3/0xdf0
           blkdev_ioctl+0x1e1/0x450
           __x64_sys_ioctl+0xc0/0x100
           do_syscall_64+0x35/0x80
           entry_SYSCALL_64_after_hwframe+0x46/0xb0

    other info that might help us debug this:

     Possible unsafe locking scenario:

           CPU0                    CPU1
           ----                    ----
      lock(&____s->seqcount#11);
                                   lock(&(&conf->resync_lock)->lock);
                                   lock(&____s->seqcount#11);
      lock(&(&conf->resync_lock)->lock);

     *** DEADLOCK ***

    2 locks held by systemd-udevd/292:
     #0: ffff88817a532528 (&disk->open_mutex){+.+.}-{3:3}, at:
blkdev_get_by_dev.part.0+0x180/0x570
     #1: ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
raid10_read_request+0x21f/0x760

    stack backtrace:
    CPU: 3 PID: 292 Comm: systemd-udevd Not tainted
6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600
    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
04/01/2014
    Call Trace:
     <TASK>
     dump_stack_lvl+0x5a/0x74
     dump_stack+0x10/0x12
     print_circular_bug.cold+0x146/0x14b
     check_noncircular+0x1ff/0x250
     __lock_acquire+0x1cb4/0x3170
     lock_acquire+0x183/0x440
     _raw_spin_lock_irq+0x4d/0x90
     wait_barrier+0x4fe/0x770
     raid10_read_request+0x21f/0x760
     raid10_make_request+0x2d6/0x2190
     md_handle_request+0x3f3/0x5b0
     md_submit_bio+0xd9/0x120
     __submit_bio+0x9d/0x100
     submit_bio_noacct_nocheck+0x1fd/0x470
     submit_bio_noacct+0x4c2/0xbb0
     submit_bio+0x3f/0xf0
     submit_bh_wbc+0x270/0x2a0
     block_read_full_folio+0x37c/0x580
     blkdev_read_folio+0x18/0x20
     filemap_read_folio+0x3f/0x110
     do_read_cache_folio+0x13b/0x2c0
     read_cache_folio+0x42/0x50
     read_part_sector+0x74/0x1c0
     read_lba+0x176/0x2a0
     efi_partition+0x1ce/0xdd0
     bdev_disk_changed+0x2e7/0x6a0
     blkdev_get_whole+0xd2/0x140
     blkdev_get_by_dev.part.0+0x37f/0x570
     blkdev_get_by_dev+0x51/0x60
     disk_scan_partitions+0xad/0xf0
     blkdev_common_ioctl+0x3f3/0xdf0
     blkdev_ioctl+0x1e1/0x450
     __x64_sys_ioctl+0xc0/0x100
     do_syscall_64+0x35/0x80
Guoqing Jiang Sept. 2, 2022, 12:49 a.m. UTC | #2
On 9/2/22 2:41 AM, Logan Gunthorpe wrote:
> Hi,
>
> On 2022-08-29 07:15, Yu Kuai wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier',
>> and io can't be dispatched until 'barrier' is dropped.
>>
>> Since holding the 'barrier' is not common, convert 'resync_lock' to use
>> seqlock so that holding lock can be avoided in fast path.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> I've found some lockdep issues starting with this patch in md-next while
> running mdadm tests (specifically 00raid10 when run about 10 times in a
> row).
>
> I've seen a couple different lock dep errors. The first seems to be
> reproducible on this patch, then it possibly changes to the second on
> subsequent patches. Not sure exactly.

That's why I said "try mdadm test suites too to avoid regression." ...

Guoqing
Logan Gunthorpe Sept. 2, 2022, 12:56 a.m. UTC | #3
On 2022-09-01 18:49, Guoqing Jiang wrote:
> 
> 
> On 9/2/22 2:41 AM, Logan Gunthorpe wrote:
>> Hi,
>>
>> On 2022-08-29 07:15, Yu Kuai wrote:
>>> From: Yu Kuai <yukuai3@huawei.com>
>>>
>>> Currently, wait_barrier() will hold 'resync_lock' to read
>>> 'conf->barrier',
>>> and io can't be dispatched until 'barrier' is dropped.
>>>
>>> Since holding the 'barrier' is not common, convert 'resync_lock' to use
>>> seqlock so that holding lock can be avoided in fast path.
>>>
>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>> I've found some lockdep issues starting with this patch in md-next while
>> running mdadm tests (specifically 00raid10 when run about 10 times in a
>> row).
>>
>> I've seen a couple different lock dep errors. The first seems to be
>> reproducible on this patch, then it possibly changes to the second on
>> subsequent patches. Not sure exactly.
> 
> That's why I said "try mdadm test suites too to avoid regression." ...

You may have to run it multiple times, a single run tends not to catch
all errors. I had to loop the noted test 10 times to be sure I hit this
every time when I did the simple bisect.

And ensure that all the debug options are on when you run it (take a
look at the Kernel Hacking section in menuconfig). You won't hit this
bug without at least CONFIG_PROVE_LOCKING=y.

Logan
Guoqing Jiang Sept. 2, 2022, 1 a.m. UTC | #4
On 9/2/22 8:56 AM, Logan Gunthorpe wrote:
>
> On 2022-09-01 18:49, Guoqing Jiang wrote:
>>
>> On 9/2/22 2:41 AM, Logan Gunthorpe wrote:
>>> Hi,
>>>
>>> On 2022-08-29 07:15, Yu Kuai wrote:
>>>> From: Yu Kuai <yukuai3@huawei.com>
>>>>
>>>> Currently, wait_barrier() will hold 'resync_lock' to read
>>>> 'conf->barrier',
>>>> and io can't be dispatched until 'barrier' is dropped.
>>>>
>>>> Since holding the 'barrier' is not common, convert 'resync_lock' to use
>>>> seqlock so that holding lock can be avoided in fast path.
>>>>
>>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>> I've found some lockdep issues starting with this patch in md-next while
>>> running mdadm tests (specifically 00raid10 when run about 10 times in a
>>> row).
>>>
>>> I've seen a couple different lock dep errors. The first seems to be
>>> reproducible on this patch, then it possibly changes to the second on
>>> subsequent patches. Not sure exactly.
>> That's why I said "try mdadm test suites too to avoid regression." ...
> You may have to run it multiple times, a single run tends not to catch
> all errors. I had to loop the noted test 10 times to be sure I hit this
> every time when I did the simple bisect.
>
> And ensure that all the debug options are on when you run it (take a
> look at the Kernel Hacking section in menuconfig). You won't hit this
> bug without at least CONFIG_PROVE_LOCKING=y.

Yes,  we definitely need to enable the option to test change for locking 
stuffs.

Thanks,
Guoqing
Yu Kuai Sept. 2, 2022, 1:21 a.m. UTC | #5
Hi,

在 2022/09/02 2:41, Logan Gunthorpe 写道:
> Hi,
> 
> On 2022-08-29 07:15, Yu Kuai wrote:
>> From: Yu Kuai <yukuai3@huawei.com>
>>
>> Currently, wait_barrier() will hold 'resync_lock' to read 'conf->barrier',
>> and io can't be dispatched until 'barrier' is dropped.
>>
>> Since holding the 'barrier' is not common, convert 'resync_lock' to use
>> seqlock so that holding lock can be avoided in fast path.
>>
>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> 
> I've found some lockdep issues starting with this patch in md-next while
> running mdadm tests (specifically 00raid10 when run about 10 times in a
> row).
> 
> I've seen a couple different lock dep errors. The first seems to be
> reproducible on this patch, then it possibly changes to the second on
> subsequent patches. Not sure exactly.
> 

Thanks for the test,

I think this is false positive because of the special usage here,

for example, in raise_barrier():

write_seqlock_irq
  spin_lock_irq();
   lock_acquire
  do_write_seqcount_begin
   seqcount_acquire

  wait_event_lock_irq_cmd
   spin_unlock_irq -> lock is released while seqcount is still hold
		     if other context hold the lock again, lockdep
		     will trigger warning.
   ...
   spin_lock_irq

write_sequnlock_irq

Functionality should be ok, I'll try to find a way to prevent such
warning.

Thanks,
Kuai
> I haven't dug into it too deeply, but hopefully it can be fixed easily.
> 
> Logan
> 
> --
> 
> 
>      ================================
>      WARNING: inconsistent lock state
>      6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 Not tainted
>      --------------------------------
>      inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
>      fsck.ext3/1695 [HC0[0]:SC0[0]:HE0:SE1] takes:
>      ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
> raid10_read_request+0x21f/0x760
> 		(raid10.c:1134)
> 
>      {IN-SOFTIRQ-W} state was registered at:
>        lock_acquire+0x183/0x440
>        lower_barrier+0x5e/0xd0
>        end_sync_request+0x178/0x180
>        end_sync_write+0x193/0x380
>        bio_endio+0x346/0x3a0
>        blk_update_request+0x1eb/0x7c0
>        blk_mq_end_request+0x30/0x50
>        lo_complete_rq+0xb7/0x100
>        blk_complete_reqs+0x77/0x90
>        blk_done_softirq+0x38/0x40
>        __do_softirq+0x10c/0x650
>        run_ksoftirqd+0x48/0x80
>        smpboot_thread_fn+0x302/0x400
>        kthread+0x18c/0x1c0
>        ret_from_fork+0x1f/0x30
> 
>      irq event stamp: 8930
>      hardirqs last  enabled at (8929): [<ffffffff96df8351>]
> _raw_spin_unlock_irqrestore+0x31/0x60
>      hardirqs last disabled at (8930): [<ffffffff96df7fc5>]
> _raw_spin_lock_irq+0x75/0x90
>      softirqs last  enabled at (6768): [<ffffffff9554970e>]
> __irq_exit_rcu+0xfe/0x150
>      softirqs last disabled at (6757): [<ffffffff9554970e>]
> __irq_exit_rcu+0xfe/0x150
> 
>      other info that might help us debug this:
>       Possible unsafe locking scenario:
> 
>             CPU0
>             ----
>        lock(&____s->seqcount#10);
>        <Interrupt>
>          lock(&____s->seqcount#10);
> 
>       *** DEADLOCK ***
> 
>      2 locks held by fsck.ext3/1695:
>       #0: ffff8881007d0930 (mapping.invalidate_lock#2){++++}-{3:3}, at:
> page_cache_ra_unbounded+0xaf/0x250
>       #1: ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
> raid10_read_request+0x21f/0x760
> 
>      stack backtrace:
>      CPU: 0 PID: 1695 Comm: fsck.ext3 Not tainted
> 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604
>      Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
> 04/01/2014
>      Call Trace:
>       <TASK>
>       dump_stack_lvl+0x5a/0x74
>       dump_stack+0x10/0x12
>       print_usage_bug.part.0+0x233/0x246
>       mark_lock.part.0.cold+0x73/0x14f
>       mark_held_locks+0x71/0xa0
>       lockdep_hardirqs_on_prepare+0x158/0x230
>       trace_hardirqs_on+0x34/0x100
>       _raw_spin_unlock_irq+0x28/0x60
>       wait_barrier+0x4a6/0x720
>           raid10.c:1004
>       raid10_read_request+0x21f/0x760
>       raid10_make_request+0x2d6/0x2160
>       md_handle_request+0x3f3/0x5b0
>       md_submit_bio+0xd9/0x120
>       __submit_bio+0x9d/0x100
>       submit_bio_noacct_nocheck+0x1fd/0x470
>       submit_bio_noacct+0x4c2/0xbb0
>       submit_bio+0x3f/0xf0
>       mpage_readahead+0x323/0x3b0
>       blkdev_readahead+0x15/0x20
>       read_pages+0x136/0x7a0
>       page_cache_ra_unbounded+0x18d/0x250
>       page_cache_ra_order+0x2c9/0x400
>       ondemand_readahead+0x320/0x730
>       page_cache_sync_ra+0xa6/0xb0
>       filemap_get_pages+0x1eb/0xc00
>       filemap_read+0x1f1/0x770
>       blkdev_read_iter+0x164/0x310
>       vfs_read+0x467/0x5a0
>       __x64_sys_pread64+0x122/0x160
>       do_syscall_64+0x35/0x80
>       entry_SYSCALL_64_after_hwframe+0x46/0xb0
> 
> --
> 
>      ======================================================
>      WARNING: possible circular locking dependency detected
>      6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 Not tainted
>      ------------------------------------------------------
>      systemd-udevd/292 is trying to acquire lock:
>      ffff88817b644170 (&(&conf->resync_lock)->lock){....}-{2:2}, at:
> wait_barrier+0x4fe/0x770
> 
>      but task is already holding lock:
>      ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
> raid10_read_request+0x21f/0x760
> 			raid10.c:1140  wait_barrier()
> 			raid10.c:1204  regular_request_wait()
> 
> 
> 
>      which lock already depends on the new lock.
> 
> 
>      the existing dependency chain (in reverse order) is:
> 
>      -> #1 (&____s->seqcount#11){+.+.}-{0:0}:
>             raise_barrier+0xe0/0x300
> 		raid10.c:940 write_seqlock_irq()
>             raid10_sync_request+0x629/0x4750
> 		raid10.c:3689 raise_barrire()
>             md_do_sync.cold+0x8ec/0x1491
>             md_thread+0x19d/0x2d0
>             kthread+0x18c/0x1c0
>             ret_from_fork+0x1f/0x30
> 
>      -> #0 (&(&conf->resync_lock)->lock){....}-{2:2}:
>             __lock_acquire+0x1cb4/0x3170
>             lock_acquire+0x183/0x440
>             _raw_spin_lock_irq+0x4d/0x90
>             wait_barrier+0x4fe/0x770
>             raid10_read_request+0x21f/0x760
> 		raid10.c:1140  wait_barrier()
> 		raid10.c:1204  regular_request_wait()
>             raid10_make_request+0x2d6/0x2190
>             md_handle_request+0x3f3/0x5b0
>             md_submit_bio+0xd9/0x120
>             __submit_bio+0x9d/0x100
>             submit_bio_noacct_nocheck+0x1fd/0x470
>             submit_bio_noacct+0x4c2/0xbb0
>             submit_bio+0x3f/0xf0
>             submit_bh_wbc+0x270/0x2a0
>             block_read_full_folio+0x37c/0x580
>             blkdev_read_folio+0x18/0x20
>             filemap_read_folio+0x3f/0x110
>             do_read_cache_folio+0x13b/0x2c0
>             read_cache_folio+0x42/0x50
>             read_part_sector+0x74/0x1c0
>             read_lba+0x176/0x2a0
>             efi_partition+0x1ce/0xdd0
>             bdev_disk_changed+0x2e7/0x6a0
>             blkdev_get_whole+0xd2/0x140
>             blkdev_get_by_dev.part.0+0x37f/0x570
>             blkdev_get_by_dev+0x51/0x60
>             disk_scan_partitions+0xad/0xf0
>             blkdev_common_ioctl+0x3f3/0xdf0
>             blkdev_ioctl+0x1e1/0x450
>             __x64_sys_ioctl+0xc0/0x100
>             do_syscall_64+0x35/0x80
>             entry_SYSCALL_64_after_hwframe+0x46/0xb0
> 
>      other info that might help us debug this:
> 
>       Possible unsafe locking scenario:
> 
>             CPU0                    CPU1
>             ----                    ----
>        lock(&____s->seqcount#11);
>                                     lock(&(&conf->resync_lock)->lock);
>                                     lock(&____s->seqcount#11);
>        lock(&(&conf->resync_lock)->lock);
> 
>       *** DEADLOCK ***
> 
>      2 locks held by systemd-udevd/292:
>       #0: ffff88817a532528 (&disk->open_mutex){+.+.}-{3:3}, at:
> blkdev_get_by_dev.part.0+0x180/0x570
>       #1: ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
> raid10_read_request+0x21f/0x760
> 
>      stack backtrace:
>      CPU: 3 PID: 292 Comm: systemd-udevd Not tainted
> 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600
>      Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
> 04/01/2014
>      Call Trace:
>       <TASK>
>       dump_stack_lvl+0x5a/0x74
>       dump_stack+0x10/0x12
>       print_circular_bug.cold+0x146/0x14b
>       check_noncircular+0x1ff/0x250
>       __lock_acquire+0x1cb4/0x3170
>       lock_acquire+0x183/0x440
>       _raw_spin_lock_irq+0x4d/0x90
>       wait_barrier+0x4fe/0x770
>       raid10_read_request+0x21f/0x760
>       raid10_make_request+0x2d6/0x2190
>       md_handle_request+0x3f3/0x5b0
>       md_submit_bio+0xd9/0x120
>       __submit_bio+0x9d/0x100
>       submit_bio_noacct_nocheck+0x1fd/0x470
>       submit_bio_noacct+0x4c2/0xbb0
>       submit_bio+0x3f/0xf0
>       submit_bh_wbc+0x270/0x2a0
>       block_read_full_folio+0x37c/0x580
>       blkdev_read_folio+0x18/0x20
>       filemap_read_folio+0x3f/0x110
>       do_read_cache_folio+0x13b/0x2c0
>       read_cache_folio+0x42/0x50
>       read_part_sector+0x74/0x1c0
>       read_lba+0x176/0x2a0
>       efi_partition+0x1ce/0xdd0
>       bdev_disk_changed+0x2e7/0x6a0
>       blkdev_get_whole+0xd2/0x140
>       blkdev_get_by_dev.part.0+0x37f/0x570
>       blkdev_get_by_dev+0x51/0x60
>       disk_scan_partitions+0xad/0xf0
>       blkdev_common_ioctl+0x3f3/0xdf0
>       blkdev_ioctl+0x1e1/0x450
>       __x64_sys_ioctl+0xc0/0x100
>       do_syscall_64+0x35/0x80
> .
>
Yu Kuai Sept. 2, 2022, 8:14 a.m. UTC | #6
Hi, Logan

在 2022/09/02 9:21, Yu Kuai 写道:
> Hi,
> 
> 在 2022/09/02 2:41, Logan Gunthorpe 写道:
>> Hi,
>>
>> On 2022-08-29 07:15, Yu Kuai wrote:
>>> From: Yu Kuai <yukuai3@huawei.com>
>>>
>>> Currently, wait_barrier() will hold 'resync_lock' to read 
>>> 'conf->barrier',
>>> and io can't be dispatched until 'barrier' is dropped.
>>>
>>> Since holding the 'barrier' is not common, convert 'resync_lock' to use
>>> seqlock so that holding lock can be avoided in fast path.
>>>
>>> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
>>
>> I've found some lockdep issues starting with this patch in md-next while
>> running mdadm tests (specifically 00raid10 when run about 10 times in a
>> row).
>>
>> I've seen a couple different lock dep errors. The first seems to be
>> reproducible on this patch, then it possibly changes to the second on
>> subsequent patches. Not sure exactly.
>>
> 
> Thanks for the test,
> 
> I think this is false positive because of the special usage here,
> 
> for example, in raise_barrier():
> 
> write_seqlock_irq
>   spin_lock_irq();
>    lock_acquire
>   do_write_seqcount_begin
>    seqcount_acquire
> 
>   wait_event_lock_irq_cmd
>    spin_unlock_irq -> lock is released while seqcount is still hold
>               if other context hold the lock again, lockdep
>               will trigger warning.
>    ...
>    spin_lock_irq
> 
> write_sequnlock_irq
> 
> Functionality should be ok, I'll try to find a way to prevent such
> warning.

Can you try the following patch? I'm running mdadm tests myself and I
didn't see any problems yet.

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 2f7c8bef6dc2..317bd862f40a 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -940,16 +940,16 @@ static void raise_barrier(struct r10conf *conf, 
int force)
         BUG_ON(force && !conf->barrier);

         /* Wait until no block IO is waiting (unless 'force') */
-       wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-                           conf->resync_lock.lock);
+       wait_event_seqlock_irq(conf->wait_barrier, force || 
!conf->nr_waiting,
+                              conf->resync_lock);

         /* block any new IO from starting */
         WRITE_ONCE(conf->barrier, conf->barrier + 1);

         /* Now wait for all pending IO to complete */
-       wait_event_lock_irq(conf->wait_barrier,
-                           !atomic_read(&conf->nr_pending) && 
conf->barrier < RESYNC_DEPTH,
-                           conf->resync_lock.lock);
+       wait_event_seqlock_irq(conf->wait_barrier,
+                              !atomic_read(&conf->nr_pending) &&
+                              conf->barrier < RESYNC_DEPTH, 
conf->resync_lock);

         write_sequnlock_irq(&conf->resync_lock);
  }
@@ -1007,7 +1007,7 @@ static bool wait_barrier(struct r10conf *conf, 
bool nowait)
                         ret = false;
                 } else {
                         raid10_log(conf->mddev, "wait barrier");
-                       wait_event_lock_irq(conf->wait_barrier,
+                       wait_event_seqlock_irq(conf->wait_barrier,
                                             !conf->barrier ||
 
(atomic_read(&conf->nr_pending) &&
                                              bio_list &&
@@ -1020,7 +1020,7 @@ static bool wait_barrier(struct r10conf *conf, 
bool nowait)
                                               test_bit(MD_RECOVERY_RUNNING,
 
&conf->mddev->recovery) &&
                                               conf->nr_queued > 0),
-                                           conf->resync_lock.lock);
+                                           conf->resync_lock);
                 }
                 conf->nr_waiting--;
                 if (!conf->nr_waiting)
@@ -1058,10 +1058,9 @@ static void freeze_array(struct r10conf *conf, 
int extra)
         conf->array_freeze_pending++;
         WRITE_ONCE(conf->barrier, conf->barrier + 1);
         conf->nr_waiting++;
-       wait_event_lock_irq_cmd(conf->wait_barrier,
+       wait_event_seqlock_irq_cmd(conf->wait_barrier,
                                 atomic_read(&conf->nr_pending) == 
conf->nr_queued+extra,
-                               conf->resync_lock.lock,
-                               flush_pending_writes(conf));
+                               conf->resync_lock, 
flush_pending_writes(conf));

         conf->array_freeze_pending--;
         write_sequnlock_irq(&conf->resync_lock);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 58cfbf81447c..97d6b378e40c 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -977,6 +977,13 @@ extern int do_wait_intr_irq(wait_queue_head_t *, 
wait_queue_entry_t *);
                             schedule(); 
         \
                             spin_lock_irq(&lock))

+#define __wait_event_seqlock_irq(wq_head, condition, lock, cmd) 
                \
+       (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 
0,     \
+                           write_sequnlock_irq(&lock); 
        \
+                           cmd; 
        \
+                           schedule(); 
        \
+                           write_seqlock_irq(&lock))
+
  /**
   * wait_event_lock_irq_cmd - sleep until a condition gets true. The
   *                          condition is checked under the lock. This
@@ -1007,6 +1014,13 @@ do { 
                                \
         __wait_event_lock_irq(wq_head, condition, lock, cmd); 
         \
  } while (0)

+#define wait_event_seqlock_irq_cmd(wq_head, condition, lock, cmd) 
        \
+do { 
        \
+       if (condition) 
        \
+               break; 
        \
+       __wait_event_seqlock_irq(wq_head, condition, lock, cmd); 
        \
+} while (0)
+
  /**
   * wait_event_lock_irq - sleep until a condition gets true. The
   *                      condition is checked under the lock. This
@@ -1034,6 +1048,12 @@ do { 
                                \
         __wait_event_lock_irq(wq_head, condition, lock, ); 
         \
  } while (0)

+#define wait_event_seqlock_irq(wq_head, condition, lock) 
        \
+do { 
        \
+       if (condition) 
        \
+               break; 
        \
+       __wait_event_seqlock_irq(wq_head, condition, lock, ); 
        \
+} while (0)

  #define __wait_event_interruptible_lock_irq(wq_head, condition, lock, 
cmd)     \
         ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, 
         \

> 
> Thanks,
> Kuai
>> I haven't dug into it too deeply, but hopefully it can be fixed easily.
>>
>> Logan
>>
>> -- 
>>
>>
>>      ================================
>>      WARNING: inconsistent lock state
>>      6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604 Not tainted
>>      --------------------------------
>>      inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
>>      fsck.ext3/1695 [HC0[0]:SC0[0]:HE0:SE1] takes:
>>      ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
>> raid10_read_request+0x21f/0x760
>>         (raid10.c:1134)
>>
>>      {IN-SOFTIRQ-W} state was registered at:
>>        lock_acquire+0x183/0x440
>>        lower_barrier+0x5e/0xd0
>>        end_sync_request+0x178/0x180
>>        end_sync_write+0x193/0x380
>>        bio_endio+0x346/0x3a0
>>        blk_update_request+0x1eb/0x7c0
>>        blk_mq_end_request+0x30/0x50
>>        lo_complete_rq+0xb7/0x100
>>        blk_complete_reqs+0x77/0x90
>>        blk_done_softirq+0x38/0x40
>>        __do_softirq+0x10c/0x650
>>        run_ksoftirqd+0x48/0x80
>>        smpboot_thread_fn+0x302/0x400
>>        kthread+0x18c/0x1c0
>>        ret_from_fork+0x1f/0x30
>>
>>      irq event stamp: 8930
>>      hardirqs last  enabled at (8929): [<ffffffff96df8351>]
>> _raw_spin_unlock_irqrestore+0x31/0x60
>>      hardirqs last disabled at (8930): [<ffffffff96df7fc5>]
>> _raw_spin_lock_irq+0x75/0x90
>>      softirqs last  enabled at (6768): [<ffffffff9554970e>]
>> __irq_exit_rcu+0xfe/0x150
>>      softirqs last disabled at (6757): [<ffffffff9554970e>]
>> __irq_exit_rcu+0xfe/0x150
>>
>>      other info that might help us debug this:
>>       Possible unsafe locking scenario:
>>
>>             CPU0
>>             ----
>>        lock(&____s->seqcount#10);
>>        <Interrupt>
>>          lock(&____s->seqcount#10);
>>
>>       *** DEADLOCK ***
>>
>>      2 locks held by fsck.ext3/1695:
>>       #0: ffff8881007d0930 (mapping.invalidate_lock#2){++++}-{3:3}, at:
>> page_cache_ra_unbounded+0xaf/0x250
>>       #1: ffff8881049b0120 (&____s->seqcount#10){+.?.}-{0:0}, at:
>> raid10_read_request+0x21f/0x760
>>
>>      stack backtrace:
>>      CPU: 0 PID: 1695 Comm: fsck.ext3 Not tainted
>> 6.0.0-rc2-eid-vmlocalyes-dbg-00023-gfd68041d2fd2 #2604
>>      Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
>> 04/01/2014
>>      Call Trace:
>>       <TASK>
>>       dump_stack_lvl+0x5a/0x74
>>       dump_stack+0x10/0x12
>>       print_usage_bug.part.0+0x233/0x246
>>       mark_lock.part.0.cold+0x73/0x14f
>>       mark_held_locks+0x71/0xa0
>>       lockdep_hardirqs_on_prepare+0x158/0x230
>>       trace_hardirqs_on+0x34/0x100
>>       _raw_spin_unlock_irq+0x28/0x60
>>       wait_barrier+0x4a6/0x720
>>           raid10.c:1004
>>       raid10_read_request+0x21f/0x760
>>       raid10_make_request+0x2d6/0x2160
>>       md_handle_request+0x3f3/0x5b0
>>       md_submit_bio+0xd9/0x120
>>       __submit_bio+0x9d/0x100
>>       submit_bio_noacct_nocheck+0x1fd/0x470
>>       submit_bio_noacct+0x4c2/0xbb0
>>       submit_bio+0x3f/0xf0
>>       mpage_readahead+0x323/0x3b0
>>       blkdev_readahead+0x15/0x20
>>       read_pages+0x136/0x7a0
>>       page_cache_ra_unbounded+0x18d/0x250
>>       page_cache_ra_order+0x2c9/0x400
>>       ondemand_readahead+0x320/0x730
>>       page_cache_sync_ra+0xa6/0xb0
>>       filemap_get_pages+0x1eb/0xc00
>>       filemap_read+0x1f1/0x770
>>       blkdev_read_iter+0x164/0x310
>>       vfs_read+0x467/0x5a0
>>       __x64_sys_pread64+0x122/0x160
>>       do_syscall_64+0x35/0x80
>>       entry_SYSCALL_64_after_hwframe+0x46/0xb0
>>
>> -- 
>>
>>      ======================================================
>>      WARNING: possible circular locking dependency detected
>>      6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600 Not tainted
>>      ------------------------------------------------------
>>      systemd-udevd/292 is trying to acquire lock:
>>      ffff88817b644170 (&(&conf->resync_lock)->lock){....}-{2:2}, at:
>> wait_barrier+0x4fe/0x770
>>
>>      but task is already holding lock:
>>      ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
>> raid10_read_request+0x21f/0x760
>>             raid10.c:1140  wait_barrier()
>>             raid10.c:1204  regular_request_wait()
>>
>>
>>
>>      which lock already depends on the new lock.
>>
>>
>>      the existing dependency chain (in reverse order) is:
>>
>>      -> #1 (&____s->seqcount#11){+.+.}-{0:0}:
>>             raise_barrier+0xe0/0x300
>>         raid10.c:940 write_seqlock_irq()
>>             raid10_sync_request+0x629/0x4750
>>         raid10.c:3689 raise_barrire()
>>             md_do_sync.cold+0x8ec/0x1491
>>             md_thread+0x19d/0x2d0
>>             kthread+0x18c/0x1c0
>>             ret_from_fork+0x1f/0x30
>>
>>      -> #0 (&(&conf->resync_lock)->lock){....}-{2:2}:
>>             __lock_acquire+0x1cb4/0x3170
>>             lock_acquire+0x183/0x440
>>             _raw_spin_lock_irq+0x4d/0x90
>>             wait_barrier+0x4fe/0x770
>>             raid10_read_request+0x21f/0x760
>>         raid10.c:1140  wait_barrier()
>>         raid10.c:1204  regular_request_wait()
>>             raid10_make_request+0x2d6/0x2190
>>             md_handle_request+0x3f3/0x5b0
>>             md_submit_bio+0xd9/0x120
>>             __submit_bio+0x9d/0x100
>>             submit_bio_noacct_nocheck+0x1fd/0x470
>>             submit_bio_noacct+0x4c2/0xbb0
>>             submit_bio+0x3f/0xf0
>>             submit_bh_wbc+0x270/0x2a0
>>             block_read_full_folio+0x37c/0x580
>>             blkdev_read_folio+0x18/0x20
>>             filemap_read_folio+0x3f/0x110
>>             do_read_cache_folio+0x13b/0x2c0
>>             read_cache_folio+0x42/0x50
>>             read_part_sector+0x74/0x1c0
>>             read_lba+0x176/0x2a0
>>             efi_partition+0x1ce/0xdd0
>>             bdev_disk_changed+0x2e7/0x6a0
>>             blkdev_get_whole+0xd2/0x140
>>             blkdev_get_by_dev.part.0+0x37f/0x570
>>             blkdev_get_by_dev+0x51/0x60
>>             disk_scan_partitions+0xad/0xf0
>>             blkdev_common_ioctl+0x3f3/0xdf0
>>             blkdev_ioctl+0x1e1/0x450
>>             __x64_sys_ioctl+0xc0/0x100
>>             do_syscall_64+0x35/0x80
>>             entry_SYSCALL_64_after_hwframe+0x46/0xb0
>>
>>      other info that might help us debug this:
>>
>>       Possible unsafe locking scenario:
>>
>>             CPU0                    CPU1
>>             ----                    ----
>>        lock(&____s->seqcount#11);
>>                                     lock(&(&conf->resync_lock)->lock);
>>                                     lock(&____s->seqcount#11);
>>        lock(&(&conf->resync_lock)->lock);
>>
>>       *** DEADLOCK ***
>>
>>      2 locks held by systemd-udevd/292:
>>       #0: ffff88817a532528 (&disk->open_mutex){+.+.}-{3:3}, at:
>> blkdev_get_by_dev.part.0+0x180/0x570
>>       #1: ffff88817b644120 (&____s->seqcount#11){+.+.}-{0:0}, at:
>> raid10_read_request+0x21f/0x760
>>
>>      stack backtrace:
>>      CPU: 3 PID: 292 Comm: systemd-udevd Not tainted
>> 6.0.0-rc2-eid-vmlocalyes-dbg-00027-gcd6aa5181bbb #2600
>>      Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2
>> 04/01/2014
>>      Call Trace:
>>       <TASK>
>>       dump_stack_lvl+0x5a/0x74
>>       dump_stack+0x10/0x12
>>       print_circular_bug.cold+0x146/0x14b
>>       check_noncircular+0x1ff/0x250
>>       __lock_acquire+0x1cb4/0x3170
>>       lock_acquire+0x183/0x440
>>       _raw_spin_lock_irq+0x4d/0x90
>>       wait_barrier+0x4fe/0x770
>>       raid10_read_request+0x21f/0x760
>>       raid10_make_request+0x2d6/0x2190
>>       md_handle_request+0x3f3/0x5b0
>>       md_submit_bio+0xd9/0x120
>>       __submit_bio+0x9d/0x100
>>       submit_bio_noacct_nocheck+0x1fd/0x470
>>       submit_bio_noacct+0x4c2/0xbb0
>>       submit_bio+0x3f/0xf0
>>       submit_bh_wbc+0x270/0x2a0
>>       block_read_full_folio+0x37c/0x580
>>       blkdev_read_folio+0x18/0x20
>>       filemap_read_folio+0x3f/0x110
>>       do_read_cache_folio+0x13b/0x2c0
>>       read_cache_folio+0x42/0x50
>>       read_part_sector+0x74/0x1c0
>>       read_lba+0x176/0x2a0
>>       efi_partition+0x1ce/0xdd0
>>       bdev_disk_changed+0x2e7/0x6a0
>>       blkdev_get_whole+0xd2/0x140
>>       blkdev_get_by_dev.part.0+0x37f/0x570
>>       blkdev_get_by_dev+0x51/0x60
>>       disk_scan_partitions+0xad/0xf0
>>       blkdev_common_ioctl+0x3f3/0xdf0
>>       blkdev_ioctl+0x1e1/0x450
>>       __x64_sys_ioctl+0xc0/0x100
>>       do_syscall_64+0x35/0x80
>> .
>>
> 
> .
>
Guoqing Jiang Sept. 2, 2022, 9:42 a.m. UTC | #7
Hi,

On 8/29/22 9:15 PM, Yu Kuai wrote:
> +static bool wait_barrier_nolock(struct r10conf *conf)
> +{
> +	unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount);
> +
> +	if (seq & 1)
> +		return false;
> +
> +	if (READ_ONCE(conf->barrier))
> +		return false;
> +
> +	atomic_inc(&conf->nr_pending);
> +	if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq))

I think 'seq' is usually get from read_seqcount_begin.

> +		return true;
> +
> +	atomic_dec(&conf->nr_pending);
> +	return false;
> +

Thanks,
Guoqing
Yu Kuai Sept. 2, 2022, 10:02 a.m. UTC | #8
Hi,

在 2022/09/02 17:42, Guoqing Jiang 写道:
> Hi,
> 
> On 8/29/22 9:15 PM, Yu Kuai wrote:
>> +static bool wait_barrier_nolock(struct r10conf *conf)
>> +{
>> +    unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount);
>> +
>> +    if (seq & 1)
>> +        return false;
>> +
>> +    if (READ_ONCE(conf->barrier))
>> +        return false;
>> +
>> +    atomic_inc(&conf->nr_pending);
>> +    if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq))
> 
> I think 'seq' is usually get from read_seqcount_begin.

read_seqcount_begin will loop untill "req & 1" failed, I'm afraid this
will cause high cpu usage in come cases.

What I try to do here is just try once, and fall back to hold lock and
wait if failed.

What do you think?

Thanks,
Kuai
> 
>> +        return true;
>> +
>> +    atomic_dec(&conf->nr_pending);
>> +    return false;
>> +
> 
> Thanks,
> Guoqing
> .
>
Guoqing Jiang Sept. 2, 2022, 10:16 a.m. UTC | #9
On 9/2/22 6:02 PM, Yu Kuai wrote:
> Hi,
>
> 在 2022/09/02 17:42, Guoqing Jiang 写道:
>> Hi,
>>
>> On 8/29/22 9:15 PM, Yu Kuai wrote:
>>> +static bool wait_barrier_nolock(struct r10conf *conf)
>>> +{
>>> +    unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount);
>>> +
>>> +    if (seq & 1)
>>> +        return false;
>>> +
>>> +    if (READ_ONCE(conf->barrier))
>>> +        return false;
>>> +
>>> +    atomic_inc(&conf->nr_pending);
>>> +    if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq))
>>
>> I think 'seq' is usually get from read_seqcount_begin.
>
> read_seqcount_begin will loop untill "req & 1" failed, I'm afraid this
> will cause high cpu usage in come cases.
>
> What I try to do here is just try once, and fall back to hold lock and
> wait if failed.

Thanks for the explanation.

I'd suggest to try with read_seqcount_begin/read_seqcount_retry pattern
because it is a common usage in kernel I think, then check whether the
performance drops or not.  Maybe it is related to lockdep issue, but I am
not sure.

Thanks,
Guoqing
Yu Kuai Sept. 2, 2022, 10:53 a.m. UTC | #10
Hi,

在 2022/09/02 18:16, Guoqing Jiang 写道:
> 
> 
> On 9/2/22 6:02 PM, Yu Kuai wrote:
>> Hi,
>>
>> 在 2022/09/02 17:42, Guoqing Jiang 写道:
>>> Hi,
>>>
>>> On 8/29/22 9:15 PM, Yu Kuai wrote:
>>>> +static bool wait_barrier_nolock(struct r10conf *conf)
>>>> +{
>>>> +    unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount);
>>>> +
>>>> +    if (seq & 1)
>>>> +        return false;
>>>> +
>>>> +    if (READ_ONCE(conf->barrier))
>>>> +        return false;
>>>> +
>>>> +    atomic_inc(&conf->nr_pending);
>>>> +    if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq))
>>>
>>> I think 'seq' is usually get from read_seqcount_begin.
>>
>> read_seqcount_begin will loop untill "req & 1" failed, I'm afraid this
>> will cause high cpu usage in come cases.
>>
>> What I try to do here is just try once, and fall back to hold lock and
>> wait if failed.
> 
> Thanks for the explanation.
> 
> I'd suggest to try with read_seqcount_begin/read_seqcount_retry pattern
> because it is a common usage in kernel I think, then check whether the
> performance drops or not.  Maybe it is related to lockdep issue, but I am
> not sure.

I can try read_seqcount_begin/read_seqcount_retry.

Please take a look at another thread, lockdep issue is due to
inconsistent usage of lock and seqcount inside seqlock:

wait_event() only release lock, seqcount is not released.

Thansk,
Kuai
> 
> Thanks,
> Guoqing
> .
>
Logan Gunthorpe Sept. 2, 2022, 5:03 p.m. UTC | #11
On 2022-09-02 02:14, Yu Kuai wrote:
> Can you try the following patch? I'm running mdadm tests myself and I
> didn't see any problems yet.

Yes, that patch seems to fix the issue.

However, may I suggest we do this without trying to introduce new
helpers into wait.h? I suspect that could result in a fair amount of
bike shedding and delay. wait_event_cmd() is often used in situations 
where a specific lock type doesn't have a helper.

My stab at it is in a diff below which also fixes the bug. 

I'd also recommend somebody clean up that nasty condition in 
wait_barrier(). Put it into an appropriately named function
with some comments. As is, it is pretty much unreadable.

Logan

--


diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 0e3229ee1ebc..ae297bc870bd 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -934,22 +934,26 @@ static void flush_pending_writes(struct r10conf *conf)
  *    lower_barrier when the particular background IO completes.
  */
 
+#define wait_event_barrier_cmd(conf, cond, cmd) \
+	wait_event_cmd((conf)->wait_barrier, cond, \
+		       write_sequnlock_irq(&(conf)->resync_lock); cmd, \
+		       write_seqlock_irq(&(conf)->resync_lock))
+#define wait_event_barrier(conf, cond) wait_event_barrier_cmd(conf, cond, )
+
 static void raise_barrier(struct r10conf *conf, int force)
 {
 	write_seqlock_irq(&conf->resync_lock);
 	BUG_ON(force && !conf->barrier);
 
 	/* Wait until no block IO is waiting (unless 'force') */
-	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-			    conf->resync_lock.lock);
+	wait_event_barrier(conf, force || !conf->nr_waiting);
 
 	/* block any new IO from starting */
 	WRITE_ONCE(conf->barrier, conf->barrier + 1);
 
 	/* Now wait for all pending IO to complete */
-	wait_event_lock_irq(conf->wait_barrier,
-			    !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock.lock);
+	wait_event_barrier(conf, !atomic_read(&conf->nr_pending) &&
+			   conf->barrier < RESYNC_DEPTH);
 
 	write_sequnlock_irq(&conf->resync_lock);
 }
@@ -1007,20 +1011,19 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
 			ret = false;
 		} else {
 			raid10_log(conf->mddev, "wait barrier");
-			wait_event_lock_irq(conf->wait_barrier,
-					    !conf->barrier ||
-					    (atomic_read(&conf->nr_pending) &&
-					     bio_list &&
-					     (!bio_list_empty(&bio_list[0]) ||
-					      !bio_list_empty(&bio_list[1]))) ||
+			wait_event_barrier(conf,
+					   !conf->barrier ||
+					   (atomic_read(&conf->nr_pending) &&
+					    bio_list &&
+					    (!bio_list_empty(&bio_list[0]) ||
+					     !bio_list_empty(&bio_list[1]))) ||
 					     /* move on if recovery thread is
 					      * blocked by us
 					      */
-					     (conf->mddev->thread->tsk == current &&
-					      test_bit(MD_RECOVERY_RUNNING,
-						       &conf->mddev->recovery) &&
-					      conf->nr_queued > 0),
-					    conf->resync_lock.lock);
+					    (conf->mddev->thread->tsk == current &&
+					     test_bit(MD_RECOVERY_RUNNING,
+					       &conf->mddev->recovery) &&
+					     conf->nr_queued > 0));
 		}
 		conf->nr_waiting--;
 		if (!conf->nr_waiting)
@@ -1058,10 +1061,9 @@ static void freeze_array(struct r10conf *conf, int extra)
 	conf->array_freeze_pending++;
 	WRITE_ONCE(conf->barrier, conf->barrier + 1);
 	conf->nr_waiting++;
-	wait_event_lock_irq_cmd(conf->wait_barrier,
-				atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
-				conf->resync_lock.lock,
-				flush_pending_writes(conf));
+	wait_event_barrier_cmd(conf,
+		atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
+		flush_pending_writes(conf));
 
 	conf->array_freeze_pending--;
 	write_sequnlock_irq(&conf->resync_lock);
Yu Kuai Sept. 3, 2022, 6:07 a.m. UTC | #12
Hi,

在 2022/09/03 1:03, Logan Gunthorpe 写道:
> 
> 
> 
> On 2022-09-02 02:14, Yu Kuai wrote:
>> Can you try the following patch? I'm running mdadm tests myself and I
>> didn't see any problems yet.
> 
> Yes, that patch seems to fix the issue.
> 
> However, may I suggest we do this without trying to introduce new
> helpers into wait.h? I suspect that could result in a fair amount of
> bike shedding and delay. wait_event_cmd() is often used in situations
> where a specific lock type doesn't have a helper.

Yes, that sounds good.
> 
> My stab at it is in a diff below which also fixes the bug.
> 
> I'd also recommend somebody clean up that nasty condition in
> wait_barrier(). Put it into an appropriately named function
> with some comments. As is, it is pretty much unreadable.

Now we're at it, I can take a look.

Thanks,
Kuai
> 
> Logan
> 
> --
> 
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 0e3229ee1ebc..ae297bc870bd 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -934,22 +934,26 @@ static void flush_pending_writes(struct r10conf *conf)
>    *    lower_barrier when the particular background IO completes.
>    */
>   
> +#define wait_event_barrier_cmd(conf, cond, cmd) \
> +	wait_event_cmd((conf)->wait_barrier, cond, \
> +		       write_sequnlock_irq(&(conf)->resync_lock); cmd, \
> +		       write_seqlock_irq(&(conf)->resync_lock))
> +#define wait_event_barrier(conf, cond) wait_event_barrier_cmd(conf, cond, )
> +
>   static void raise_barrier(struct r10conf *conf, int force)
>   {
>   	write_seqlock_irq(&conf->resync_lock);
>   	BUG_ON(force && !conf->barrier);
>   
>   	/* Wait until no block IO is waiting (unless 'force') */
> -	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
> -			    conf->resync_lock.lock);
> +	wait_event_barrier(conf, force || !conf->nr_waiting);
>   
>   	/* block any new IO from starting */
>   	WRITE_ONCE(conf->barrier, conf->barrier + 1);
>   
>   	/* Now wait for all pending IO to complete */
> -	wait_event_lock_irq(conf->wait_barrier,
> -			    !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
> -			    conf->resync_lock.lock);
> +	wait_event_barrier(conf, !atomic_read(&conf->nr_pending) &&
> +			   conf->barrier < RESYNC_DEPTH);
>   
>   	write_sequnlock_irq(&conf->resync_lock);
>   }
> @@ -1007,20 +1011,19 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
>   			ret = false;
>   		} else {
>   			raid10_log(conf->mddev, "wait barrier");
> -			wait_event_lock_irq(conf->wait_barrier,
> -					    !conf->barrier ||
> -					    (atomic_read(&conf->nr_pending) &&
> -					     bio_list &&
> -					     (!bio_list_empty(&bio_list[0]) ||
> -					      !bio_list_empty(&bio_list[1]))) ||
> +			wait_event_barrier(conf,
> +					   !conf->barrier ||
> +					   (atomic_read(&conf->nr_pending) &&
> +					    bio_list &&
> +					    (!bio_list_empty(&bio_list[0]) ||
> +					     !bio_list_empty(&bio_list[1]))) ||
>   					     /* move on if recovery thread is
>   					      * blocked by us
>   					      */
> -					     (conf->mddev->thread->tsk == current &&
> -					      test_bit(MD_RECOVERY_RUNNING,
> -						       &conf->mddev->recovery) &&
> -					      conf->nr_queued > 0),
> -					    conf->resync_lock.lock);
> +					    (conf->mddev->thread->tsk == current &&
> +					     test_bit(MD_RECOVERY_RUNNING,
> +					       &conf->mddev->recovery) &&
> +					     conf->nr_queued > 0));
>   		}
>   		conf->nr_waiting--;
>   		if (!conf->nr_waiting)
> @@ -1058,10 +1061,9 @@ static void freeze_array(struct r10conf *conf, int extra)
>   	conf->array_freeze_pending++;
>   	WRITE_ONCE(conf->barrier, conf->barrier + 1);
>   	conf->nr_waiting++;
> -	wait_event_lock_irq_cmd(conf->wait_barrier,
> -				atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
> -				conf->resync_lock.lock,
> -				flush_pending_writes(conf));
> +	wait_event_barrier_cmd(conf,
> +		atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
> +		flush_pending_writes(conf));
>   
>   	conf->array_freeze_pending--;
>   	write_sequnlock_irq(&conf->resync_lock);
> .
>
diff mbox series

Patch

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index b70c207f7932..086216b051f5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -930,38 +930,60 @@  static void flush_pending_writes(struct r10conf *conf)
 
 static void raise_barrier(struct r10conf *conf, int force)
 {
-	spin_lock_irq(&conf->resync_lock);
+	write_seqlock_irq(&conf->resync_lock);
 	BUG_ON(force && !conf->barrier);
 
 	/* Wait until no block IO is waiting (unless 'force') */
 	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
-			    conf->resync_lock);
+			    conf->resync_lock.lock);
 
 	/* block any new IO from starting */
-	conf->barrier++;
+	WRITE_ONCE(conf->barrier, conf->barrier + 1);
 
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
-			    conf->resync_lock);
+			    conf->resync_lock.lock);
 
-	spin_unlock_irq(&conf->resync_lock);
+	write_sequnlock_irq(&conf->resync_lock);
 }
 
 static void lower_barrier(struct r10conf *conf)
 {
 	unsigned long flags;
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->barrier--;
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+
+	write_seqlock_irqsave(&conf->resync_lock, flags);
+	WRITE_ONCE(conf->barrier, conf->barrier - 1);
+	write_sequnlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 
+static bool wait_barrier_nolock(struct r10conf *conf)
+{
+	unsigned int seq = raw_read_seqcount(&conf->resync_lock.seqcount);
+
+	if (seq & 1)
+		return false;
+
+	if (READ_ONCE(conf->barrier))
+		return false;
+
+	atomic_inc(&conf->nr_pending);
+	if (!read_seqcount_retry(&conf->resync_lock.seqcount, seq))
+		return true;
+
+	atomic_dec(&conf->nr_pending);
+	return false;
+}
+
 static bool wait_barrier(struct r10conf *conf, bool nowait)
 {
 	bool ret = true;
 
-	spin_lock_irq(&conf->resync_lock);
+	if (wait_barrier_nolock(conf))
+		return true;
+
+	write_seqlock_irq(&conf->resync_lock);
 	if (conf->barrier) {
 		struct bio_list *bio_list = current->bio_list;
 		conf->nr_waiting++;
@@ -992,7 +1014,7 @@  static bool wait_barrier(struct r10conf *conf, bool nowait)
 					      test_bit(MD_RECOVERY_RUNNING,
 						       &conf->mddev->recovery) &&
 					      conf->nr_queued > 0),
-					    conf->resync_lock);
+					    conf->resync_lock.lock);
 		}
 		conf->nr_waiting--;
 		if (!conf->nr_waiting)
@@ -1001,7 +1023,7 @@  static bool wait_barrier(struct r10conf *conf, bool nowait)
 	/* Only increment nr_pending when we wait */
 	if (ret)
 		atomic_inc(&conf->nr_pending);
-	spin_unlock_irq(&conf->resync_lock);
+	write_sequnlock_irq(&conf->resync_lock);
 	return ret;
 }
 
@@ -1026,27 +1048,27 @@  static void freeze_array(struct r10conf *conf, int extra)
 	 * must match the number of pending IOs (nr_pending) before
 	 * we continue.
 	 */
-	spin_lock_irq(&conf->resync_lock);
+	write_seqlock_irq(&conf->resync_lock);
 	conf->array_freeze_pending++;
-	conf->barrier++;
+	WRITE_ONCE(conf->barrier, conf->barrier + 1);
 	conf->nr_waiting++;
 	wait_event_lock_irq_cmd(conf->wait_barrier,
 				atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
-				conf->resync_lock,
+				conf->resync_lock.lock,
 				flush_pending_writes(conf));
 
 	conf->array_freeze_pending--;
-	spin_unlock_irq(&conf->resync_lock);
+	write_sequnlock_irq(&conf->resync_lock);
 }
 
 static void unfreeze_array(struct r10conf *conf)
 {
 	/* reverse the effect of the freeze */
-	spin_lock_irq(&conf->resync_lock);
-	conf->barrier--;
+	write_seqlock_irq(&conf->resync_lock);
+	WRITE_ONCE(conf->barrier, conf->barrier - 1);
 	conf->nr_waiting--;
 	wake_up(&conf->wait_barrier);
-	spin_unlock_irq(&conf->resync_lock);
+	write_sequnlock_irq(&conf->resync_lock);
 }
 
 static sector_t choose_data_offset(struct r10bio *r10_bio,
@@ -4033,7 +4055,7 @@  static struct r10conf *setup_conf(struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->retry_list);
 	INIT_LIST_HEAD(&conf->bio_end_io_list);
 
-	spin_lock_init(&conf->resync_lock);
+	seqlock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
 	atomic_set(&conf->nr_pending, 0);
 
@@ -4352,7 +4374,7 @@  static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
 				rdev->new_raid_disk = rdev->raid_disk * 2;
 				rdev->sectors = size;
 			}
-		conf->barrier = 1;
+		WRITE_ONCE(conf->barrier, 1);
 	}
 
 	return conf;
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 5c0804d8bb1f..8c072ce0bc54 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -76,7 +76,7 @@  struct r10conf {
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;
 
-	spinlock_t		resync_lock;
+	seqlock_t		resync_lock;
 	atomic_t		nr_pending;
 	int			nr_waiting;
 	int			nr_queued;