diff mbox series

mm: fix hard lockup in __split_huge_page

Message ID 20240618020926.1911903-1-zhaoyang.huang@unisoc.com (mailing list archive)
State New
Headers show
Series mm: fix hard lockup in __split_huge_page | expand

Commit Message

zhaoyang.huang June 18, 2024, 2:09 a.m. UTC
From: Zhaoyang Huang <zhaoyang.huang@unisoc.com>

Hard lockup[2] is reported which should be caused by recursive
lock contention of lruvec->lru_lock[1] within __split_huge_page.

[1]
static void __split_huge_page(struct page *page, struct list_head *list,
                pgoff_t end, unsigned int new_order)
{
        /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
//1st lock here
        lruvec = folio_lruvec_lock(folio);

        for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
                __split_huge_page_tail(folio, i, lruvec, list, new_order);
                /* Some pages can be beyond EOF: drop them from page cache */
                if (head[i].index >= end) {
                        folio_put(tail);
                            __page_cache_release
//2nd lock here
                               folio_lruvec_relock_irqsave

[2]
[26887.389623] watchdog: Watchdog detected hard LOCKUP on cpu 21
[26887.389682] CPU: 21 PID: 264 Comm: kswapd0 Kdump: loaded Tainted: G
      W          6.6.31.el9 #3
[26887.389685] Hardware name: FUJITSU PRIMERGY RX2540 M4/D3384-A1, BIOS
V5.0.0.12 R1.22.0 for D3384-A1x                    06/04/2018
[26887.389687] RIP: 0010:native_queued_spin_lock_slowpath+0x6e/0x2c0
[26887.389696] Code: 08 0f 92 c2 8b 45 00 0f b6 d2 c1 e2 08 30 e4 09 d0
a9 00 01 ff ff 0f 85 ea 01 00 00 85 c0 74 12 0f b6 45 00 84 c0 74 0a f3
90 <0f> b6 45 00 84 c0 75 f6 b8 01 00 00 00 66 89 45 00 5b 5d 41 5c 41
[26887.389698] RSP: 0018:ffffb3e587a87a20 EFLAGS: 00000002
[26887.389700] RAX: 0000000000000001 RBX: ffff9ad6c6f67050 RCX:
0000000000000000
[26887.389701] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
ffff9ad6c6f67050
[26887.389703] RBP: ffff9ad6c6f67050 R08: 0000000000000000 R09:
0000000000000067
[26887.389704] R10: 0000000000000000 R11: 0000000000000000 R12:
0000000000000046
[26887.389705] R13: 0000000000000200 R14: 0000000000000000 R15:
ffffe1138aa98000
[26887.389707] FS:  0000000000000000(0000) GS:ffff9ade20340000(0000)
knlGS:0000000000000000
[26887.389708] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[26887.389710] CR2: 000000002912809b CR3: 000000064401e003 CR4:
00000000007706e0
[26887.389711] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[26887.389712] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
0000000000000400
[26887.389713] PKRU: 55555554
[26887.389714] Call Trace:
[26887.389717]  <NMI>
[26887.389720]  ? watchdog_hardlockup_check+0xac/0x150
[26887.389725]  ? __perf_event_overflow+0x102/0x1d0
[26887.389729]  ? handle_pmi_common+0x189/0x3e0
[26887.389735]  ? set_pte_vaddr_p4d+0x4a/0x60
[26887.389738]  ? flush_tlb_one_kernel+0xa/0x20
[26887.389742]  ? native_set_fixmap+0x65/0x80
[26887.389745]  ? ghes_copy_tofrom_phys+0x75/0x110
[26887.389751]  ? __ghes_peek_estatus.isra.0+0x49/0xb0
[26887.389755]  ? intel_pmu_handle_irq+0x10b/0x230
[26887.389756]  ? perf_event_nmi_handler+0x28/0x50
[26887.389759]  ? nmi_handle+0x58/0x150
[26887.389764]  ? native_queued_spin_lock_slowpath+0x6e/0x2c0
[26887.389768]  ? default_do_nmi+0x6b/0x170
[26887.389770]  ? exc_nmi+0x12c/0x1a0
[26887.389772]  ? end_repeat_nmi+0x16/0x1f
[26887.389777]  ? native_queued_spin_lock_slowpath+0x6e/0x2c0
[26887.389780]  ? native_queued_spin_lock_slowpath+0x6e/0x2c0
[26887.389784]  ? native_queued_spin_lock_slowpath+0x6e/0x2c0
[26887.389787]  </NMI>
[26887.389788]  <TASK>
[26887.389789]  __raw_spin_lock_irqsave+0x3d/0x50
[26887.389793]  folio_lruvec_lock_irqsave+0x5e/0x90
[26887.389798]  __page_cache_release+0x68/0x230
[26887.389801]  ? remove_migration_ptes+0x5c/0x80
[26887.389807]  __folio_put+0x24/0x60
[26887.389808]  __split_huge_page+0x368/0x520
[26887.389812]  split_huge_page_to_list+0x4b3/0x570
[26887.389816]  deferred_split_scan+0x1c8/0x290
[26887.389819]  do_shrink_slab+0x12f/0x2d0
[26887.389824]  shrink_slab_memcg+0x133/0x1d0
[26887.389829]  shrink_node_memcgs+0x18e/0x1d0
[26887.389832]  shrink_node+0xa7/0x370
[26887.389836]  balance_pgdat+0x332/0x6f0
[26887.389842]  kswapd+0xf0/0x190
[26887.389845]  ? balance_pgdat+0x6f0/0x6f0
[26887.389848]  kthread+0xee/0x120
[26887.389851]  ? kthread_complete_and_exit+0x20/0x20
[26887.389853]  ret_from_fork+0x2d/0x50
[26887.389857]  ? kthread_complete_and_exit+0x20/0x20
[26887.389859]  ret_from_fork_asm+0x11/0x20
[26887.389864]  </TASK>
[26887.389865] Kernel panic - not syncing: Hard LOCKUP
[26887.389867] CPU: 21 PID: 264 Comm: kswapd0 Kdump: loaded Tainted: G
      W          6.6.31.el9 #3
[26887.389869] Hardware name: FUJITSU PRIMERGY RX2540 M4/D3384-A1, BIOS
V5.0.0.12 R1.22.0 for D3384-A1x                    06/04/2018
[26887.389870] Call Trace:
[26887.389871]  <NMI>
[26887.389872]  dump_stack_lvl+0x44/0x60
[26887.389877]  panic+0x241/0x330
[26887.389881]  nmi_panic+0x2f/0x40
[26887.389883]  watchdog_hardlockup_check+0x119/0x150
[26887.389886]  __perf_event_overflow+0x102/0x1d0
[26887.389889]  handle_pmi_common+0x189/0x3e0
[26887.389893]  ? set_pte_vaddr_p4d+0x4a/0x60
[26887.389896]  ? flush_tlb_one_kernel+0xa/0x20
[26887.389899]  ? native_set_fixmap+0x65/0x80
[26887.389902]  ? ghes_copy_tofrom_phys+0x75/0x110
[26887.389906]  ? __ghes_peek_estatus.isra.0+0x49/0xb0
[26887.389909]  intel_pmu_handle_irq+0x10b/0x230
[26887.389911]  perf_event_nmi_handler+0x28/0x50
[26887.389913]  nmi_handle+0x58/0x150
[26887.389916]  ? native_queued_spin_lock_slowpath+0x6e/0x2c0
[26887.389920]  default_do_nmi+0x6b/0x170
[26887.389922]  exc_nmi+0x12c/0x1a0
[26887.389923]  end_repeat_nmi+0x16/0x1f
[26887.389926] RIP: 0010:native_queued_spin_lock_slowpath+0x6e/0x2c0
[26887.389930] Code: 08 0f 92 c2 8b 45 00 0f b6 d2 c1 e2 08 30 e4 09 d0
a9 00 01 ff ff 0f 85 ea 01 00 00 85 c0 74 12 0f b6 45 00 84 c0 74 0a f3
90 <0f> b6 45 00 84 c0 75 f6 b8 01 00 00 00 66 89 45 00 5b 5d 41 5c 41
[26887.389931] RSP: 0018:ffffb3e587a87a20 EFLAGS: 00000002
[26887.389933] RAX: 0000000000000001 RBX: ffff9ad6c6f67050 RCX:
0000000000000000
[26887.389934] RDX: 0000000000000000 RSI: 0000000000000000 RDI:
ffff9ad6c6f67050
[26887.389935] RBP: ffff9ad6c6f67050 R08: 0000000000000000 R09:
0000000000000067
[26887.389936] R10: 0000000000000000 R11: 0000000000000000 R12:
0000000000000046
[26887.389937] R13: 0000000000000200 R14: 0000000000000000 R15:
ffffe1138aa98000
[26887.389940]  ? native_queued_spin_lock_slowpath+0x6e/0x2c0
[26887.389943]  ? native_queued_spin_lock_slowpath+0x6e/0x2c0
[26887.389946]  </NMI>
[26887.389947]  <TASK>
[26887.389947]  __raw_spin_lock_irqsave+0x3d/0x50
[26887.389950]  folio_lruvec_lock_irqsave+0x5e/0x90
[26887.389953]  __page_cache_release+0x68/0x230
[26887.389955]  ? remove_migration_ptes+0x5c/0x80
[26887.389958]  __folio_put+0x24/0x60
[26887.389960]  __split_huge_page+0x368/0x520
[26887.389963]  split_huge_page_to_list+0x4b3/0x570
[26887.389967]  deferred_split_scan+0x1c8/0x290
[26887.389971]  do_shrink_slab+0x12f/0x2d0
[26887.389974]  shrink_slab_memcg+0x133/0x1d0
[26887.389978]  shrink_node_memcgs+0x18e/0x1d0
[26887.389982]  shrink_node+0xa7/0x370
[26887.389985]  balance_pgdat+0x332/0x6f0
[26887.389991]  kswapd+0xf0/0x190
[26887.389994]  ? balance_pgdat+0x6f0/0x6f0
[26887.389997]  kthread+0xee/0x120
[26887.389998]  ? kthread_complete_and_exit+0x20/0x20
[26887.390000]  ret_from_fork+0x2d/0x50
[26887.390003]  ? kthread_complete_and_exit+0x20/0x20
[26887.390004]  ret_from_fork_asm+0x11/0x20
[26887.390009]  </TASK>

Signed-off-by: Zhaoyang Huang <zhaoyang.huang@unisoc.com>
---
 mm/huge_memory.c | 2 ++
 1 file changed, 2 insertions(+)

Comments

Matthew Wilcox (Oracle) June 18, 2024, 3:19 a.m. UTC | #1
On Tue, Jun 18, 2024 at 10:09:26AM +0800, zhaoyang.huang wrote:
> Hard lockup[2] is reported which should be caused by recursive
> lock contention of lruvec->lru_lock[1] within __split_huge_page.
> 
> [1]
> static void __split_huge_page(struct page *page, struct list_head *list,
>                 pgoff_t end, unsigned int new_order)
> {
>         /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
> //1st lock here
>         lruvec = folio_lruvec_lock(folio);
> 
>         for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
>                 __split_huge_page_tail(folio, i, lruvec, list, new_order);
>                 /* Some pages can be beyond EOF: drop them from page cache */
>                 if (head[i].index >= end) {
>                         folio_put(tail);
>                             __page_cache_release
> //2nd lock here
>                                folio_lruvec_relock_irqsave

Why doesn't lockdep catch this?

> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 9859aa4f7553..ea504df46d3b 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2925,7 +2925,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
>  				folio_account_cleaned(tail,
>  					inode_to_wb(folio->mapping->host));
>  			__filemap_remove_folio(tail, NULL);
> +			unlock_page_lruvec(lruvec);
>  			folio_put(tail);
> +			folio_lruvec_lock(folio);

Why is it safe to drop & reacquire this lock?  Is there nothing we need
to revalidate?
Zhaoyang Huang June 18, 2024, 3:27 a.m. UTC | #2
On Tue, Jun 18, 2024 at 11:19 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Tue, Jun 18, 2024 at 10:09:26AM +0800, zhaoyang.huang wrote:
> > Hard lockup[2] is reported which should be caused by recursive
> > lock contention of lruvec->lru_lock[1] within __split_huge_page.
> >
> > [1]
> > static void __split_huge_page(struct page *page, struct list_head *list,
> >                 pgoff_t end, unsigned int new_order)
> > {
> >         /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
> > //1st lock here
> >         lruvec = folio_lruvec_lock(folio);
> >
> >         for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
> >                 __split_huge_page_tail(folio, i, lruvec, list, new_order);
> >                 /* Some pages can be beyond EOF: drop them from page cache */
> >                 if (head[i].index >= end) {
> >                         folio_put(tail);
> >                             __page_cache_release
> > //2nd lock here
> >                                folio_lruvec_relock_irqsave
>
> Why doesn't lockdep catch this?
It is reported by a regression test of the fix patch which aims at the
find_get_entry livelock issue as below. I don't know the details of
the kernel configuration.

https://lore.kernel.org/linux-mm/5f989315-e380-46aa-80d1-ce8608889e5f@marcinwanat.pl/

>
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 9859aa4f7553..ea504df46d3b 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -2925,7 +2925,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
> >                               folio_account_cleaned(tail,
> >                                       inode_to_wb(folio->mapping->host));
> >                       __filemap_remove_folio(tail, NULL);
> > +                     unlock_page_lruvec(lruvec);
> >                       folio_put(tail);
> > +                     folio_lruvec_lock(folio);
>
> Why is it safe to drop & reacquire this lock?  Is there nothing we need
> to revalidate?
My stupid. I will take that into consideration in the next version.
>
Matthew Wilcox (Oracle) June 18, 2024, 3:31 a.m. UTC | #3
On Tue, Jun 18, 2024 at 11:27:12AM +0800, Zhaoyang Huang wrote:
> On Tue, Jun 18, 2024 at 11:19 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Tue, Jun 18, 2024 at 10:09:26AM +0800, zhaoyang.huang wrote:
> > > Hard lockup[2] is reported which should be caused by recursive
> > > lock contention of lruvec->lru_lock[1] within __split_huge_page.
> > >
> > > [1]
> > > static void __split_huge_page(struct page *page, struct list_head *list,
> > >                 pgoff_t end, unsigned int new_order)
> > > {
> > >         /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
> > > //1st lock here
> > >         lruvec = folio_lruvec_lock(folio);
> > >
> > >         for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
> > >                 __split_huge_page_tail(folio, i, lruvec, list, new_order);
> > >                 /* Some pages can be beyond EOF: drop them from page cache */
> > >                 if (head[i].index >= end) {
> > >                         folio_put(tail);
> > >                             __page_cache_release
> > > //2nd lock here
> > >                                folio_lruvec_relock_irqsave
> >
> > Why doesn't lockdep catch this?
> It is reported by a regression test of the fix patch which aims at the
> find_get_entry livelock issue as below. I don't know the details of
> the kernel configuration.
> 
> https://lore.kernel.org/linux-mm/5f989315-e380-46aa-80d1-ce8608889e5f@marcinwanat.pl/

Go away.
Zhaoyang Huang June 18, 2024, 3:42 a.m. UTC | #4
On Tue, Jun 18, 2024 at 11:31 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Tue, Jun 18, 2024 at 11:27:12AM +0800, Zhaoyang Huang wrote:
> > On Tue, Jun 18, 2024 at 11:19 AM Matthew Wilcox <willy@infradead.org> wrote:
> > >
> > > On Tue, Jun 18, 2024 at 10:09:26AM +0800, zhaoyang.huang wrote:
> > > > Hard lockup[2] is reported which should be caused by recursive
> > > > lock contention of lruvec->lru_lock[1] within __split_huge_page.
> > > >
> > > > [1]
> > > > static void __split_huge_page(struct page *page, struct list_head *list,
> > > >                 pgoff_t end, unsigned int new_order)
> > > > {
> > > >         /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
> > > > //1st lock here
> > > >         lruvec = folio_lruvec_lock(folio);
> > > >
> > > >         for (i = nr - new_nr; i >= new_nr; i -= new_nr) {
> > > >                 __split_huge_page_tail(folio, i, lruvec, list, new_order);
> > > >                 /* Some pages can be beyond EOF: drop them from page cache */
> > > >                 if (head[i].index >= end) {
> > > >                         folio_put(tail);
> > > >                             __page_cache_release
> > > > //2nd lock here
> > > >                                folio_lruvec_relock_irqsave
> > >
> > > Why doesn't lockdep catch this?
> > It is reported by a regression test of the fix patch which aims at the
> > find_get_entry livelock issue as below. I don't know the details of
> > the kernel configuration.
> >
> > https://lore.kernel.org/linux-mm/5f989315-e380-46aa-80d1-ce8608889e5f@marcinwanat.pl/
>
> Go away.
ok, you are the boss anyway. But this series of call chain does have
the risk of deadlock, right? Besides, the livelock issue which is
caused by zero ref-count folio within find_get_entry is kept being
reported by different users.

https://lore.kernel.org/linux-mm/CALOAHbC8NM7R-pKvPW6m4fnn_8BQZuPjJrNZaEN=sg67Gp+NGQ@mail.gmail.com/
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9859aa4f7553..ea504df46d3b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2925,7 +2925,9 @@  static void __split_huge_page(struct page *page, struct list_head *list,
 				folio_account_cleaned(tail,
 					inode_to_wb(folio->mapping->host));
 			__filemap_remove_folio(tail, NULL);
+			unlock_page_lruvec(lruvec);
 			folio_put(tail);
+			folio_lruvec_lock(folio);
 		} else if (!PageAnon(page)) {
 			__xa_store(&folio->mapping->i_pages, head[i].index,
 					head + i, 0);