diff mbox series

[RFC,2/2] mm/khugepaged: Remove compound_pagelist

Message ID 20230922193639.10158-3-vishal.moola@gmail.com (mailing list archive)
State New
Headers show
Series Remove compound_pagelist from khugepaged | expand

Commit Message

Vishal Moola Sept. 22, 2023, 7:36 p.m. UTC
Currently, khugepaged builds a compound_pagelist while scanning, which
is used to properly account for compound pages. We can now account
for a compound page as a singular folio instead, so remove this list.

Large folios are guaranteed to have consecutive ptes and addresses, so
once the first pte of a large folio is found skip over the rest.

This helps convert khugepaged to use folios. It removes 3 compound_head
calls in __collapse_huge_page_copy_succeeded(), and removes 980 bytes of
kernel text.

Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
---
 mm/khugepaged.c | 76 ++++++++++++-------------------------------------
 1 file changed, 18 insertions(+), 58 deletions(-)

Comments

Yang Shi Sept. 26, 2023, 10:07 p.m. UTC | #1
On Fri, Sep 22, 2023 at 9:33 PM Vishal Moola (Oracle)
<vishal.moola@gmail.com> wrote:
>
> Currently, khugepaged builds a compound_pagelist while scanning, which
> is used to properly account for compound pages. We can now account
> for a compound page as a singular folio instead, so remove this list.
>
> Large folios are guaranteed to have consecutive ptes and addresses, so
> once the first pte of a large folio is found skip over the rest.

The address space may just map a partial folio, for example, in the
extreme case the HUGE_PMD size range may have HUGE_PMD_NR folios with
mapping one subpage from each folio per PTE. So assuming the PTE
mapped folio is mapped consecutively may be wrong.

Please refer to collapse_compound_extreme() in
tools/testing/selftests/mm/khugepaged.c.

>
> This helps convert khugepaged to use folios. It removes 3 compound_head
> calls in __collapse_huge_page_copy_succeeded(), and removes 980 bytes of
> kernel text.
>
> Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
> ---
>  mm/khugepaged.c | 76 ++++++++++++-------------------------------------
>  1 file changed, 18 insertions(+), 58 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index f46a7a7c489f..b6c7d55a8231 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -498,10 +498,9 @@ static void release_pte_page(struct page *page)
>         release_pte_folio(page_folio(page));
>  }
>
> -static void release_pte_pages(pte_t *pte, pte_t *_pte,
> -               struct list_head *compound_pagelist)
> +static void release_pte_folios(pte_t *pte, pte_t *_pte)
>  {
> -       struct folio *folio, *tmp;
> +       struct folio *folio;
>
>         while (--_pte >= pte) {
>                 pte_t pteval = ptep_get(_pte);
> @@ -514,12 +513,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
>                         continue;
>                 folio = pfn_folio(pfn);
>                 if (folio_test_large(folio))
> -                       continue;
> -               release_pte_folio(folio);
> -       }
> -
> -       list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
> -               list_del(&folio->lru);
> +                       _pte -= folio_nr_pages(folio) - 1;
>                 release_pte_folio(folio);
>         }
>  }
> @@ -538,8 +532,7 @@ static bool is_refcount_suitable(struct page *page)
>  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                                         unsigned long address,
>                                         pte_t *pte,
> -                                       struct collapse_control *cc,
> -                                       struct list_head *compound_pagelist)
> +                                       struct collapse_control *cc)
>  {
>         struct folio *folio = NULL;
>         pte_t *_pte;
> @@ -588,19 +581,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                         }
>                 }
>
> -               if (folio_test_large(folio)) {
> -                       struct folio *f;
> -
> -                       /*
> -                        * Check if we have dealt with the compound page
> -                        * already
> -                        */
> -                       list_for_each_entry(f, compound_pagelist, lru) {
> -                               if (folio == f)
> -                                       goto next;
> -                       }
> -               }
> -
>                 /*
>                  * We can do it before isolate_lru_page because the
>                  * page can't be freed from under us. NOTE: PG_lock
> @@ -644,9 +624,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
>                 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
>
> -               if (folio_test_large(folio))
> -                       list_add_tail(&folio->lru, compound_pagelist);
> -next:
>                 /*
>                  * If collapse was initiated by khugepaged, check that there is
>                  * enough young pte to justify collapsing the page
> @@ -660,6 +637,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                 if (pte_write(pteval))
>                         writable = true;
>
> +               if (folio_test_large(folio)) {
> +                       _pte += folio_nr_pages(folio) - 1;
> +                       address += folio_size(folio) - PAGE_SIZE;
> +               }
>         }
>
>         if (unlikely(!writable)) {
> @@ -673,7 +654,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                 return result;
>         }
>  out:
> -       release_pte_pages(pte, _pte, compound_pagelist);
> +       release_pte_folios(pte, _pte);
>         trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
>                                             referenced, writable, result);
>         return result;
> @@ -682,11 +663,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>                                                 struct vm_area_struct *vma,
>                                                 unsigned long address,
> -                                               spinlock_t *ptl,
> -                                               struct list_head *compound_pagelist)
> +                                               spinlock_t *ptl)
>  {
>         struct page *src_page;
> -       struct page *tmp;
>         pte_t *_pte;
>         pte_t pteval;
>
> @@ -706,8 +685,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>                         }
>                 } else {
>                         src_page = pte_page(pteval);
> -                       if (!PageCompound(src_page))
> -                               release_pte_page(src_page);
> +                       release_pte_page(src_page);
>                         /*
>                          * ptl mostly unnecessary, but preempt has to
>                          * be disabled to update the per-cpu stats
> @@ -720,23 +698,12 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
>                         free_page_and_swap_cache(src_page);
>                 }
>         }
> -
> -       list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
> -               list_del(&src_page->lru);
> -               mod_node_page_state(page_pgdat(src_page),
> -                                   NR_ISOLATED_ANON + page_is_file_lru(src_page),
> -                                   -compound_nr(src_page));
> -               unlock_page(src_page);
> -               free_swap_cache(src_page);
> -               putback_lru_page(src_page);
> -       }
>  }
>
>  static void __collapse_huge_page_copy_failed(pte_t *pte,
>                                              pmd_t *pmd,
>                                              pmd_t orig_pmd,
> -                                            struct vm_area_struct *vma,
> -                                            struct list_head *compound_pagelist)
> +                                            struct vm_area_struct *vma)
>  {
>         spinlock_t *pmd_ptl;
>
> @@ -753,7 +720,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>          * Release both raw and compound pages isolated
>          * in __collapse_huge_page_isolate.
>          */
> -       release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> +       release_pte_folios(pte, pte + HPAGE_PMD_NR);
>  }
>
>  /*
> @@ -769,7 +736,6 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
>   * @vma: the original raw pages' virtual memory area
>   * @address: starting address to copy
>   * @ptl: lock on raw pages' PTEs
> - * @compound_pagelist: list that stores compound pages
>   */
>  static int __collapse_huge_page_copy(pte_t *pte,
>                                      struct page *page,
> @@ -777,8 +743,7 @@ static int __collapse_huge_page_copy(pte_t *pte,
>                                      pmd_t orig_pmd,
>                                      struct vm_area_struct *vma,
>                                      unsigned long address,
> -                                    spinlock_t *ptl,
> -                                    struct list_head *compound_pagelist)
> +                                    spinlock_t *ptl)
>  {
>         struct page *src_page;
>         pte_t *_pte;
> @@ -804,11 +769,9 @@ static int __collapse_huge_page_copy(pte_t *pte,
>         }
>
>         if (likely(result == SCAN_SUCCEED))
> -               __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> -                                                   compound_pagelist);
> +               __collapse_huge_page_copy_succeeded(pte, vma, address, ptl);
>         else
> -               __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> -                                                compound_pagelist);
> +               __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma);
>
>         return result;
>  }
> @@ -1081,7 +1044,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>                               int referenced, int unmapped,
>                               struct collapse_control *cc)
>  {
> -       LIST_HEAD(compound_pagelist);
>         pmd_t *pmd, _pmd;
>         pte_t *pte;
>         pgtable_t pgtable;
> @@ -1168,8 +1130,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>
>         pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
>         if (pte) {
> -               result = __collapse_huge_page_isolate(vma, address, pte, cc,
> -                                                     &compound_pagelist);
> +               result = __collapse_huge_page_isolate(vma, address, pte, cc);
>                 spin_unlock(pte_ptl);
>         } else {
>                 result = SCAN_PMD_NULL;
> @@ -1198,8 +1159,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>         anon_vma_unlock_write(vma->anon_vma);
>
>         result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
> -                                          vma, address, pte_ptl,
> -                                          &compound_pagelist);
> +                                          vma, address, pte_ptl);
>         pte_unmap(pte);
>         if (unlikely(result != SCAN_SUCCEED))
>                 goto out_up_write;
> --
> 2.40.1
>
Matthew Wilcox Sept. 28, 2023, 9:05 a.m. UTC | #2
On Tue, Sep 26, 2023 at 03:07:18PM -0700, Yang Shi wrote:
> On Fri, Sep 22, 2023 at 9:33 PM Vishal Moola (Oracle)
> <vishal.moola@gmail.com> wrote:
> >
> > Currently, khugepaged builds a compound_pagelist while scanning, which
> > is used to properly account for compound pages. We can now account
> > for a compound page as a singular folio instead, so remove this list.
> >
> > Large folios are guaranteed to have consecutive ptes and addresses, so
> > once the first pte of a large folio is found skip over the rest.
> 
> The address space may just map a partial folio, for example, in the
> extreme case the HUGE_PMD size range may have HUGE_PMD_NR folios with
> mapping one subpage from each folio per PTE. So assuming the PTE
> mapped folio is mapped consecutively may be wrong.

How?  You can do that with two VMAs, but this is limited to scanning
within a single VMA.  If we've COWed a large folio, we currently do
so as a single page folio, and I'm not seeing any demand to change that.
If we did COW as a large folio, we'd COW every page in that folio.
How do we interleave two large folios in the same VMA?

> Please refer to collapse_compound_extreme() in
> tools/testing/selftests/mm/khugepaged.c.

I agree that running that part of the test-suite would be useful, but
could you point to which test specifically would create a problem here?
Yang Shi Sept. 28, 2023, 7:33 p.m. UTC | #3
On Thu, Sep 28, 2023 at 2:05 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Tue, Sep 26, 2023 at 03:07:18PM -0700, Yang Shi wrote:
> > On Fri, Sep 22, 2023 at 9:33 PM Vishal Moola (Oracle)
> > <vishal.moola@gmail.com> wrote:
> > >
> > > Currently, khugepaged builds a compound_pagelist while scanning, which
> > > is used to properly account for compound pages. We can now account
> > > for a compound page as a singular folio instead, so remove this list.
> > >
> > > Large folios are guaranteed to have consecutive ptes and addresses, so
> > > once the first pte of a large folio is found skip over the rest.
> >
> > The address space may just map a partial folio, for example, in the
> > extreme case the HUGE_PMD size range may have HUGE_PMD_NR folios with
> > mapping one subpage from each folio per PTE. So assuming the PTE
> > mapped folio is mapped consecutively may be wrong.
>
> How?  You can do that with two VMAs, but this is limited to scanning
> within a single VMA.  If we've COWed a large folio, we currently do
> so as a single page folio, and I'm not seeing any demand to change that.
> If we did COW as a large folio, we'd COW every page in that folio.
> How do we interleave two large folios in the same VMA?

It is not about COW. The magic from mremap() may cause some corner
cases. For example,

We have a 2M VMA, every 4K of the VMA may be mapped to a subpage from
different folios. Like:

0: #0 subpage of folio #0
1: #1 subpage of folio #1
2: #2 subpage of folio #2
....
511: #511 subpage of folio #511

When khugepaged is scanning the VMA, it may just isolate and lock the
folio #0, but skip all other folios since it assumes the VMA is just
mapped by folio #0.

This may trigger kernel bug when unlocking other folios which are
actually not locked and maybe data corruption since the other folios
may go away under us (unisolated, unlocked and unpinned).

>
> > Please refer to collapse_compound_extreme() in
> > tools/testing/selftests/mm/khugepaged.c.
>
> I agree that running that part of the test-suite would be useful, but
> could you point to which test specifically would create a problem here?

collapse_compound_extreme. I ran the test with this series (patch #1
is fine), it did trigger a kernel bug:

Collapse PTE table full of different compound pages....[  259.634265]
page:0000000088fc2e6f refcount:1 mapcount:1 mapping:0000000000000000
index:0x4000
1 pfn:0x124a00
[  259.636880] head:0000000088fc2e6f order:9 entire_mapcount:0
nr_pages_mapped:1 pincount:0
[  259.639158] memcg:ffff002035ab9000
[  259.640041] anon flags:
0x2fffff000a0078(uptodate|dirty|lru|head|mappedtodisk|swapbacked|node=0|zone=2|lastcpupid=0xfffff)
[  259.642884] page_type: 0x0()
[  259.643626] raw: 002fffff000a0078 fffffc001fd1a908 fffffc0003930008
ffff0007fd376209
[  259.645609] raw: 0000000000040001 0000000000000000 0000000100000000
ffff002035ab9000
[  259.647580] page dumped because:
VM_BUG_ON_FOLIO(!folio_test_locked(folio))
[  259.649535] ------------[ cut here ]------------
[  259.650739] kernel BUG at mm/filemap.c:1524!
[  259.651856] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
[  259.653419] Modules linked in: rfkill vfat fat virtio_net
net_failover failover fuse zram crct10dif_ce ghash_ce virtio_blk
qemu_fw_cfg virtio_mmio i
pmi_devintf ipmi_msghandler
[  259.657448] CPU: 6 PID: 828 Comm: khugepaged Not tainted 6.6.0-rc2+
#27
[  259.659269] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0
02/06/2015
[  259.661168] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS
BTYPE=--)
[  259.663095] pc : folio_unlock+0x7c/0x88
[  259.664169] lr : folio_unlock+0x7c/0x88
[  259.665236] sp : ffff800086c9bae0
[  259.666169] x29: ffff800086c9bae0 x28: 00000000020378a0 x27:
ffff0000d81e4000
[  259.668332] x26: ffff8000826b43ec x25: ffff0000d81e4000 x24:
0000000000000001
[  259.670301] x23: ffff8000826b0b88 x22: ffff002062a25928 x21:
fffffc00036078e8
[  259.672240] x20: fffffc0003928000 x19: fffffc0003928000 x18:
ffffffffffffffff
[  259.674205] x17: 3032303066666666 x16: 2030303030303030 x15:
0000000000000006
[  259.676161] x14: 0000000000000000 x13: 29296f696c6f6628 x12:
64656b636f6c5f74
[  259.678301] x11: 00000000ffff7fff x10: ffff003f5f1f0000 x9 :
ffff800080248730
[  259.680187] x8 : 00000000000bffe8 x7 : c0000000ffff7fff x6 :
00000000002bffa8
[  259.682111] x5 : ffff001ffec79988 x4 : 0000000000000000 x3 :
0000000000000000
[  259.683804] x2 : ffff001ffec86370 x1 : ffff801f7cbdc000 x0 :
000000000000003f
[  259.685500] Call trace:
[  259.686096]  folio_unlock+0x7c/0x88
[  259.686969]  release_pte_folio+0x64/0x80
[  259.687983]  __collapse_huge_page_copy_succeeded.isra.0+0x98/0x228
[  259.689495]  collapse_huge_page+0x4b8/0x708
[  259.690520]  hpage_collapse_scan_pmd+0x208/0x558
[  259.691565]  khugepaged_scan_mm_slot.constprop.0+0x33c/0x470
[  259.692934]  khugepaged+0xf0/0x2b0
[  259.693744]  kthread+0xe0/0xf0
[  259.694493]  ret_from_fork+0x10/0x20
[  259.695381] Code: d65f03c0 b0009a81 9139e021 94012728 (d4210000)
[  259.696856] ---[ end trace 0000000000000000 ]---
[  259.697977] note: khugepaged[828] exited with irqs disabled


collapse_full_of_compound also failed with the same BUG.

Collapse PTE table full of compound pages....[  145.909210]
page:0000000004cc4ad2 refcount:511 mapcount:0 mapping:0000000000000000
index:0x400[50/4619]
20ba000
[  145.911807] head:0000000004cc4ad2 order:9 entire_mapcount:0
nr_pages_mapped:511 pincount:0
[  145.913806] memcg:ffff002019745000
[  145.914686] anon flags:
0xafffff000a0078(uptodate|dirty|lru|head|mappedtodisk|swapbacked|node=1|zone=2|lastcpupid=0xfffff)
[  145.917667] page_type: 0xffffffff()
[  145.918580] raw: 00afffff000a0078 fffffc0081b37a08 ffff0020680f6800
ffff002044e834e1
[  145.920555] raw: 0000000000040000 0000000000000000 000001ffffffffff
ffff002019745000
[  145.922466] page dumped because:
VM_BUG_ON_FOLIO(!folio_test_locked(folio))
[  145.924151] ------------[ cut here ]------------
[  145.925326] kernel BUG at mm/filemap.c:1524!
[  145.926499] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP
[  145.928199] Modules linked in: rfkill vfat fat virtio_net
net_failover failover fuse zram crct10dif_ce ghash_ce virtio_blk
qemu_fw_cfg virtio_mmio i
pmi_devintf ipmi_msghandler
[  145.932379] CPU: 135 PID: 2193 Comm: khugepaged Not tainted
6.6.0-rc2+ #27
[  145.934265] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0
02/06/2015
[  145.936150] pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS
BTYPE=--)
[  145.938066] pc : folio_unlock+0x7c/0x88
[  145.939140] lr : folio_unlock+0x7c/0x88
[  145.940208] sp : ffff800099d2bae0
[  145.941134] x29: ffff800099d2bae0 x28: 0000000002037aa0 x27:
ffff0020656e0000
[  145.943049] x26: ffff8000826b43ec x25: ffff0020656e0000 x24:
0000000000000001
[  145.944988] x23: ffff8000826b0b88 x22: ffff00203e0b1ee8 x21:
fffffc008195b7e8
[  145.946973] x20: fffffc0081e80040 x19: fffffc0081e80000 x18:
ffffffffffffffff
[  145.948914] x17: 3032303066666666 x16: 2066666666666666 x15:
0000000000000006
[  145.950882] x14: 0000000000000000 x13: 29296f696c6f6628 x12:
64656b636f6c5f74
[  145.952779] x11: 00000000ffff7fff x10: ffff003f5f1f0000 x9 :
ffff80008013bea0
[  145.954750] x8 : 00000000000bffe8 x7 : c0000000ffff7fff x6 :
00000000002bffa8
[  145.956831] x5 : ffff003f5faca988 x4 : ffff800099d2b930 x3 :
ffff803edda2d000
[  145.958538] x2 : ffff003f5faca980 x1 : 0000000000000023 x0 :
000000000000003f
[  145.960300] Call trace:
[  145.960914]  folio_unlock+0x7c/0x88
[  145.961810]  release_pte_folio+0x64/0x80
[  145.962824]  __collapse_huge_page_copy_succeeded.isra.0+0x98/0x228
[  145.964270]  collapse_huge_page+0x4b8/0x708
[  145.965313]  hpage_collapse_scan_pmd+0x208/0x558
[  145.966629]  khugepaged_scan_mm_slot.constprop.0+0x33c/0x470
[  145.967825]  khugepaged+0xf0/0x2b0
[  145.968586]  kthread+0xe0/0xf0
[  145.969274]  ret_from_fork+0x10/0x20
[  145.970081] Code: d65f03c0 b0009a81 9139e021 94012728 (d4210000)
[  145.971411] ---[ end trace 0000000000000000 ]---
[  145.972439] note: khugepaged[2193] exited with irqs disabled
Yang Shi Sept. 29, 2023, 7:07 p.m. UTC | #4
On Tue, Sep 26, 2023 at 3:07 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Fri, Sep 22, 2023 at 9:33 PM Vishal Moola (Oracle)
> <vishal.moola@gmail.com> wrote:
> >
> > Currently, khugepaged builds a compound_pagelist while scanning, which
> > is used to properly account for compound pages. We can now account
> > for a compound page as a singular folio instead, so remove this list.
> >
> > Large folios are guaranteed to have consecutive ptes and addresses, so
> > once the first pte of a large folio is found skip over the rest.
>
> The address space may just map a partial folio, for example, in the
> extreme case the HUGE_PMD size range may have HUGE_PMD_NR folios with
> mapping one subpage from each folio per PTE. So assuming the PTE
> mapped folio is mapped consecutively may be wrong.
>
> Please refer to collapse_compound_extreme() in
> tools/testing/selftests/mm/khugepaged.c.
>
> >
> > This helps convert khugepaged to use folios. It removes 3 compound_head
> > calls in __collapse_huge_page_copy_succeeded(), and removes 980 bytes of
> > kernel text.
> >
> > Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
> > ---
> >  mm/khugepaged.c | 76 ++++++++++++-------------------------------------
> >  1 file changed, 18 insertions(+), 58 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index f46a7a7c489f..b6c7d55a8231 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -498,10 +498,9 @@ static void release_pte_page(struct page *page)
> >         release_pte_folio(page_folio(page));
> >  }
> >
> > -static void release_pte_pages(pte_t *pte, pte_t *_pte,
> > -               struct list_head *compound_pagelist)
> > +static void release_pte_folios(pte_t *pte, pte_t *_pte)
> >  {
> > -       struct folio *folio, *tmp;
> > +       struct folio *folio;
> >
> >         while (--_pte >= pte) {
> >                 pte_t pteval = ptep_get(_pte);
> > @@ -514,12 +513,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
> >                         continue;
> >                 folio = pfn_folio(pfn);
> >                 if (folio_test_large(folio))
> > -                       continue;
> > -               release_pte_folio(folio);
> > -       }
> > -
> > -       list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
> > -               list_del(&folio->lru);
> > +                       _pte -= folio_nr_pages(folio) - 1;
> >                 release_pte_folio(folio);
> >         }
> >  }
> > @@ -538,8 +532,7 @@ static bool is_refcount_suitable(struct page *page)
> >  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                                         unsigned long address,
> >                                         pte_t *pte,
> > -                                       struct collapse_control *cc,
> > -                                       struct list_head *compound_pagelist)
> > +                                       struct collapse_control *cc)
> >  {
> >         struct folio *folio = NULL;
> >         pte_t *_pte;
> > @@ -588,19 +581,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                         }
> >                 }
> >
> > -               if (folio_test_large(folio)) {
> > -                       struct folio *f;
> > -
> > -                       /*
> > -                        * Check if we have dealt with the compound page
> > -                        * already
> > -                        */
> > -                       list_for_each_entry(f, compound_pagelist, lru) {
> > -                               if (folio == f)
> > -                                       goto next;
> > -                       }
> > -               }
> > -
> >                 /*
> >                  * We can do it before isolate_lru_page because the
> >                  * page can't be freed from under us. NOTE: PG_lock
> > @@ -644,9 +624,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
> >                 VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
> >
> > -               if (folio_test_large(folio))
> > -                       list_add_tail(&folio->lru, compound_pagelist);
> > -next:
> >                 /*
> >                  * If collapse was initiated by khugepaged, check that there is
> >                  * enough young pte to justify collapsing the page
> > @@ -660,6 +637,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                 if (pte_write(pteval))
> >                         writable = true;
> >
> > +               if (folio_test_large(folio)) {
> > +                       _pte += folio_nr_pages(folio) - 1;
> > +                       address += folio_size(folio) - PAGE_SIZE;
> > +               }
> >         }
> >
> >         if (unlikely(!writable)) {
> > @@ -673,7 +654,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >                 return result;
> >         }
> >  out:
> > -       release_pte_pages(pte, _pte, compound_pagelist);
> > +       release_pte_folios(pte, _pte);
> >         trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
> >                                             referenced, writable, result);
> >         return result;
> > @@ -682,11 +663,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
> >  static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> >                                                 struct vm_area_struct *vma,
> >                                                 unsigned long address,
> > -                                               spinlock_t *ptl,
> > -                                               struct list_head *compound_pagelist)
> > +                                               spinlock_t *ptl)
> >  {
> >         struct page *src_page;
> > -       struct page *tmp;
> >         pte_t *_pte;
> >         pte_t pteval;
> >
> > @@ -706,8 +685,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> >                         }
> >                 } else {
> >                         src_page = pte_page(pteval);
> > -                       if (!PageCompound(src_page))
> > -                               release_pte_page(src_page);
> > +                       release_pte_page(src_page);

This line is problematic too. It may cause double unlock if I read it
correctly. The loop scans the mapped subpages from the same folio,
release_pte_page() is called for the same folio multiple times.

> >                         /*
> >                          * ptl mostly unnecessary, but preempt has to
> >                          * be disabled to update the per-cpu stats
> > @@ -720,23 +698,12 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> >                         free_page_and_swap_cache(src_page);
> >                 }
> >         }
> > -
> > -       list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
> > -               list_del(&src_page->lru);
> > -               mod_node_page_state(page_pgdat(src_page),
> > -                                   NR_ISOLATED_ANON + page_is_file_lru(src_page),
> > -                                   -compound_nr(src_page));
> > -               unlock_page(src_page);
> > -               free_swap_cache(src_page);
> > -               putback_lru_page(src_page);
> > -       }
> >  }
> >
> >  static void __collapse_huge_page_copy_failed(pte_t *pte,
> >                                              pmd_t *pmd,
> >                                              pmd_t orig_pmd,
> > -                                            struct vm_area_struct *vma,
> > -                                            struct list_head *compound_pagelist)
> > +                                            struct vm_area_struct *vma)
> >  {
> >         spinlock_t *pmd_ptl;
> >
> > @@ -753,7 +720,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> >          * Release both raw and compound pages isolated
> >          * in __collapse_huge_page_isolate.
> >          */
> > -       release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> > +       release_pte_folios(pte, pte + HPAGE_PMD_NR);
> >  }
> >
> >  /*
> > @@ -769,7 +736,6 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> >   * @vma: the original raw pages' virtual memory area
> >   * @address: starting address to copy
> >   * @ptl: lock on raw pages' PTEs
> > - * @compound_pagelist: list that stores compound pages
> >   */
> >  static int __collapse_huge_page_copy(pte_t *pte,
> >                                      struct page *page,
> > @@ -777,8 +743,7 @@ static int __collapse_huge_page_copy(pte_t *pte,
> >                                      pmd_t orig_pmd,
> >                                      struct vm_area_struct *vma,
> >                                      unsigned long address,
> > -                                    spinlock_t *ptl,
> > -                                    struct list_head *compound_pagelist)
> > +                                    spinlock_t *ptl)
> >  {
> >         struct page *src_page;
> >         pte_t *_pte;
> > @@ -804,11 +769,9 @@ static int __collapse_huge_page_copy(pte_t *pte,
> >         }
> >
> >         if (likely(result == SCAN_SUCCEED))
> > -               __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> > -                                                   compound_pagelist);
> > +               __collapse_huge_page_copy_succeeded(pte, vma, address, ptl);
> >         else
> > -               __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> > -                                                compound_pagelist);
> > +               __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma);
> >
> >         return result;
> >  }
> > @@ -1081,7 +1044,6 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >                               int referenced, int unmapped,
> >                               struct collapse_control *cc)
> >  {
> > -       LIST_HEAD(compound_pagelist);
> >         pmd_t *pmd, _pmd;
> >         pte_t *pte;
> >         pgtable_t pgtable;
> > @@ -1168,8 +1130,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >
> >         pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> >         if (pte) {
> > -               result = __collapse_huge_page_isolate(vma, address, pte, cc,
> > -                                                     &compound_pagelist);
> > +               result = __collapse_huge_page_isolate(vma, address, pte, cc);
> >                 spin_unlock(pte_ptl);
> >         } else {
> >                 result = SCAN_PMD_NULL;
> > @@ -1198,8 +1159,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
> >         anon_vma_unlock_write(vma->anon_vma);
> >
> >         result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
> > -                                          vma, address, pte_ptl,
> > -                                          &compound_pagelist);
> > +                                          vma, address, pte_ptl);
> >         pte_unmap(pte);
> >         if (unlikely(result != SCAN_SUCCEED))
> >                 goto out_up_write;
> > --
> > 2.40.1
> >
Vishal Moola Oct. 2, 2023, 3:55 p.m. UTC | #5
On Thu, Sep 28, 2023 at 12:33 PM Yang Shi <shy828301@gmail.com> wrote:
>
> On Thu, Sep 28, 2023 at 2:05 AM Matthew Wilcox <willy@infradead.org> wrote:
> >
> > On Tue, Sep 26, 2023 at 03:07:18PM -0700, Yang Shi wrote:
> > > On Fri, Sep 22, 2023 at 9:33 PM Vishal Moola (Oracle)
> > > <vishal.moola@gmail.com> wrote:
> > > >
> > > > Currently, khugepaged builds a compound_pagelist while scanning, which
> > > > is used to properly account for compound pages. We can now account
> > > > for a compound page as a singular folio instead, so remove this list.
> > > >
> > > > Large folios are guaranteed to have consecutive ptes and addresses, so
> > > > once the first pte of a large folio is found skip over the rest.
> > >
> > > The address space may just map a partial folio, for example, in the
> > > extreme case the HUGE_PMD size range may have HUGE_PMD_NR folios with
> > > mapping one subpage from each folio per PTE. So assuming the PTE
> > > mapped folio is mapped consecutively may be wrong.
> >
> > How?  You can do that with two VMAs, but this is limited to scanning
> > within a single VMA.  If we've COWed a large folio, we currently do
> > so as a single page folio, and I'm not seeing any demand to change that.
> > If we did COW as a large folio, we'd COW every page in that folio.
> > How do we interleave two large folios in the same VMA?
>
> It is not about COW. The magic from mremap() may cause some corner
> cases. For example,
>
> We have a 2M VMA, every 4K of the VMA may be mapped to a subpage from
> different folios. Like:
>
> 0: #0 subpage of folio #0
> 1: #1 subpage of folio #1
> 2: #2 subpage of folio #2
> ....
> 511: #511 subpage of folio #511
>
> When khugepaged is scanning the VMA, it may just isolate and lock the
> folio #0, but skip all other folios since it assumes the VMA is just
> mapped by folio #0.
>
> This may trigger kernel bug when unlocking other folios which are
> actually not locked and maybe data corruption since the other folios
> may go away under us (unisolated, unlocked and unpinned).

Thanks for the review. I did not know this could happen; I'll drop
this patch for
now until I can think of a better way to iterate through ptes for large folios.
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f46a7a7c489f..b6c7d55a8231 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -498,10 +498,9 @@  static void release_pte_page(struct page *page)
 	release_pte_folio(page_folio(page));
 }
 
-static void release_pte_pages(pte_t *pte, pte_t *_pte,
-		struct list_head *compound_pagelist)
+static void release_pte_folios(pte_t *pte, pte_t *_pte)
 {
-	struct folio *folio, *tmp;
+	struct folio *folio;
 
 	while (--_pte >= pte) {
 		pte_t pteval = ptep_get(_pte);
@@ -514,12 +513,7 @@  static void release_pte_pages(pte_t *pte, pte_t *_pte,
 			continue;
 		folio = pfn_folio(pfn);
 		if (folio_test_large(folio))
-			continue;
-		release_pte_folio(folio);
-	}
-
-	list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
-		list_del(&folio->lru);
+			_pte -= folio_nr_pages(folio) - 1;
 		release_pte_folio(folio);
 	}
 }
@@ -538,8 +532,7 @@  static bool is_refcount_suitable(struct page *page)
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 					unsigned long address,
 					pte_t *pte,
-					struct collapse_control *cc,
-					struct list_head *compound_pagelist)
+					struct collapse_control *cc)
 {
 	struct folio *folio = NULL;
 	pte_t *_pte;
@@ -588,19 +581,6 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			}
 		}
 
-		if (folio_test_large(folio)) {
-			struct folio *f;
-
-			/*
-			 * Check if we have dealt with the compound page
-			 * already
-			 */
-			list_for_each_entry(f, compound_pagelist, lru) {
-				if (folio == f)
-					goto next;
-			}
-		}
-
 		/*
 		 * We can do it before isolate_lru_page because the
 		 * page can't be freed from under us. NOTE: PG_lock
@@ -644,9 +624,6 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 		VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 
-		if (folio_test_large(folio))
-			list_add_tail(&folio->lru, compound_pagelist);
-next:
 		/*
 		 * If collapse was initiated by khugepaged, check that there is
 		 * enough young pte to justify collapsing the page
@@ -660,6 +637,10 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		if (pte_write(pteval))
 			writable = true;
 
+		if (folio_test_large(folio)) {
+			_pte += folio_nr_pages(folio) - 1;
+			address += folio_size(folio) - PAGE_SIZE;
+		}
 	}
 
 	if (unlikely(!writable)) {
@@ -673,7 +654,7 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		return result;
 	}
 out:
-	release_pte_pages(pte, _pte, compound_pagelist);
+	release_pte_folios(pte, _pte);
 	trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
 					    referenced, writable, result);
 	return result;
@@ -682,11 +663,9 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 						struct vm_area_struct *vma,
 						unsigned long address,
-						spinlock_t *ptl,
-						struct list_head *compound_pagelist)
+						spinlock_t *ptl)
 {
 	struct page *src_page;
-	struct page *tmp;
 	pte_t *_pte;
 	pte_t pteval;
 
@@ -706,8 +685,7 @@  static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 			}
 		} else {
 			src_page = pte_page(pteval);
-			if (!PageCompound(src_page))
-				release_pte_page(src_page);
+			release_pte_page(src_page);
 			/*
 			 * ptl mostly unnecessary, but preempt has to
 			 * be disabled to update the per-cpu stats
@@ -720,23 +698,12 @@  static void __collapse_huge_page_copy_succeeded(pte_t *pte,
 			free_page_and_swap_cache(src_page);
 		}
 	}
-
-	list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
-		list_del(&src_page->lru);
-		mod_node_page_state(page_pgdat(src_page),
-				    NR_ISOLATED_ANON + page_is_file_lru(src_page),
-				    -compound_nr(src_page));
-		unlock_page(src_page);
-		free_swap_cache(src_page);
-		putback_lru_page(src_page);
-	}
 }
 
 static void __collapse_huge_page_copy_failed(pte_t *pte,
 					     pmd_t *pmd,
 					     pmd_t orig_pmd,
-					     struct vm_area_struct *vma,
-					     struct list_head *compound_pagelist)
+					     struct vm_area_struct *vma)
 {
 	spinlock_t *pmd_ptl;
 
@@ -753,7 +720,7 @@  static void __collapse_huge_page_copy_failed(pte_t *pte,
 	 * Release both raw and compound pages isolated
 	 * in __collapse_huge_page_isolate.
 	 */
-	release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
+	release_pte_folios(pte, pte + HPAGE_PMD_NR);
 }
 
 /*
@@ -769,7 +736,6 @@  static void __collapse_huge_page_copy_failed(pte_t *pte,
  * @vma: the original raw pages' virtual memory area
  * @address: starting address to copy
  * @ptl: lock on raw pages' PTEs
- * @compound_pagelist: list that stores compound pages
  */
 static int __collapse_huge_page_copy(pte_t *pte,
 				     struct page *page,
@@ -777,8 +743,7 @@  static int __collapse_huge_page_copy(pte_t *pte,
 				     pmd_t orig_pmd,
 				     struct vm_area_struct *vma,
 				     unsigned long address,
-				     spinlock_t *ptl,
-				     struct list_head *compound_pagelist)
+				     spinlock_t *ptl)
 {
 	struct page *src_page;
 	pte_t *_pte;
@@ -804,11 +769,9 @@  static int __collapse_huge_page_copy(pte_t *pte,
 	}
 
 	if (likely(result == SCAN_SUCCEED))
-		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
-						    compound_pagelist);
+		__collapse_huge_page_copy_succeeded(pte, vma, address, ptl);
 	else
-		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
-						 compound_pagelist);
+		__collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma);
 
 	return result;
 }
@@ -1081,7 +1044,6 @@  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 			      int referenced, int unmapped,
 			      struct collapse_control *cc)
 {
-	LIST_HEAD(compound_pagelist);
 	pmd_t *pmd, _pmd;
 	pte_t *pte;
 	pgtable_t pgtable;
@@ -1168,8 +1130,7 @@  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 
 	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
 	if (pte) {
-		result = __collapse_huge_page_isolate(vma, address, pte, cc,
-						      &compound_pagelist);
+		result = __collapse_huge_page_isolate(vma, address, pte, cc);
 		spin_unlock(pte_ptl);
 	} else {
 		result = SCAN_PMD_NULL;
@@ -1198,8 +1159,7 @@  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	anon_vma_unlock_write(vma->anon_vma);
 
 	result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
-					   vma, address, pte_ptl,
-					   &compound_pagelist);
+					   vma, address, pte_ptl);
 	pte_unmap(pte);
 	if (unlikely(result != SCAN_SUCCEED))
 		goto out_up_write;