Message ID | 20220706235936.2197195-8-zokeefe@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm: userspace hugepage collapse | expand |
On Wed, Jul 6, 2022 at 5:06 PM Zach O'Keefe <zokeefe@google.com> wrote: > > MADV_COLLAPSE is not coupled to the kernel-oriented sysfs THP settings[1]. > > hugepage_vma_check() is the authority on determining if a VMA is eligible > for THP allocation/collapse, and currently enforces the sysfs THP settings. > Add a flag to disable these checks. For now, only apply this arg to anon > and file, which use /sys/kernel/transparent_hugepage/enabled. We can > expand this to shmem, which uses > /sys/kernel/transparent_hugepage/shmem_enabled, later. > > Use this flag in collapse_pte_mapped_thp() where previously the VMA flags > passed to hugepage_vma_check() were OR'd with VM_HUGEPAGE to elide the > VM_HUGEPAGE check in "madvise" THP mode. Prior to "mm: khugepaged: check > THP flag in hugepage_vma_check()", this check also didn't check "never" THP > mode. As such, this restores the previous behavior of > collapse_pte_mapped_thp() where sysfs THP settings are ignored. See > comment in code for justification why this is OK. > > [1] https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@mail.gmail.com/ > > Signed-off-by: Zach O'Keefe <zokeefe@google.com> Reviewed-by: Yang Shi <shy828301@gmail.com> > --- > fs/proc/task_mmu.c | 2 +- > include/linux/huge_mm.h | 9 ++++----- > mm/huge_memory.c | 14 ++++++-------- > mm/khugepaged.c | 25 ++++++++++++++----------- > mm/memory.c | 4 ++-- > 5 files changed, 27 insertions(+), 27 deletions(-) > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > index 34d292cec79a..f8cd58846a28 100644 > --- a/fs/proc/task_mmu.c > +++ b/fs/proc/task_mmu.c > @@ -866,7 +866,7 @@ static int show_smap(struct seq_file *m, void *v) > __show_smap(m, &mss, false); > > seq_printf(m, "THPeligible: %d\n", > - hugepage_vma_check(vma, vma->vm_flags, true, false)); > + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); > > if (arch_pkeys_enabled()) > seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > index 37f2f11a6d7e..00312fc251c1 100644 > --- a/include/linux/huge_mm.h > +++ b/include/linux/huge_mm.h > @@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) > !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); > } > > -bool hugepage_vma_check(struct vm_area_struct *vma, > - unsigned long vm_flags, > - bool smaps, bool in_pf); > +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, > + bool smaps, bool in_pf, bool enforce_sysfs); > > #define transparent_hugepage_use_zero_page() \ > (transparent_hugepage_flags & \ > @@ -321,8 +320,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, > } > > static inline bool hugepage_vma_check(struct vm_area_struct *vma, > - unsigned long vm_flags, > - bool smaps, bool in_pf) > + unsigned long vm_flags, bool smaps, > + bool in_pf, bool enforce_sysfs) > { > return false; > } > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index da300ce9dedb..4fbe43dc1568 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -69,9 +69,8 @@ static atomic_t huge_zero_refcount; > struct page *huge_zero_page __read_mostly; > unsigned long huge_zero_pfn __read_mostly = ~0UL; > > -bool hugepage_vma_check(struct vm_area_struct *vma, > - unsigned long vm_flags, > - bool smaps, bool in_pf) > +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, > + bool smaps, bool in_pf, bool enforce_sysfs) > { > if (!vma->vm_mm) /* vdso */ > return false; > @@ -120,11 +119,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma, > if (!in_pf && shmem_file(vma->vm_file)) > return shmem_huge_enabled(vma); > > - if (!hugepage_flags_enabled()) > - return false; > - > - /* THP settings require madvise. */ > - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) > + /* Enforce sysfs THP requirements as necessary */ > + if (enforce_sysfs && > + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && > + !hugepage_flags_always()))) > return false; > > /* Only regular file is valid */ > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > index d89056d8cbad..b0e20db3f805 100644 > --- a/mm/khugepaged.c > +++ b/mm/khugepaged.c > @@ -478,7 +478,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, > { > if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && > hugepage_flags_enabled()) { > - if (hugepage_vma_check(vma, vm_flags, false, false)) > + if (hugepage_vma_check(vma, vm_flags, false, false, true)) > __khugepaged_enter(vma->vm_mm); > } > } > @@ -844,7 +844,8 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) > */ > > static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > - struct vm_area_struct **vmap) > + struct vm_area_struct **vmap, > + struct collapse_control *cc) > { > struct vm_area_struct *vma; > > @@ -855,7 +856,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > if (!vma) > return SCAN_VMA_NULL; > > - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, > + cc->is_khugepaged)) > return SCAN_VMA_CHECK; > /* > * Anon VMA expected, the address may be unmapped then > @@ -974,7 +976,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > goto out_nolock; > > mmap_read_lock(mm); > - result = hugepage_vma_revalidate(mm, address, &vma); > + result = hugepage_vma_revalidate(mm, address, &vma, cc); > if (result != SCAN_SUCCEED) { > mmap_read_unlock(mm); > goto out_nolock; > @@ -1006,7 +1008,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > * handled by the anon_vma lock + PG_lock. > */ > mmap_write_lock(mm); > - result = hugepage_vma_revalidate(mm, address, &vma); > + result = hugepage_vma_revalidate(mm, address, &vma, cc); > if (result != SCAN_SUCCEED) > goto out_up_write; > /* check if the pmd is still valid */ > @@ -1350,12 +1352,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) > return; > > /* > - * This vm_flags may not have VM_HUGEPAGE if the page was not > - * collapsed by this mm. But we can still collapse if the page is > - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() > - * will not fail the vma for missing VM_HUGEPAGE > + * If we are here, we've succeeded in replacing all the native pages > + * in the page cache with a single hugepage. If a mm were to fault-in > + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage > + * and map it by a PMD, regardless of sysfs THP settings. As such, let's > + * analogously elide sysfs THP settings here. > */ > - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) > return; > > /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ > @@ -2042,7 +2045,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, > progress++; > break; > } > - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { > skip: > progress++; > continue; > diff --git a/mm/memory.c b/mm/memory.c > index 8917bea2f0bc..96cd776e84f1 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -5001,7 +5001,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > return VM_FAULT_OOM; > retry_pud: > if (pud_none(*vmf.pud) && > - hugepage_vma_check(vma, vm_flags, false, true)) { > + hugepage_vma_check(vma, vm_flags, false, true, true)) { > ret = create_huge_pud(&vmf); > if (!(ret & VM_FAULT_FALLBACK)) > return ret; > @@ -5035,7 +5035,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > goto retry_pud; > > if (pmd_none(*vmf.pmd) && > - hugepage_vma_check(vma, vm_flags, false, true)) { > + hugepage_vma_check(vma, vm_flags, false, true, true)) { > ret = create_huge_pmd(&vmf); > if (!(ret & VM_FAULT_FALLBACK)) > return ret; > -- > 2.37.0.rc0.161.g10f37bed90-goog > >
On Jul 11 13:57, Yang Shi wrote: > On Wed, Jul 6, 2022 at 5:06 PM Zach O'Keefe <zokeefe@google.com> wrote: > > > > MADV_COLLAPSE is not coupled to the kernel-oriented sysfs THP settings[1]. > > > > hugepage_vma_check() is the authority on determining if a VMA is eligible > > for THP allocation/collapse, and currently enforces the sysfs THP settings. > > Add a flag to disable these checks. For now, only apply this arg to anon > > and file, which use /sys/kernel/transparent_hugepage/enabled. We can > > expand this to shmem, which uses > > /sys/kernel/transparent_hugepage/shmem_enabled, later. > > > > Use this flag in collapse_pte_mapped_thp() where previously the VMA flags > > passed to hugepage_vma_check() were OR'd with VM_HUGEPAGE to elide the > > VM_HUGEPAGE check in "madvise" THP mode. Prior to "mm: khugepaged: check > > THP flag in hugepage_vma_check()", this check also didn't check "never" THP > > mode. As such, this restores the previous behavior of > > collapse_pte_mapped_thp() where sysfs THP settings are ignored. See > > comment in code for justification why this is OK. > > > > [1] https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@mail.gmail.com/ > > > > Signed-off-by: Zach O'Keefe <zokeefe@google.com> > > Reviewed-by: Yang Shi <shy828301@gmail.com> Thanks for the review! Best, Zach > > --- > > fs/proc/task_mmu.c | 2 +- > > include/linux/huge_mm.h | 9 ++++----- > > mm/huge_memory.c | 14 ++++++-------- > > mm/khugepaged.c | 25 ++++++++++++++----------- > > mm/memory.c | 4 ++-- > > 5 files changed, 27 insertions(+), 27 deletions(-) > > > > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c > > index 34d292cec79a..f8cd58846a28 100644 > > --- a/fs/proc/task_mmu.c > > +++ b/fs/proc/task_mmu.c > > @@ -866,7 +866,7 @@ static int show_smap(struct seq_file *m, void *v) > > __show_smap(m, &mss, false); > > > > seq_printf(m, "THPeligible: %d\n", > > - hugepage_vma_check(vma, vma->vm_flags, true, false)); > > + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); > > > > if (arch_pkeys_enabled()) > > seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); > > diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h > > index 37f2f11a6d7e..00312fc251c1 100644 > > --- a/include/linux/huge_mm.h > > +++ b/include/linux/huge_mm.h > > @@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) > > !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); > > } > > > > -bool hugepage_vma_check(struct vm_area_struct *vma, > > - unsigned long vm_flags, > > - bool smaps, bool in_pf); > > +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, > > + bool smaps, bool in_pf, bool enforce_sysfs); > > > > #define transparent_hugepage_use_zero_page() \ > > (transparent_hugepage_flags & \ > > @@ -321,8 +320,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, > > } > > > > static inline bool hugepage_vma_check(struct vm_area_struct *vma, > > - unsigned long vm_flags, > > - bool smaps, bool in_pf) > > + unsigned long vm_flags, bool smaps, > > + bool in_pf, bool enforce_sysfs) > > { > > return false; > > } > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index da300ce9dedb..4fbe43dc1568 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > > @@ -69,9 +69,8 @@ static atomic_t huge_zero_refcount; > > struct page *huge_zero_page __read_mostly; > > unsigned long huge_zero_pfn __read_mostly = ~0UL; > > > > -bool hugepage_vma_check(struct vm_area_struct *vma, > > - unsigned long vm_flags, > > - bool smaps, bool in_pf) > > +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, > > + bool smaps, bool in_pf, bool enforce_sysfs) > > { > > if (!vma->vm_mm) /* vdso */ > > return false; > > @@ -120,11 +119,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma, > > if (!in_pf && shmem_file(vma->vm_file)) > > return shmem_huge_enabled(vma); > > > > - if (!hugepage_flags_enabled()) > > - return false; > > - > > - /* THP settings require madvise. */ > > - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) > > + /* Enforce sysfs THP requirements as necessary */ > > + if (enforce_sysfs && > > + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && > > + !hugepage_flags_always()))) > > return false; > > > > /* Only regular file is valid */ > > diff --git a/mm/khugepaged.c b/mm/khugepaged.c > > index d89056d8cbad..b0e20db3f805 100644 > > --- a/mm/khugepaged.c > > +++ b/mm/khugepaged.c > > @@ -478,7 +478,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, > > { > > if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && > > hugepage_flags_enabled()) { > > - if (hugepage_vma_check(vma, vm_flags, false, false)) > > + if (hugepage_vma_check(vma, vm_flags, false, false, true)) > > __khugepaged_enter(vma->vm_mm); > > } > > } > > @@ -844,7 +844,8 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) > > */ > > > > static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > > - struct vm_area_struct **vmap) > > + struct vm_area_struct **vmap, > > + struct collapse_control *cc) > > { > > struct vm_area_struct *vma; > > > > @@ -855,7 +856,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, > > if (!vma) > > return SCAN_VMA_NULL; > > > > - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) > > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, > > + cc->is_khugepaged)) > > return SCAN_VMA_CHECK; > > /* > > * Anon VMA expected, the address may be unmapped then > > @@ -974,7 +976,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > > goto out_nolock; > > > > mmap_read_lock(mm); > > - result = hugepage_vma_revalidate(mm, address, &vma); > > + result = hugepage_vma_revalidate(mm, address, &vma, cc); > > if (result != SCAN_SUCCEED) { > > mmap_read_unlock(mm); > > goto out_nolock; > > @@ -1006,7 +1008,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, > > * handled by the anon_vma lock + PG_lock. > > */ > > mmap_write_lock(mm); > > - result = hugepage_vma_revalidate(mm, address, &vma); > > + result = hugepage_vma_revalidate(mm, address, &vma, cc); > > if (result != SCAN_SUCCEED) > > goto out_up_write; > > /* check if the pmd is still valid */ > > @@ -1350,12 +1352,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) > > return; > > > > /* > > - * This vm_flags may not have VM_HUGEPAGE if the page was not > > - * collapsed by this mm. But we can still collapse if the page is > > - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() > > - * will not fail the vma for missing VM_HUGEPAGE > > + * If we are here, we've succeeded in replacing all the native pages > > + * in the page cache with a single hugepage. If a mm were to fault-in > > + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage > > + * and map it by a PMD, regardless of sysfs THP settings. As such, let's > > + * analogously elide sysfs THP settings here. > > */ > > - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) > > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) > > return; > > > > /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ > > @@ -2042,7 +2045,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, > > progress++; > > break; > > } > > - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { > > + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { > > skip: > > progress++; > > continue; > > diff --git a/mm/memory.c b/mm/memory.c > > index 8917bea2f0bc..96cd776e84f1 100644 > > --- a/mm/memory.c > > +++ b/mm/memory.c > > @@ -5001,7 +5001,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > > return VM_FAULT_OOM; > > retry_pud: > > if (pud_none(*vmf.pud) && > > - hugepage_vma_check(vma, vm_flags, false, true)) { > > + hugepage_vma_check(vma, vm_flags, false, true, true)) { > > ret = create_huge_pud(&vmf); > > if (!(ret & VM_FAULT_FALLBACK)) > > return ret; > > @@ -5035,7 +5035,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, > > goto retry_pud; > > > > if (pmd_none(*vmf.pmd) && > > - hugepage_vma_check(vma, vm_flags, false, true)) { > > + hugepage_vma_check(vma, vm_flags, false, true, true)) { > > ret = create_huge_pmd(&vmf); > > if (!(ret & VM_FAULT_FALLBACK)) > > return ret; > > -- > > 2.37.0.rc0.161.g10f37bed90-goog > > > >
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 34d292cec79a..f8cd58846a28 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -866,7 +866,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - hugepage_vma_check(vma, vma->vm_flags, true, false)); + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 37f2f11a6d7e..00312fc251c1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf); +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -321,8 +320,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, } static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs) { return false; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index da300ce9dedb..4fbe43dc1568 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -69,9 +69,8 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs) { if (!vma->vm_mm) /* vdso */ return false; @@ -120,11 +119,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma, if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (!hugepage_flags_enabled()) - return false; - - /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) + /* Enforce sysfs THP requirements as necessary */ + if (enforce_sysfs && + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_flags_always()))) return false; /* Only regular file is valid */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d89056d8cbad..b0e20db3f805 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -478,7 +478,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false, false)) + if (hugepage_vma_check(vma, vm_flags, false, false, true)) __khugepaged_enter(vma->vm_mm); } } @@ -844,7 +844,8 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - struct vm_area_struct **vmap) + struct vm_area_struct **vmap, + struct collapse_control *cc) { struct vm_area_struct *vma; @@ -855,7 +856,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!vma) return SCAN_VMA_NULL; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, + cc->is_khugepaged)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -974,7 +976,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, goto out_nolock; mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); + result = hugepage_vma_revalidate(mm, address, &vma, cc); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; @@ -1006,7 +1008,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); + result = hugepage_vma_revalidate(mm, address, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ @@ -1350,12 +1352,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) return; /* - * This vm_flags may not have VM_HUGEPAGE if the page was not - * collapsed by this mm. But we can still collapse if the page is - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() - * will not fail the vma for missing VM_HUGEPAGE + * If we are here, we've succeeded in replacing all the native pages + * in the page cache with a single hugepage. If a mm were to fault-in + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage + * and map it by a PMD, regardless of sysfs THP settings. As such, let's + * analogously elide sysfs THP settings here. */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2042,7 +2045,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { skip: progress++; continue; diff --git a/mm/memory.c b/mm/memory.c index 8917bea2f0bc..96cd776e84f1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5001,7 +5001,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5035,7 +5035,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, goto retry_pud; if (pmd_none(*vmf.pmd) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret;
MADV_COLLAPSE is not coupled to the kernel-oriented sysfs THP settings[1]. hugepage_vma_check() is the authority on determining if a VMA is eligible for THP allocation/collapse, and currently enforces the sysfs THP settings. Add a flag to disable these checks. For now, only apply this arg to anon and file, which use /sys/kernel/transparent_hugepage/enabled. We can expand this to shmem, which uses /sys/kernel/transparent_hugepage/shmem_enabled, later. Use this flag in collapse_pte_mapped_thp() where previously the VMA flags passed to hugepage_vma_check() were OR'd with VM_HUGEPAGE to elide the VM_HUGEPAGE check in "madvise" THP mode. Prior to "mm: khugepaged: check THP flag in hugepage_vma_check()", this check also didn't check "never" THP mode. As such, this restores the previous behavior of collapse_pte_mapped_thp() where sysfs THP settings are ignored. See comment in code for justification why this is OK. [1] https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@mail.gmail.com/ Signed-off-by: Zach O'Keefe <zokeefe@google.com> --- fs/proc/task_mmu.c | 2 +- include/linux/huge_mm.h | 9 ++++----- mm/huge_memory.c | 14 ++++++-------- mm/khugepaged.c | 25 ++++++++++++++----------- mm/memory.c | 4 ++-- 5 files changed, 27 insertions(+), 27 deletions(-)