diff mbox series

[6/7] thp: Change CoW semantics for anon-THP

Message ID 20200327170601.18563-7-kirill.shutemov@linux.intel.com (mailing list archive)
State New, archived
Headers show
Series thp/khugepaged improvements and CoW semantics | expand

Commit Message

Kirill A. Shutemov March 27, 2020, 5:06 p.m. UTC
Currently we have different copy-on-write semantics for anon- and
file-THP. For anon-THP we try to allocate huge page on the write fault,
but on file-THP we split PMD and allocate 4k page.

Arguably, file-THP semantics is more desirable: we don't necessary want
to unshare full PMD range from the parent on the first access. This is
the primary reason THP is unusable for some workloads, like Redis.

The original THP refcounting didn't allow to have PTE-mapped compound
pages, so we had no options, but to allocate huge page on CoW (with
fallback to 512 4k pages).

The current refcounting doesn't have such limitations and we can cut a
lot of complex code out of fault path.

khugepaged is now able to recover THP from such ranges if the
configuration allows.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 mm/huge_memory.c | 247 +++++------------------------------------------
 1 file changed, 24 insertions(+), 223 deletions(-)

Comments

Yang Shi March 27, 2020, 8:07 p.m. UTC | #1
On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
<kirill@shutemov.name> wrote:
>
> Currently we have different copy-on-write semantics for anon- and
> file-THP. For anon-THP we try to allocate huge page on the write fault,
> but on file-THP we split PMD and allocate 4k page.

I agree this seems confusing.

>
> Arguably, file-THP semantics is more desirable: we don't necessary want
> to unshare full PMD range from the parent on the first access. This is
> the primary reason THP is unusable for some workloads, like Redis.
>
> The original THP refcounting didn't allow to have PTE-mapped compound
> pages, so we had no options, but to allocate huge page on CoW (with
> fallback to 512 4k pages).
>
> The current refcounting doesn't have such limitations and we can cut a
> lot of complex code out of fault path.
>
> khugepaged is now able to recover THP from such ranges if the
> configuration allows.

It looks this patch would just split the PMD then fallback to handle
it on PTE level, it definitely simplify the code a lot. However it
makes the use of THP depend on the productivity of khugepaged. And the
success rate of THP allocation in khugepaged depends on defrag. But by
default khugepaged runs at very low priority, so khugepaged defrag may
result in priority inversion easily.

For example we saw THP split in reclaim path triggered by khugepaged
held write anon_vma lock, but the other process which was doing
reclaim too was blocked by anon_vma lock. Then the cond_resched() in
rmap walk would make khugepaged preempted by all other processes
easily so khugepaged may hold the anon_vma lock for indefinite time.
Then we saw hung tasks.

So we have khugepaged defrag disabled for some workloads to workaround
the priority inversion problem. So, I'm concerned some processes may
never get THP for some our workloads with this patch applied.

The priority inversion caused by khugepaged was not very usual. Just
my two cents.

>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> ---
>  mm/huge_memory.c | 247 +++++------------------------------------------
>  1 file changed, 24 insertions(+), 223 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index ef6a6bcb291f..15b7a9c86b7c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1206,262 +1206,63 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
>         spin_unlock(vmf->ptl);
>  }
>
> -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
> -                       pmd_t orig_pmd, struct page *page)
> -{
> -       struct vm_area_struct *vma = vmf->vma;
> -       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -       struct mem_cgroup *memcg;
> -       pgtable_t pgtable;
> -       pmd_t _pmd;
> -       int i;
> -       vm_fault_t ret = 0;
> -       struct page **pages;
> -       struct mmu_notifier_range range;
> -
> -       pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
> -                             GFP_KERNEL);
> -       if (unlikely(!pages)) {
> -               ret |= VM_FAULT_OOM;
> -               goto out;
> -       }
> -
> -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> -               pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
> -                                              vmf->address, page_to_nid(page));
> -               if (unlikely(!pages[i] ||
> -                            mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
> -                                    GFP_KERNEL, &memcg, false))) {
> -                       if (pages[i])
> -                               put_page(pages[i]);
> -                       while (--i >= 0) {
> -                               memcg = (void *)page_private(pages[i]);
> -                               set_page_private(pages[i], 0);
> -                               mem_cgroup_cancel_charge(pages[i], memcg,
> -                                               false);
> -                               put_page(pages[i]);
> -                       }
> -                       kfree(pages);
> -                       ret |= VM_FAULT_OOM;
> -                       goto out;
> -               }
> -               set_page_private(pages[i], (unsigned long)memcg);
> -       }
> -
> -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> -               copy_user_highpage(pages[i], page + i,
> -                                  haddr + PAGE_SIZE * i, vma);
> -               __SetPageUptodate(pages[i]);
> -               cond_resched();
> -       }
> -
> -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> -                               haddr, haddr + HPAGE_PMD_SIZE);
> -       mmu_notifier_invalidate_range_start(&range);
> -
> -       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> -               goto out_free_pages;
> -       VM_BUG_ON_PAGE(!PageHead(page), page);
> -
> -       /*
> -        * Leave pmd empty until pte is filled note we must notify here as
> -        * concurrent CPU thread might write to new page before the call to
> -        * mmu_notifier_invalidate_range_end() happens which can lead to a
> -        * device seeing memory write in different order than CPU.
> -        *
> -        * See Documentation/vm/mmu_notifier.rst
> -        */
> -       pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> -
> -       pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
> -       pmd_populate(vma->vm_mm, &_pmd, pgtable);
> -
> -       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> -               pte_t entry;
> -               entry = mk_pte(pages[i], vma->vm_page_prot);
> -               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> -               memcg = (void *)page_private(pages[i]);
> -               set_page_private(pages[i], 0);
> -               page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> -               mem_cgroup_commit_charge(pages[i], memcg, false, false);
> -               lru_cache_add_active_or_unevictable(pages[i], vma);
> -               vmf->pte = pte_offset_map(&_pmd, haddr);
> -               VM_BUG_ON(!pte_none(*vmf->pte));
> -               set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
> -               pte_unmap(vmf->pte);
> -       }
> -       kfree(pages);
> -
> -       smp_wmb(); /* make pte visible before pmd */
> -       pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
> -       page_remove_rmap(page, true);
> -       spin_unlock(vmf->ptl);
> -
> -       /*
> -        * No need to double call mmu_notifier->invalidate_range() callback as
> -        * the above pmdp_huge_clear_flush_notify() did already call it.
> -        */
> -       mmu_notifier_invalidate_range_only_end(&range);
> -
> -       ret |= VM_FAULT_WRITE;
> -       put_page(page);
> -
> -out:
> -       return ret;
> -
> -out_free_pages:
> -       spin_unlock(vmf->ptl);
> -       mmu_notifier_invalidate_range_end(&range);
> -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> -               memcg = (void *)page_private(pages[i]);
> -               set_page_private(pages[i], 0);
> -               mem_cgroup_cancel_charge(pages[i], memcg, false);
> -               put_page(pages[i]);
> -       }
> -       kfree(pages);
> -       goto out;
> -}
> -
>  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
>  {
>         struct vm_area_struct *vma = vmf->vma;
> -       struct page *page = NULL, *new_page;
> -       struct mem_cgroup *memcg;
> +       struct page *page;
>         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -       struct mmu_notifier_range range;
> -       gfp_t huge_gfp;                 /* for allocation and charge */
> -       vm_fault_t ret = 0;
>
>         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>         VM_BUG_ON_VMA(!vma->anon_vma, vma);
> +
>         if (is_huge_zero_pmd(orig_pmd))
> -               goto alloc;
> +               goto fallback;
> +
>         spin_lock(vmf->ptl);
> -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> -               goto out_unlock;
> +
> +       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> +               spin_unlock(vmf->ptl);
> +               return 0;
> +       }
>
>         page = pmd_page(orig_pmd);
>         VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
> -       /*
> -        * We can only reuse the page if nobody else maps the huge page or it's
> -        * part.
> -        */
> +
> +       /* Lock page for reuse_swap_page() */
>         if (!trylock_page(page)) {
>                 get_page(page);
>                 spin_unlock(vmf->ptl);
>                 lock_page(page);
>                 spin_lock(vmf->ptl);
>                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> +                       spin_unlock(vmf->ptl);
>                         unlock_page(page);
>                         put_page(page);
> -                       goto out_unlock;
> +                       return 0;
>                 }
>                 put_page(page);
>         }
> +
> +       /*
> +        * We can only reuse the page if nobody else maps the huge page or it's
> +        * part.
> +        */
>         if (reuse_swap_page(page, NULL)) {
>                 pmd_t entry;
>                 entry = pmd_mkyoung(orig_pmd);
>                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
>                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> -               ret |= VM_FAULT_WRITE;
>                 unlock_page(page);
> -               goto out_unlock;
> -       }
> -       unlock_page(page);
> -       get_page(page);
> -       spin_unlock(vmf->ptl);
> -alloc:
> -       if (__transparent_hugepage_enabled(vma) &&
> -           !transparent_hugepage_debug_cow()) {
> -               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
> -               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
> -       } else
> -               new_page = NULL;
> -
> -       if (likely(new_page)) {
> -               prep_transhuge_page(new_page);
> -       } else {
> -               if (!page) {
> -                       split_huge_pmd(vma, vmf->pmd, vmf->address);
> -                       ret |= VM_FAULT_FALLBACK;
> -               } else {
> -                       ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
> -                       if (ret & VM_FAULT_OOM) {
> -                               split_huge_pmd(vma, vmf->pmd, vmf->address);
> -                               ret |= VM_FAULT_FALLBACK;
> -                       }
> -                       put_page(page);
> -               }
> -               count_vm_event(THP_FAULT_FALLBACK);
> -               goto out;
> -       }
> -
> -       if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
> -                                       huge_gfp, &memcg, true))) {
> -               put_page(new_page);
> -               split_huge_pmd(vma, vmf->pmd, vmf->address);
> -               if (page)
> -                       put_page(page);
> -               ret |= VM_FAULT_FALLBACK;
> -               count_vm_event(THP_FAULT_FALLBACK);
> -               goto out;
> -       }
> -
> -       count_vm_event(THP_FAULT_ALLOC);
> -       count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
> -
> -       if (!page)
> -               clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
> -       else
> -               copy_user_huge_page(new_page, page, vmf->address,
> -                                   vma, HPAGE_PMD_NR);
> -       __SetPageUptodate(new_page);
> -
> -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> -                               haddr, haddr + HPAGE_PMD_SIZE);
> -       mmu_notifier_invalidate_range_start(&range);
> -
> -       spin_lock(vmf->ptl);
> -       if (page)
> -               put_page(page);
> -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
>                 spin_unlock(vmf->ptl);
> -               mem_cgroup_cancel_charge(new_page, memcg, true);
> -               put_page(new_page);
> -               goto out_mn;
> -       } else {
> -               pmd_t entry;
> -               entry = mk_huge_pmd(new_page, vma->vm_page_prot);
> -               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> -               pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> -               page_add_new_anon_rmap(new_page, vma, haddr, true);
> -               mem_cgroup_commit_charge(new_page, memcg, false, true);
> -               lru_cache_add_active_or_unevictable(new_page, vma);
> -               set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> -               update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> -               if (!page) {
> -                       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -               } else {
> -                       VM_BUG_ON_PAGE(!PageHead(page), page);
> -                       page_remove_rmap(page, true);
> -                       put_page(page);
> -               }
> -               ret |= VM_FAULT_WRITE;
> +               return VM_FAULT_WRITE;
>         }
> +
> +       unlock_page(page);
>         spin_unlock(vmf->ptl);
> -out_mn:
> -       /*
> -        * No need to double call mmu_notifier->invalidate_range() callback as
> -        * the above pmdp_huge_clear_flush_notify() did already call it.
> -        */
> -       mmu_notifier_invalidate_range_only_end(&range);
> -out:
> -       return ret;
> -out_unlock:
> -       spin_unlock(vmf->ptl);
> -       return ret;
> +fallback:
> +       __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
> +       return VM_FAULT_FALLBACK;
>  }
>
>  /*
> --
> 2.26.0
>
>
Kirill A. Shutemov March 28, 2020, 12:43 a.m. UTC | #2
On Fri, Mar 27, 2020 at 01:07:34PM -0700, Yang Shi wrote:
> On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> <kirill@shutemov.name> wrote:
> >
> > Currently we have different copy-on-write semantics for anon- and
> > file-THP. For anon-THP we try to allocate huge page on the write fault,
> > but on file-THP we split PMD and allocate 4k page.
> 
> I agree this seems confusing.
> 
> >
> > Arguably, file-THP semantics is more desirable: we don't necessary want
> > to unshare full PMD range from the parent on the first access. This is
> > the primary reason THP is unusable for some workloads, like Redis.
> >
> > The original THP refcounting didn't allow to have PTE-mapped compound
> > pages, so we had no options, but to allocate huge page on CoW (with
> > fallback to 512 4k pages).
> >
> > The current refcounting doesn't have such limitations and we can cut a
> > lot of complex code out of fault path.
> >
> > khugepaged is now able to recover THP from such ranges if the
> > configuration allows.
> 
> It looks this patch would just split the PMD then fallback to handle
> it on PTE level, it definitely simplify the code a lot. However it
> makes the use of THP depend on the productivity of khugepaged. And the
> success rate of THP allocation in khugepaged depends on defrag. But by
> default khugepaged runs at very low priority, so khugepaged defrag may
> result in priority inversion easily.

If you have a workload that may be hurt by such change, please get some
numbers. It would be interesting to see.

> For example we saw THP split in reclaim path triggered by khugepaged
> held write anon_vma lock, but the other process which was doing
> reclaim too was blocked by anon_vma lock. Then the cond_resched() in
> rmap walk would make khugepaged preempted by all other processes
> easily so khugepaged may hold the anon_vma lock for indefinite time.
> Then we saw hung tasks.

Any chance you have a reproducer? I'm not sure I follow the problem, but
it's almost 4AM, so I'm slow.

> So we have khugepaged defrag disabled for some workloads to workaround
> the priority inversion problem. So, I'm concerned some processes may
> never get THP for some our workloads with this patch applied.
> 
> The priority inversion caused by khugepaged was not very usual. Just
> my two cents.
> 
> >
> > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > ---
> >  mm/huge_memory.c | 247 +++++------------------------------------------
> >  1 file changed, 24 insertions(+), 223 deletions(-)
> >
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index ef6a6bcb291f..15b7a9c86b7c 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -1206,262 +1206,63 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
> >         spin_unlock(vmf->ptl);
> >  }
> >
> > -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
> > -                       pmd_t orig_pmd, struct page *page)
> > -{
> > -       struct vm_area_struct *vma = vmf->vma;
> > -       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> > -       struct mem_cgroup *memcg;
> > -       pgtable_t pgtable;
> > -       pmd_t _pmd;
> > -       int i;
> > -       vm_fault_t ret = 0;
> > -       struct page **pages;
> > -       struct mmu_notifier_range range;
> > -
> > -       pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
> > -                             GFP_KERNEL);
> > -       if (unlikely(!pages)) {
> > -               ret |= VM_FAULT_OOM;
> > -               goto out;
> > -       }
> > -
> > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > -               pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
> > -                                              vmf->address, page_to_nid(page));
> > -               if (unlikely(!pages[i] ||
> > -                            mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
> > -                                    GFP_KERNEL, &memcg, false))) {
> > -                       if (pages[i])
> > -                               put_page(pages[i]);
> > -                       while (--i >= 0) {
> > -                               memcg = (void *)page_private(pages[i]);
> > -                               set_page_private(pages[i], 0);
> > -                               mem_cgroup_cancel_charge(pages[i], memcg,
> > -                                               false);
> > -                               put_page(pages[i]);
> > -                       }
> > -                       kfree(pages);
> > -                       ret |= VM_FAULT_OOM;
> > -                       goto out;
> > -               }
> > -               set_page_private(pages[i], (unsigned long)memcg);
> > -       }
> > -
> > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > -               copy_user_highpage(pages[i], page + i,
> > -                                  haddr + PAGE_SIZE * i, vma);
> > -               __SetPageUptodate(pages[i]);
> > -               cond_resched();
> > -       }
> > -
> > -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> > -                               haddr, haddr + HPAGE_PMD_SIZE);
> > -       mmu_notifier_invalidate_range_start(&range);
> > -
> > -       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> > -               goto out_free_pages;
> > -       VM_BUG_ON_PAGE(!PageHead(page), page);
> > -
> > -       /*
> > -        * Leave pmd empty until pte is filled note we must notify here as
> > -        * concurrent CPU thread might write to new page before the call to
> > -        * mmu_notifier_invalidate_range_end() happens which can lead to a
> > -        * device seeing memory write in different order than CPU.
> > -        *
> > -        * See Documentation/vm/mmu_notifier.rst
> > -        */
> > -       pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> > -
> > -       pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
> > -       pmd_populate(vma->vm_mm, &_pmd, pgtable);
> > -
> > -       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> > -               pte_t entry;
> > -               entry = mk_pte(pages[i], vma->vm_page_prot);
> > -               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> > -               memcg = (void *)page_private(pages[i]);
> > -               set_page_private(pages[i], 0);
> > -               page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> > -               mem_cgroup_commit_charge(pages[i], memcg, false, false);
> > -               lru_cache_add_active_or_unevictable(pages[i], vma);
> > -               vmf->pte = pte_offset_map(&_pmd, haddr);
> > -               VM_BUG_ON(!pte_none(*vmf->pte));
> > -               set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
> > -               pte_unmap(vmf->pte);
> > -       }
> > -       kfree(pages);
> > -
> > -       smp_wmb(); /* make pte visible before pmd */
> > -       pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
> > -       page_remove_rmap(page, true);
> > -       spin_unlock(vmf->ptl);
> > -
> > -       /*
> > -        * No need to double call mmu_notifier->invalidate_range() callback as
> > -        * the above pmdp_huge_clear_flush_notify() did already call it.
> > -        */
> > -       mmu_notifier_invalidate_range_only_end(&range);
> > -
> > -       ret |= VM_FAULT_WRITE;
> > -       put_page(page);
> > -
> > -out:
> > -       return ret;
> > -
> > -out_free_pages:
> > -       spin_unlock(vmf->ptl);
> > -       mmu_notifier_invalidate_range_end(&range);
> > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > -               memcg = (void *)page_private(pages[i]);
> > -               set_page_private(pages[i], 0);
> > -               mem_cgroup_cancel_charge(pages[i], memcg, false);
> > -               put_page(pages[i]);
> > -       }
> > -       kfree(pages);
> > -       goto out;
> > -}
> > -
> >  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
> >  {
> >         struct vm_area_struct *vma = vmf->vma;
> > -       struct page *page = NULL, *new_page;
> > -       struct mem_cgroup *memcg;
> > +       struct page *page;
> >         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> > -       struct mmu_notifier_range range;
> > -       gfp_t huge_gfp;                 /* for allocation and charge */
> > -       vm_fault_t ret = 0;
> >
> >         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
> >         VM_BUG_ON_VMA(!vma->anon_vma, vma);
> > +
> >         if (is_huge_zero_pmd(orig_pmd))
> > -               goto alloc;
> > +               goto fallback;
> > +
> >         spin_lock(vmf->ptl);
> > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> > -               goto out_unlock;
> > +
> > +       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> > +               spin_unlock(vmf->ptl);
> > +               return 0;
> > +       }
> >
> >         page = pmd_page(orig_pmd);
> >         VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
> > -       /*
> > -        * We can only reuse the page if nobody else maps the huge page or it's
> > -        * part.
> > -        */
> > +
> > +       /* Lock page for reuse_swap_page() */
> >         if (!trylock_page(page)) {
> >                 get_page(page);
> >                 spin_unlock(vmf->ptl);
> >                 lock_page(page);
> >                 spin_lock(vmf->ptl);
> >                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> > +                       spin_unlock(vmf->ptl);
> >                         unlock_page(page);
> >                         put_page(page);
> > -                       goto out_unlock;
> > +                       return 0;
> >                 }
> >                 put_page(page);
> >         }
> > +
> > +       /*
> > +        * We can only reuse the page if nobody else maps the huge page or it's
> > +        * part.
> > +        */
> >         if (reuse_swap_page(page, NULL)) {
> >                 pmd_t entry;
> >                 entry = pmd_mkyoung(orig_pmd);
> >                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> >                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
> >                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> > -               ret |= VM_FAULT_WRITE;
> >                 unlock_page(page);
> > -               goto out_unlock;
> > -       }
> > -       unlock_page(page);
> > -       get_page(page);
> > -       spin_unlock(vmf->ptl);
> > -alloc:
> > -       if (__transparent_hugepage_enabled(vma) &&
> > -           !transparent_hugepage_debug_cow()) {
> > -               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
> > -               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
> > -       } else
> > -               new_page = NULL;
> > -
> > -       if (likely(new_page)) {
> > -               prep_transhuge_page(new_page);
> > -       } else {
> > -               if (!page) {
> > -                       split_huge_pmd(vma, vmf->pmd, vmf->address);
> > -                       ret |= VM_FAULT_FALLBACK;
> > -               } else {
> > -                       ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
> > -                       if (ret & VM_FAULT_OOM) {
> > -                               split_huge_pmd(vma, vmf->pmd, vmf->address);
> > -                               ret |= VM_FAULT_FALLBACK;
> > -                       }
> > -                       put_page(page);
> > -               }
> > -               count_vm_event(THP_FAULT_FALLBACK);
> > -               goto out;
> > -       }
> > -
> > -       if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
> > -                                       huge_gfp, &memcg, true))) {
> > -               put_page(new_page);
> > -               split_huge_pmd(vma, vmf->pmd, vmf->address);
> > -               if (page)
> > -                       put_page(page);
> > -               ret |= VM_FAULT_FALLBACK;
> > -               count_vm_event(THP_FAULT_FALLBACK);
> > -               goto out;
> > -       }
> > -
> > -       count_vm_event(THP_FAULT_ALLOC);
> > -       count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
> > -
> > -       if (!page)
> > -               clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
> > -       else
> > -               copy_user_huge_page(new_page, page, vmf->address,
> > -                                   vma, HPAGE_PMD_NR);
> > -       __SetPageUptodate(new_page);
> > -
> > -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> > -                               haddr, haddr + HPAGE_PMD_SIZE);
> > -       mmu_notifier_invalidate_range_start(&range);
> > -
> > -       spin_lock(vmf->ptl);
> > -       if (page)
> > -               put_page(page);
> > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> >                 spin_unlock(vmf->ptl);
> > -               mem_cgroup_cancel_charge(new_page, memcg, true);
> > -               put_page(new_page);
> > -               goto out_mn;
> > -       } else {
> > -               pmd_t entry;
> > -               entry = mk_huge_pmd(new_page, vma->vm_page_prot);
> > -               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> > -               pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> > -               page_add_new_anon_rmap(new_page, vma, haddr, true);
> > -               mem_cgroup_commit_charge(new_page, memcg, false, true);
> > -               lru_cache_add_active_or_unevictable(new_page, vma);
> > -               set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> > -               update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> > -               if (!page) {
> > -                       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> > -               } else {
> > -                       VM_BUG_ON_PAGE(!PageHead(page), page);
> > -                       page_remove_rmap(page, true);
> > -                       put_page(page);
> > -               }
> > -               ret |= VM_FAULT_WRITE;
> > +               return VM_FAULT_WRITE;
> >         }
> > +
> > +       unlock_page(page);
> >         spin_unlock(vmf->ptl);
> > -out_mn:
> > -       /*
> > -        * No need to double call mmu_notifier->invalidate_range() callback as
> > -        * the above pmdp_huge_clear_flush_notify() did already call it.
> > -        */
> > -       mmu_notifier_invalidate_range_only_end(&range);
> > -out:
> > -       return ret;
> > -out_unlock:
> > -       spin_unlock(vmf->ptl);
> > -       return ret;
> > +fallback:
> > +       __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
> > +       return VM_FAULT_FALLBACK;
> >  }
> >
> >  /*
> > --
> > 2.26.0
> >
> >
Yang Shi March 28, 2020, 1:30 a.m. UTC | #3
On Fri, Mar 27, 2020 at 5:43 PM Kirill A. Shutemov <kirill@shutemov.name> wrote:
>
> On Fri, Mar 27, 2020 at 01:07:34PM -0700, Yang Shi wrote:
> > On Fri, Mar 27, 2020 at 10:06 AM Kirill A. Shutemov
> > <kirill@shutemov.name> wrote:
> > >
> > > Currently we have different copy-on-write semantics for anon- and
> > > file-THP. For anon-THP we try to allocate huge page on the write fault,
> > > but on file-THP we split PMD and allocate 4k page.
> >
> > I agree this seems confusing.
> >
> > >
> > > Arguably, file-THP semantics is more desirable: we don't necessary want
> > > to unshare full PMD range from the parent on the first access. This is
> > > the primary reason THP is unusable for some workloads, like Redis.
> > >
> > > The original THP refcounting didn't allow to have PTE-mapped compound
> > > pages, so we had no options, but to allocate huge page on CoW (with
> > > fallback to 512 4k pages).
> > >
> > > The current refcounting doesn't have such limitations and we can cut a
> > > lot of complex code out of fault path.
> > >
> > > khugepaged is now able to recover THP from such ranges if the
> > > configuration allows.
> >
> > It looks this patch would just split the PMD then fallback to handle
> > it on PTE level, it definitely simplify the code a lot. However it
> > makes the use of THP depend on the productivity of khugepaged. And the
> > success rate of THP allocation in khugepaged depends on defrag. But by
> > default khugepaged runs at very low priority, so khugepaged defrag may
> > result in priority inversion easily.
>
> If you have a workload that may be hurt by such change, please get some
> numbers. It would be interesting to see.

Please see the below splat:

[ 7417.871422] INFO: task /home/staragent:9088 blocked for more than
1200 seconds.
[ 7417.880501]       Tainted: G        W         4.19.91 #1
[ 7417.889015] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
disables this message.
[ 7417.897373] /home/staragent D    0  9088      1 0x00000000
[ 7417.905017] Call Trace:
[ 7417.907898]  ? __schedule+0x3cf/0x660
[ 7417.911988]  ? core_sys_select+0x1d2/0x270
[ 7417.916562]  schedule+0x33/0xc0
[ 7417.920205]  rwsem_down_read_failed+0x130/0x190
[ 7417.925153]  ? call_rwsem_down_read_failed+0x14/0x30
[ 7417.930648]  call_rwsem_down_read_failed+0x14/0x30
[ 7417.935894]  down_read+0x1c/0x30
[ 7417.939603]  __do_page_fault+0x435/0x4d0
[ 7417.943984]  do_page_fault+0x32/0x110
[ 7417.949315]  ? page_fault+0x8/0x30
[ 7417.953189]  page_fault+0x1e/0x30
[ 7417.956974] RIP: 0033:0x8063b0
[ 7417.960582] Code: Bad RIP value.

***  the task got current->mm->mmap_sem: 0xffff88a65ed02f80
crash> bt ffff88a65ed02f80
PID: 102341  TASK: ffff88a65ed02f80  CPU: 64  COMMAND: "/home/staragent"
 #0 [ffffc90021bd73e8] __schedule at ffffffff818436af
 #1 [ffffc90021bd7480] schedule at ffffffff81843973
 #2 [ffffc90021bd7498] rwsem_down_read_failed at ffffffff818472b0
 #3 [ffffc90021bd7540] call_rwsem_down_read_failed at ffffffff818393d4
 #4 [ffffc90021bd7588] down_read at ffffffff8184662c
 #5 [ffffc90021bd7598] page_lock_anon_vma_read at ffffffff81242d9c
 #6 [ffffc90021bd75c0] rmap_walk_anon at ffffffff8124154f
 #7 [ffffc90021bd7620] page_referenced at ffffffff8124350e
 #8 [ffffc90021bd7690] shrink_page_list at ffffffff8120b2f4
 #9 [ffffc90021bd7760] shrink_inactive_list at ffffffff8120c4c6
#10 [ffffc90021bd7808] shrink_node_memcg at ffffffff8120cffa
#11 [ffffc90021bd7910] shrink_node at ffffffff8120d534
#12 [ffffc90021bd79a8] do_try_to_free_pages at ffffffff8120dac4
#13 [ffffc90021bd7a10] try_to_free_pages at ffffffff8120dec4
#14 [ffffc90021bd7aa8] __alloc_pages_slowpath at ffffffff811fa367
#15 [ffffc90021bd7bd0] __alloc_pages_nodemask at ffffffff811fb116
#16 [ffffc90021bd7c30] __do_page_cache_readahead at ffffffff812013c4
#17 [ffffc90021bd7cc8] filemap_fault at ffffffff811ef652
#18 [ffffc90021bd7d98] ext4_filemap_fault at ffffffffc030c48c [ext4]
#19 [ffffc90021bd7db0] __do_fault at ffffffff81234cf0
#20 [ffffc90021bd7dd0] __handle_mm_fault at ffffffff812337d8
#21 [ffffc90021bd7e80] handle_mm_fault at ffffffff81233adc
#22 [ffffc90021bd7eb0] __do_page_fault at ffffffff8106f084
#23 [ffffc90021bd7f20] do_page_fault at ffffffff8106f342
#24 [ffffc90021bd7f50] page_fault at ffffffff81a0112e
    RIP: 00007f67de084e72  RSP: 00007f67677fddb8  RFLAGS: 00010206
    RAX: 00007f67de084e72  RBX: 0000000002802310  RCX: 0000000000080000
    RDX: 00007f67680008c0  RSI: 0000000000080000  RDI: 00007f67680008c0
    RBP: 00007f67677fddf0   R8: 0000000000000000   R9: 00007f6768086460
    R10: 00007f67677fdde0  R11: 00000000009b3ca8  R12: 0000000002802120
    R13: 0000000002802768  R14: 0000000000000003  R15: 00007f67677fe700
    ORIG_RAX: ffffffffffffffff  CS: 0033  SS: 002b

*** staragent held mmap_sem and was doing memory reclaim, and was
trying to acquire anon_vma lock.

*** the anon_vma->root->rwsem is held by khugepaged
crash> struct task_struct 0xffff88fe6f5dc740
comm = “khugepaged\000\000\000\000\000”
crash> bt 0xffff88fe6f5dc740
PID: 504    TASK: ffff88fe6f5dc740  CPU: 34  COMMAND: "khugepaged"
 #0 [ffffc9000da2f630] __schedule at ffffffff818436af
 #1 [ffffc9000da2f6c8] _cond_resched at ffffffff81843c0d
 #2 [ffffc9000da2f6d8] rmap_walk_anon at ffffffff81241465
 #3 [ffffc9000da2f738] remove_migration_ptes at ffffffff8127129d
 #4 [ffffc9000da2f770] remap_page at ffffffff8127504f
 #5 [ffffc9000da2f788] split_huge_page_to_list at ffffffff8127a82a
 #6 [ffffc9000da2f810] shrink_page_list at ffffffff8120b3fc
 #7 [ffffc9000da2f8e0] shrink_inactive_list at ffffffff8120c4c6
 #8 [ffffc9000da2f988] shrink_node_memcg at ffffffff8120cffa
 #9 [ffffc9000da2fa90] shrink_node at ffffffff8120d534
#10 [ffffc9000da2fb28] do_try_to_free_pages at ffffffff8120dac4
#11 [ffffc9000da2fb90] try_to_free_pages at ffffffff8120dec4
#12 [ffffc9000da2fc28] __alloc_pages_slowpath at ffffffff811fa367
#13 [ffffc9000da2fd50] __alloc_pages_nodemask at ffffffff811fb116
#14 [ffffc9000da2fdb0] khugepaged at ffffffff8127e478
#15 [ffffc9000da2ff10] kthread at ffffffff810bcb75
#16 [ffffc9000da2ff50] ret_from_fork at ffffffff81a001cf

The system was overloaded and experiencing memory pressure. Releasing
mmap_sem before doing any blocking operations could mitigate the
problem, but the underlying priority inversion caused by khugepaged
still exists.

> > For example we saw THP split in reclaim path triggered by khugepaged
> > held write anon_vma lock, but the other process which was doing
> > reclaim too was blocked by anon_vma lock. Then the cond_resched() in
> > rmap walk would make khugepaged preempted by all other processes
> > easily so khugepaged may hold the anon_vma lock for indefinite time.
> > Then we saw hung tasks.
>
> Any chance you have a reproducer? I'm not sure I follow the problem, but
> it's almost 4AM, so I'm slow.

I don't have a stable reproducer.

>
> > So we have khugepaged defrag disabled for some workloads to workaround
> > the priority inversion problem. So, I'm concerned some processes may
> > never get THP for some our workloads with this patch applied.
> >
> > The priority inversion caused by khugepaged was not very usual. Just
> > my two cents.
> >
> > >
> > > Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> > > ---
> > >  mm/huge_memory.c | 247 +++++------------------------------------------
> > >  1 file changed, 24 insertions(+), 223 deletions(-)
> > >
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index ef6a6bcb291f..15b7a9c86b7c 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -1206,262 +1206,63 @@ void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
> > >         spin_unlock(vmf->ptl);
> > >  }
> > >
> > > -static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
> > > -                       pmd_t orig_pmd, struct page *page)
> > > -{
> > > -       struct vm_area_struct *vma = vmf->vma;
> > > -       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> > > -       struct mem_cgroup *memcg;
> > > -       pgtable_t pgtable;
> > > -       pmd_t _pmd;
> > > -       int i;
> > > -       vm_fault_t ret = 0;
> > > -       struct page **pages;
> > > -       struct mmu_notifier_range range;
> > > -
> > > -       pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
> > > -                             GFP_KERNEL);
> > > -       if (unlikely(!pages)) {
> > > -               ret |= VM_FAULT_OOM;
> > > -               goto out;
> > > -       }
> > > -
> > > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > > -               pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
> > > -                                              vmf->address, page_to_nid(page));
> > > -               if (unlikely(!pages[i] ||
> > > -                            mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
> > > -                                    GFP_KERNEL, &memcg, false))) {
> > > -                       if (pages[i])
> > > -                               put_page(pages[i]);
> > > -                       while (--i >= 0) {
> > > -                               memcg = (void *)page_private(pages[i]);
> > > -                               set_page_private(pages[i], 0);
> > > -                               mem_cgroup_cancel_charge(pages[i], memcg,
> > > -                                               false);
> > > -                               put_page(pages[i]);
> > > -                       }
> > > -                       kfree(pages);
> > > -                       ret |= VM_FAULT_OOM;
> > > -                       goto out;
> > > -               }
> > > -               set_page_private(pages[i], (unsigned long)memcg);
> > > -       }
> > > -
> > > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > > -               copy_user_highpage(pages[i], page + i,
> > > -                                  haddr + PAGE_SIZE * i, vma);
> > > -               __SetPageUptodate(pages[i]);
> > > -               cond_resched();
> > > -       }
> > > -
> > > -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> > > -                               haddr, haddr + HPAGE_PMD_SIZE);
> > > -       mmu_notifier_invalidate_range_start(&range);
> > > -
> > > -       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> > > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> > > -               goto out_free_pages;
> > > -       VM_BUG_ON_PAGE(!PageHead(page), page);
> > > -
> > > -       /*
> > > -        * Leave pmd empty until pte is filled note we must notify here as
> > > -        * concurrent CPU thread might write to new page before the call to
> > > -        * mmu_notifier_invalidate_range_end() happens which can lead to a
> > > -        * device seeing memory write in different order than CPU.
> > > -        *
> > > -        * See Documentation/vm/mmu_notifier.rst
> > > -        */
> > > -       pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> > > -
> > > -       pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
> > > -       pmd_populate(vma->vm_mm, &_pmd, pgtable);
> > > -
> > > -       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
> > > -               pte_t entry;
> > > -               entry = mk_pte(pages[i], vma->vm_page_prot);
> > > -               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> > > -               memcg = (void *)page_private(pages[i]);
> > > -               set_page_private(pages[i], 0);
> > > -               page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
> > > -               mem_cgroup_commit_charge(pages[i], memcg, false, false);
> > > -               lru_cache_add_active_or_unevictable(pages[i], vma);
> > > -               vmf->pte = pte_offset_map(&_pmd, haddr);
> > > -               VM_BUG_ON(!pte_none(*vmf->pte));
> > > -               set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
> > > -               pte_unmap(vmf->pte);
> > > -       }
> > > -       kfree(pages);
> > > -
> > > -       smp_wmb(); /* make pte visible before pmd */
> > > -       pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
> > > -       page_remove_rmap(page, true);
> > > -       spin_unlock(vmf->ptl);
> > > -
> > > -       /*
> > > -        * No need to double call mmu_notifier->invalidate_range() callback as
> > > -        * the above pmdp_huge_clear_flush_notify() did already call it.
> > > -        */
> > > -       mmu_notifier_invalidate_range_only_end(&range);
> > > -
> > > -       ret |= VM_FAULT_WRITE;
> > > -       put_page(page);
> > > -
> > > -out:
> > > -       return ret;
> > > -
> > > -out_free_pages:
> > > -       spin_unlock(vmf->ptl);
> > > -       mmu_notifier_invalidate_range_end(&range);
> > > -       for (i = 0; i < HPAGE_PMD_NR; i++) {
> > > -               memcg = (void *)page_private(pages[i]);
> > > -               set_page_private(pages[i], 0);
> > > -               mem_cgroup_cancel_charge(pages[i], memcg, false);
> > > -               put_page(pages[i]);
> > > -       }
> > > -       kfree(pages);
> > > -       goto out;
> > > -}
> > > -
> > >  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
> > >  {
> > >         struct vm_area_struct *vma = vmf->vma;
> > > -       struct page *page = NULL, *new_page;
> > > -       struct mem_cgroup *memcg;
> > > +       struct page *page;
> > >         unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> > > -       struct mmu_notifier_range range;
> > > -       gfp_t huge_gfp;                 /* for allocation and charge */
> > > -       vm_fault_t ret = 0;
> > >
> > >         vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
> > >         VM_BUG_ON_VMA(!vma->anon_vma, vma);
> > > +
> > >         if (is_huge_zero_pmd(orig_pmd))
> > > -               goto alloc;
> > > +               goto fallback;
> > > +
> > >         spin_lock(vmf->ptl);
> > > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
> > > -               goto out_unlock;
> > > +
> > > +       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> > > +               spin_unlock(vmf->ptl);
> > > +               return 0;
> > > +       }
> > >
> > >         page = pmd_page(orig_pmd);
> > >         VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
> > > -       /*
> > > -        * We can only reuse the page if nobody else maps the huge page or it's
> > > -        * part.
> > > -        */
> > > +
> > > +       /* Lock page for reuse_swap_page() */
> > >         if (!trylock_page(page)) {
> > >                 get_page(page);
> > >                 spin_unlock(vmf->ptl);
> > >                 lock_page(page);
> > >                 spin_lock(vmf->ptl);
> > >                 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> > > +                       spin_unlock(vmf->ptl);
> > >                         unlock_page(page);
> > >                         put_page(page);
> > > -                       goto out_unlock;
> > > +                       return 0;
> > >                 }
> > >                 put_page(page);
> > >         }
> > > +
> > > +       /*
> > > +        * We can only reuse the page if nobody else maps the huge page or it's
> > > +        * part.
> > > +        */
> > >         if (reuse_swap_page(page, NULL)) {
> > >                 pmd_t entry;
> > >                 entry = pmd_mkyoung(orig_pmd);
> > >                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> > >                 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
> > >                         update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> > > -               ret |= VM_FAULT_WRITE;
> > >                 unlock_page(page);
> > > -               goto out_unlock;
> > > -       }
> > > -       unlock_page(page);
> > > -       get_page(page);
> > > -       spin_unlock(vmf->ptl);
> > > -alloc:
> > > -       if (__transparent_hugepage_enabled(vma) &&
> > > -           !transparent_hugepage_debug_cow()) {
> > > -               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
> > > -               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
> > > -       } else
> > > -               new_page = NULL;
> > > -
> > > -       if (likely(new_page)) {
> > > -               prep_transhuge_page(new_page);
> > > -       } else {
> > > -               if (!page) {
> > > -                       split_huge_pmd(vma, vmf->pmd, vmf->address);
> > > -                       ret |= VM_FAULT_FALLBACK;
> > > -               } else {
> > > -                       ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
> > > -                       if (ret & VM_FAULT_OOM) {
> > > -                               split_huge_pmd(vma, vmf->pmd, vmf->address);
> > > -                               ret |= VM_FAULT_FALLBACK;
> > > -                       }
> > > -                       put_page(page);
> > > -               }
> > > -               count_vm_event(THP_FAULT_FALLBACK);
> > > -               goto out;
> > > -       }
> > > -
> > > -       if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
> > > -                                       huge_gfp, &memcg, true))) {
> > > -               put_page(new_page);
> > > -               split_huge_pmd(vma, vmf->pmd, vmf->address);
> > > -               if (page)
> > > -                       put_page(page);
> > > -               ret |= VM_FAULT_FALLBACK;
> > > -               count_vm_event(THP_FAULT_FALLBACK);
> > > -               goto out;
> > > -       }
> > > -
> > > -       count_vm_event(THP_FAULT_ALLOC);
> > > -       count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
> > > -
> > > -       if (!page)
> > > -               clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
> > > -       else
> > > -               copy_user_huge_page(new_page, page, vmf->address,
> > > -                                   vma, HPAGE_PMD_NR);
> > > -       __SetPageUptodate(new_page);
> > > -
> > > -       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> > > -                               haddr, haddr + HPAGE_PMD_SIZE);
> > > -       mmu_notifier_invalidate_range_start(&range);
> > > -
> > > -       spin_lock(vmf->ptl);
> > > -       if (page)
> > > -               put_page(page);
> > > -       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
> > >                 spin_unlock(vmf->ptl);
> > > -               mem_cgroup_cancel_charge(new_page, memcg, true);
> > > -               put_page(new_page);
> > > -               goto out_mn;
> > > -       } else {
> > > -               pmd_t entry;
> > > -               entry = mk_huge_pmd(new_page, vma->vm_page_prot);
> > > -               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> > > -               pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
> > > -               page_add_new_anon_rmap(new_page, vma, haddr, true);
> > > -               mem_cgroup_commit_charge(new_page, memcg, false, true);
> > > -               lru_cache_add_active_or_unevictable(new_page, vma);
> > > -               set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> > > -               update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> > > -               if (!page) {
> > > -                       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> > > -               } else {
> > > -                       VM_BUG_ON_PAGE(!PageHead(page), page);
> > > -                       page_remove_rmap(page, true);
> > > -                       put_page(page);
> > > -               }
> > > -               ret |= VM_FAULT_WRITE;
> > > +               return VM_FAULT_WRITE;
> > >         }
> > > +
> > > +       unlock_page(page);
> > >         spin_unlock(vmf->ptl);
> > > -out_mn:
> > > -       /*
> > > -        * No need to double call mmu_notifier->invalidate_range() callback as
> > > -        * the above pmdp_huge_clear_flush_notify() did already call it.
> > > -        */
> > > -       mmu_notifier_invalidate_range_only_end(&range);
> > > -out:
> > > -       return ret;
> > > -out_unlock:
> > > -       spin_unlock(vmf->ptl);
> > > -       return ret;
> > > +fallback:
> > > +       __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
> > > +       return VM_FAULT_FALLBACK;
> > >  }
> > >
> > >  /*
> > > --
> > > 2.26.0
> > >
> > >
>
> --
>  Kirill A. Shutemov
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ef6a6bcb291f..15b7a9c86b7c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1206,262 +1206,63 @@  void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
 	spin_unlock(vmf->ptl);
 }
 
-static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
-			pmd_t orig_pmd, struct page *page)
-{
-	struct vm_area_struct *vma = vmf->vma;
-	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	struct mem_cgroup *memcg;
-	pgtable_t pgtable;
-	pmd_t _pmd;
-	int i;
-	vm_fault_t ret = 0;
-	struct page **pages;
-	struct mmu_notifier_range range;
-
-	pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
-			      GFP_KERNEL);
-	if (unlikely(!pages)) {
-		ret |= VM_FAULT_OOM;
-		goto out;
-	}
-
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
-					       vmf->address, page_to_nid(page));
-		if (unlikely(!pages[i] ||
-			     mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
-				     GFP_KERNEL, &memcg, false))) {
-			if (pages[i])
-				put_page(pages[i]);
-			while (--i >= 0) {
-				memcg = (void *)page_private(pages[i]);
-				set_page_private(pages[i], 0);
-				mem_cgroup_cancel_charge(pages[i], memcg,
-						false);
-				put_page(pages[i]);
-			}
-			kfree(pages);
-			ret |= VM_FAULT_OOM;
-			goto out;
-		}
-		set_page_private(pages[i], (unsigned long)memcg);
-	}
-
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		copy_user_highpage(pages[i], page + i,
-				   haddr + PAGE_SIZE * i, vma);
-		__SetPageUptodate(pages[i]);
-		cond_resched();
-	}
-
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
-				haddr, haddr + HPAGE_PMD_SIZE);
-	mmu_notifier_invalidate_range_start(&range);
-
-	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
-		goto out_free_pages;
-	VM_BUG_ON_PAGE(!PageHead(page), page);
-
-	/*
-	 * Leave pmd empty until pte is filled note we must notify here as
-	 * concurrent CPU thread might write to new page before the call to
-	 * mmu_notifier_invalidate_range_end() happens which can lead to a
-	 * device seeing memory write in different order than CPU.
-	 *
-	 * See Documentation/vm/mmu_notifier.rst
-	 */
-	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-
-	pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
-	pmd_populate(vma->vm_mm, &_pmd, pgtable);
-
-	for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-		pte_t entry;
-		entry = mk_pte(pages[i], vma->vm_page_prot);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-		memcg = (void *)page_private(pages[i]);
-		set_page_private(pages[i], 0);
-		page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
-		mem_cgroup_commit_charge(pages[i], memcg, false, false);
-		lru_cache_add_active_or_unevictable(pages[i], vma);
-		vmf->pte = pte_offset_map(&_pmd, haddr);
-		VM_BUG_ON(!pte_none(*vmf->pte));
-		set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
-		pte_unmap(vmf->pte);
-	}
-	kfree(pages);
-
-	smp_wmb(); /* make pte visible before pmd */
-	pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
-	page_remove_rmap(page, true);
-	spin_unlock(vmf->ptl);
-
-	/*
-	 * No need to double call mmu_notifier->invalidate_range() callback as
-	 * the above pmdp_huge_clear_flush_notify() did already call it.
-	 */
-	mmu_notifier_invalidate_range_only_end(&range);
-
-	ret |= VM_FAULT_WRITE;
-	put_page(page);
-
-out:
-	return ret;
-
-out_free_pages:
-	spin_unlock(vmf->ptl);
-	mmu_notifier_invalidate_range_end(&range);
-	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		memcg = (void *)page_private(pages[i]);
-		set_page_private(pages[i], 0);
-		mem_cgroup_cancel_charge(pages[i], memcg, false);
-		put_page(pages[i]);
-	}
-	kfree(pages);
-	goto out;
-}
-
 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	struct page *page = NULL, *new_page;
-	struct mem_cgroup *memcg;
+	struct page *page;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	struct mmu_notifier_range range;
-	gfp_t huge_gfp;			/* for allocation and charge */
-	vm_fault_t ret = 0;
 
 	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
 	if (is_huge_zero_pmd(orig_pmd))
-		goto alloc;
+		goto fallback;
+
 	spin_lock(vmf->ptl);
-	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
-		goto out_unlock;
+
+	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+		spin_unlock(vmf->ptl);
+		return 0;
+	}
 
 	page = pmd_page(orig_pmd);
 	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
-	/*
-	 * We can only reuse the page if nobody else maps the huge page or it's
-	 * part.
-	 */
+
+	/* Lock page for reuse_swap_page() */
 	if (!trylock_page(page)) {
 		get_page(page);
 		spin_unlock(vmf->ptl);
 		lock_page(page);
 		spin_lock(vmf->ptl);
 		if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+			spin_unlock(vmf->ptl);
 			unlock_page(page);
 			put_page(page);
-			goto out_unlock;
+			return 0;
 		}
 		put_page(page);
 	}
+
+	/*
+	 * We can only reuse the page if nobody else maps the huge page or it's
+	 * part.
+	 */
 	if (reuse_swap_page(page, NULL)) {
 		pmd_t entry;
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 		if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
 			update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-		ret |= VM_FAULT_WRITE;
 		unlock_page(page);
-		goto out_unlock;
-	}
-	unlock_page(page);
-	get_page(page);
-	spin_unlock(vmf->ptl);
-alloc:
-	if (__transparent_hugepage_enabled(vma) &&
-	    !transparent_hugepage_debug_cow()) {
-		huge_gfp = alloc_hugepage_direct_gfpmask(vma);
-		new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
-	} else
-		new_page = NULL;
-
-	if (likely(new_page)) {
-		prep_transhuge_page(new_page);
-	} else {
-		if (!page) {
-			split_huge_pmd(vma, vmf->pmd, vmf->address);
-			ret |= VM_FAULT_FALLBACK;
-		} else {
-			ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
-			if (ret & VM_FAULT_OOM) {
-				split_huge_pmd(vma, vmf->pmd, vmf->address);
-				ret |= VM_FAULT_FALLBACK;
-			}
-			put_page(page);
-		}
-		count_vm_event(THP_FAULT_FALLBACK);
-		goto out;
-	}
-
-	if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
-					huge_gfp, &memcg, true))) {
-		put_page(new_page);
-		split_huge_pmd(vma, vmf->pmd, vmf->address);
-		if (page)
-			put_page(page);
-		ret |= VM_FAULT_FALLBACK;
-		count_vm_event(THP_FAULT_FALLBACK);
-		goto out;
-	}
-
-	count_vm_event(THP_FAULT_ALLOC);
-	count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
-
-	if (!page)
-		clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
-	else
-		copy_user_huge_page(new_page, page, vmf->address,
-				    vma, HPAGE_PMD_NR);
-	__SetPageUptodate(new_page);
-
-	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
-				haddr, haddr + HPAGE_PMD_SIZE);
-	mmu_notifier_invalidate_range_start(&range);
-
-	spin_lock(vmf->ptl);
-	if (page)
-		put_page(page);
-	if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
 		spin_unlock(vmf->ptl);
-		mem_cgroup_cancel_charge(new_page, memcg, true);
-		put_page(new_page);
-		goto out_mn;
-	} else {
-		pmd_t entry;
-		entry = mk_huge_pmd(new_page, vma->vm_page_prot);
-		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-		pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-		page_add_new_anon_rmap(new_page, vma, haddr, true);
-		mem_cgroup_commit_charge(new_page, memcg, false, true);
-		lru_cache_add_active_or_unevictable(new_page, vma);
-		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
-		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-		if (!page) {
-			add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		} else {
-			VM_BUG_ON_PAGE(!PageHead(page), page);
-			page_remove_rmap(page, true);
-			put_page(page);
-		}
-		ret |= VM_FAULT_WRITE;
+		return VM_FAULT_WRITE;
 	}
+
+	unlock_page(page);
 	spin_unlock(vmf->ptl);
-out_mn:
-	/*
-	 * No need to double call mmu_notifier->invalidate_range() callback as
-	 * the above pmdp_huge_clear_flush_notify() did already call it.
-	 */
-	mmu_notifier_invalidate_range_only_end(&range);
-out:
-	return ret;
-out_unlock:
-	spin_unlock(vmf->ptl);
-	return ret;
+fallback:
+	__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+	return VM_FAULT_FALLBACK;
 }
 
 /*