diff mbox series

[RFC,14/26] hugetlb: add HGM support for hugetlb_fault and hugetlb_no_page

Message ID 20220624173656.2033256-15-jthoughton@google.com (mailing list archive)
State New
Headers show
Series hugetlb: Introduce HugeTLB high-granularity mapping | expand

Commit Message

James Houghton June 24, 2022, 5:36 p.m. UTC
This CL is the first main functional HugeTLB change. Together, these
changes allow the HugeTLB fault path to handle faults on HGM-enabled
VMAs. The two main behaviors that can be done now:
  1. Faults can be passed to handle_userfault. (Userspace will want to
     use UFFD_FEATURE_REAL_ADDRESS to get the real address to know which
     region they should be call UFFDIO_CONTINUE on later.)
  2. Faults on pages that have been partially mapped (and userfaultfd is
     not being used) will get mapped at the largest possible size.
     For example, if a 1G page has been partially mapped at 2M, and we
     fault on an unmapped 2M section, hugetlb_no_page will create a 2M
     PMD to map the faulting address.

This commit does not handle hugetlb_wp right now, and it doesn't handle
HugeTLB page migration and swap entries.

Signed-off-by: James Houghton <jthoughton@google.com>
---
 include/linux/hugetlb.h |  12 ++++
 mm/hugetlb.c            | 121 +++++++++++++++++++++++++++++++---------
 2 files changed, 106 insertions(+), 27 deletions(-)

Comments

Manish June 29, 2022, 2:40 p.m. UTC | #1
On 24/06/22 11:06 pm, James Houghton wrote:
> This CL is the first main functional HugeTLB change. Together, these
> changes allow the HugeTLB fault path to handle faults on HGM-enabled
> VMAs. The two main behaviors that can be done now:
>    1. Faults can be passed to handle_userfault. (Userspace will want to
>       use UFFD_FEATURE_REAL_ADDRESS to get the real address to know which
>       region they should be call UFFDIO_CONTINUE on later.)
>    2. Faults on pages that have been partially mapped (and userfaultfd is
>       not being used) will get mapped at the largest possible size.
>       For example, if a 1G page has been partially mapped at 2M, and we
>       fault on an unmapped 2M section, hugetlb_no_page will create a 2M
>       PMD to map the faulting address.
>
> This commit does not handle hugetlb_wp right now, and it doesn't handle
> HugeTLB page migration and swap entries.
>
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>   include/linux/hugetlb.h |  12 ++++
>   mm/hugetlb.c            | 121 +++++++++++++++++++++++++++++++---------
>   2 files changed, 106 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 321f5745d87f..ac4ac8fbd901 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -1185,6 +1185,9 @@ enum split_mode {
>   #ifdef CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING
>   /* If HugeTLB high-granularity mappings are enabled for this VMA. */
>   bool hugetlb_hgm_enabled(struct vm_area_struct *vma);
> +int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
> +			      struct vm_area_struct *vma, unsigned long start,
> +			      unsigned long end);
>   int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
>   				    struct mm_struct *mm,
>   				    struct vm_area_struct *vma,
> @@ -1197,6 +1200,15 @@ static inline bool hugetlb_hgm_enabled(struct vm_area_struct *vma)
>   {
>   	return false;
>   }
> +
> +static inline
> +int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
> +			      struct vm_area_struct *vma, unsigned long start,
> +			      unsigned long end)
> +{
> +		BUG();
> +}
> +
>   static inline int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
>   					   struct mm_struct *mm,
>   					   struct vm_area_struct *vma,
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6e0c5fbfe32c..da30621656b8 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5605,18 +5605,24 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
>   static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
>   			struct vm_area_struct *vma,
>   			struct address_space *mapping, pgoff_t idx,
> -			unsigned long address, pte_t *ptep,
> +			unsigned long address, struct hugetlb_pte *hpte,
>   			pte_t old_pte, unsigned int flags)
>   {
>   	struct hstate *h = hstate_vma(vma);
>   	vm_fault_t ret = VM_FAULT_SIGBUS;
>   	int anon_rmap = 0;
>   	unsigned long size;
> -	struct page *page;
> +	struct page *page, *subpage;
>   	pte_t new_pte;
>   	spinlock_t *ptl;
>   	unsigned long haddr = address & huge_page_mask(h);
> +	unsigned long haddr_hgm = address & hugetlb_pte_mask(hpte);
>   	bool new_page, new_pagecache_page = false;
> +	/*
> +	 * This page is getting mapped for the first time, in which case we
> +	 * want to increment its mapcount.
> +	 */
> +	bool new_mapping = hpte->shift == huge_page_shift(h);
>   
>   	/*
>   	 * Currently, we are forced to kill the process in the event the
> @@ -5665,9 +5671,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
>   			 * here.  Before returning error, get ptl and make
>   			 * sure there really is no pte entry.
>   			 */
> -			ptl = huge_pte_lock(h, mm, ptep);
> +			ptl = hugetlb_pte_lock(mm, hpte);
>   			ret = 0;
> -			if (huge_pte_none(huge_ptep_get(ptep)))
> +			if (hugetlb_pte_none(hpte))
>   				ret = vmf_error(PTR_ERR(page));
>   			spin_unlock(ptl);
>   			goto out;
> @@ -5731,18 +5737,25 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
>   		vma_end_reservation(h, vma, haddr);
>   	}
>   
> -	ptl = huge_pte_lock(h, mm, ptep);
> +	ptl = hugetlb_pte_lock(mm, hpte);
>   	ret = 0;
>   	/* If pte changed from under us, retry */
> -	if (!pte_same(huge_ptep_get(ptep), old_pte))
> +	if (!pte_same(hugetlb_ptep_get(hpte), old_pte))
>   		goto backout;
>   
> -	if (anon_rmap) {
> -		ClearHPageRestoreReserve(page);
> -		hugepage_add_new_anon_rmap(page, vma, haddr);
> -	} else
> -		page_dup_file_rmap(page, true);
> -	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
> +	if (new_mapping) {
> +		/* Only increment this page's mapcount if we are mapping it
> +		 * for the first time.
> +		 */
> +		if (anon_rmap) {
> +			ClearHPageRestoreReserve(page);
> +			hugepage_add_new_anon_rmap(page, vma, haddr);
> +		} else
> +			page_dup_file_rmap(page, true);
> +	}
> +
> +	subpage = hugetlb_find_subpage(h, page, haddr_hgm);

               sorry did not understand why make_huge_pte we may be mapping just PAGE_SIZE

               too here.

> +	new_pte = make_huge_pte(vma, subpage, ((vma->vm_flags & VM_WRITE)
>   				&& (vma->vm_flags & VM_SHARED)));
>   	/*
>   	 * If this pte was previously wr-protected, keep it wr-protected even
> @@ -5750,12 +5763,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
>   	 */
>   	if (unlikely(pte_marker_uffd_wp(old_pte)))
>   		new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
> -	set_huge_pte_at(mm, haddr, ptep, new_pte);
> +	set_huge_pte_at(mm, haddr_hgm, hpte->ptep, new_pte);
>   
> -	hugetlb_count_add(pages_per_huge_page(h), mm);
> +	hugetlb_count_add(hugetlb_pte_size(hpte) / PAGE_SIZE, mm);
>   	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
> +		BUG_ON(hugetlb_pte_size(hpte) != huge_page_size(h));
>   		/* Optimization, do the COW without a second fault */
> -		ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
> +		ret = hugetlb_wp(mm, vma, address, hpte->ptep, flags, page, ptl);
>   	}
>   
>   	spin_unlock(ptl);
> @@ -5816,11 +5830,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>   	u32 hash;
>   	pgoff_t idx;
>   	struct page *page = NULL;
> +	struct page *subpage = NULL;
>   	struct page *pagecache_page = NULL;
>   	struct hstate *h = hstate_vma(vma);
>   	struct address_space *mapping;
>   	int need_wait_lock = 0;
>   	unsigned long haddr = address & huge_page_mask(h);
> +	unsigned long haddr_hgm;
> +	bool hgm_enabled = hugetlb_hgm_enabled(vma);
> +	struct hugetlb_pte hpte;
>   
>   	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
>   	if (ptep) {
> @@ -5866,11 +5884,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>   	hash = hugetlb_fault_mutex_hash(mapping, idx);
>   	mutex_lock(&hugetlb_fault_mutex_table[hash]);
>   
> -	entry = huge_ptep_get(ptep);
> +	hugetlb_pte_populate(&hpte, ptep, huge_page_shift(h));
> +
> +	if (hgm_enabled) {
> +		ret = hugetlb_walk_to(mm, &hpte, address,
> +				      PAGE_SIZE, /*stop_at_none=*/true);
> +		if (ret) {
> +			ret = vmf_error(ret);
> +			goto out_mutex;
> +		}
> +	}
> +
> +	entry = hugetlb_ptep_get(&hpte);
>   	/* PTE markers should be handled the same way as none pte */
> -	if (huge_pte_none_mostly(entry)) {
> -		ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
> -				      entry, flags);
> +	if (hugetlb_pte_none_mostly(&hpte)) {
> +		ret = hugetlb_no_page(mm, vma, mapping, idx, address, &hpte,
> +				entry, flags);
>   		goto out_mutex;
>   	}
>   
> @@ -5908,14 +5937,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>   								vma, haddr);
>   	}
>   
> -	ptl = huge_pte_lock(h, mm, ptep);
> +	ptl = hugetlb_pte_lock(mm, &hpte);
>   
>   	/* Check for a racing update before calling hugetlb_wp() */
> -	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
> +	if (unlikely(!pte_same(entry, hugetlb_ptep_get(&hpte))))
>   		goto out_ptl;
>   
> +	/* haddr_hgm is the base address of the region that hpte maps. */
> +	haddr_hgm = address & hugetlb_pte_mask(&hpte);
> +
>   	/* Handle userfault-wp first, before trying to lock more pages */
> -	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
> +	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(hugetlb_ptep_get(&hpte)) &&
>   	    (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
>   		struct vm_fault vmf = {
>   			.vma = vma,
> @@ -5939,7 +5971,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>   	 * pagecache_page, so here we need take the former one
>   	 * when page != pagecache_page or !pagecache_page.
>   	 */
> -	page = pte_page(entry);
> +	subpage = pte_page(entry);
> +	page = compound_head(subpage);
>   	if (page != pagecache_page)
>   		if (!trylock_page(page)) {
>   			need_wait_lock = 1;
> @@ -5950,7 +5983,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>   
>   	if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
>   		if (!huge_pte_write(entry)) {
> -			ret = hugetlb_wp(mm, vma, address, ptep, flags,
> +			BUG_ON(hugetlb_pte_size(&hpte) != huge_page_size(h));

is it in respect to fact that userfault_wp is not support with HGM mapping currently? Not

sure yet though how it is controlled may be next patches will have more details.

> +			ret = hugetlb_wp(mm, vma, address, hpte.ptep, flags,
>   					 pagecache_page, ptl);
>   			goto out_put_page;
>   		} else if (likely(flags & FAULT_FLAG_WRITE)) {
> @@ -5958,9 +5992,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>   		}
>   	}
>   	entry = pte_mkyoung(entry);
> -	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
> +	if (huge_ptep_set_access_flags(vma, haddr_hgm, hpte.ptep, entry,
>   						flags & FAULT_FLAG_WRITE))
> -		update_mmu_cache(vma, haddr, ptep);
> +		update_mmu_cache(vma, haddr_hgm, hpte.ptep);
>   out_put_page:
>   	if (page != pagecache_page)
>   		unlock_page(page);
> @@ -6951,7 +6985,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
>   				pte = (pte_t *)pmd_alloc(mm, pud, addr);
>   		}
>   	}
> -	BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
> +	if (!hugetlb_hgm_enabled(vma))
> +		BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
>   
>   	return pte;
>   }
> @@ -7057,6 +7092,38 @@ static unsigned int __shift_for_hstate(struct hstate *h)
>   			       (tmp_h) <= &hstates[hugetlb_max_hstate]; \
>   			       (tmp_h)++)
>   
> +/*
> + * Allocate a HugeTLB PTE that maps as much of [start, end) as possible with a
> + * single page table entry. The allocated HugeTLB PTE is returned in hpte.
> + */

Will it be used for madvise_collapase? If so will it make sense to keep it in different patch

as this one title says just for handle_page_fault routines.

> +int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
> +			      struct vm_area_struct *vma, unsigned long start,
> +			      unsigned long end)
> +{
> +	struct hstate *h = hstate_vma(vma), *tmp_h;
> +	unsigned int shift;
> +	int ret;
> +
> +	for_each_hgm_shift(h, tmp_h, shift) {
> +		unsigned long sz = 1UL << shift;
> +
> +		if (!IS_ALIGNED(start, sz) || start + sz > end)
> +			continue;
> +		ret = huge_pte_alloc_high_granularity(hpte, mm, vma, start,
> +						      shift, HUGETLB_SPLIT_NONE,
> +						      /*write_locked=*/false);
> +		if (ret)
> +			return ret;
> +
> +		if (hpte->shift > shift)
> +			return -EEXIST;
> +
> +		BUG_ON(hpte->shift != shift);
> +		return 0;
> +	}
> +	return -EINVAL;
> +}
> +
>   /*
>    * Given a particular address, split the HugeTLB PTE that currently maps it
>    * so that, for the given address, the PTE that maps it is `desired_shift`.
James Houghton June 29, 2022, 3:56 p.m. UTC | #2
On Wed, Jun 29, 2022 at 7:41 AM manish.mishra <manish.mishra@nutanix.com> wrote:
>
>
> On 24/06/22 11:06 pm, James Houghton wrote:
> > This CL is the first main functional HugeTLB change. Together, these
> > changes allow the HugeTLB fault path to handle faults on HGM-enabled
> > VMAs. The two main behaviors that can be done now:
> >    1. Faults can be passed to handle_userfault. (Userspace will want to
> >       use UFFD_FEATURE_REAL_ADDRESS to get the real address to know which
> >       region they should be call UFFDIO_CONTINUE on later.)
> >    2. Faults on pages that have been partially mapped (and userfaultfd is
> >       not being used) will get mapped at the largest possible size.
> >       For example, if a 1G page has been partially mapped at 2M, and we
> >       fault on an unmapped 2M section, hugetlb_no_page will create a 2M
> >       PMD to map the faulting address.
> >
> > This commit does not handle hugetlb_wp right now, and it doesn't handle
> > HugeTLB page migration and swap entries.
> >
> > Signed-off-by: James Houghton <jthoughton@google.com>
> > ---
> >   include/linux/hugetlb.h |  12 ++++
> >   mm/hugetlb.c            | 121 +++++++++++++++++++++++++++++++---------
> >   2 files changed, 106 insertions(+), 27 deletions(-)
> >
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 321f5745d87f..ac4ac8fbd901 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -1185,6 +1185,9 @@ enum split_mode {
> >   #ifdef CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING
> >   /* If HugeTLB high-granularity mappings are enabled for this VMA. */
> >   bool hugetlb_hgm_enabled(struct vm_area_struct *vma);
> > +int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
> > +                           struct vm_area_struct *vma, unsigned long start,
> > +                           unsigned long end);
> >   int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
> >                                   struct mm_struct *mm,
> >                                   struct vm_area_struct *vma,
> > @@ -1197,6 +1200,15 @@ static inline bool hugetlb_hgm_enabled(struct vm_area_struct *vma)
> >   {
> >       return false;
> >   }
> > +
> > +static inline
> > +int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
> > +                           struct vm_area_struct *vma, unsigned long start,
> > +                           unsigned long end)
> > +{
> > +             BUG();
> > +}
> > +
> >   static inline int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
> >                                          struct mm_struct *mm,
> >                                          struct vm_area_struct *vma,
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 6e0c5fbfe32c..da30621656b8 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -5605,18 +5605,24 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
> >   static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> >                       struct vm_area_struct *vma,
> >                       struct address_space *mapping, pgoff_t idx,
> > -                     unsigned long address, pte_t *ptep,
> > +                     unsigned long address, struct hugetlb_pte *hpte,
> >                       pte_t old_pte, unsigned int flags)
> >   {
> >       struct hstate *h = hstate_vma(vma);
> >       vm_fault_t ret = VM_FAULT_SIGBUS;
> >       int anon_rmap = 0;
> >       unsigned long size;
> > -     struct page *page;
> > +     struct page *page, *subpage;
> >       pte_t new_pte;
> >       spinlock_t *ptl;
> >       unsigned long haddr = address & huge_page_mask(h);
> > +     unsigned long haddr_hgm = address & hugetlb_pte_mask(hpte);
> >       bool new_page, new_pagecache_page = false;
> > +     /*
> > +      * This page is getting mapped for the first time, in which case we
> > +      * want to increment its mapcount.
> > +      */
> > +     bool new_mapping = hpte->shift == huge_page_shift(h);
> >
> >       /*
> >        * Currently, we are forced to kill the process in the event the
> > @@ -5665,9 +5671,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> >                        * here.  Before returning error, get ptl and make
> >                        * sure there really is no pte entry.
> >                        */
> > -                     ptl = huge_pte_lock(h, mm, ptep);
> > +                     ptl = hugetlb_pte_lock(mm, hpte);
> >                       ret = 0;
> > -                     if (huge_pte_none(huge_ptep_get(ptep)))
> > +                     if (hugetlb_pte_none(hpte))
> >                               ret = vmf_error(PTR_ERR(page));
> >                       spin_unlock(ptl);
> >                       goto out;
> > @@ -5731,18 +5737,25 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> >               vma_end_reservation(h, vma, haddr);
> >       }
> >
> > -     ptl = huge_pte_lock(h, mm, ptep);
> > +     ptl = hugetlb_pte_lock(mm, hpte);
> >       ret = 0;
> >       /* If pte changed from under us, retry */
> > -     if (!pte_same(huge_ptep_get(ptep), old_pte))
> > +     if (!pte_same(hugetlb_ptep_get(hpte), old_pte))
> >               goto backout;
> >
> > -     if (anon_rmap) {
> > -             ClearHPageRestoreReserve(page);
> > -             hugepage_add_new_anon_rmap(page, vma, haddr);
> > -     } else
> > -             page_dup_file_rmap(page, true);
> > -     new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
> > +     if (new_mapping) {
> > +             /* Only increment this page's mapcount if we are mapping it
> > +              * for the first time.
> > +              */
> > +             if (anon_rmap) {
> > +                     ClearHPageRestoreReserve(page);
> > +                     hugepage_add_new_anon_rmap(page, vma, haddr);
> > +             } else
> > +                     page_dup_file_rmap(page, true);
> > +     }
> > +
> > +     subpage = hugetlb_find_subpage(h, page, haddr_hgm);
>
>                sorry did not understand why make_huge_pte we may be mapping just PAGE_SIZE
>
>                too here.
>

This should be make_huge_pte_with_shift(), with shift =
hugetlb_pte_shift(hpte). Thanks.

> > +     new_pte = make_huge_pte(vma, subpage, ((vma->vm_flags & VM_WRITE)
> >                               && (vma->vm_flags & VM_SHARED)));
> >       /*
> >        * If this pte was previously wr-protected, keep it wr-protected even
> > @@ -5750,12 +5763,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
> >        */
> >       if (unlikely(pte_marker_uffd_wp(old_pte)))
> >               new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
> > -     set_huge_pte_at(mm, haddr, ptep, new_pte);
> > +     set_huge_pte_at(mm, haddr_hgm, hpte->ptep, new_pte);
> >
> > -     hugetlb_count_add(pages_per_huge_page(h), mm);
> > +     hugetlb_count_add(hugetlb_pte_size(hpte) / PAGE_SIZE, mm);
> >       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
> > +             BUG_ON(hugetlb_pte_size(hpte) != huge_page_size(h));
> >               /* Optimization, do the COW without a second fault */
> > -             ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
> > +             ret = hugetlb_wp(mm, vma, address, hpte->ptep, flags, page, ptl);
> >       }
> >
> >       spin_unlock(ptl);
> > @@ -5816,11 +5830,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >       u32 hash;
> >       pgoff_t idx;
> >       struct page *page = NULL;
> > +     struct page *subpage = NULL;
> >       struct page *pagecache_page = NULL;
> >       struct hstate *h = hstate_vma(vma);
> >       struct address_space *mapping;
> >       int need_wait_lock = 0;
> >       unsigned long haddr = address & huge_page_mask(h);
> > +     unsigned long haddr_hgm;
> > +     bool hgm_enabled = hugetlb_hgm_enabled(vma);
> > +     struct hugetlb_pte hpte;
> >
> >       ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
> >       if (ptep) {
> > @@ -5866,11 +5884,22 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >       hash = hugetlb_fault_mutex_hash(mapping, idx);
> >       mutex_lock(&hugetlb_fault_mutex_table[hash]);
> >
> > -     entry = huge_ptep_get(ptep);
> > +     hugetlb_pte_populate(&hpte, ptep, huge_page_shift(h));
> > +
> > +     if (hgm_enabled) {
> > +             ret = hugetlb_walk_to(mm, &hpte, address,
> > +                                   PAGE_SIZE, /*stop_at_none=*/true);
> > +             if (ret) {
> > +                     ret = vmf_error(ret);
> > +                     goto out_mutex;
> > +             }
> > +     }
> > +
> > +     entry = hugetlb_ptep_get(&hpte);
> >       /* PTE markers should be handled the same way as none pte */
> > -     if (huge_pte_none_mostly(entry)) {
> > -             ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
> > -                                   entry, flags);
> > +     if (hugetlb_pte_none_mostly(&hpte)) {
> > +             ret = hugetlb_no_page(mm, vma, mapping, idx, address, &hpte,
> > +                             entry, flags);
> >               goto out_mutex;
> >       }
> >
> > @@ -5908,14 +5937,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >                                                               vma, haddr);
> >       }
> >
> > -     ptl = huge_pte_lock(h, mm, ptep);
> > +     ptl = hugetlb_pte_lock(mm, &hpte);
> >
> >       /* Check for a racing update before calling hugetlb_wp() */
> > -     if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
> > +     if (unlikely(!pte_same(entry, hugetlb_ptep_get(&hpte))))
> >               goto out_ptl;
> >
> > +     /* haddr_hgm is the base address of the region that hpte maps. */
> > +     haddr_hgm = address & hugetlb_pte_mask(&hpte);
> > +
> >       /* Handle userfault-wp first, before trying to lock more pages */
> > -     if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
> > +     if (userfaultfd_wp(vma) && huge_pte_uffd_wp(hugetlb_ptep_get(&hpte)) &&
> >           (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
> >               struct vm_fault vmf = {
> >                       .vma = vma,
> > @@ -5939,7 +5971,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >        * pagecache_page, so here we need take the former one
> >        * when page != pagecache_page or !pagecache_page.
> >        */
> > -     page = pte_page(entry);
> > +     subpage = pte_page(entry);
> > +     page = compound_head(subpage);
> >       if (page != pagecache_page)
> >               if (!trylock_page(page)) {
> >                       need_wait_lock = 1;
> > @@ -5950,7 +5983,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >
> >       if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
> >               if (!huge_pte_write(entry)) {
> > -                     ret = hugetlb_wp(mm, vma, address, ptep, flags,
> > +                     BUG_ON(hugetlb_pte_size(&hpte) != huge_page_size(h));
>
> is it in respect to fact that userfault_wp is not support with HGM mapping currently? Not
>
> sure yet though how it is controlled may be next patches will have more details.

Yeah this BUG_ON is just because I haven't implemented support for
userfaultfd_wp yet (userfaultfd_wp for HugeTLB was added pretty
recently, while I was working on this patch series). I'll improve WP
support for the next version.

>
> > +                     ret = hugetlb_wp(mm, vma, address, hpte.ptep, flags,
> >                                        pagecache_page, ptl);
> >                       goto out_put_page;
> >               } else if (likely(flags & FAULT_FLAG_WRITE)) {
> > @@ -5958,9 +5992,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> >               }
> >       }
> >       entry = pte_mkyoung(entry);
> > -     if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
> > +     if (huge_ptep_set_access_flags(vma, haddr_hgm, hpte.ptep, entry,
> >                                               flags & FAULT_FLAG_WRITE))
> > -             update_mmu_cache(vma, haddr, ptep);
> > +             update_mmu_cache(vma, haddr_hgm, hpte.ptep);
> >   out_put_page:
> >       if (page != pagecache_page)
> >               unlock_page(page);
> > @@ -6951,7 +6985,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
> >                               pte = (pte_t *)pmd_alloc(mm, pud, addr);
> >               }
> >       }
> > -     BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
> > +     if (!hugetlb_hgm_enabled(vma))
> > +             BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
> >
> >       return pte;
> >   }
> > @@ -7057,6 +7092,38 @@ static unsigned int __shift_for_hstate(struct hstate *h)
> >                              (tmp_h) <= &hstates[hugetlb_max_hstate]; \
> >                              (tmp_h)++)
> >
> > +/*
> > + * Allocate a HugeTLB PTE that maps as much of [start, end) as possible with a
> > + * single page table entry. The allocated HugeTLB PTE is returned in hpte.
> > + */
>
> Will it be used for madvise_collapase? If so will it make sense to keep it in different patch
>
> as this one title says just for handle_page_fault routines.

This is used by userfaultfd/UFFDIO_CONTINUE -- I will move this diff
to the patch that uses it (certainly shouldn't be in this patch).

>
> > +int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
> > +                           struct vm_area_struct *vma, unsigned long start,
> > +                           unsigned long end)
> > +{
> > +     struct hstate *h = hstate_vma(vma), *tmp_h;
> > +     unsigned int shift;
> > +     int ret;
> > +
> > +     for_each_hgm_shift(h, tmp_h, shift) {
> > +             unsigned long sz = 1UL << shift;
> > +
> > +             if (!IS_ALIGNED(start, sz) || start + sz > end)
> > +                     continue;
> > +             ret = huge_pte_alloc_high_granularity(hpte, mm, vma, start,
> > +                                                   shift, HUGETLB_SPLIT_NONE,
> > +                                                   /*write_locked=*/false);
> > +             if (ret)
> > +                     return ret;
> > +
> > +             if (hpte->shift > shift)
> > +                     return -EEXIST;
> > +
> > +             BUG_ON(hpte->shift != shift);
> > +             return 0;
> > +     }
> > +     return -EINVAL;
> > +}
> > +
> >   /*
> >    * Given a particular address, split the HugeTLB PTE that currently maps it
> >    * so that, for the given address, the PTE that maps it is `desired_shift`.
diff mbox series

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 321f5745d87f..ac4ac8fbd901 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1185,6 +1185,9 @@  enum split_mode {
 #ifdef CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING
 /* If HugeTLB high-granularity mappings are enabled for this VMA. */
 bool hugetlb_hgm_enabled(struct vm_area_struct *vma);
+int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
+			      struct vm_area_struct *vma, unsigned long start,
+			      unsigned long end);
 int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
 				    struct mm_struct *mm,
 				    struct vm_area_struct *vma,
@@ -1197,6 +1200,15 @@  static inline bool hugetlb_hgm_enabled(struct vm_area_struct *vma)
 {
 	return false;
 }
+
+static inline
+int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
+			      struct vm_area_struct *vma, unsigned long start,
+			      unsigned long end)
+{
+		BUG();
+}
+
 static inline int huge_pte_alloc_high_granularity(struct hugetlb_pte *hpte,
 					   struct mm_struct *mm,
 					   struct vm_area_struct *vma,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6e0c5fbfe32c..da30621656b8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5605,18 +5605,24 @@  static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 			struct vm_area_struct *vma,
 			struct address_space *mapping, pgoff_t idx,
-			unsigned long address, pte_t *ptep,
+			unsigned long address, struct hugetlb_pte *hpte,
 			pte_t old_pte, unsigned int flags)
 {
 	struct hstate *h = hstate_vma(vma);
 	vm_fault_t ret = VM_FAULT_SIGBUS;
 	int anon_rmap = 0;
 	unsigned long size;
-	struct page *page;
+	struct page *page, *subpage;
 	pte_t new_pte;
 	spinlock_t *ptl;
 	unsigned long haddr = address & huge_page_mask(h);
+	unsigned long haddr_hgm = address & hugetlb_pte_mask(hpte);
 	bool new_page, new_pagecache_page = false;
+	/*
+	 * This page is getting mapped for the first time, in which case we
+	 * want to increment its mapcount.
+	 */
+	bool new_mapping = hpte->shift == huge_page_shift(h);
 
 	/*
 	 * Currently, we are forced to kill the process in the event the
@@ -5665,9 +5671,9 @@  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 			 * here.  Before returning error, get ptl and make
 			 * sure there really is no pte entry.
 			 */
-			ptl = huge_pte_lock(h, mm, ptep);
+			ptl = hugetlb_pte_lock(mm, hpte);
 			ret = 0;
-			if (huge_pte_none(huge_ptep_get(ptep)))
+			if (hugetlb_pte_none(hpte))
 				ret = vmf_error(PTR_ERR(page));
 			spin_unlock(ptl);
 			goto out;
@@ -5731,18 +5737,25 @@  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 		vma_end_reservation(h, vma, haddr);
 	}
 
-	ptl = huge_pte_lock(h, mm, ptep);
+	ptl = hugetlb_pte_lock(mm, hpte);
 	ret = 0;
 	/* If pte changed from under us, retry */
-	if (!pte_same(huge_ptep_get(ptep), old_pte))
+	if (!pte_same(hugetlb_ptep_get(hpte), old_pte))
 		goto backout;
 
-	if (anon_rmap) {
-		ClearHPageRestoreReserve(page);
-		hugepage_add_new_anon_rmap(page, vma, haddr);
-	} else
-		page_dup_file_rmap(page, true);
-	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+	if (new_mapping) {
+		/* Only increment this page's mapcount if we are mapping it
+		 * for the first time.
+		 */
+		if (anon_rmap) {
+			ClearHPageRestoreReserve(page);
+			hugepage_add_new_anon_rmap(page, vma, haddr);
+		} else
+			page_dup_file_rmap(page, true);
+	}
+
+	subpage = hugetlb_find_subpage(h, page, haddr_hgm);
+	new_pte = make_huge_pte(vma, subpage, ((vma->vm_flags & VM_WRITE)
 				&& (vma->vm_flags & VM_SHARED)));
 	/*
 	 * If this pte was previously wr-protected, keep it wr-protected even
@@ -5750,12 +5763,13 @@  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	 */
 	if (unlikely(pte_marker_uffd_wp(old_pte)))
 		new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
-	set_huge_pte_at(mm, haddr, ptep, new_pte);
+	set_huge_pte_at(mm, haddr_hgm, hpte->ptep, new_pte);
 
-	hugetlb_count_add(pages_per_huge_page(h), mm);
+	hugetlb_count_add(hugetlb_pte_size(hpte) / PAGE_SIZE, mm);
 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+		BUG_ON(hugetlb_pte_size(hpte) != huge_page_size(h));
 		/* Optimization, do the COW without a second fault */
-		ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
+		ret = hugetlb_wp(mm, vma, address, hpte->ptep, flags, page, ptl);
 	}
 
 	spin_unlock(ptl);
@@ -5816,11 +5830,15 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	u32 hash;
 	pgoff_t idx;
 	struct page *page = NULL;
+	struct page *subpage = NULL;
 	struct page *pagecache_page = NULL;
 	struct hstate *h = hstate_vma(vma);
 	struct address_space *mapping;
 	int need_wait_lock = 0;
 	unsigned long haddr = address & huge_page_mask(h);
+	unsigned long haddr_hgm;
+	bool hgm_enabled = hugetlb_hgm_enabled(vma);
+	struct hugetlb_pte hpte;
 
 	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
 	if (ptep) {
@@ -5866,11 +5884,22 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	hash = hugetlb_fault_mutex_hash(mapping, idx);
 	mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
-	entry = huge_ptep_get(ptep);
+	hugetlb_pte_populate(&hpte, ptep, huge_page_shift(h));
+
+	if (hgm_enabled) {
+		ret = hugetlb_walk_to(mm, &hpte, address,
+				      PAGE_SIZE, /*stop_at_none=*/true);
+		if (ret) {
+			ret = vmf_error(ret);
+			goto out_mutex;
+		}
+	}
+
+	entry = hugetlb_ptep_get(&hpte);
 	/* PTE markers should be handled the same way as none pte */
-	if (huge_pte_none_mostly(entry)) {
-		ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
-				      entry, flags);
+	if (hugetlb_pte_none_mostly(&hpte)) {
+		ret = hugetlb_no_page(mm, vma, mapping, idx, address, &hpte,
+				entry, flags);
 		goto out_mutex;
 	}
 
@@ -5908,14 +5937,17 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 								vma, haddr);
 	}
 
-	ptl = huge_pte_lock(h, mm, ptep);
+	ptl = hugetlb_pte_lock(mm, &hpte);
 
 	/* Check for a racing update before calling hugetlb_wp() */
-	if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+	if (unlikely(!pte_same(entry, hugetlb_ptep_get(&hpte))))
 		goto out_ptl;
 
+	/* haddr_hgm is the base address of the region that hpte maps. */
+	haddr_hgm = address & hugetlb_pte_mask(&hpte);
+
 	/* Handle userfault-wp first, before trying to lock more pages */
-	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
+	if (userfaultfd_wp(vma) && huge_pte_uffd_wp(hugetlb_ptep_get(&hpte)) &&
 	    (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
 		struct vm_fault vmf = {
 			.vma = vma,
@@ -5939,7 +5971,8 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * pagecache_page, so here we need take the former one
 	 * when page != pagecache_page or !pagecache_page.
 	 */
-	page = pte_page(entry);
+	subpage = pte_page(entry);
+	page = compound_head(subpage);
 	if (page != pagecache_page)
 		if (!trylock_page(page)) {
 			need_wait_lock = 1;
@@ -5950,7 +5983,8 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
 		if (!huge_pte_write(entry)) {
-			ret = hugetlb_wp(mm, vma, address, ptep, flags,
+			BUG_ON(hugetlb_pte_size(&hpte) != huge_page_size(h));
+			ret = hugetlb_wp(mm, vma, address, hpte.ptep, flags,
 					 pagecache_page, ptl);
 			goto out_put_page;
 		} else if (likely(flags & FAULT_FLAG_WRITE)) {
@@ -5958,9 +5992,9 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 	}
 	entry = pte_mkyoung(entry);
-	if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
+	if (huge_ptep_set_access_flags(vma, haddr_hgm, hpte.ptep, entry,
 						flags & FAULT_FLAG_WRITE))
-		update_mmu_cache(vma, haddr, ptep);
+		update_mmu_cache(vma, haddr_hgm, hpte.ptep);
 out_put_page:
 	if (page != pagecache_page)
 		unlock_page(page);
@@ -6951,7 +6985,8 @@  pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 				pte = (pte_t *)pmd_alloc(mm, pud, addr);
 		}
 	}
-	BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
+	if (!hugetlb_hgm_enabled(vma))
+		BUG_ON(pte && pte_present(*pte) && !pte_huge(*pte));
 
 	return pte;
 }
@@ -7057,6 +7092,38 @@  static unsigned int __shift_for_hstate(struct hstate *h)
 			       (tmp_h) <= &hstates[hugetlb_max_hstate]; \
 			       (tmp_h)++)
 
+/*
+ * Allocate a HugeTLB PTE that maps as much of [start, end) as possible with a
+ * single page table entry. The allocated HugeTLB PTE is returned in hpte.
+ */
+int hugetlb_alloc_largest_pte(struct hugetlb_pte *hpte, struct mm_struct *mm,
+			      struct vm_area_struct *vma, unsigned long start,
+			      unsigned long end)
+{
+	struct hstate *h = hstate_vma(vma), *tmp_h;
+	unsigned int shift;
+	int ret;
+
+	for_each_hgm_shift(h, tmp_h, shift) {
+		unsigned long sz = 1UL << shift;
+
+		if (!IS_ALIGNED(start, sz) || start + sz > end)
+			continue;
+		ret = huge_pte_alloc_high_granularity(hpte, mm, vma, start,
+						      shift, HUGETLB_SPLIT_NONE,
+						      /*write_locked=*/false);
+		if (ret)
+			return ret;
+
+		if (hpte->shift > shift)
+			return -EEXIST;
+
+		BUG_ON(hpte->shift != shift);
+		return 0;
+	}
+	return -EINVAL;
+}
+
 /*
  * Given a particular address, split the HugeTLB PTE that currently maps it
  * so that, for the given address, the PTE that maps it is `desired_shift`.