diff mbox series

[RFC,04/26] hugetlb: make huge_pte_lockptr take an explicit shift argument.

Message ID 20220624173656.2033256-5-jthoughton@google.com (mailing list archive)
State New
Headers show
Series hugetlb: Introduce HugeTLB high-granularity mapping | expand

Commit Message

James Houghton June 24, 2022, 5:36 p.m. UTC
This is needed to handle PTL locking with high-granularity mapping. We
won't always be using the PMD-level PTL even if we're using the 2M
hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
case, we need to lock the PTL for the 4K PTE.

Signed-off-by: James Houghton <jthoughton@google.com>
---
 arch/powerpc/mm/pgtable.c |  3 ++-
 include/linux/hugetlb.h   | 19 ++++++++++++++-----
 mm/hugetlb.c              |  9 +++++----
 mm/migrate.c              |  3 ++-
 mm/page_vma_mapped.c      |  3 ++-
 5 files changed, 25 insertions(+), 12 deletions(-)

Comments

Manish June 27, 2022, 12:26 p.m. UTC | #1
On 24/06/22 11:06 pm, James Houghton wrote:
> This is needed to handle PTL locking with high-granularity mapping. We
> won't always be using the PMD-level PTL even if we're using the 2M
> hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
> case, we need to lock the PTL for the 4K PTE.
>
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>   arch/powerpc/mm/pgtable.c |  3 ++-
>   include/linux/hugetlb.h   | 19 ++++++++++++++-----
>   mm/hugetlb.c              |  9 +++++----
>   mm/migrate.c              |  3 ++-
>   mm/page_vma_mapped.c      |  3 ++-
>   5 files changed, 25 insertions(+), 12 deletions(-)
>
> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index e6166b71d36d..663d591a8f08 100644
> --- a/arch/powerpc/mm/pgtable.c
> +++ b/arch/powerpc/mm/pgtable.c
> @@ -261,7 +261,8 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>   
>   		psize = hstate_get_psize(h);
>   #ifdef CONFIG_DEBUG_VM
> -		assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
> +		assert_spin_locked(huge_pte_lockptr(huge_page_shift(h),
> +						    vma->vm_mm, ptep));
>   #endif
>   
>   #else
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 498a4ae3d462..5fe1db46d8c9 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -868,12 +868,11 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
>   	return modified_mask;
>   }
>   
> -static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
> +static inline spinlock_t *huge_pte_lockptr(unsigned int shift,
>   					   struct mm_struct *mm, pte_t *pte)
>   {
> -	if (huge_page_size(h) == PMD_SIZE)
> +	if (shift == PMD_SHIFT)
>   		return pmd_lockptr(mm, (pmd_t *) pte);
> -	VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);

I may have wrong understanding here, is per pmd lock for reducing

contention, if that is the case should be take per pmd lock of

PAGE_SIZE too and page_table_lock for anything higer than PMD.

>   	return &mm->page_table_lock;
>   }
>   
> @@ -1076,7 +1075,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
>   	return 0;
>   }
>   
> -static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
> +static inline spinlock_t *huge_pte_lockptr(unsigned int shift,
>   					   struct mm_struct *mm, pte_t *pte)
>   {
>   	return &mm->page_table_lock;
> @@ -1116,7 +1115,17 @@ static inline spinlock_t *huge_pte_lock(struct hstate *h,
>   {
>   	spinlock_t *ptl;
>   
> -	ptl = huge_pte_lockptr(h, mm, pte);
> +	ptl = huge_pte_lockptr(huge_page_shift(h), mm, pte);
> +	spin_lock(ptl);
> +	return ptl;
> +}
> +
> +static inline spinlock_t *huge_pte_lock_shift(unsigned int shift,
> +					      struct mm_struct *mm, pte_t *pte)
> +{
> +	spinlock_t *ptl;
> +
> +	ptl = huge_pte_lockptr(shift, mm, pte);
>   	spin_lock(ptl);
>   	return ptl;
>   }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 0eec34edf3b2..d6d0d4c03def 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4817,7 +4817,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>   			continue;
>   
>   		dst_ptl = huge_pte_lock(h, dst, dst_pte);
> -		src_ptl = huge_pte_lockptr(h, src, src_pte);
> +		src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte);
>   		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
>   		entry = huge_ptep_get(src_pte);
>   		dst_entry = huge_ptep_get(dst_pte);
> @@ -4894,7 +4894,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>   
>   				/* Install the new huge page if src pte stable */
>   				dst_ptl = huge_pte_lock(h, dst, dst_pte);
> -				src_ptl = huge_pte_lockptr(h, src, src_pte);
> +				src_ptl = huge_pte_lockptr(huge_page_shift(h),
> +							   src, src_pte);
>   				spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
>   				entry = huge_ptep_get(src_pte);
>   				if (!pte_same(src_pte_old, entry)) {
> @@ -4948,7 +4949,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
>   	pte_t pte;
>   
>   	dst_ptl = huge_pte_lock(h, mm, dst_pte);
> -	src_ptl = huge_pte_lockptr(h, mm, src_pte);
> +	src_ptl = huge_pte_lockptr(huge_page_shift(h), mm, src_pte);
>   
>   	/*
>   	 * We don't have to worry about the ordering of src and dst ptlocks
> @@ -6024,7 +6025,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>   		page_in_pagecache = true;
>   	}
>   
> -	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
> +	ptl = huge_pte_lockptr(huge_page_shift(h), dst_mm, dst_pte);
>   	spin_lock(ptl);
>   
>   	/*
> diff --git a/mm/migrate.c b/mm/migrate.c
> index e51588e95f57..a8a960992373 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -318,7 +318,8 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
>   void migration_entry_wait_huge(struct vm_area_struct *vma,
>   		struct mm_struct *mm, pte_t *pte)
>   {
> -	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
> +	spinlock_t *ptl = huge_pte_lockptr(huge_page_shift(hstate_vma(vma)),
> +					   mm, pte);
>   	__migration_entry_wait(mm, pte, ptl);
>   }
>   
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index c10f839fc410..8921dd4e41b1 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -174,7 +174,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
>   		if (!pvmw->pte)
>   			return false;
>   
> -		pvmw->ptl = huge_pte_lockptr(hstate, mm, pvmw->pte);
> +		pvmw->ptl = huge_pte_lockptr(huge_page_shift(hstate),
> +					     mm, pvmw->pte);
>   		spin_lock(pvmw->ptl);
>   		if (!check_pte(pvmw))
>   			return not_found(pvmw);
Mike Kravetz June 27, 2022, 8:51 p.m. UTC | #2
On 06/24/22 17:36, James Houghton wrote:
> This is needed to handle PTL locking with high-granularity mapping. We
> won't always be using the PMD-level PTL even if we're using the 2M
> hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
> case, we need to lock the PTL for the 4K PTE.

I'm not really sure why this would be required.
Why not use the PMD level lock for 4K PTEs?  Seems that would scale better
with less contention than using the more coarse mm lock.
James Houghton June 28, 2022, 3:29 p.m. UTC | #3
On Mon, Jun 27, 2022 at 1:52 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 06/24/22 17:36, James Houghton wrote:
> > This is needed to handle PTL locking with high-granularity mapping. We
> > won't always be using the PMD-level PTL even if we're using the 2M
> > hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
> > case, we need to lock the PTL for the 4K PTE.
>
> I'm not really sure why this would be required.
> Why not use the PMD level lock for 4K PTEs?  Seems that would scale better
> with less contention than using the more coarse mm lock.

I should be using the PMD level lock for 4K PTEs, yeah. I'll work this
into the next version of the series. Thanks both.

>
> --
> Mike Kravetz
Muchun Song June 29, 2022, 6:09 a.m. UTC | #4
On Mon, Jun 27, 2022 at 01:51:53PM -0700, Mike Kravetz wrote:
> On 06/24/22 17:36, James Houghton wrote:
> > This is needed to handle PTL locking with high-granularity mapping. We
> > won't always be using the PMD-level PTL even if we're using the 2M
> > hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
> > case, we need to lock the PTL for the 4K PTE.
> 
> I'm not really sure why this would be required.
> Why not use the PMD level lock for 4K PTEs?  Seems that would scale better
> with less contention than using the more coarse mm lock.  
>

Your words make me thing of another question unrelated to this patch.
We __know__ that arm64 supports continues PTE HugeTLB. huge_pte_lockptr()
did not consider this case, in this case, those HugeTLB pages are contended
with mm lock. Seems we should optimize this case. Something like:

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0d790fa3f297..68a1e071bfc0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -893,7 +893,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
 {
-       if (huge_page_size(h) == PMD_SIZE)
+       if (huge_page_size(h) <= PMD_SIZE)
                return pmd_lockptr(mm, (pmd_t *) pte);
        VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
        return &mm->page_table_lock;

I did not check if elsewhere needs to be changed as well. Just a primary
thought.

Thanks.
 
> -- 
> Mike Kravetz
>
Mike Kravetz June 29, 2022, 9:03 p.m. UTC | #5
On 06/29/22 14:09, Muchun Song wrote:
> On Mon, Jun 27, 2022 at 01:51:53PM -0700, Mike Kravetz wrote:
> > On 06/24/22 17:36, James Houghton wrote:
> > > This is needed to handle PTL locking with high-granularity mapping. We
> > > won't always be using the PMD-level PTL even if we're using the 2M
> > > hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
> > > case, we need to lock the PTL for the 4K PTE.
> > 
> > I'm not really sure why this would be required.
> > Why not use the PMD level lock for 4K PTEs?  Seems that would scale better
> > with less contention than using the more coarse mm lock.  
> >
> 
> Your words make me thing of another question unrelated to this patch.
> We __know__ that arm64 supports continues PTE HugeTLB. huge_pte_lockptr()
> did not consider this case, in this case, those HugeTLB pages are contended
> with mm lock. Seems we should optimize this case. Something like:
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 0d790fa3f297..68a1e071bfc0 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -893,7 +893,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
>  static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
>                                            struct mm_struct *mm, pte_t *pte)
>  {
> -       if (huge_page_size(h) == PMD_SIZE)
> +       if (huge_page_size(h) <= PMD_SIZE)
>                 return pmd_lockptr(mm, (pmd_t *) pte);
>         VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
>         return &mm->page_table_lock;
> 
> I did not check if elsewhere needs to be changed as well. Just a primary
> thought.

That seems perfectly reasonable to me.

Also unrelated, but using the pmd lock is REQUIRED for pmd sharing.  The
mm lock is process specific and does not synchronize shared access.  I
found that out the hard way. :)
James Houghton June 29, 2022, 9:39 p.m. UTC | #6
On Wed, Jun 29, 2022 at 2:04 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>
> On 06/29/22 14:09, Muchun Song wrote:
> > On Mon, Jun 27, 2022 at 01:51:53PM -0700, Mike Kravetz wrote:
> > > On 06/24/22 17:36, James Houghton wrote:
> > > > This is needed to handle PTL locking with high-granularity mapping. We
> > > > won't always be using the PMD-level PTL even if we're using the 2M
> > > > hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
> > > > case, we need to lock the PTL for the 4K PTE.
> > >
> > > I'm not really sure why this would be required.
> > > Why not use the PMD level lock for 4K PTEs?  Seems that would scale better
> > > with less contention than using the more coarse mm lock.
> > >
> >
> > Your words make me thing of another question unrelated to this patch.
> > We __know__ that arm64 supports continues PTE HugeTLB. huge_pte_lockptr()
> > did not consider this case, in this case, those HugeTLB pages are contended
> > with mm lock. Seems we should optimize this case. Something like:
> >
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 0d790fa3f297..68a1e071bfc0 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -893,7 +893,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
> >  static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
> >                                            struct mm_struct *mm, pte_t *pte)
> >  {
> > -       if (huge_page_size(h) == PMD_SIZE)
> > +       if (huge_page_size(h) <= PMD_SIZE)
> >                 return pmd_lockptr(mm, (pmd_t *) pte);
> >         VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
> >         return &mm->page_table_lock;
> >
> > I did not check if elsewhere needs to be changed as well. Just a primary
> > thought.

I'm not sure if this works. If hugetlb_pte_size(hpte) is PAGE_SIZE,
then `hpte.ptep` will be a pte_t, not a pmd_t -- I assume that breaks
things. So I think, when doing a HugeTLB PT walk down to PAGE_SIZE, we
need to separately keep track of the location of the PMD so that we
can use it to get the PMD lock.

>
> That seems perfectly reasonable to me.
>
> Also unrelated, but using the pmd lock is REQUIRED for pmd sharing.  The
> mm lock is process specific and does not synchronize shared access.  I
> found that out the hard way. :)
>
> --
> Mike Kravetz
Mike Kravetz June 29, 2022, 10:24 p.m. UTC | #7
On 06/29/22 14:39, James Houghton wrote:
> On Wed, Jun 29, 2022 at 2:04 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> >
> > On 06/29/22 14:09, Muchun Song wrote:
> > > On Mon, Jun 27, 2022 at 01:51:53PM -0700, Mike Kravetz wrote:
> > > > On 06/24/22 17:36, James Houghton wrote:
> > > > > This is needed to handle PTL locking with high-granularity mapping. We
> > > > > won't always be using the PMD-level PTL even if we're using the 2M
> > > > > hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
> > > > > case, we need to lock the PTL for the 4K PTE.
> > > >
> > > > I'm not really sure why this would be required.
> > > > Why not use the PMD level lock for 4K PTEs?  Seems that would scale better
> > > > with less contention than using the more coarse mm lock.
> > > >
> > >
> > > Your words make me thing of another question unrelated to this patch.
> > > We __know__ that arm64 supports continues PTE HugeTLB. huge_pte_lockptr()
> > > did not consider this case, in this case, those HugeTLB pages are contended
> > > with mm lock. Seems we should optimize this case. Something like:
> > >
> > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > > index 0d790fa3f297..68a1e071bfc0 100644
> > > --- a/include/linux/hugetlb.h
> > > +++ b/include/linux/hugetlb.h
> > > @@ -893,7 +893,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
> > >  static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
> > >                                            struct mm_struct *mm, pte_t *pte)
> > >  {
> > > -       if (huge_page_size(h) == PMD_SIZE)
> > > +       if (huge_page_size(h) <= PMD_SIZE)
> > >                 return pmd_lockptr(mm, (pmd_t *) pte);
> > >         VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
> > >         return &mm->page_table_lock;
> > >
> > > I did not check if elsewhere needs to be changed as well. Just a primary
> > > thought.
> 
> I'm not sure if this works. If hugetlb_pte_size(hpte) is PAGE_SIZE,
> then `hpte.ptep` will be a pte_t, not a pmd_t -- I assume that breaks
> things. So I think, when doing a HugeTLB PT walk down to PAGE_SIZE, we
> need to separately keep track of the location of the PMD so that we
> can use it to get the PMD lock.

I assume Muchun was talking about changing this in current code (before
your changes) where huge_page_size(h) can not be PAGE_SIZE.
Muchun Song June 30, 2022, 9:35 a.m. UTC | #8
On Wed, Jun 29, 2022 at 03:24:45PM -0700, Mike Kravetz wrote:
> On 06/29/22 14:39, James Houghton wrote:
> > On Wed, Jun 29, 2022 at 2:04 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > >
> > > On 06/29/22 14:09, Muchun Song wrote:
> > > > On Mon, Jun 27, 2022 at 01:51:53PM -0700, Mike Kravetz wrote:
> > > > > On 06/24/22 17:36, James Houghton wrote:
> > > > > > This is needed to handle PTL locking with high-granularity mapping. We
> > > > > > won't always be using the PMD-level PTL even if we're using the 2M
> > > > > > hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
> > > > > > case, we need to lock the PTL for the 4K PTE.
> > > > >
> > > > > I'm not really sure why this would be required.
> > > > > Why not use the PMD level lock for 4K PTEs?  Seems that would scale better
> > > > > with less contention than using the more coarse mm lock.
> > > > >
> > > >
> > > > Your words make me thing of another question unrelated to this patch.
> > > > We __know__ that arm64 supports continues PTE HugeTLB. huge_pte_lockptr()
> > > > did not consider this case, in this case, those HugeTLB pages are contended
> > > > with mm lock. Seems we should optimize this case. Something like:
> > > >
> > > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > > > index 0d790fa3f297..68a1e071bfc0 100644
> > > > --- a/include/linux/hugetlb.h
> > > > +++ b/include/linux/hugetlb.h
> > > > @@ -893,7 +893,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
> > > >  static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
> > > >                                            struct mm_struct *mm, pte_t *pte)
> > > >  {
> > > > -       if (huge_page_size(h) == PMD_SIZE)
> > > > +       if (huge_page_size(h) <= PMD_SIZE)
> > > >                 return pmd_lockptr(mm, (pmd_t *) pte);
> > > >         VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
> > > >         return &mm->page_table_lock;
> > > >
> > > > I did not check if elsewhere needs to be changed as well. Just a primary
> > > > thought.
> > 
> > I'm not sure if this works. If hugetlb_pte_size(hpte) is PAGE_SIZE,
> > then `hpte.ptep` will be a pte_t, not a pmd_t -- I assume that breaks
> > things. So I think, when doing a HugeTLB PT walk down to PAGE_SIZE, we
> > need to separately keep track of the location of the PMD so that we
> > can use it to get the PMD lock.
> 
> I assume Muchun was talking about changing this in current code (before
> your changes) where huge_page_size(h) can not be PAGE_SIZE.
>

Yes, that's what I meant.

Thanks.
James Houghton June 30, 2022, 4:23 p.m. UTC | #9
On Thu, Jun 30, 2022 at 2:35 AM Muchun Song <songmuchun@bytedance.com> wrote:
>
> On Wed, Jun 29, 2022 at 03:24:45PM -0700, Mike Kravetz wrote:
> > On 06/29/22 14:39, James Houghton wrote:
> > > On Wed, Jun 29, 2022 at 2:04 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > > >
> > > > On 06/29/22 14:09, Muchun Song wrote:
> > > > > On Mon, Jun 27, 2022 at 01:51:53PM -0700, Mike Kravetz wrote:
> > > > > > On 06/24/22 17:36, James Houghton wrote:
> > > > > > > This is needed to handle PTL locking with high-granularity mapping. We
> > > > > > > won't always be using the PMD-level PTL even if we're using the 2M
> > > > > > > hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
> > > > > > > case, we need to lock the PTL for the 4K PTE.
> > > > > >
> > > > > > I'm not really sure why this would be required.
> > > > > > Why not use the PMD level lock for 4K PTEs?  Seems that would scale better
> > > > > > with less contention than using the more coarse mm lock.
> > > > > >
> > > > >
> > > > > Your words make me thing of another question unrelated to this patch.
> > > > > We __know__ that arm64 supports continues PTE HugeTLB. huge_pte_lockptr()
> > > > > did not consider this case, in this case, those HugeTLB pages are contended
> > > > > with mm lock. Seems we should optimize this case. Something like:
> > > > >
> > > > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > > > > index 0d790fa3f297..68a1e071bfc0 100644
> > > > > --- a/include/linux/hugetlb.h
> > > > > +++ b/include/linux/hugetlb.h
> > > > > @@ -893,7 +893,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
> > > > >  static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
> > > > >                                            struct mm_struct *mm, pte_t *pte)
> > > > >  {
> > > > > -       if (huge_page_size(h) == PMD_SIZE)
> > > > > +       if (huge_page_size(h) <= PMD_SIZE)
> > > > >                 return pmd_lockptr(mm, (pmd_t *) pte);
> > > > >         VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
> > > > >         return &mm->page_table_lock;
> > > > >
> > > > > I did not check if elsewhere needs to be changed as well. Just a primary
> > > > > thought.
> > >
> > > I'm not sure if this works. If hugetlb_pte_size(hpte) is PAGE_SIZE,
> > > then `hpte.ptep` will be a pte_t, not a pmd_t -- I assume that breaks
> > > things. So I think, when doing a HugeTLB PT walk down to PAGE_SIZE, we
> > > need to separately keep track of the location of the PMD so that we
> > > can use it to get the PMD lock.
> >
> > I assume Muchun was talking about changing this in current code (before
> > your changes) where huge_page_size(h) can not be PAGE_SIZE.
> >
>
> Yes, that's what I meant.

Right -- but I think my point still stands. If `huge_page_size(h)` is
CONT_PTE_SIZE, then the `pte_t *` passed to `huge_pte_lockptr` will
*actually* point to a `pte_t` and not a `pmd_t` (I'm pretty sure the
distinction is important). So it seems like we need to separately keep
track of the real pmd_t that is being used in the CONT_PTE_SIZE case
(and therefore, when considering HGM, the PAGE_SIZE case).

However, we *can* make this optimization for CONT_PMD_SIZE (maybe this
is what you originally meant, Muchun?), so instead of
`huge_page_size(h) == PMD_SIZE`, we could do `huge_page_size(h) >=
PMD_SIZE && huge_page_size(h) < PUD_SIZE`.

>
> Thanks.
Mike Kravetz June 30, 2022, 5:40 p.m. UTC | #10
On 06/30/22 09:23, James Houghton wrote:
> On Thu, Jun 30, 2022 at 2:35 AM Muchun Song <songmuchun@bytedance.com> wrote:
> >
> > On Wed, Jun 29, 2022 at 03:24:45PM -0700, Mike Kravetz wrote:
> > > On 06/29/22 14:39, James Houghton wrote:
> > > > On Wed, Jun 29, 2022 at 2:04 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
> > > > >
> > > > > On 06/29/22 14:09, Muchun Song wrote:
> > > > > > On Mon, Jun 27, 2022 at 01:51:53PM -0700, Mike Kravetz wrote:
> > > > > > > On 06/24/22 17:36, James Houghton wrote:
> > > > > > > > This is needed to handle PTL locking with high-granularity mapping. We
> > > > > > > > won't always be using the PMD-level PTL even if we're using the 2M
> > > > > > > > hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
> > > > > > > > case, we need to lock the PTL for the 4K PTE.
> > > > > > >
> > > > > > > I'm not really sure why this would be required.
> > > > > > > Why not use the PMD level lock for 4K PTEs?  Seems that would scale better
> > > > > > > with less contention than using the more coarse mm lock.
> > > > > > >
> > > > > >
> > > > > > Your words make me thing of another question unrelated to this patch.
> > > > > > We __know__ that arm64 supports continues PTE HugeTLB. huge_pte_lockptr()
> > > > > > did not consider this case, in this case, those HugeTLB pages are contended
> > > > > > with mm lock. Seems we should optimize this case. Something like:
> > > > > >
> > > > > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > > > > > index 0d790fa3f297..68a1e071bfc0 100644
> > > > > > --- a/include/linux/hugetlb.h
> > > > > > +++ b/include/linux/hugetlb.h
> > > > > > @@ -893,7 +893,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
> > > > > >  static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
> > > > > >                                            struct mm_struct *mm, pte_t *pte)
> > > > > >  {
> > > > > > -       if (huge_page_size(h) == PMD_SIZE)
> > > > > > +       if (huge_page_size(h) <= PMD_SIZE)
> > > > > >                 return pmd_lockptr(mm, (pmd_t *) pte);
> > > > > >         VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
> > > > > >         return &mm->page_table_lock;
> > > > > >
> > > > > > I did not check if elsewhere needs to be changed as well. Just a primary
> > > > > > thought.
> > > >
> > > > I'm not sure if this works. If hugetlb_pte_size(hpte) is PAGE_SIZE,
> > > > then `hpte.ptep` will be a pte_t, not a pmd_t -- I assume that breaks
> > > > things. So I think, when doing a HugeTLB PT walk down to PAGE_SIZE, we
> > > > need to separately keep track of the location of the PMD so that we
> > > > can use it to get the PMD lock.
> > >
> > > I assume Muchun was talking about changing this in current code (before
> > > your changes) where huge_page_size(h) can not be PAGE_SIZE.
> > >
> >
> > Yes, that's what I meant.
> 
> Right -- but I think my point still stands. If `huge_page_size(h)` is
> CONT_PTE_SIZE, then the `pte_t *` passed to `huge_pte_lockptr` will
> *actually* point to a `pte_t` and not a `pmd_t` (I'm pretty sure the
> distinction is important). So it seems like we need to separately keep
> track of the real pmd_t that is being used in the CONT_PTE_SIZE case
> (and therefore, when considering HGM, the PAGE_SIZE case).

Ah yes, that is correct.  We would be passing in a pte not pmd in this
case.

> 
> However, we *can* make this optimization for CONT_PMD_SIZE (maybe this
> is what you originally meant, Muchun?), so instead of
> `huge_page_size(h) == PMD_SIZE`, we could do `huge_page_size(h) >=
> PMD_SIZE && huge_page_size(h) < PUD_SIZE`.
> 

Another 'optimization' may exist in hugetlb address range scanning code.
We currently have something like:

for addr=start, addr< end, addr += huge_page_size
	pte = huge_pte_offset(addr)
	ptl = huge_pte_lock(pte)
	...
	...
	spin_unlock(ptl)

Seems like ptl will be the same for all entries on the same pmd page.
We 'may' be able to go from 512 lock/unlock cycles to 1.
Muchun Song July 1, 2022, 3:32 a.m. UTC | #11
> On Jul 1, 2022, at 00:23, James Houghton <jthoughton@google.com> wrote:
> 
> On Thu, Jun 30, 2022 at 2:35 AM Muchun Song <songmuchun@bytedance.com> wrote:
>> 
>> On Wed, Jun 29, 2022 at 03:24:45PM -0700, Mike Kravetz wrote:
>>> On 06/29/22 14:39, James Houghton wrote:
>>>> On Wed, Jun 29, 2022 at 2:04 PM Mike Kravetz <mike.kravetz@oracle.com> wrote:
>>>>> 
>>>>> On 06/29/22 14:09, Muchun Song wrote:
>>>>>> On Mon, Jun 27, 2022 at 01:51:53PM -0700, Mike Kravetz wrote:
>>>>>>> On 06/24/22 17:36, James Houghton wrote:
>>>>>>>> This is needed to handle PTL locking with high-granularity mapping. We
>>>>>>>> won't always be using the PMD-level PTL even if we're using the 2M
>>>>>>>> hugepage hstate. It's possible that we're dealing with 4K PTEs, in which
>>>>>>>> case, we need to lock the PTL for the 4K PTE.
>>>>>>> 
>>>>>>> I'm not really sure why this would be required.
>>>>>>> Why not use the PMD level lock for 4K PTEs? Seems that would scale better
>>>>>>> with less contention than using the more coarse mm lock.
>>>>>>> 
>>>>>> 
>>>>>> Your words make me thing of another question unrelated to this patch.
>>>>>> We __know__ that arm64 supports continues PTE HugeTLB. huge_pte_lockptr()
>>>>>> did not consider this case, in this case, those HugeTLB pages are contended
>>>>>> with mm lock. Seems we should optimize this case. Something like:
>>>>>> 
>>>>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>>>>>> index 0d790fa3f297..68a1e071bfc0 100644
>>>>>> --- a/include/linux/hugetlb.h
>>>>>> +++ b/include/linux/hugetlb.h
>>>>>> @@ -893,7 +893,7 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
>>>>>> static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
>>>>>> struct mm_struct *mm, pte_t *pte)
>>>>>> {
>>>>>> - if (huge_page_size(h) == PMD_SIZE)
>>>>>> + if (huge_page_size(h) <= PMD_SIZE)
>>>>>> return pmd_lockptr(mm, (pmd_t *) pte);
>>>>>> VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
>>>>>> return &mm->page_table_lock;
>>>>>> 
>>>>>> I did not check if elsewhere needs to be changed as well. Just a primary
>>>>>> thought.
>>>> 
>>>> I'm not sure if this works. If hugetlb_pte_size(hpte) is PAGE_SIZE,
>>>> then `hpte.ptep` will be a pte_t, not a pmd_t -- I assume that breaks
>>>> things. So I think, when doing a HugeTLB PT walk down to PAGE_SIZE, we
>>>> need to separately keep track of the location of the PMD so that we
>>>> can use it to get the PMD lock.
>>> 
>>> I assume Muchun was talking about changing this in current code (before
>>> your changes) where huge_page_size(h) can not be PAGE_SIZE.
>>> 
>> 
>> Yes, that's what I meant.
> 
> Right -- but I think my point still stands. If `huge_page_size(h)` is
> CONT_PTE_SIZE, then the `pte_t *` passed to `huge_pte_lockptr` will
> *actually* point to a `pte_t` and not a `pmd_t` (I'm pretty sure the

Right. It is a pte in this case.

> distinction is important). So it seems like we need to separately keep
> track of the real pmd_t that is being used in the CONT_PTE_SIZE case

If we want to find pmd_t from pte_t, I think we can introduce a new field
in struct page just like the thread [1] does.

[1] https://lore.kernel.org/lkml/20211110105428.32458-7-zhengqi.arch@bytedance.com/

> (and therefore, when considering HGM, the PAGE_SIZE case).
> 
> However, we *can* make this optimization for CONT_PMD_SIZE (maybe this
> is what you originally meant, Muchun?), so instead of
> `huge_page_size(h) == PMD_SIZE`, we could do `huge_page_size(h) >=
> PMD_SIZE && huge_page_size(h) < PUD_SIZE`.

Right. It is a good start to optimize CONT_PMD_SIZE case.

Thanks.

> 
>> 
>> Thanks.
diff mbox series

Patch

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index e6166b71d36d..663d591a8f08 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -261,7 +261,8 @@  int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 
 		psize = hstate_get_psize(h);
 #ifdef CONFIG_DEBUG_VM
-		assert_spin_locked(huge_pte_lockptr(h, vma->vm_mm, ptep));
+		assert_spin_locked(huge_pte_lockptr(huge_page_shift(h),
+						    vma->vm_mm, ptep));
 #endif
 
 #else
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 498a4ae3d462..5fe1db46d8c9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -868,12 +868,11 @@  static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 	return modified_mask;
 }
 
-static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
+static inline spinlock_t *huge_pte_lockptr(unsigned int shift,
 					   struct mm_struct *mm, pte_t *pte)
 {
-	if (huge_page_size(h) == PMD_SIZE)
+	if (shift == PMD_SHIFT)
 		return pmd_lockptr(mm, (pmd_t *) pte);
-	VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
 	return &mm->page_table_lock;
 }
 
@@ -1076,7 +1075,7 @@  static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 	return 0;
 }
 
-static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
+static inline spinlock_t *huge_pte_lockptr(unsigned int shift,
 					   struct mm_struct *mm, pte_t *pte)
 {
 	return &mm->page_table_lock;
@@ -1116,7 +1115,17 @@  static inline spinlock_t *huge_pte_lock(struct hstate *h,
 {
 	spinlock_t *ptl;
 
-	ptl = huge_pte_lockptr(h, mm, pte);
+	ptl = huge_pte_lockptr(huge_page_shift(h), mm, pte);
+	spin_lock(ptl);
+	return ptl;
+}
+
+static inline spinlock_t *huge_pte_lock_shift(unsigned int shift,
+					      struct mm_struct *mm, pte_t *pte)
+{
+	spinlock_t *ptl;
+
+	ptl = huge_pte_lockptr(shift, mm, pte);
 	spin_lock(ptl);
 	return ptl;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0eec34edf3b2..d6d0d4c03def 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4817,7 +4817,7 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			continue;
 
 		dst_ptl = huge_pte_lock(h, dst, dst_pte);
-		src_ptl = huge_pte_lockptr(h, src, src_pte);
+		src_ptl = huge_pte_lockptr(huge_page_shift(h), src, src_pte);
 		spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 		entry = huge_ptep_get(src_pte);
 		dst_entry = huge_ptep_get(dst_pte);
@@ -4894,7 +4894,8 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 
 				/* Install the new huge page if src pte stable */
 				dst_ptl = huge_pte_lock(h, dst, dst_pte);
-				src_ptl = huge_pte_lockptr(h, src, src_pte);
+				src_ptl = huge_pte_lockptr(huge_page_shift(h),
+							   src, src_pte);
 				spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 				entry = huge_ptep_get(src_pte);
 				if (!pte_same(src_pte_old, entry)) {
@@ -4948,7 +4949,7 @@  static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
 	pte_t pte;
 
 	dst_ptl = huge_pte_lock(h, mm, dst_pte);
-	src_ptl = huge_pte_lockptr(h, mm, src_pte);
+	src_ptl = huge_pte_lockptr(huge_page_shift(h), mm, src_pte);
 
 	/*
 	 * We don't have to worry about the ordering of src and dst ptlocks
@@ -6024,7 +6025,7 @@  int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		page_in_pagecache = true;
 	}
 
-	ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
+	ptl = huge_pte_lockptr(huge_page_shift(h), dst_mm, dst_pte);
 	spin_lock(ptl);
 
 	/*
diff --git a/mm/migrate.c b/mm/migrate.c
index e51588e95f57..a8a960992373 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -318,7 +318,8 @@  void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 void migration_entry_wait_huge(struct vm_area_struct *vma,
 		struct mm_struct *mm, pte_t *pte)
 {
-	spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
+	spinlock_t *ptl = huge_pte_lockptr(huge_page_shift(hstate_vma(vma)),
+					   mm, pte);
 	__migration_entry_wait(mm, pte, ptl);
 }
 
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c10f839fc410..8921dd4e41b1 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -174,7 +174,8 @@  bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 		if (!pvmw->pte)
 			return false;
 
-		pvmw->ptl = huge_pte_lockptr(hstate, mm, pvmw->pte);
+		pvmw->ptl = huge_pte_lockptr(huge_page_shift(hstate),
+					     mm, pvmw->pte);
 		spin_lock(pvmw->ptl);
 		if (!check_pte(pvmw))
 			return not_found(pvmw);