diff mbox series

[8/8] hugetlb: use new vma_lock for pmd sharing synchronization

Message ID 20220824175757.20590-9-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series hugetlb: Use new vma mutex for huge pmd sharing synchronization | expand

Commit Message

Mike Kravetz Aug. 24, 2022, 5:57 p.m. UTC
The new hugetlb vma lock (rw semaphore) is used to address this race:

Faulting thread                                 Unsharing thread
...                                                  ...
ptep = huge_pte_offset()
      or
ptep = huge_pte_alloc()
...
                                                i_mmap_lock_write
                                                lock page table
ptep invalid   <------------------------        huge_pmd_unshare()
Could be in a previously                        unlock_page_table
sharing process or worse                        i_mmap_unlock_write
...

The vma_lock is used as follows:
- During fault processing. the lock is acquired in read mode before
  doing a page table lock and allocation (huge_pte_alloc).  The lock is
  held until code is finished with the page table entry (ptep).
- The lock must be held in write mode whenever huge_pmd_unshare is
  called.

Lock ordering issues come into play when unmapping a page from all
vmas mapping the page.  The i_mmap_rwsem must be held to search for the
vmas, and the vma lock must be held before calling unmap which will
call huge_pmd_unshare.  This is done today in:
- try_to_migrate_one and try_to_unmap_ for page migration and memory
  error handling.  In these routines we 'try' to obtain the vma lock and
  fail to unmap if unsuccessful.  Calling routines already deal with the
  failure of unmapping.
- hugetlb_vmdelete_list for truncation and hole punch.  This routine
  also tries to acquire the vma lock.  If it fails, it skips the
  unmapping.  However, we can not have file truncation or hole punch
  fail because of contention.  After hugetlb_vmdelete_list, truncation
  and hole punch call remove_inode_hugepages.  remove_inode_hugepages
  check for mapped pages and call hugetlb_unmap_file_page to unmap them.
  hugetlb_unmap_file_page is designed to drop locks and reacquire in the
  correct order to guarantee unmap success.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 fs/hugetlbfs/inode.c |  46 +++++++++++++++++++
 mm/hugetlb.c         | 102 +++++++++++++++++++++++++++++++++++++++----
 mm/memory.c          |   2 +
 mm/rmap.c            | 100 +++++++++++++++++++++++++++---------------
 mm/userfaultfd.c     |   9 +++-
 5 files changed, 214 insertions(+), 45 deletions(-)

Comments

Miaohe Lin Aug. 30, 2022, 2:02 a.m. UTC | #1
On 2022/8/25 1:57, Mike Kravetz wrote:
> The new hugetlb vma lock (rw semaphore) is used to address this race:
> 
> Faulting thread                                 Unsharing thread
> ...                                                  ...
> ptep = huge_pte_offset()
>       or
> ptep = huge_pte_alloc()
> ...
>                                                 i_mmap_lock_write
>                                                 lock page table
> ptep invalid   <------------------------        huge_pmd_unshare()
> Could be in a previously                        unlock_page_table
> sharing process or worse                        i_mmap_unlock_write
> ...
> 
> The vma_lock is used as follows:
> - During fault processing. the lock is acquired in read mode before
>   doing a page table lock and allocation (huge_pte_alloc).  The lock is
>   held until code is finished with the page table entry (ptep).
> - The lock must be held in write mode whenever huge_pmd_unshare is
>   called.
> 
> Lock ordering issues come into play when unmapping a page from all
> vmas mapping the page.  The i_mmap_rwsem must be held to search for the
> vmas, and the vma lock must be held before calling unmap which will
> call huge_pmd_unshare.  This is done today in:
> - try_to_migrate_one and try_to_unmap_ for page migration and memory
>   error handling.  In these routines we 'try' to obtain the vma lock and
>   fail to unmap if unsuccessful.  Calling routines already deal with the
>   failure of unmapping.
> - hugetlb_vmdelete_list for truncation and hole punch.  This routine
>   also tries to acquire the vma lock.  If it fails, it skips the
>   unmapping.  However, we can not have file truncation or hole punch
>   fail because of contention.  After hugetlb_vmdelete_list, truncation
>   and hole punch call remove_inode_hugepages.  remove_inode_hugepages
>   check for mapped pages and call hugetlb_unmap_file_page to unmap them.
>   hugetlb_unmap_file_page is designed to drop locks and reacquire in the
>   correct order to guarantee unmap success.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>  fs/hugetlbfs/inode.c |  46 +++++++++++++++++++
>  mm/hugetlb.c         | 102 +++++++++++++++++++++++++++++++++++++++----
>  mm/memory.c          |   2 +
>  mm/rmap.c            | 100 +++++++++++++++++++++++++++---------------
>  mm/userfaultfd.c     |   9 +++-
>  5 files changed, 214 insertions(+), 45 deletions(-)
> 
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index b93d131b0cb5..52d9b390389b 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -434,6 +434,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>  					struct folio *folio, pgoff_t index)
>  {
>  	struct rb_root_cached *root = &mapping->i_mmap;
> +	unsigned long skipped_vm_start;
> +	struct mm_struct *skipped_mm;
>  	struct page *page = &folio->page;
>  	struct vm_area_struct *vma;
>  	unsigned long v_start;
> @@ -444,6 +446,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>  	end = ((index + 1) * pages_per_huge_page(h));
>  
>  	i_mmap_lock_write(mapping);
> +retry:
> +	skipped_mm = NULL;
>  
>  	vma_interval_tree_foreach(vma, root, start, end - 1) {
>  		v_start = vma_offset_start(vma, start);
> @@ -452,11 +456,49 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>  		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
>  			continue;
>  
> +		if (!hugetlb_vma_trylock_write(vma)) {
> +			/*
> +			 * If we can not get vma lock, we need to drop
> +			 * immap_sema and take locks in order.
> +			 */
> +			skipped_vm_start = vma->vm_start;
> +			skipped_mm = vma->vm_mm;
> +			/* grab mm-struct as we will be dropping i_mmap_sema */
> +			mmgrab(skipped_mm);
> +			break;
> +		}
> +
>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
>  				NULL, ZAP_FLAG_DROP_MARKER);
> +		hugetlb_vma_unlock_write(vma);
>  	}
>  
>  	i_mmap_unlock_write(mapping);
> +
> +	if (skipped_mm) {
> +		mmap_read_lock(skipped_mm);
> +		vma = find_vma(skipped_mm, skipped_vm_start);
> +		if (!vma || !is_vm_hugetlb_page(vma) ||
> +					vma->vm_file->f_mapping != mapping ||
> +					vma->vm_start != skipped_vm_start) {

i_mmap_lock_write(mapping) is missing here? Retry logic will do i_mmap_unlock_write(mapping) anyway.

> +			mmap_read_unlock(skipped_mm);
> +			mmdrop(skipped_mm);
> +			goto retry;
> +		}
> +

IMHO, above check is not enough. Think about the below scene:

CPU 1					CPU 2
hugetlb_unmap_file_folio		exit_mmap
  mmap_read_lock(skipped_mm);		  mmap_read_lock(mm);
  check vma is wanted.
  					  unmap_vmas
  mmap_read_unlock(skipped_mm);		  mmap_read_unlock
  					  mmap_write_lock(mm);
  					  free_pgtables
  					  remove_vma
					    hugetlb_vma_lock_free
  vma, hugetlb_vma_lock is still *used after free*
  					  mmap_write_unlock(mm);
So we should check mm->mm_users == 0 to fix the above issue. Or am I miss something?

> +		hugetlb_vma_lock_write(vma);
> +		i_mmap_lock_write(mapping);
> +		mmap_read_unlock(skipped_mm);
> +		mmdrop(skipped_mm);
> +
> +		v_start = vma_offset_start(vma, start);
> +		v_end = vma_offset_end(vma, end);
> +		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
> +				NULL, ZAP_FLAG_DROP_MARKER);
> +		hugetlb_vma_unlock_write(vma);
> +
> +		goto retry;

Should here be one cond_resched() here in case this function will take a really long time?

> +	}
>  }
>  
>  static void
> @@ -474,11 +516,15 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
>  		unsigned long v_start;
>  		unsigned long v_end;
>  
> +		if (!hugetlb_vma_trylock_write(vma))
> +			continue;
> +
>  		v_start = vma_offset_start(vma, start);
>  		v_end = vma_offset_end(vma, end);
>  
>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
>  				     NULL, zap_flags);
> +		hugetlb_vma_unlock_write(vma);
>  	}

unmap_hugepage_range is not called under hugetlb_vma_lock in unmap_ref_private since it's private vma?
Add a comment to avoid future confusion?

>  }
>  
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6fb0bff2c7ee..5912c2b97ddf 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4801,6 +4801,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>  		mmu_notifier_invalidate_range_start(&range);
>  		mmap_assert_write_locked(src);
>  		raw_write_seqcount_begin(&src->write_protect_seq);
> +	} else {
> +		/*
> +		 * For shared mappings the vma lock must be held before
> +		 * calling huge_pte_offset in the src vma. Otherwise, the

s/huge_pte_offset/huge_pte_alloc/, i.e. huge_pte_alloc could return shared pmd, not huge_pte_offset which
might lead to confusion. But this is really trivial...

Except from above comments, this patch looks good to me.

Thanks,
Miaohe Lin
Mike Kravetz Sept. 2, 2022, 11:07 p.m. UTC | #2
On 08/30/22 10:02, Miaohe Lin wrote:
> On 2022/8/25 1:57, Mike Kravetz wrote:
> > The new hugetlb vma lock (rw semaphore) is used to address this race:
> > 
> > Faulting thread                                 Unsharing thread
> > ...                                                  ...
> > ptep = huge_pte_offset()
> >       or
> > ptep = huge_pte_alloc()
> > ...
> >                                                 i_mmap_lock_write
> >                                                 lock page table
> > ptep invalid   <------------------------        huge_pmd_unshare()
> > Could be in a previously                        unlock_page_table
> > sharing process or worse                        i_mmap_unlock_write
> > ...
> > 
> > The vma_lock is used as follows:
> > - During fault processing. the lock is acquired in read mode before
> >   doing a page table lock and allocation (huge_pte_alloc).  The lock is
> >   held until code is finished with the page table entry (ptep).
> > - The lock must be held in write mode whenever huge_pmd_unshare is
> >   called.
> > 
> > Lock ordering issues come into play when unmapping a page from all
> > vmas mapping the page.  The i_mmap_rwsem must be held to search for the
> > vmas, and the vma lock must be held before calling unmap which will
> > call huge_pmd_unshare.  This is done today in:
> > - try_to_migrate_one and try_to_unmap_ for page migration and memory
> >   error handling.  In these routines we 'try' to obtain the vma lock and
> >   fail to unmap if unsuccessful.  Calling routines already deal with the
> >   failure of unmapping.
> > - hugetlb_vmdelete_list for truncation and hole punch.  This routine
> >   also tries to acquire the vma lock.  If it fails, it skips the
> >   unmapping.  However, we can not have file truncation or hole punch
> >   fail because of contention.  After hugetlb_vmdelete_list, truncation
> >   and hole punch call remove_inode_hugepages.  remove_inode_hugepages
> >   check for mapped pages and call hugetlb_unmap_file_page to unmap them.
> >   hugetlb_unmap_file_page is designed to drop locks and reacquire in the
> >   correct order to guarantee unmap success.
> > 
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> > ---
> >  fs/hugetlbfs/inode.c |  46 +++++++++++++++++++
> >  mm/hugetlb.c         | 102 +++++++++++++++++++++++++++++++++++++++----
> >  mm/memory.c          |   2 +
> >  mm/rmap.c            | 100 +++++++++++++++++++++++++++---------------
> >  mm/userfaultfd.c     |   9 +++-
> >  5 files changed, 214 insertions(+), 45 deletions(-)
> > 
> > diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> > index b93d131b0cb5..52d9b390389b 100644
> > --- a/fs/hugetlbfs/inode.c
> > +++ b/fs/hugetlbfs/inode.c
> > @@ -434,6 +434,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >  					struct folio *folio, pgoff_t index)
> >  {
> >  	struct rb_root_cached *root = &mapping->i_mmap;
> > +	unsigned long skipped_vm_start;
> > +	struct mm_struct *skipped_mm;
> >  	struct page *page = &folio->page;
> >  	struct vm_area_struct *vma;
> >  	unsigned long v_start;
> > @@ -444,6 +446,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >  	end = ((index + 1) * pages_per_huge_page(h));
> >  
> >  	i_mmap_lock_write(mapping);
> > +retry:
> > +	skipped_mm = NULL;
> >  
> >  	vma_interval_tree_foreach(vma, root, start, end - 1) {
> >  		v_start = vma_offset_start(vma, start);
> > @@ -452,11 +456,49 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >  		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
> >  			continue;
> >  
> > +		if (!hugetlb_vma_trylock_write(vma)) {
> > +			/*
> > +			 * If we can not get vma lock, we need to drop
> > +			 * immap_sema and take locks in order.
> > +			 */
> > +			skipped_vm_start = vma->vm_start;
> > +			skipped_mm = vma->vm_mm;
> > +			/* grab mm-struct as we will be dropping i_mmap_sema */
> > +			mmgrab(skipped_mm);
> > +			break;
> > +		}
> > +
> >  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
> >  				NULL, ZAP_FLAG_DROP_MARKER);
> > +		hugetlb_vma_unlock_write(vma);
> >  	}
> >  
> >  	i_mmap_unlock_write(mapping);
> > +
> > +	if (skipped_mm) {
> > +		mmap_read_lock(skipped_mm);
> > +		vma = find_vma(skipped_mm, skipped_vm_start);
> > +		if (!vma || !is_vm_hugetlb_page(vma) ||
> > +					vma->vm_file->f_mapping != mapping ||
> > +					vma->vm_start != skipped_vm_start) {
> 
> i_mmap_lock_write(mapping) is missing here? Retry logic will do i_mmap_unlock_write(mapping) anyway.
> 

Yes, that is missing.  I will add here.

> > +			mmap_read_unlock(skipped_mm);
> > +			mmdrop(skipped_mm);
> > +			goto retry;
> > +		}
> > +
> 
> IMHO, above check is not enough. Think about the below scene:
> 
> CPU 1					CPU 2
> hugetlb_unmap_file_folio		exit_mmap
>   mmap_read_lock(skipped_mm);		  mmap_read_lock(mm);
>   check vma is wanted.
>   					  unmap_vmas
>   mmap_read_unlock(skipped_mm);		  mmap_read_unlock
>   					  mmap_write_lock(mm);
>   					  free_pgtables
>   					  remove_vma
> 					    hugetlb_vma_lock_free
>   vma, hugetlb_vma_lock is still *used after free*
>   					  mmap_write_unlock(mm);
> So we should check mm->mm_users == 0 to fix the above issue. Or am I miss something?

In the retry case, we are OK because go back and look up the vma again.  Right?

After taking mmap_read_lock, vma can not go away until we mmap_read_unlock.
Before that, we do the following:

> > +		hugetlb_vma_lock_write(vma);
> > +		i_mmap_lock_write(mapping);

IIUC, vma can not go away while we hold i_mmap_lock_write.  So, after this we
can.

> > +		mmap_read_unlock(skipped_mm);
> > +		mmdrop(skipped_mm);

We continue to hold i_mmap_lock_write as we goto retry.

I could be missing something as well.  This was how I intended to keep
vma valid while dropping and acquiring locks.

> > +
> > +		v_start = vma_offset_start(vma, start);
> > +		v_end = vma_offset_end(vma, end);
> > +		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
> > +				NULL, ZAP_FLAG_DROP_MARKER);
> > +		hugetlb_vma_unlock_write(vma);
> > +
> > +		goto retry;
> 
> Should here be one cond_resched() here in case this function will take a really long time?
> 

I think we will at most retry once.

> > +	}
> >  }
> >  
> >  static void
> > @@ -474,11 +516,15 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
> >  		unsigned long v_start;
> >  		unsigned long v_end;
> >  
> > +		if (!hugetlb_vma_trylock_write(vma))
> > +			continue;
> > +
> >  		v_start = vma_offset_start(vma, start);
> >  		v_end = vma_offset_end(vma, end);
> >  
> >  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
> >  				     NULL, zap_flags);
> > +		hugetlb_vma_unlock_write(vma);
> >  	}
> 
> unmap_hugepage_range is not called under hugetlb_vma_lock in unmap_ref_private since it's private vma?
> Add a comment to avoid future confusion?
> 
> >  }

Sure, will add a comment before hugetlb_vma_lock.

> >  
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 6fb0bff2c7ee..5912c2b97ddf 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -4801,6 +4801,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
> >  		mmu_notifier_invalidate_range_start(&range);
> >  		mmap_assert_write_locked(src);
> >  		raw_write_seqcount_begin(&src->write_protect_seq);
> > +	} else {
> > +		/*
> > +		 * For shared mappings the vma lock must be held before
> > +		 * calling huge_pte_offset in the src vma. Otherwise, the
> 
> s/huge_pte_offset/huge_pte_alloc/, i.e. huge_pte_alloc could return shared pmd, not huge_pte_offset which
> might lead to confusion. But this is really trivial...

Actually, it is huge_pte_offset.  While looking up ptes in the source vma, we
do not want to race with other threads in the source process which could
be doing a huge_pmd_unshare.  Otherwise, the returned pte could be invalid.

FYI - Most of this code is now 'dead' because of bcd51a3c679d "Lazy page table
copies in fork()".  We will not copy shared mappigns at fork time.

> 
> Except from above comments, this patch looks good to me.
> 

Thank you! Thank you! Thank you!  For looking at this series and all
your comments.  I hope to send out v2 next week.
Miaohe Lin Sept. 5, 2022, 3:08 a.m. UTC | #3
On 2022/9/3 7:07, Mike Kravetz wrote:
> On 08/30/22 10:02, Miaohe Lin wrote:
>> On 2022/8/25 1:57, Mike Kravetz wrote:
>>> The new hugetlb vma lock (rw semaphore) is used to address this race:
>>>
>>> Faulting thread                                 Unsharing thread
>>> ...                                                  ...
>>> ptep = huge_pte_offset()
>>>       or
>>> ptep = huge_pte_alloc()
>>> ...
>>>                                                 i_mmap_lock_write
>>>                                                 lock page table
>>> ptep invalid   <------------------------        huge_pmd_unshare()
>>> Could be in a previously                        unlock_page_table
>>> sharing process or worse                        i_mmap_unlock_write
>>> ...
>>>
>>> The vma_lock is used as follows:
>>> - During fault processing. the lock is acquired in read mode before
>>>   doing a page table lock and allocation (huge_pte_alloc).  The lock is
>>>   held until code is finished with the page table entry (ptep).
>>> - The lock must be held in write mode whenever huge_pmd_unshare is
>>>   called.
>>>
>>> Lock ordering issues come into play when unmapping a page from all
>>> vmas mapping the page.  The i_mmap_rwsem must be held to search for the
>>> vmas, and the vma lock must be held before calling unmap which will
>>> call huge_pmd_unshare.  This is done today in:
>>> - try_to_migrate_one and try_to_unmap_ for page migration and memory
>>>   error handling.  In these routines we 'try' to obtain the vma lock and
>>>   fail to unmap if unsuccessful.  Calling routines already deal with the
>>>   failure of unmapping.
>>> - hugetlb_vmdelete_list for truncation and hole punch.  This routine
>>>   also tries to acquire the vma lock.  If it fails, it skips the
>>>   unmapping.  However, we can not have file truncation or hole punch
>>>   fail because of contention.  After hugetlb_vmdelete_list, truncation
>>>   and hole punch call remove_inode_hugepages.  remove_inode_hugepages
>>>   check for mapped pages and call hugetlb_unmap_file_page to unmap them.
>>>   hugetlb_unmap_file_page is designed to drop locks and reacquire in the
>>>   correct order to guarantee unmap success.
>>>
>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>> ---
>>>  fs/hugetlbfs/inode.c |  46 +++++++++++++++++++
>>>  mm/hugetlb.c         | 102 +++++++++++++++++++++++++++++++++++++++----
>>>  mm/memory.c          |   2 +
>>>  mm/rmap.c            | 100 +++++++++++++++++++++++++++---------------
>>>  mm/userfaultfd.c     |   9 +++-
>>>  5 files changed, 214 insertions(+), 45 deletions(-)
>>>
>>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>>> index b93d131b0cb5..52d9b390389b 100644
>>> --- a/fs/hugetlbfs/inode.c
>>> +++ b/fs/hugetlbfs/inode.c
>>> @@ -434,6 +434,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>  					struct folio *folio, pgoff_t index)
>>>  {
>>>  	struct rb_root_cached *root = &mapping->i_mmap;
>>> +	unsigned long skipped_vm_start;
>>> +	struct mm_struct *skipped_mm;
>>>  	struct page *page = &folio->page;
>>>  	struct vm_area_struct *vma;
>>>  	unsigned long v_start;
>>> @@ -444,6 +446,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>  	end = ((index + 1) * pages_per_huge_page(h));
>>>  
>>>  	i_mmap_lock_write(mapping);
>>> +retry:
>>> +	skipped_mm = NULL;
>>>  
>>>  	vma_interval_tree_foreach(vma, root, start, end - 1) {
>>>  		v_start = vma_offset_start(vma, start);
>>> @@ -452,11 +456,49 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>  		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
>>>  			continue;
>>>  
>>> +		if (!hugetlb_vma_trylock_write(vma)) {
>>> +			/*
>>> +			 * If we can not get vma lock, we need to drop
>>> +			 * immap_sema and take locks in order.
>>> +			 */
>>> +			skipped_vm_start = vma->vm_start;
>>> +			skipped_mm = vma->vm_mm;
>>> +			/* grab mm-struct as we will be dropping i_mmap_sema */
>>> +			mmgrab(skipped_mm);
>>> +			break;
>>> +		}
>>> +
>>>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
>>>  				NULL, ZAP_FLAG_DROP_MARKER);
>>> +		hugetlb_vma_unlock_write(vma);
>>>  	}
>>>  
>>>  	i_mmap_unlock_write(mapping);
>>> +
>>> +	if (skipped_mm) {
>>> +		mmap_read_lock(skipped_mm);
>>> +		vma = find_vma(skipped_mm, skipped_vm_start);
>>> +		if (!vma || !is_vm_hugetlb_page(vma) ||
>>> +					vma->vm_file->f_mapping != mapping ||
>>> +					vma->vm_start != skipped_vm_start) {
>>
>> i_mmap_lock_write(mapping) is missing here? Retry logic will do i_mmap_unlock_write(mapping) anyway.
>>
> 
> Yes, that is missing.  I will add here.
> 
>>> +			mmap_read_unlock(skipped_mm);
>>> +			mmdrop(skipped_mm);
>>> +			goto retry;
>>> +		}
>>> +
>>
>> IMHO, above check is not enough. Think about the below scene:
>>
>> CPU 1					CPU 2
>> hugetlb_unmap_file_folio		exit_mmap
>>   mmap_read_lock(skipped_mm);		  mmap_read_lock(mm);
>>   check vma is wanted.
>>   					  unmap_vmas
>>   mmap_read_unlock(skipped_mm);		  mmap_read_unlock
>>   					  mmap_write_lock(mm);
>>   					  free_pgtables
>>   					  remove_vma
>> 					    hugetlb_vma_lock_free
>>   vma, hugetlb_vma_lock is still *used after free*
>>   					  mmap_write_unlock(mm);
>> So we should check mm->mm_users == 0 to fix the above issue. Or am I miss something?
> 
> In the retry case, we are OK because go back and look up the vma again.  Right?
> 
> After taking mmap_read_lock, vma can not go away until we mmap_read_unlock.
> Before that, we do the following:
> 
>>> +		hugetlb_vma_lock_write(vma);
>>> +		i_mmap_lock_write(mapping);
> 
> IIUC, vma can not go away while we hold i_mmap_lock_write.  So, after this we

I think you're right. free_pgtables() can't complete its work as unlink_file_vma() will be
blocked on i_mmap_rwsem of mapping. Sorry for reporting such nonexistent race.

> can.
> 
>>> +		mmap_read_unlock(skipped_mm);
>>> +		mmdrop(skipped_mm);
> 
> We continue to hold i_mmap_lock_write as we goto retry.
> 
> I could be missing something as well.  This was how I intended to keep
> vma valid while dropping and acquiring locks.

Thanks for your clarifying.

> 
>>> +
>>> +		v_start = vma_offset_start(vma, start);
>>> +		v_end = vma_offset_end(vma, end);
>>> +		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
>>> +				NULL, ZAP_FLAG_DROP_MARKER);
>>> +		hugetlb_vma_unlock_write(vma);
>>> +
>>> +		goto retry;
>>
>> Should here be one cond_resched() here in case this function will take a really long time?
>>
> 
> I think we will at most retry once.

I see. It should be acceptable.

> 
>>> +	}
>>>  }
>>>  
>>>  static void
>>> @@ -474,11 +516,15 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
>>>  		unsigned long v_start;
>>>  		unsigned long v_end;
>>>  
>>> +		if (!hugetlb_vma_trylock_write(vma))
>>> +			continue;
>>> +
>>>  		v_start = vma_offset_start(vma, start);
>>>  		v_end = vma_offset_end(vma, end);
>>>  
>>>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
>>>  				     NULL, zap_flags);
>>> +		hugetlb_vma_unlock_write(vma);
>>>  	}
>>
>> unmap_hugepage_range is not called under hugetlb_vma_lock in unmap_ref_private since it's private vma?
>> Add a comment to avoid future confusion?
>>
>>>  }
> 
> Sure, will add a comment before hugetlb_vma_lock.
> 
>>>  
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 6fb0bff2c7ee..5912c2b97ddf 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -4801,6 +4801,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
>>>  		mmu_notifier_invalidate_range_start(&range);
>>>  		mmap_assert_write_locked(src);
>>>  		raw_write_seqcount_begin(&src->write_protect_seq);
>>> +	} else {
>>> +		/*
>>> +		 * For shared mappings the vma lock must be held before
>>> +		 * calling huge_pte_offset in the src vma. Otherwise, the
>>
>> s/huge_pte_offset/huge_pte_alloc/, i.e. huge_pte_alloc could return shared pmd, not huge_pte_offset which
>> might lead to confusion. But this is really trivial...
> 
> Actually, it is huge_pte_offset.  While looking up ptes in the source vma, we
> do not want to race with other threads in the source process which could
> be doing a huge_pmd_unshare.  Otherwise, the returned pte could be invalid.
> 
> FYI - Most of this code is now 'dead' because of bcd51a3c679d "Lazy page table
> copies in fork()".  We will not copy shared mappigns at fork time.

Agree. Should these "dead" codes be removed later?

Thanks,
Miaohe Lin


> 
>>
>> Except from above comments, this patch looks good to me.
>>
> 
> Thank you! Thank you! Thank you!  For looking at this series and all
> your comments.  I hope to send out v2 next week.
>
Mike Kravetz Sept. 12, 2022, 11:02 p.m. UTC | #4
On 09/05/22 11:08, Miaohe Lin wrote:
> On 2022/9/3 7:07, Mike Kravetz wrote:
> > On 08/30/22 10:02, Miaohe Lin wrote:
> >> On 2022/8/25 1:57, Mike Kravetz wrote:
> >>> The new hugetlb vma lock (rw semaphore) is used to address this race:
> >>>
> >>> Faulting thread                                 Unsharing thread
> >>> ...                                                  ...
> >>> ptep = huge_pte_offset()
> >>>       or
> >>> ptep = huge_pte_alloc()
> >>> ...
> >>>                                                 i_mmap_lock_write
> >>>                                                 lock page table
> >>> ptep invalid   <------------------------        huge_pmd_unshare()
> >>> Could be in a previously                        unlock_page_table
> >>> sharing process or worse                        i_mmap_unlock_write
> >>> ...
> >>>
> >>> The vma_lock is used as follows:
> >>> - During fault processing. the lock is acquired in read mode before
> >>>   doing a page table lock and allocation (huge_pte_alloc).  The lock is
> >>>   held until code is finished with the page table entry (ptep).
> >>> - The lock must be held in write mode whenever huge_pmd_unshare is
> >>>   called.
> >>>
> >>> Lock ordering issues come into play when unmapping a page from all
> >>> vmas mapping the page.  The i_mmap_rwsem must be held to search for the
> >>> vmas, and the vma lock must be held before calling unmap which will
> >>> call huge_pmd_unshare.  This is done today in:
> >>> - try_to_migrate_one and try_to_unmap_ for page migration and memory
> >>>   error handling.  In these routines we 'try' to obtain the vma lock and
> >>>   fail to unmap if unsuccessful.  Calling routines already deal with the
> >>>   failure of unmapping.
> >>> - hugetlb_vmdelete_list for truncation and hole punch.  This routine
> >>>   also tries to acquire the vma lock.  If it fails, it skips the
> >>>   unmapping.  However, we can not have file truncation or hole punch
> >>>   fail because of contention.  After hugetlb_vmdelete_list, truncation
> >>>   and hole punch call remove_inode_hugepages.  remove_inode_hugepages
> >>>   check for mapped pages and call hugetlb_unmap_file_page to unmap them.
> >>>   hugetlb_unmap_file_page is designed to drop locks and reacquire in the
> >>>   correct order to guarantee unmap success.
> >>>
> >>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> >>> ---
> >>>  fs/hugetlbfs/inode.c |  46 +++++++++++++++++++
> >>>  mm/hugetlb.c         | 102 +++++++++++++++++++++++++++++++++++++++----
> >>>  mm/memory.c          |   2 +
> >>>  mm/rmap.c            | 100 +++++++++++++++++++++++++++---------------
> >>>  mm/userfaultfd.c     |   9 +++-
> >>>  5 files changed, 214 insertions(+), 45 deletions(-)
> >>>
> >>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> >>> index b93d131b0cb5..52d9b390389b 100644
> >>> --- a/fs/hugetlbfs/inode.c
> >>> +++ b/fs/hugetlbfs/inode.c
> >>> @@ -434,6 +434,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >>>  					struct folio *folio, pgoff_t index)
> >>>  {
> >>>  	struct rb_root_cached *root = &mapping->i_mmap;
> >>> +	unsigned long skipped_vm_start;
> >>> +	struct mm_struct *skipped_mm;
> >>>  	struct page *page = &folio->page;
> >>>  	struct vm_area_struct *vma;
> >>>  	unsigned long v_start;
> >>> @@ -444,6 +446,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >>>  	end = ((index + 1) * pages_per_huge_page(h));
> >>>  
> >>>  	i_mmap_lock_write(mapping);
> >>> +retry:
> >>> +	skipped_mm = NULL;
> >>>  
> >>>  	vma_interval_tree_foreach(vma, root, start, end - 1) {
> >>>  		v_start = vma_offset_start(vma, start);
> >>> @@ -452,11 +456,49 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >>>  		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
> >>>  			continue;
> >>>  
> >>> +		if (!hugetlb_vma_trylock_write(vma)) {
> >>> +			/*
> >>> +			 * If we can not get vma lock, we need to drop
> >>> +			 * immap_sema and take locks in order.
> >>> +			 */
> >>> +			skipped_vm_start = vma->vm_start;
> >>> +			skipped_mm = vma->vm_mm;
> >>> +			/* grab mm-struct as we will be dropping i_mmap_sema */
> >>> +			mmgrab(skipped_mm);
> >>> +			break;
> >>> +		}
> >>> +
> >>>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
> >>>  				NULL, ZAP_FLAG_DROP_MARKER);
> >>> +		hugetlb_vma_unlock_write(vma);
> >>>  	}
> >>>  
> >>>  	i_mmap_unlock_write(mapping);
> >>> +
> >>> +	if (skipped_mm) {
> >>> +		mmap_read_lock(skipped_mm);
> >>> +		vma = find_vma(skipped_mm, skipped_vm_start);
> >>> +		if (!vma || !is_vm_hugetlb_page(vma) ||
> >>> +					vma->vm_file->f_mapping != mapping ||
> >>> +					vma->vm_start != skipped_vm_start) {
> >>
> >> i_mmap_lock_write(mapping) is missing here? Retry logic will do i_mmap_unlock_write(mapping) anyway.
> >>
> > 
> > Yes, that is missing.  I will add here.
> > 
> >>> +			mmap_read_unlock(skipped_mm);
> >>> +			mmdrop(skipped_mm);
> >>> +			goto retry;
> >>> +		}
> >>> +
> >>
> >> IMHO, above check is not enough. Think about the below scene:
> >>
> >> CPU 1					CPU 2
> >> hugetlb_unmap_file_folio		exit_mmap
> >>   mmap_read_lock(skipped_mm);		  mmap_read_lock(mm);
> >>   check vma is wanted.
> >>   					  unmap_vmas
> >>   mmap_read_unlock(skipped_mm);		  mmap_read_unlock
> >>   					  mmap_write_lock(mm);
> >>   					  free_pgtables
> >>   					  remove_vma
> >> 					    hugetlb_vma_lock_free
> >>   vma, hugetlb_vma_lock is still *used after free*
> >>   					  mmap_write_unlock(mm);
> >> So we should check mm->mm_users == 0 to fix the above issue. Or am I miss something?
> > 
> > In the retry case, we are OK because go back and look up the vma again.  Right?
> > 
> > After taking mmap_read_lock, vma can not go away until we mmap_read_unlock.
> > Before that, we do the following:
> > 
> >>> +		hugetlb_vma_lock_write(vma);
> >>> +		i_mmap_lock_write(mapping);
> > 
> > IIUC, vma can not go away while we hold i_mmap_lock_write.  So, after this we
> 
> I think you're right. free_pgtables() can't complete its work as unlink_file_vma() will be
> blocked on i_mmap_rwsem of mapping. Sorry for reporting such nonexistent race.
> 
> > can.
> > 
> >>> +		mmap_read_unlock(skipped_mm);
> >>> +		mmdrop(skipped_mm);
> > 
> > We continue to hold i_mmap_lock_write as we goto retry.
> > 
> > I could be missing something as well.  This was how I intended to keep
> > vma valid while dropping and acquiring locks.
> 
> Thanks for your clarifying.
> 

Well, that was all correct 'in theory' but not in practice.  I did not take
into account the inode lock that is taken at the beginning of truncate (or
hole punch).  In other code paths, we take inode lock after mmap_lock.  So,
taking mmap_lock here is not allowed.

I came up with another way to make this work.  As discussed above, we need to
drop the i_mmap lock before acquiring the vma_lock.  However, once we drop
i_mmap, the vma could go away.  My solution is to make the 'vma_lock' be a
ref counted structure that can live on after the vma is freed.  Therefore,
this code can take a reference while under i_mmap then drop i_mmap and wait
on the vma_lock.  Of course, once it acquires the vma_lock it needs to check
and make sure the vma still exists.  It may sound complicated, but I think
it is a bit simpler than the code here.  A new series will be out soon.
Miaohe Lin Sept. 13, 2022, 2:14 a.m. UTC | #5
On 2022/9/13 7:02, Mike Kravetz wrote:
> On 09/05/22 11:08, Miaohe Lin wrote:
>> On 2022/9/3 7:07, Mike Kravetz wrote:
>>> On 08/30/22 10:02, Miaohe Lin wrote:
>>>> On 2022/8/25 1:57, Mike Kravetz wrote:
>>>>> The new hugetlb vma lock (rw semaphore) is used to address this race:
>>>>>
>>>>> Faulting thread                                 Unsharing thread
>>>>> ...                                                  ...
>>>>> ptep = huge_pte_offset()
>>>>>       or
>>>>> ptep = huge_pte_alloc()
>>>>> ...
>>>>>                                                 i_mmap_lock_write
>>>>>                                                 lock page table
>>>>> ptep invalid   <------------------------        huge_pmd_unshare()
>>>>> Could be in a previously                        unlock_page_table
>>>>> sharing process or worse                        i_mmap_unlock_write
>>>>> ...
>>>>>
>>>>> The vma_lock is used as follows:
>>>>> - During fault processing. the lock is acquired in read mode before
>>>>>   doing a page table lock and allocation (huge_pte_alloc).  The lock is
>>>>>   held until code is finished with the page table entry (ptep).
>>>>> - The lock must be held in write mode whenever huge_pmd_unshare is
>>>>>   called.
>>>>>
>>>>> Lock ordering issues come into play when unmapping a page from all
>>>>> vmas mapping the page.  The i_mmap_rwsem must be held to search for the
>>>>> vmas, and the vma lock must be held before calling unmap which will
>>>>> call huge_pmd_unshare.  This is done today in:
>>>>> - try_to_migrate_one and try_to_unmap_ for page migration and memory
>>>>>   error handling.  In these routines we 'try' to obtain the vma lock and
>>>>>   fail to unmap if unsuccessful.  Calling routines already deal with the
>>>>>   failure of unmapping.
>>>>> - hugetlb_vmdelete_list for truncation and hole punch.  This routine
>>>>>   also tries to acquire the vma lock.  If it fails, it skips the
>>>>>   unmapping.  However, we can not have file truncation or hole punch
>>>>>   fail because of contention.  After hugetlb_vmdelete_list, truncation
>>>>>   and hole punch call remove_inode_hugepages.  remove_inode_hugepages
>>>>>   check for mapped pages and call hugetlb_unmap_file_page to unmap them.
>>>>>   hugetlb_unmap_file_page is designed to drop locks and reacquire in the
>>>>>   correct order to guarantee unmap success.
>>>>>
>>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>>> ---
>>>>>  fs/hugetlbfs/inode.c |  46 +++++++++++++++++++
>>>>>  mm/hugetlb.c         | 102 +++++++++++++++++++++++++++++++++++++++----
>>>>>  mm/memory.c          |   2 +
>>>>>  mm/rmap.c            | 100 +++++++++++++++++++++++++++---------------
>>>>>  mm/userfaultfd.c     |   9 +++-
>>>>>  5 files changed, 214 insertions(+), 45 deletions(-)
>>>>>
>>>>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>>>>> index b93d131b0cb5..52d9b390389b 100644
>>>>> --- a/fs/hugetlbfs/inode.c
>>>>> +++ b/fs/hugetlbfs/inode.c
>>>>> @@ -434,6 +434,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>>>  					struct folio *folio, pgoff_t index)
>>>>>  {
>>>>>  	struct rb_root_cached *root = &mapping->i_mmap;
>>>>> +	unsigned long skipped_vm_start;
>>>>> +	struct mm_struct *skipped_mm;
>>>>>  	struct page *page = &folio->page;
>>>>>  	struct vm_area_struct *vma;
>>>>>  	unsigned long v_start;
>>>>> @@ -444,6 +446,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>>>  	end = ((index + 1) * pages_per_huge_page(h));
>>>>>  
>>>>>  	i_mmap_lock_write(mapping);
>>>>> +retry:
>>>>> +	skipped_mm = NULL;
>>>>>  
>>>>>  	vma_interval_tree_foreach(vma, root, start, end - 1) {
>>>>>  		v_start = vma_offset_start(vma, start);
>>>>> @@ -452,11 +456,49 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>>>  		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
>>>>>  			continue;
>>>>>  
>>>>> +		if (!hugetlb_vma_trylock_write(vma)) {
>>>>> +			/*
>>>>> +			 * If we can not get vma lock, we need to drop
>>>>> +			 * immap_sema and take locks in order.
>>>>> +			 */
>>>>> +			skipped_vm_start = vma->vm_start;
>>>>> +			skipped_mm = vma->vm_mm;
>>>>> +			/* grab mm-struct as we will be dropping i_mmap_sema */
>>>>> +			mmgrab(skipped_mm);
>>>>> +			break;
>>>>> +		}
>>>>> +
>>>>>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
>>>>>  				NULL, ZAP_FLAG_DROP_MARKER);
>>>>> +		hugetlb_vma_unlock_write(vma);
>>>>>  	}
>>>>>  
>>>>>  	i_mmap_unlock_write(mapping);
>>>>> +
>>>>> +	if (skipped_mm) {
>>>>> +		mmap_read_lock(skipped_mm);
>>>>> +		vma = find_vma(skipped_mm, skipped_vm_start);
>>>>> +		if (!vma || !is_vm_hugetlb_page(vma) ||
>>>>> +					vma->vm_file->f_mapping != mapping ||
>>>>> +					vma->vm_start != skipped_vm_start) {
>>>>
>>>> i_mmap_lock_write(mapping) is missing here? Retry logic will do i_mmap_unlock_write(mapping) anyway.
>>>>
>>>
>>> Yes, that is missing.  I will add here.
>>>
>>>>> +			mmap_read_unlock(skipped_mm);
>>>>> +			mmdrop(skipped_mm);
>>>>> +			goto retry;
>>>>> +		}
>>>>> +
>>>>
>>>> IMHO, above check is not enough. Think about the below scene:
>>>>
>>>> CPU 1					CPU 2
>>>> hugetlb_unmap_file_folio		exit_mmap
>>>>   mmap_read_lock(skipped_mm);		  mmap_read_lock(mm);
>>>>   check vma is wanted.
>>>>   					  unmap_vmas
>>>>   mmap_read_unlock(skipped_mm);		  mmap_read_unlock
>>>>   					  mmap_write_lock(mm);
>>>>   					  free_pgtables
>>>>   					  remove_vma
>>>> 					    hugetlb_vma_lock_free
>>>>   vma, hugetlb_vma_lock is still *used after free*
>>>>   					  mmap_write_unlock(mm);
>>>> So we should check mm->mm_users == 0 to fix the above issue. Or am I miss something?
>>>
>>> In the retry case, we are OK because go back and look up the vma again.  Right?
>>>
>>> After taking mmap_read_lock, vma can not go away until we mmap_read_unlock.
>>> Before that, we do the following:
>>>
>>>>> +		hugetlb_vma_lock_write(vma);
>>>>> +		i_mmap_lock_write(mapping);
>>>
>>> IIUC, vma can not go away while we hold i_mmap_lock_write.  So, after this we
>>
>> I think you're right. free_pgtables() can't complete its work as unlink_file_vma() will be
>> blocked on i_mmap_rwsem of mapping. Sorry for reporting such nonexistent race.
>>
>>> can.
>>>
>>>>> +		mmap_read_unlock(skipped_mm);
>>>>> +		mmdrop(skipped_mm);
>>>
>>> We continue to hold i_mmap_lock_write as we goto retry.
>>>
>>> I could be missing something as well.  This was how I intended to keep
>>> vma valid while dropping and acquiring locks.
>>
>> Thanks for your clarifying.
>>
> 
> Well, that was all correct 'in theory' but not in practice.  I did not take
> into account the inode lock that is taken at the beginning of truncate (or
> hole punch).  In other code paths, we take inode lock after mmap_lock.  So,
> taking mmap_lock here is not allowed.

Considering the Lock ordering in mm/filemap.c:

 *  ->i_rwsem
 *    ->invalidate_lock		(acquired by fs in truncate path)
 *      ->i_mmap_rwsem		(truncate->unmap_mapping_range)

 *  ->i_rwsem			(generic_perform_write)
 *    ->mmap_lock		(fault_in_readable->do_page_fault)

It seems inode_lock is taken before the mmap_lock?

Thanks,
Miaohe Lin

> 
> I came up with another way to make this work.  As discussed above, we need to
> drop the i_mmap lock before acquiring the vma_lock.  However, once we drop
> i_mmap, the vma could go away.  My solution is to make the 'vma_lock' be a
> ref counted structure that can live on after the vma is freed.  Therefore,
> this code can take a reference while under i_mmap then drop i_mmap and wait
> on the vma_lock.  Of course, once it acquires the vma_lock it needs to check
> and make sure the vma still exists.  It may sound complicated, but I think
> it is a bit simpler than the code here.  A new series will be out soon.
>
Mike Kravetz Sept. 14, 2022, 12:50 a.m. UTC | #6
On 09/13/22 10:14, Miaohe Lin wrote:
> On 2022/9/13 7:02, Mike Kravetz wrote:
> > On 09/05/22 11:08, Miaohe Lin wrote:
> >> On 2022/9/3 7:07, Mike Kravetz wrote:
> >>> On 08/30/22 10:02, Miaohe Lin wrote:
> >>>> On 2022/8/25 1:57, Mike Kravetz wrote:
> >>>>> The new hugetlb vma lock (rw semaphore) is used to address this race:
> >>>>>
> >>>>> Faulting thread                                 Unsharing thread
> >>>>> ...                                                  ...
> >>>>> ptep = huge_pte_offset()
> >>>>>       or
> >>>>> ptep = huge_pte_alloc()
> >>>>> ...
> >>>>>                                                 i_mmap_lock_write
> >>>>>                                                 lock page table
> >>>>> ptep invalid   <------------------------        huge_pmd_unshare()
> >>>>> Could be in a previously                        unlock_page_table
> >>>>> sharing process or worse                        i_mmap_unlock_write
> >>>>> ...
> >>>>>
> >>>>> The vma_lock is used as follows:
> >>>>> - During fault processing. the lock is acquired in read mode before
> >>>>>   doing a page table lock and allocation (huge_pte_alloc).  The lock is
> >>>>>   held until code is finished with the page table entry (ptep).
> >>>>> - The lock must be held in write mode whenever huge_pmd_unshare is
> >>>>>   called.
> >>>>>
> >>>>> Lock ordering issues come into play when unmapping a page from all
> >>>>> vmas mapping the page.  The i_mmap_rwsem must be held to search for the
> >>>>> vmas, and the vma lock must be held before calling unmap which will
> >>>>> call huge_pmd_unshare.  This is done today in:
> >>>>> - try_to_migrate_one and try_to_unmap_ for page migration and memory
> >>>>>   error handling.  In these routines we 'try' to obtain the vma lock and
> >>>>>   fail to unmap if unsuccessful.  Calling routines already deal with the
> >>>>>   failure of unmapping.
> >>>>> - hugetlb_vmdelete_list for truncation and hole punch.  This routine
> >>>>>   also tries to acquire the vma lock.  If it fails, it skips the
> >>>>>   unmapping.  However, we can not have file truncation or hole punch
> >>>>>   fail because of contention.  After hugetlb_vmdelete_list, truncation
> >>>>>   and hole punch call remove_inode_hugepages.  remove_inode_hugepages
> >>>>>   check for mapped pages and call hugetlb_unmap_file_page to unmap them.
> >>>>>   hugetlb_unmap_file_page is designed to drop locks and reacquire in the
> >>>>>   correct order to guarantee unmap success.
> >>>>>
> >>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> >>>>> ---
> >>>>>  fs/hugetlbfs/inode.c |  46 +++++++++++++++++++
> >>>>>  mm/hugetlb.c         | 102 +++++++++++++++++++++++++++++++++++++++----
> >>>>>  mm/memory.c          |   2 +
> >>>>>  mm/rmap.c            | 100 +++++++++++++++++++++++++++---------------
> >>>>>  mm/userfaultfd.c     |   9 +++-
> >>>>>  5 files changed, 214 insertions(+), 45 deletions(-)
> >>>>>
> >>>>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> >>>>> index b93d131b0cb5..52d9b390389b 100644
> >>>>> --- a/fs/hugetlbfs/inode.c
> >>>>> +++ b/fs/hugetlbfs/inode.c
> >>>>> @@ -434,6 +434,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >>>>>  					struct folio *folio, pgoff_t index)
> >>>>>  {
> >>>>>  	struct rb_root_cached *root = &mapping->i_mmap;
> >>>>> +	unsigned long skipped_vm_start;
> >>>>> +	struct mm_struct *skipped_mm;
> >>>>>  	struct page *page = &folio->page;
> >>>>>  	struct vm_area_struct *vma;
> >>>>>  	unsigned long v_start;
> >>>>> @@ -444,6 +446,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >>>>>  	end = ((index + 1) * pages_per_huge_page(h));
> >>>>>  
> >>>>>  	i_mmap_lock_write(mapping);
> >>>>> +retry:
> >>>>> +	skipped_mm = NULL;
> >>>>>  
> >>>>>  	vma_interval_tree_foreach(vma, root, start, end - 1) {
> >>>>>  		v_start = vma_offset_start(vma, start);
> >>>>> @@ -452,11 +456,49 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
> >>>>>  		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
> >>>>>  			continue;
> >>>>>  
> >>>>> +		if (!hugetlb_vma_trylock_write(vma)) {
> >>>>> +			/*
> >>>>> +			 * If we can not get vma lock, we need to drop
> >>>>> +			 * immap_sema and take locks in order.
> >>>>> +			 */
> >>>>> +			skipped_vm_start = vma->vm_start;
> >>>>> +			skipped_mm = vma->vm_mm;
> >>>>> +			/* grab mm-struct as we will be dropping i_mmap_sema */
> >>>>> +			mmgrab(skipped_mm);
> >>>>> +			break;
> >>>>> +		}
> >>>>> +
> >>>>>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
> >>>>>  				NULL, ZAP_FLAG_DROP_MARKER);
> >>>>> +		hugetlb_vma_unlock_write(vma);
> >>>>>  	}
> >>>>>  
> >>>>>  	i_mmap_unlock_write(mapping);
> >>>>> +
> >>>>> +	if (skipped_mm) {
> >>>>> +		mmap_read_lock(skipped_mm);
> >>>>> +		vma = find_vma(skipped_mm, skipped_vm_start);
> >>>>> +		if (!vma || !is_vm_hugetlb_page(vma) ||
> >>>>> +					vma->vm_file->f_mapping != mapping ||
> >>>>> +					vma->vm_start != skipped_vm_start) {
> >>>>
> >>>> i_mmap_lock_write(mapping) is missing here? Retry logic will do i_mmap_unlock_write(mapping) anyway.
> >>>>
> >>>
> >>> Yes, that is missing.  I will add here.
> >>>
> >>>>> +			mmap_read_unlock(skipped_mm);
> >>>>> +			mmdrop(skipped_mm);
> >>>>> +			goto retry;
> >>>>> +		}
> >>>>> +
> >>>>
> >>>> IMHO, above check is not enough. Think about the below scene:
> >>>>
> >>>> CPU 1					CPU 2
> >>>> hugetlb_unmap_file_folio		exit_mmap
> >>>>   mmap_read_lock(skipped_mm);		  mmap_read_lock(mm);
> >>>>   check vma is wanted.
> >>>>   					  unmap_vmas
> >>>>   mmap_read_unlock(skipped_mm);		  mmap_read_unlock
> >>>>   					  mmap_write_lock(mm);
> >>>>   					  free_pgtables
> >>>>   					  remove_vma
> >>>> 					    hugetlb_vma_lock_free
> >>>>   vma, hugetlb_vma_lock is still *used after free*
> >>>>   					  mmap_write_unlock(mm);
> >>>> So we should check mm->mm_users == 0 to fix the above issue. Or am I miss something?
> >>>
> >>> In the retry case, we are OK because go back and look up the vma again.  Right?
> >>>
> >>> After taking mmap_read_lock, vma can not go away until we mmap_read_unlock.
> >>> Before that, we do the following:
> >>>
> >>>>> +		hugetlb_vma_lock_write(vma);
> >>>>> +		i_mmap_lock_write(mapping);
> >>>
> >>> IIUC, vma can not go away while we hold i_mmap_lock_write.  So, after this we
> >>
> >> I think you're right. free_pgtables() can't complete its work as unlink_file_vma() will be
> >> blocked on i_mmap_rwsem of mapping. Sorry for reporting such nonexistent race.
> >>
> >>> can.
> >>>
> >>>>> +		mmap_read_unlock(skipped_mm);
> >>>>> +		mmdrop(skipped_mm);
> >>>
> >>> We continue to hold i_mmap_lock_write as we goto retry.
> >>>
> >>> I could be missing something as well.  This was how I intended to keep
> >>> vma valid while dropping and acquiring locks.
> >>
> >> Thanks for your clarifying.
> >>
> > 
> > Well, that was all correct 'in theory' but not in practice.  I did not take
> > into account the inode lock that is taken at the beginning of truncate (or
> > hole punch).  In other code paths, we take inode lock after mmap_lock.  So,
> > taking mmap_lock here is not allowed.
> 
> Considering the Lock ordering in mm/filemap.c:
> 
>  *  ->i_rwsem
>  *    ->invalidate_lock		(acquired by fs in truncate path)
>  *      ->i_mmap_rwsem		(truncate->unmap_mapping_range)
> 
>  *  ->i_rwsem			(generic_perform_write)
>  *    ->mmap_lock		(fault_in_readable->do_page_fault)
> 
> It seems inode_lock is taken before the mmap_lock?

Hmmmm?  I can't find a sequence where inode_lock is taken after mmap_lock.
lockdep was complaining about taking mmap_lock after i_rwsem in the above code.
I assumed there was such a sequence somewhere.  Might need to go back and get
another trace/warning.

In any case, I think the scheme below is much cleaner.  Doing another round of
benchmarking before sending.

> > I came up with another way to make this work.  As discussed above, we need to
> > drop the i_mmap lock before acquiring the vma_lock.  However, once we drop
> > i_mmap, the vma could go away.  My solution is to make the 'vma_lock' be a
> > ref counted structure that can live on after the vma is freed.  Therefore,
> > this code can take a reference while under i_mmap then drop i_mmap and wait
> > on the vma_lock.  Of course, once it acquires the vma_lock it needs to check
> > and make sure the vma still exists.  It may sound complicated, but I think
> > it is a bit simpler than the code here.  A new series will be out soon.
> >
Miaohe Lin Sept. 14, 2022, 2:08 a.m. UTC | #7
On 2022/9/14 8:50, Mike Kravetz wrote:
> On 09/13/22 10:14, Miaohe Lin wrote:
>> On 2022/9/13 7:02, Mike Kravetz wrote:
>>> On 09/05/22 11:08, Miaohe Lin wrote:
>>>> On 2022/9/3 7:07, Mike Kravetz wrote:
>>>>> On 08/30/22 10:02, Miaohe Lin wrote:
>>>>>> On 2022/8/25 1:57, Mike Kravetz wrote:
>>>>>>> The new hugetlb vma lock (rw semaphore) is used to address this race:
>>>>>>>
>>>>>>> Faulting thread                                 Unsharing thread
>>>>>>> ...                                                  ...
>>>>>>> ptep = huge_pte_offset()
>>>>>>>       or
>>>>>>> ptep = huge_pte_alloc()
>>>>>>> ...
>>>>>>>                                                 i_mmap_lock_write
>>>>>>>                                                 lock page table
>>>>>>> ptep invalid   <------------------------        huge_pmd_unshare()
>>>>>>> Could be in a previously                        unlock_page_table
>>>>>>> sharing process or worse                        i_mmap_unlock_write
>>>>>>> ...
>>>>>>>
>>>>>>> The vma_lock is used as follows:
>>>>>>> - During fault processing. the lock is acquired in read mode before
>>>>>>>   doing a page table lock and allocation (huge_pte_alloc).  The lock is
>>>>>>>   held until code is finished with the page table entry (ptep).
>>>>>>> - The lock must be held in write mode whenever huge_pmd_unshare is
>>>>>>>   called.
>>>>>>>
>>>>>>> Lock ordering issues come into play when unmapping a page from all
>>>>>>> vmas mapping the page.  The i_mmap_rwsem must be held to search for the
>>>>>>> vmas, and the vma lock must be held before calling unmap which will
>>>>>>> call huge_pmd_unshare.  This is done today in:
>>>>>>> - try_to_migrate_one and try_to_unmap_ for page migration and memory
>>>>>>>   error handling.  In these routines we 'try' to obtain the vma lock and
>>>>>>>   fail to unmap if unsuccessful.  Calling routines already deal with the
>>>>>>>   failure of unmapping.
>>>>>>> - hugetlb_vmdelete_list for truncation and hole punch.  This routine
>>>>>>>   also tries to acquire the vma lock.  If it fails, it skips the
>>>>>>>   unmapping.  However, we can not have file truncation or hole punch
>>>>>>>   fail because of contention.  After hugetlb_vmdelete_list, truncation
>>>>>>>   and hole punch call remove_inode_hugepages.  remove_inode_hugepages
>>>>>>>   check for mapped pages and call hugetlb_unmap_file_page to unmap them.
>>>>>>>   hugetlb_unmap_file_page is designed to drop locks and reacquire in the
>>>>>>>   correct order to guarantee unmap success.
>>>>>>>
>>>>>>> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
>>>>>>> ---
>>>>>>>  fs/hugetlbfs/inode.c |  46 +++++++++++++++++++
>>>>>>>  mm/hugetlb.c         | 102 +++++++++++++++++++++++++++++++++++++++----
>>>>>>>  mm/memory.c          |   2 +
>>>>>>>  mm/rmap.c            | 100 +++++++++++++++++++++++++++---------------
>>>>>>>  mm/userfaultfd.c     |   9 +++-
>>>>>>>  5 files changed, 214 insertions(+), 45 deletions(-)
>>>>>>>
>>>>>>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>>>>>>> index b93d131b0cb5..52d9b390389b 100644
>>>>>>> --- a/fs/hugetlbfs/inode.c
>>>>>>> +++ b/fs/hugetlbfs/inode.c
>>>>>>> @@ -434,6 +434,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>>>>>  					struct folio *folio, pgoff_t index)
>>>>>>>  {
>>>>>>>  	struct rb_root_cached *root = &mapping->i_mmap;
>>>>>>> +	unsigned long skipped_vm_start;
>>>>>>> +	struct mm_struct *skipped_mm;
>>>>>>>  	struct page *page = &folio->page;
>>>>>>>  	struct vm_area_struct *vma;
>>>>>>>  	unsigned long v_start;
>>>>>>> @@ -444,6 +446,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>>>>>  	end = ((index + 1) * pages_per_huge_page(h));
>>>>>>>  
>>>>>>>  	i_mmap_lock_write(mapping);
>>>>>>> +retry:
>>>>>>> +	skipped_mm = NULL;
>>>>>>>  
>>>>>>>  	vma_interval_tree_foreach(vma, root, start, end - 1) {
>>>>>>>  		v_start = vma_offset_start(vma, start);
>>>>>>> @@ -452,11 +456,49 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
>>>>>>>  		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
>>>>>>>  			continue;
>>>>>>>  
>>>>>>> +		if (!hugetlb_vma_trylock_write(vma)) {
>>>>>>> +			/*
>>>>>>> +			 * If we can not get vma lock, we need to drop
>>>>>>> +			 * immap_sema and take locks in order.
>>>>>>> +			 */
>>>>>>> +			skipped_vm_start = vma->vm_start;
>>>>>>> +			skipped_mm = vma->vm_mm;
>>>>>>> +			/* grab mm-struct as we will be dropping i_mmap_sema */
>>>>>>> +			mmgrab(skipped_mm);
>>>>>>> +			break;
>>>>>>> +		}
>>>>>>> +
>>>>>>>  		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
>>>>>>>  				NULL, ZAP_FLAG_DROP_MARKER);
>>>>>>> +		hugetlb_vma_unlock_write(vma);
>>>>>>>  	}
>>>>>>>  
>>>>>>>  	i_mmap_unlock_write(mapping);
>>>>>>> +
>>>>>>> +	if (skipped_mm) {
>>>>>>> +		mmap_read_lock(skipped_mm);
>>>>>>> +		vma = find_vma(skipped_mm, skipped_vm_start);
>>>>>>> +		if (!vma || !is_vm_hugetlb_page(vma) ||
>>>>>>> +					vma->vm_file->f_mapping != mapping ||
>>>>>>> +					vma->vm_start != skipped_vm_start) {
>>>>>>
>>>>>> i_mmap_lock_write(mapping) is missing here? Retry logic will do i_mmap_unlock_write(mapping) anyway.
>>>>>>
>>>>>
>>>>> Yes, that is missing.  I will add here.
>>>>>
>>>>>>> +			mmap_read_unlock(skipped_mm);
>>>>>>> +			mmdrop(skipped_mm);
>>>>>>> +			goto retry;
>>>>>>> +		}
>>>>>>> +
>>>>>>
>>>>>> IMHO, above check is not enough. Think about the below scene:
>>>>>>
>>>>>> CPU 1					CPU 2
>>>>>> hugetlb_unmap_file_folio		exit_mmap
>>>>>>   mmap_read_lock(skipped_mm);		  mmap_read_lock(mm);
>>>>>>   check vma is wanted.
>>>>>>   					  unmap_vmas
>>>>>>   mmap_read_unlock(skipped_mm);		  mmap_read_unlock
>>>>>>   					  mmap_write_lock(mm);
>>>>>>   					  free_pgtables
>>>>>>   					  remove_vma
>>>>>> 					    hugetlb_vma_lock_free
>>>>>>   vma, hugetlb_vma_lock is still *used after free*
>>>>>>   					  mmap_write_unlock(mm);
>>>>>> So we should check mm->mm_users == 0 to fix the above issue. Or am I miss something?
>>>>>
>>>>> In the retry case, we are OK because go back and look up the vma again.  Right?
>>>>>
>>>>> After taking mmap_read_lock, vma can not go away until we mmap_read_unlock.
>>>>> Before that, we do the following:
>>>>>
>>>>>>> +		hugetlb_vma_lock_write(vma);
>>>>>>> +		i_mmap_lock_write(mapping);
>>>>>
>>>>> IIUC, vma can not go away while we hold i_mmap_lock_write.  So, after this we
>>>>
>>>> I think you're right. free_pgtables() can't complete its work as unlink_file_vma() will be
>>>> blocked on i_mmap_rwsem of mapping. Sorry for reporting such nonexistent race.
>>>>
>>>>> can.
>>>>>
>>>>>>> +		mmap_read_unlock(skipped_mm);
>>>>>>> +		mmdrop(skipped_mm);
>>>>>
>>>>> We continue to hold i_mmap_lock_write as we goto retry.
>>>>>
>>>>> I could be missing something as well.  This was how I intended to keep
>>>>> vma valid while dropping and acquiring locks.
>>>>
>>>> Thanks for your clarifying.
>>>>
>>>
>>> Well, that was all correct 'in theory' but not in practice.  I did not take
>>> into account the inode lock that is taken at the beginning of truncate (or
>>> hole punch).  In other code paths, we take inode lock after mmap_lock.  So,
>>> taking mmap_lock here is not allowed.
>>
>> Considering the Lock ordering in mm/filemap.c:
>>
>>  *  ->i_rwsem
>>  *    ->invalidate_lock		(acquired by fs in truncate path)
>>  *      ->i_mmap_rwsem		(truncate->unmap_mapping_range)
>>
>>  *  ->i_rwsem			(generic_perform_write)
>>  *    ->mmap_lock		(fault_in_readable->do_page_fault)
>>
>> It seems inode_lock is taken before the mmap_lock?
> 
> Hmmmm?  I can't find a sequence where inode_lock is taken after mmap_lock.
> lockdep was complaining about taking mmap_lock after i_rwsem in the above code.
> I assumed there was such a sequence somewhere.  Might need to go back and get
> another trace/warning.

Sorry, I'm somewhat confused. Take generic_file_write_iter() as an example:

generic_file_write_iter
  inode_lock(inode); -- *inode lock is held here*
  __generic_file_write_iter
    generic_perform_write
      fault_in_iov_iter_readable -- *may cause page fault and thus take mmap_lock*
  inode_unlock(inode);

This is the documented example in the mm/filemap.c. So we should take inode_lock before
taking mmap_lock. Or this is out-dated ? And above example needs a fix?

> 
> In any case, I think the scheme below is much cleaner.  Doing another round of
> benchmarking before sending.

That should be a good alternative. Thanks for your work. :)

Thanks,
Miaohe Lin

> 
>>> I came up with another way to make this work.  As discussed above, we need to
>>> drop the i_mmap lock before acquiring the vma_lock.  However, once we drop
>>> i_mmap, the vma could go away.  My solution is to make the 'vma_lock' be a
>>> ref counted structure that can live on after the vma is freed.  Therefore,
>>> this code can take a reference while under i_mmap then drop i_mmap and wait
>>> on the vma_lock.  Of course, once it acquires the vma_lock it needs to check
>>> and make sure the vma still exists.  It may sound complicated, but I think
>>> it is a bit simpler than the code here.  A new series will be out soon.
>>>
>
diff mbox series

Patch

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b93d131b0cb5..52d9b390389b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -434,6 +434,8 @@  static void hugetlb_unmap_file_folio(struct hstate *h,
 					struct folio *folio, pgoff_t index)
 {
 	struct rb_root_cached *root = &mapping->i_mmap;
+	unsigned long skipped_vm_start;
+	struct mm_struct *skipped_mm;
 	struct page *page = &folio->page;
 	struct vm_area_struct *vma;
 	unsigned long v_start;
@@ -444,6 +446,8 @@  static void hugetlb_unmap_file_folio(struct hstate *h,
 	end = ((index + 1) * pages_per_huge_page(h));
 
 	i_mmap_lock_write(mapping);
+retry:
+	skipped_mm = NULL;
 
 	vma_interval_tree_foreach(vma, root, start, end - 1) {
 		v_start = vma_offset_start(vma, start);
@@ -452,11 +456,49 @@  static void hugetlb_unmap_file_folio(struct hstate *h,
 		if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
 			continue;
 
+		if (!hugetlb_vma_trylock_write(vma)) {
+			/*
+			 * If we can not get vma lock, we need to drop
+			 * immap_sema and take locks in order.
+			 */
+			skipped_vm_start = vma->vm_start;
+			skipped_mm = vma->vm_mm;
+			/* grab mm-struct as we will be dropping i_mmap_sema */
+			mmgrab(skipped_mm);
+			break;
+		}
+
 		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
 				NULL, ZAP_FLAG_DROP_MARKER);
+		hugetlb_vma_unlock_write(vma);
 	}
 
 	i_mmap_unlock_write(mapping);
+
+	if (skipped_mm) {
+		mmap_read_lock(skipped_mm);
+		vma = find_vma(skipped_mm, skipped_vm_start);
+		if (!vma || !is_vm_hugetlb_page(vma) ||
+					vma->vm_file->f_mapping != mapping ||
+					vma->vm_start != skipped_vm_start) {
+			mmap_read_unlock(skipped_mm);
+			mmdrop(skipped_mm);
+			goto retry;
+		}
+
+		hugetlb_vma_lock_write(vma);
+		i_mmap_lock_write(mapping);
+		mmap_read_unlock(skipped_mm);
+		mmdrop(skipped_mm);
+
+		v_start = vma_offset_start(vma, start);
+		v_end = vma_offset_end(vma, end);
+		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
+				NULL, ZAP_FLAG_DROP_MARKER);
+		hugetlb_vma_unlock_write(vma);
+
+		goto retry;
+	}
 }
 
 static void
@@ -474,11 +516,15 @@  hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
 		unsigned long v_start;
 		unsigned long v_end;
 
+		if (!hugetlb_vma_trylock_write(vma))
+			continue;
+
 		v_start = vma_offset_start(vma, start);
 		v_end = vma_offset_end(vma, end);
 
 		unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
 				     NULL, zap_flags);
+		hugetlb_vma_unlock_write(vma);
 	}
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6fb0bff2c7ee..5912c2b97ddf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4801,6 +4801,14 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 		mmu_notifier_invalidate_range_start(&range);
 		mmap_assert_write_locked(src);
 		raw_write_seqcount_begin(&src->write_protect_seq);
+	} else {
+		/*
+		 * For shared mappings the vma lock must be held before
+		 * calling huge_pte_offset in the src vma. Otherwise, the
+		 * returned ptep could go away if part of a shared pmd and
+		 * another thread calls huge_pmd_unshare.
+		 */
+		hugetlb_vma_lock_read(src_vma);
 	}
 
 	last_addr_mask = hugetlb_mask_last_page(h);
@@ -4948,6 +4956,8 @@  int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	if (cow) {
 		raw_write_seqcount_end(&src->write_protect_seq);
 		mmu_notifier_invalidate_range_end(&range);
+	} else {
+		hugetlb_vma_unlock_read(src_vma);
 	}
 
 	return ret;
@@ -5006,6 +5016,7 @@  int move_hugetlb_page_tables(struct vm_area_struct *vma,
 	mmu_notifier_invalidate_range_start(&range);
 	last_addr_mask = hugetlb_mask_last_page(h);
 	/* Prevent race with file truncation */
+	hugetlb_vma_lock_write(vma);
 	i_mmap_lock_write(mapping);
 	for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
 		src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -5037,6 +5048,7 @@  int move_hugetlb_page_tables(struct vm_area_struct *vma,
 		flush_tlb_range(vma, old_end - len, old_end);
 	mmu_notifier_invalidate_range_end(&range);
 	i_mmap_unlock_write(mapping);
+	hugetlb_vma_unlock_write(vma);
 
 	return len + old_addr - old_end;
 }
@@ -5356,9 +5368,30 @@  static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * may get SIGKILLed if it later faults.
 		 */
 		if (outside_reserve) {
+			struct address_space *mapping = vma->vm_file->f_mapping;
+			pgoff_t idx;
+			u32 hash;
+
 			put_page(old_page);
 			BUG_ON(huge_pte_none(pte));
+			/*
+			 * Drop hugetlb_fault_mutex and vma_lock before
+			 * unmapping.  unmapping needs to hold vma_lock
+			 * in write mode.  Dropping vma_lock in read mode
+			 * here is OK as COW mappings do not interact with
+			 * PMD sharing.
+			 *
+			 * Reacquire both after unmap operation.
+			 */
+			idx = vma_hugecache_offset(h, vma, haddr);
+			hash = hugetlb_fault_mutex_hash(mapping, idx);
+			hugetlb_vma_unlock_read(vma);
+			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
 			unmap_ref_private(mm, vma, old_page, haddr);
+
+			mutex_lock(&hugetlb_fault_mutex_table[hash]);
+			hugetlb_vma_lock_read(vma);
 			spin_lock(ptl);
 			ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
 			if (likely(ptep &&
@@ -5520,14 +5553,16 @@  static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
 	};
 
 	/*
-	 * hugetlb_fault_mutex and i_mmap_rwsem must be
+	 * vma_lock and hugetlb_fault_mutex must be
 	 * dropped before handling userfault.  Reacquire
 	 * after handling fault to make calling code simpler.
 	 */
+	hugetlb_vma_unlock_read(vma);
 	hash = hugetlb_fault_mutex_hash(mapping, idx);
 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 	ret = handle_userfault(&vmf, reason);
 	mutex_lock(&hugetlb_fault_mutex_table[hash]);
+	hugetlb_vma_lock_read(vma);
 
 	return ret;
 }
@@ -5767,6 +5802,11 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
 	if (ptep) {
+		/*
+		 * Since we hold no locks, ptep could be stale.  That is
+		 * OK as we are only making decisions based on content and
+		 * not actually modifying content here.
+		 */
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
 			migration_entry_wait_huge(vma, ptep);
@@ -5774,23 +5814,35 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
 			return VM_FAULT_HWPOISON_LARGE |
 				VM_FAULT_SET_HINDEX(hstate_index(h));
-	} else {
-		ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
-		if (!ptep)
-			return VM_FAULT_OOM;
 	}
 
-	mapping = vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(h, vma, haddr);
-
 	/*
 	 * Serialize hugepage allocation and instantiation, so that we don't
 	 * get spurious allocation failures if two CPUs race to instantiate
 	 * the same page in the page cache.
 	 */
+	mapping = vma->vm_file->f_mapping;
+	idx = vma_hugecache_offset(h, vma, haddr);
 	hash = hugetlb_fault_mutex_hash(mapping, idx);
 	mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
+	/*
+	 * Acquire vma lock before calling huge_pte_alloc and hold
+	 * until finished with ptep.  This prevents huge_pmd_unshare from
+	 * being called elsewhere and making the ptep no longer valid.
+	 *
+	 * ptep could have already be assigned via huge_pte_offset.  That
+	 * is OK, as huge_pte_alloc will return the same value unless
+	 * something has changed.
+	 */
+	hugetlb_vma_lock_read(vma);
+	ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
+	if (!ptep) {
+		hugetlb_vma_unlock_read(vma);
+		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+		return VM_FAULT_OOM;
+	}
+
 	entry = huge_ptep_get(ptep);
 	/* PTE markers should be handled the same way as none pte */
 	if (huge_pte_none_mostly(entry)) {
@@ -5851,6 +5903,7 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unlock_page(pagecache_page);
 			put_page(pagecache_page);
 		}
+		hugetlb_vma_unlock_read(vma);
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 		return handle_userfault(&vmf, VM_UFFD_WP);
 	}
@@ -5894,6 +5947,7 @@  vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(pagecache_page);
 	}
 out_mutex:
+	hugetlb_vma_unlock_read(vma);
 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 	/*
 	 * Generally it's safe to hold refcount during waiting page lock. But
@@ -6343,8 +6397,9 @@  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	flush_cache_range(vma, range.start, range.end);
 
 	mmu_notifier_invalidate_range_start(&range);
-	last_addr_mask = hugetlb_mask_last_page(h);
+	hugetlb_vma_lock_write(vma);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
+	last_addr_mask = hugetlb_mask_last_page(h);
 	for (; address < end; address += psize) {
 		spinlock_t *ptl;
 		ptep = huge_pte_offset(mm, address, psize);
@@ -6443,6 +6498,7 @@  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	 * See Documentation/mm/mmu_notifier.rst
 	 */
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
+	hugetlb_vma_unlock_write(vma);
 	mmu_notifier_invalidate_range_end(&range);
 
 	return pages << h->order;
@@ -6909,6 +6965,7 @@  int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud = pud_offset(p4d, addr);
 
 	i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+	hugetlb_vma_assert_locked(vma);
 	BUG_ON(page_count(virt_to_page(ptep)) == 0);
 	if (page_count(virt_to_page(ptep)) == 1)
 		return 0;
@@ -6920,6 +6977,31 @@  int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+	return 1;
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
 static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
 {
 }
@@ -7298,6 +7380,7 @@  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
 				start, end);
 	mmu_notifier_invalidate_range_start(&range);
+	hugetlb_vma_lock_write(vma);
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (address = start; address < end; address += PUD_SIZE) {
 		ptep = huge_pte_offset(mm, address, sz);
@@ -7309,6 +7392,7 @@  void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
 	}
 	flush_hugetlb_tlb_range(vma, start, end);
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
+	hugetlb_vma_unlock_write(vma);
 	/*
 	 * No need to call mmu_notifier_invalidate_range(), see
 	 * Documentation/mm/mmu_notifier.rst.
diff --git a/mm/memory.c b/mm/memory.c
index 2f3cc57a5a11..55166045ab55 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1675,10 +1675,12 @@  static void unmap_single_vma(struct mmu_gather *tlb,
 			if (vma->vm_file) {
 				zap_flags_t zap_flags = details ?
 				    details->zap_flags : 0;
+				hugetlb_vma_lock_write(vma);
 				i_mmap_lock_write(vma->vm_file->f_mapping);
 				__unmap_hugepage_range_final(tlb, vma, start, end,
 							     NULL, zap_flags);
 				i_mmap_unlock_write(vma->vm_file->f_mapping);
+				hugetlb_vma_unlock_write(vma);
 			}
 		} else
 			unmap_page_range(tlb, vma, start, end, details);
diff --git a/mm/rmap.c b/mm/rmap.c
index 55209e029847..60d7db60428e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1558,24 +1558,39 @@  static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 			 * To call huge_pmd_unshare, i_mmap_rwsem must be
 			 * held in write mode.  Caller needs to explicitly
 			 * do this outside rmap routines.
+			 *
+			 * We also must hold hugetlb vma_lock in write mode.
+			 * Lock order dictates acquiring vma_lock BEFORE
+			 * i_mmap_rwsem.  We can only try lock here and fail
+			 * if unsuccessful.
 			 */
-			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-				flush_tlb_range(vma, range.start, range.end);
-				mmu_notifier_invalidate_range(mm, range.start,
-							      range.end);
-
-				/*
-				 * The ref count of the PMD page was dropped
-				 * which is part of the way map counting
-				 * is done for shared PMDs.  Return 'true'
-				 * here.  When there is no other sharing,
-				 * huge_pmd_unshare returns false and we will
-				 * unmap the actual page and drop map count
-				 * to zero.
-				 */
-				page_vma_mapped_walk_done(&pvmw);
-				break;
+			if (!anon) {
+				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+				if (!hugetlb_vma_trylock_write(vma)) {
+					page_vma_mapped_walk_done(&pvmw);
+					ret = false;
+					break;
+				}
+				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+					hugetlb_vma_unlock_write(vma);
+					flush_tlb_range(vma,
+						range.start, range.end);
+					mmu_notifier_invalidate_range(mm,
+						range.start, range.end);
+					/*
+					 * The ref count of the PMD page was
+					 * dropped which is part of the way map
+					 * counting is done for shared PMDs.
+					 * Return 'true' here.  When there is
+					 * no other sharing, huge_pmd_unshare
+					 * returns false and we will unmap the
+					 * actual page and drop map count
+					 * to zero.
+					 */
+					page_vma_mapped_walk_done(&pvmw);
+					break;
+				}
+				hugetlb_vma_unlock_write(vma);
 			}
 			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
 		} else {
@@ -1934,26 +1949,41 @@  static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
 			 * To call huge_pmd_unshare, i_mmap_rwsem must be
 			 * held in write mode.  Caller needs to explicitly
 			 * do this outside rmap routines.
+			 *
+			 * We also must hold hugetlb vma_lock in write mode.
+			 * Lock order dictates acquiring vma_lock BEFORE
+			 * i_mmap_rwsem.  We can only try lock here and
+			 * fail if unsuccessful.
 			 */
-			VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
-			if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
-				flush_tlb_range(vma, range.start, range.end);
-				mmu_notifier_invalidate_range(mm, range.start,
-							      range.end);
-
-				/*
-				 * The ref count of the PMD page was dropped
-				 * which is part of the way map counting
-				 * is done for shared PMDs.  Return 'true'
-				 * here.  When there is no other sharing,
-				 * huge_pmd_unshare returns false and we will
-				 * unmap the actual page and drop map count
-				 * to zero.
-				 */
-				page_vma_mapped_walk_done(&pvmw);
-				break;
+			if (!anon) {
+				VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+				if (!hugetlb_vma_trylock_write(vma)) {
+					page_vma_mapped_walk_done(&pvmw);
+					ret = false;
+					break;
+				}
+				if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+					hugetlb_vma_unlock_write(vma);
+					flush_tlb_range(vma,
+						range.start, range.end);
+					mmu_notifier_invalidate_range(mm,
+						range.start, range.end);
+
+					/*
+					 * The ref count of the PMD page was
+					 * dropped which is part of the way map
+					 * counting is done for shared PMDs.
+					 * Return 'true' here.  When there is
+					 * no other sharing, huge_pmd_unshare
+					 * returns false and we will unmap the
+					 * actual page and drop map count
+					 * to zero.
+					 */
+					page_vma_mapped_walk_done(&pvmw);
+					break;
+				}
+				hugetlb_vma_unlock_write(vma);
 			}
-
 			/* Nuke the hugetlb page table entry */
 			pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
 		} else {
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7707f2664adb..2b0502710ea1 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -377,16 +377,21 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 		BUG_ON(dst_addr >= dst_start + len);
 
 		/*
-		 * Serialize via hugetlb_fault_mutex.
+		 * Serialize via vma_lock and hugetlb_fault_mutex.
+		 * vma_lock ensures the dst_pte remains valid even
+		 * in the case of shared pmds.  fault mutex prevents
+		 * races with other faulting threads.
 		 */
 		idx = linear_page_index(dst_vma, dst_addr);
 		mapping = dst_vma->vm_file->f_mapping;
 		hash = hugetlb_fault_mutex_hash(mapping, idx);
 		mutex_lock(&hugetlb_fault_mutex_table[hash]);
+		hugetlb_vma_lock_read(dst_vma);
 
 		err = -ENOMEM;
 		dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
 		if (!dst_pte) {
+			hugetlb_vma_unlock_read(dst_vma);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			goto out_unlock;
 		}
@@ -394,6 +399,7 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 		if (mode != MCOPY_ATOMIC_CONTINUE &&
 		    !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
 			err = -EEXIST;
+			hugetlb_vma_unlock_read(dst_vma);
 			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 			goto out_unlock;
 		}
@@ -402,6 +408,7 @@  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
 					       dst_addr, src_addr, mode, &page,
 					       wp_copy);
 
+		hugetlb_vma_unlock_read(dst_vma);
 		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
 
 		cond_resched();