[2/4] hugetlbfs: extend hugetlb_vma_lock to private VMAs

Message ID	20231006040020.3677377-3-riel@surriel.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: riel@surriel.com To: linux-kernel@vger.kernel.org Cc: kernel-team@meta.com, linux-mm@kvack.org, akpm@linux-foundation.org, muchun.song@linux.dev, mike.kravetz@oracle.com, leit@meta.com, willy@infradead.org, Rik van Riel <riel@surriel.com>, stable@kernel.org Subject: [PATCH 2/4] hugetlbfs: extend hugetlb_vma_lock to private VMAs Date: Thu, 5 Oct 2023 23:59:07 -0400 Message-ID: <20231006040020.3677377-3-riel@surriel.com> In-Reply-To: <20231006040020.3677377-1-riel@surriel.com> References: <20231006040020.3677377-1-riel@surriel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	hugetlbfs: close race between MADV_DONTNEED and page fault \| expand [v7,0/4] hugetlbfs: close race between MADV_DONTNEED and page fault [1/4] hugetlbfs: clear resv_map pointer if mmap fails [2/4] hugetlbfs: extend hugetlb_vma_lock to private VMAs [3/4] hugetlbfs: close race between MADV_DONTNEED and page fault [4/4] hugetlbfs: replace hugetlb_vma_lock with invalidate_lock

Message ID

20231006040020.3677377-3-riel@surriel.com (mailing list archive)

State

New

Headers

From: riel@surriel.com
To: linux-kernel@vger.kernel.org
Cc: kernel-team@meta.com,
	linux-mm@kvack.org,
	akpm@linux-foundation.org,
	muchun.song@linux.dev,
	mike.kravetz@oracle.com,
	leit@meta.com,
	willy@infradead.org,
	Rik van Riel <riel@surriel.com>,
	stable@kernel.org
Subject: [PATCH 2/4] hugetlbfs: extend hugetlb_vma_lock to private VMAs
Date: Thu,  5 Oct 2023 23:59:07 -0400
Message-ID: <20231006040020.3677377-3-riel@surriel.com>
In-Reply-To: <20231006040020.3677377-1-riel@surriel.com>
References: <20231006040020.3677377-1-riel@surriel.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Sender: owner-linux-mm@kvack.org
Precedence: bulk

Series

hugetlbfs: close race between MADV_DONTNEED and page fault | expand

Commit Message

Rik van Riel Oct. 6, 2023, 3:59 a.m. UTC

From: Rik van Riel <riel@surriel.com>

Extend the locking scheme used to protect shared hugetlb mappings
from truncate vs page fault races, in order to protect private
hugetlb mappings (with resv_map) against MADV_DONTNEED.

Add a read-write semaphore to the resv_map data structure, and
use that from the hugetlb_vma_(un)lock_* functions, in preparation
for closing the race between MADV_DONTNEED and page faults.

Signed-off-by: Rik van Riel <riel@surriel.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: stable@kernel.org
Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
---
 include/linux/hugetlb.h |  6 ++++++
 mm/hugetlb.c            | 41 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 43 insertions(+), 4 deletions(-)

Comments

Peter Xu Nov. 7, 2024, 8:18 p.m. UTC | #1

On Thu, Oct 05, 2023 at 11:59:07PM -0400, riel@surriel.com wrote:
> From: Rik van Riel <riel@surriel.com>
> 
> Extend the locking scheme used to protect shared hugetlb mappings
> from truncate vs page fault races, in order to protect private
> hugetlb mappings (with resv_map) against MADV_DONTNEED.
> 
> Add a read-write semaphore to the resv_map data structure, and
> use that from the hugetlb_vma_(un)lock_* functions, in preparation
> for closing the race between MADV_DONTNEED and page faults.
> 
> Signed-off-by: Rik van Riel <riel@surriel.com>
> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
> Cc: stable@kernel.org
> Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing")
> ---
>  include/linux/hugetlb.h |  6 ++++++
>  mm/hugetlb.c            | 41 +++++++++++++++++++++++++++++++++++++----
>  2 files changed, 43 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 5b2626063f4f..694928fa06a3 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -60,6 +60,7 @@ struct resv_map {
>  	long adds_in_progress;
>  	struct list_head region_cache;
>  	long region_cache_count;
> +	struct rw_semaphore rw_sema;
>  #ifdef CONFIG_CGROUP_HUGETLB
>  	/*
>  	 * On private mappings, the counter to uncharge reservations is stored
> @@ -1231,6 +1232,11 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
>  	return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
>  }
>  
> +static inline bool __vma_private_lock(struct vm_area_struct *vma)
> +{
> +	return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data;
> +}
> +
>  /*
>   * Safe version of huge_pte_offset() to check the locks.  See comments
>   * above huge_pte_offset().
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index a86e070d735b..dd3de6ec8f1a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -97,6 +97,7 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
>  static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
>  static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
>  		unsigned long start, unsigned long end);
> +static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
>  
>  static inline bool subpool_is_free(struct hugepage_subpool *spool)
>  {
> @@ -267,6 +268,10 @@ void hugetlb_vma_lock_read(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		down_read(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		down_read(&resv_map->rw_sema);
>  	}
>  }

+Ackerley +Oscar

I'm reading the resv code recently and just stumbled upon this. So want to
raise this question.

IIUC __vma_private_lock() will return false for MAP_PRIVATE hugetlb vma if
the vma is dup()ed from a fork(), with/without commit 187da0f8250a
("hugetlb: fix null-ptr-deref in hugetlb_vma_lock_write") which fixed a
slightly different issue.

The problem is the current vma lock for private mmap() is based on the resv
map, and the resv map only belongs to the process that mmap()ed this
private vma.  E.g. dup_mmap() has:

                if (is_vm_hugetlb_page(tmp))
                        hugetlb_dup_vma_private(tmp);

Which does:

	if (vma->vm_flags & VM_MAYSHARE) {
                ...
	} else
		vma->vm_private_data = NULL; <---------------------

So even if I don't know how many of us are even using hugetlb PRIVATE +
fork(), assuming that's the most controversial use case that I'm aware of
on hugetlb that people complains about.. with some tricky changes like
04f2cbe35699.. Just still want to raise this pure question, that after a
fork() on private vma, and if I read it alright, lock/unlock operations may
become noop..

Thanks,

>  
> @@ -276,6 +281,10 @@ void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		up_read(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		up_read(&resv_map->rw_sema);
>  	}
>  }
>  
> @@ -285,6 +294,10 @@ void hugetlb_vma_lock_write(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		down_write(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		down_write(&resv_map->rw_sema);
>  	}
>  }
>  
> @@ -294,17 +307,27 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		up_write(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		up_write(&resv_map->rw_sema);
>  	}
>  }
>  
>  int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
>  {
> -	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
> -	if (!__vma_shareable_lock(vma))
> -		return 1;
> +	if (__vma_shareable_lock(vma)) {
> +		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
> -	return down_write_trylock(&vma_lock->rw_sema);
> +		return down_write_trylock(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		return down_write_trylock(&resv_map->rw_sema);
> +	}
> +
> +	return 1;
>  }
>  
>  void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
> @@ -313,6 +336,10 @@ void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		lockdep_assert_held(&vma_lock->rw_sema);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		lockdep_assert_held(&resv_map->rw_sema);
>  	}
>  }
>  
> @@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
>  		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
>  
>  		__hugetlb_vma_unlock_write_put(vma_lock);
> +	} else if (__vma_private_lock(vma)) {
> +		struct resv_map *resv_map = vma_resv_map(vma);
> +
> +		/* no free for anon vmas, but still need to unlock */
> +		up_write(&resv_map->rw_sema);
>  	}
>  }
>  
> @@ -1068,6 +1100,7 @@ struct resv_map *resv_map_alloc(void)
>  	kref_init(&resv_map->refs);
>  	spin_lock_init(&resv_map->lock);
>  	INIT_LIST_HEAD(&resv_map->regions);
> +	init_rwsem(&resv_map->rw_sema);
>  
>  	resv_map->adds_in_progress = 0;
>  	/*
> -- 
> 2.41.0
> 
>

Oscar Salvador Nov. 12, 2024, 11:24 a.m. UTC | #2

On Thu, Nov 07, 2024 at 03:18:51PM -0500, Peter Xu wrote:
> +Ackerley +Oscar
> 
> I'm reading the resv code recently and just stumbled upon this. So want to
> raise this question.
> 
> IIUC __vma_private_lock() will return false for MAP_PRIVATE hugetlb vma if
> the vma is dup()ed from a fork(), with/without commit 187da0f8250a
> ("hugetlb: fix null-ptr-deref in hugetlb_vma_lock_write") which fixed a
> slightly different issue.
> 
> The problem is the current vma lock for private mmap() is based on the resv
> map, and the resv map only belongs to the process that mmap()ed this
> private vma.  E.g. dup_mmap() has:
> 
>                 if (is_vm_hugetlb_page(tmp))
>                         hugetlb_dup_vma_private(tmp);
> 
> Which does:
> 
> 	if (vma->vm_flags & VM_MAYSHARE) {
>                 ...
> 	} else
> 		vma->vm_private_data = NULL; <---------------------
> 
> So even if I don't know how many of us are even using hugetlb PRIVATE +
> fork(), assuming that's the most controversial use case that I'm aware of
> on hugetlb that people complains about.. with some tricky changes like
> 04f2cbe35699.. Just still want to raise this pure question, that after a
> fork() on private vma, and if I read it alright, lock/unlock operations may
> become noop..

I have been taking a look at this, and yes, __vma_private_lock will
return false for private hugetlb mappings that were forked .

I quickly checked what protects what and we currently have:

hugetlb_vma_lock_read - copy_hugetlb_page_range (only sharing)
hugetlb_vma_lock_read - hugetlb_wp (only for HPAGE_RESV_OWNER)
hugetlb_vma_lock_read - hugetlb_fault , protects huge_pmd_unshare?
hugetlb_vma_lock_read - pagewalks

hugetlb_vma_lock_write - hugetlb_change_protection
hugetlb_vma_lock_write - hugetlb_unshare_pmds
hugetlb_vma_lock_wirte - move_hugetlb_page_tables
hugetlb_vma_lock_wirte - _hugetlb_zap_begin (unmap_vmas)

the ones taking the hugetlb_vma_lock in write (so, the last four) also
take the i_mmap_lock_write (vma->vm_file->f_mapping), and AFAIK, hugetlb
mappings, private or not, should have vma->vm_file->f_mapping set.

Which means that technically we cannot race between hugetlb_change_protection
and move_hugetlb_page_tables etc.

But, checking 

 commit bf4916922c60f43efaa329744b3eef539aa6a2b2
 Author: Rik van Riel <riel@surriel.com>
 Date:   Thu Oct 5 23:59:07 2023 -0400

     hugetlbfs: extend hugetlb_vma_lock to private VMAs

which its motivation was to protect MADV_DONTNEED vs page_faults, I do
not see how it gets protected with private hugetlb mappings that were
dupped (forked).

 madvise_dontneed_single_vma
  zap_page_range_single
   _hugetlb_zap_begin
    hugetlb_vma_lock_write - noop for mappings that do not own the reservation
    i_mmap_lock_write

But the hugetlb_fault path only takes hugetlb_vma_lock_*, so theorically
we still could race between page_fault vs madvise_dontneed_single_vma?

A quick way to prove would be map a hugetlb private mapping, fork it and
have two threads tryong to madvise(MADV_DONTNEED) and the other trying
to write to it?

I do not know, maybe we are protected in some other way I cannot see
right now.

I will have a look.

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5b2626063f4f..694928fa06a3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -60,6 +60,7 @@  struct resv_map {
 	long adds_in_progress;
 	struct list_head region_cache;
 	long region_cache_count;
+	struct rw_semaphore rw_sema;
 #ifdef CONFIG_CGROUP_HUGETLB
 	/*
 	 * On private mappings, the counter to uncharge reservations is stored
@@ -1231,6 +1232,11 @@  static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
 	return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
 }
 
+static inline bool __vma_private_lock(struct vm_area_struct *vma)
+{
+	return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data;
+}
+
 /*
  * Safe version of huge_pte_offset() to check the locks.  See comments
  * above huge_pte_offset().
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a86e070d735b..dd3de6ec8f1a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -97,6 +97,7 @@  static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end);
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
 
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
 {
@@ -267,6 +268,10 @@  void hugetlb_vma_lock_read(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		down_read(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		down_read(&resv_map->rw_sema);
 	}
 }
 
@@ -276,6 +281,10 @@  void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		up_read(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		up_read(&resv_map->rw_sema);
 	}
 }
 
@@ -285,6 +294,10 @@  void hugetlb_vma_lock_write(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		down_write(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		down_write(&resv_map->rw_sema);
 	}
 }
 
@@ -294,17 +307,27 @@  void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		up_write(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		up_write(&resv_map->rw_sema);
 	}
 }
 
 int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
 {
-	struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
-	if (!__vma_shareable_lock(vma))
-		return 1;
+	if (__vma_shareable_lock(vma)) {
+		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
-	return down_write_trylock(&vma_lock->rw_sema);
+		return down_write_trylock(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		return down_write_trylock(&resv_map->rw_sema);
+	}
+
+	return 1;
 }
 
 void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
@@ -313,6 +336,10 @@  void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		lockdep_assert_held(&vma_lock->rw_sema);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		lockdep_assert_held(&resv_map->rw_sema);
 	}
 }
 
@@ -345,6 +372,11 @@  static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
 		struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
 
 		__hugetlb_vma_unlock_write_put(vma_lock);
+	} else if (__vma_private_lock(vma)) {
+		struct resv_map *resv_map = vma_resv_map(vma);
+
+		/* no free for anon vmas, but still need to unlock */
+		up_write(&resv_map->rw_sema);
 	}
 }
 
@@ -1068,6 +1100,7 @@  struct resv_map *resv_map_alloc(void)
 	kref_init(&resv_map->refs);
 	spin_lock_init(&resv_map->lock);
 	INIT_LIST_HEAD(&resv_map->regions);
+	init_rwsem(&resv_map->rw_sema);
 
 	resv_map->adds_in_progress = 0;
 	/*

[2/4] hugetlbfs: extend hugetlb_vma_lock to private VMAs

Commit Message

Comments

Patch