diff mbox series

[v3,1/3] mm: enable MADV_DONTNEED for hugetlb mappings

Message ID 20220215002348.128823-2-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series Add hugetlb MADV_DONTNEED support | expand

Commit Message

Mike Kravetz Feb. 15, 2022, 12:23 a.m. UTC
MADV_DONTNEED is currently disabled for hugetlb mappings.  This
certainly makes sense in shared file mappings as the pagecache maintains
a reference to the page and it will never be freed.  However, it could
be useful to unmap and free pages in private mappings.  In addition,
userfaultfd minor fault users may be able to simplify code by using
MADV_DONTNEED.

The primary thing preventing MADV_DONTNEED from working on hugetlb
mappings is a check in can_madv_lru_vma().  To allow support for hugetlb
mappings create and use a new routine madvise_dontneed_free_valid_vma()
that allows hugetlb mappings in this specific case.

For normal mappings, madvise requires the start address be PAGE aligned
and rounds up length to the next multiple of PAGE_SIZE.  Do similarly
for hugetlb mappings: require start address be huge page size aligned and
round up length to the next multiple of huge page size.  Use the new
madvise_dontneed_free_valid_vma routine to check alignment and round up
length/end.  zap_page_range requires this alignment for hugetlb vmas
otherwise we will hit BUGs.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 mm/madvise.c | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

Comments

Yin Fengwei Feb. 17, 2022, 8:32 a.m. UTC | #1
On 2022/2/15 08:23, Mike Kravetz wrote:
> MADV_DONTNEED is currently disabled for hugetlb mappings.  This
> certainly makes sense in shared file mappings as the pagecache maintains
> a reference to the page and it will never be freed.  However, it could
> be useful to unmap and free pages in private mappings.  In addition,
> userfaultfd minor fault users may be able to simplify code by using
> MADV_DONTNEED.
> 
> The primary thing preventing MADV_DONTNEED from working on hugetlb
> mappings is a check in can_madv_lru_vma().  To allow support for hugetlb
> mappings create and use a new routine madvise_dontneed_free_valid_vma()
> that allows hugetlb mappings in this specific case.
> 
> For normal mappings, madvise requires the start address be PAGE aligned
> and rounds up length to the next multiple of PAGE_SIZE.  Do similarly

In man page of mmap, NOTE for "Huge page (Huge TLB) mappings":

"For munmap(), addr, and length must both be a multiple of the 
underlying huge page size."

Should we apply same rule to MADV_DONTNEED? Thanks.


Regards
Yin, Fengwei



> for hugetlb mappings: require start address be huge page size aligned and
> round up length to the next multiple of huge page size.  Use the new
> madvise_dontneed_free_valid_vma routine to check alignment and round up
> length/end.  zap_page_range requires this alignment for hugetlb vmas
> otherwise we will hit BUGs.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   mm/madvise.c | 33 ++++++++++++++++++++++++++++++---
>   1 file changed, 30 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/madvise.c b/mm/madvise.c
> index bed872a2ad5f..ede6affa1350 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -554,9 +554,14 @@ static void madvise_cold_page_range(struct mmu_gather *tlb,
>   	tlb_end_vma(tlb, vma);
>   }
>   
> +static inline bool can_madv_lru_non_huge_vma(struct vm_area_struct *vma)
> +{
> +	return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP));
> +}
> +
>   static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
>   {
> -	return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
> +	return can_madv_lru_non_huge_vma(vma) && !is_vm_hugetlb_page(vma);
>   }
>   
>   static long madvise_cold(struct vm_area_struct *vma,
> @@ -829,6 +834,23 @@ static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
>   	return 0;
>   }
>   
> +static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
> +					    unsigned long start,
> +					    unsigned long *end,
> +					    int behavior)
> +{
> +	if (!is_vm_hugetlb_page(vma))
> +		return can_madv_lru_non_huge_vma(vma);
> +
> +	if (behavior != MADV_DONTNEED)
> +		return false;
> +	if (start & ~huge_page_mask(hstate_vma(vma)))
> +		return false;
> +
> +	*end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
> +	return true;
> +}
> +
>   static long madvise_dontneed_free(struct vm_area_struct *vma,
>   				  struct vm_area_struct **prev,
>   				  unsigned long start, unsigned long end,
> @@ -837,7 +859,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
>   	struct mm_struct *mm = vma->vm_mm;
>   
>   	*prev = vma;
> -	if (!can_madv_lru_vma(vma))
> +	if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
>   		return -EINVAL;
>   
>   	if (!userfaultfd_remove(vma, start, end)) {
> @@ -859,7 +881,12 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
>   			 */
>   			return -ENOMEM;
>   		}
> -		if (!can_madv_lru_vma(vma))
> +		/*
> +		 * Potential end adjustment for hugetlb vma is OK as
> +		 * the check below keeps end within vma.
> +		 */
> +		if (!madvise_dontneed_free_valid_vma(vma, start, &end,
> +						     behavior))
>   			return -EINVAL;
>   		if (end > vma->vm_end) {
>   			/*
David Hildenbrand Feb. 17, 2022, 8:58 a.m. UTC | #2
On 17.02.22 09:32, Yin Fengwei wrote:
> 
> 
> On 2022/2/15 08:23, Mike Kravetz wrote:
>> MADV_DONTNEED is currently disabled for hugetlb mappings.  This
>> certainly makes sense in shared file mappings as the pagecache maintains
>> a reference to the page and it will never be freed.  However, it could
>> be useful to unmap and free pages in private mappings.  In addition,
>> userfaultfd minor fault users may be able to simplify code by using
>> MADV_DONTNEED.
>>
>> The primary thing preventing MADV_DONTNEED from working on hugetlb
>> mappings is a check in can_madv_lru_vma().  To allow support for hugetlb
>> mappings create and use a new routine madvise_dontneed_free_valid_vma()
>> that allows hugetlb mappings in this specific case.
>>
>> For normal mappings, madvise requires the start address be PAGE aligned
>> and rounds up length to the next multiple of PAGE_SIZE.  Do similarly
> 
> In man page of mmap, NOTE for "Huge page (Huge TLB) mappings":
> 
> "For munmap(), addr, and length must both be a multiple of the 
> underlying huge page size."
> 
> Should we apply same rule to MADV_DONTNEED? Thanks.

madvise() already has different rules than mmap() for ordinary pages, so
we'd much rather try staying consistent with madvise() rules.
Yin Fengwei Feb. 18, 2022, 12:39 a.m. UTC | #3
On 2/17/2022 4:58 PM, David Hildenbrand wrote:
>> In man page of mmap, NOTE for "Huge page (Huge TLB) mappings":
>>
>> "For munmap(), addr, and length must both be a multiple of the 
>> underlying huge page size."
>>
>> Should we apply same rule to MADV_DONTNEED? Thanks.
> madvise() already has different rules than mmap() for ordinary pages, so
> we'd much rather try staying consistent with madvise() rules.

Thanks for clarification.

Regards
Yin, Fengwei
diff mbox series

Patch

diff --git a/mm/madvise.c b/mm/madvise.c
index bed872a2ad5f..ede6affa1350 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -554,9 +554,14 @@  static void madvise_cold_page_range(struct mmu_gather *tlb,
 	tlb_end_vma(tlb, vma);
 }
 
+static inline bool can_madv_lru_non_huge_vma(struct vm_area_struct *vma)
+{
+	return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP));
+}
+
 static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
 {
-	return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
+	return can_madv_lru_non_huge_vma(vma) && !is_vm_hugetlb_page(vma);
 }
 
 static long madvise_cold(struct vm_area_struct *vma,
@@ -829,6 +834,23 @@  static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
 	return 0;
 }
 
+static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
+					    unsigned long start,
+					    unsigned long *end,
+					    int behavior)
+{
+	if (!is_vm_hugetlb_page(vma))
+		return can_madv_lru_non_huge_vma(vma);
+
+	if (behavior != MADV_DONTNEED)
+		return false;
+	if (start & ~huge_page_mask(hstate_vma(vma)))
+		return false;
+
+	*end = ALIGN(*end, huge_page_size(hstate_vma(vma)));
+	return true;
+}
+
 static long madvise_dontneed_free(struct vm_area_struct *vma,
 				  struct vm_area_struct **prev,
 				  unsigned long start, unsigned long end,
@@ -837,7 +859,7 @@  static long madvise_dontneed_free(struct vm_area_struct *vma,
 	struct mm_struct *mm = vma->vm_mm;
 
 	*prev = vma;
-	if (!can_madv_lru_vma(vma))
+	if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
 		return -EINVAL;
 
 	if (!userfaultfd_remove(vma, start, end)) {
@@ -859,7 +881,12 @@  static long madvise_dontneed_free(struct vm_area_struct *vma,
 			 */
 			return -ENOMEM;
 		}
-		if (!can_madv_lru_vma(vma))
+		/*
+		 * Potential end adjustment for hugetlb vma is OK as
+		 * the check below keeps end within vma.
+		 */
+		if (!madvise_dontneed_free_valid_vma(vma, start, &end,
+						     behavior))
 			return -EINVAL;
 		if (end > vma->vm_end) {
 			/*