Message ID | 20230925203030.703439-3-riel@surriel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | hugetlbfs: close race between MADV_DONTNEED and page fault | expand |
On 09/25/23 16:28, riel@surriel.com wrote: > From: Rik van Riel <riel@surriel.com> > > Malloc libraries, like jemalloc and tcalloc, take decisions on when > to call madvise independently from the code in the main application. > > This sometimes results in the application page faulting on an address, > right after the malloc library has shot down the backing memory with > MADV_DONTNEED. > > Usually this is harmless, because we always have some 4kB pages > sitting around to satisfy a page fault. However, with hugetlbfs > systems often allocate only the exact number of huge pages that > the application wants. > > Due to TLB batching, hugetlbfs MADV_DONTNEED will free pages outside of > any lock taken on the page fault path, which can open up the following > race condition: > > CPU 1 CPU 2 > > MADV_DONTNEED > unmap page > shoot down TLB entry > page fault > fail to allocate a huge page > killed with SIGBUS > free page > > Fix that race by pulling the locking from __unmap_hugepage_final_range > into helper functions called from zap_page_range_single. This ensures > page faults stay locked out of the MADV_DONTNEED VMA until the > huge pages have actually been freed. > > Signed-off-by: Rik van Riel <riel@surriel.com> > --- > include/linux/hugetlb.h | 35 +++++++++++++++++++++++++++++++++-- > mm/hugetlb.c | 20 +++++++++++--------- > mm/memory.c | 7 +++---- > 3 files changed, 47 insertions(+), 15 deletions(-) > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 694928fa06a3..d9ec500cfef9 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -139,7 +139,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, > void unmap_hugepage_range(struct vm_area_struct *, > unsigned long, unsigned long, struct page *, > zap_flags_t); > -void __unmap_hugepage_range_final(struct mmu_gather *tlb, > +void __unmap_hugepage_range(struct mmu_gather *tlb, > struct vm_area_struct *vma, > unsigned long start, unsigned long end, > struct page *ref_page, zap_flags_t zap_flags); > @@ -246,6 +246,25 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, > void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, > unsigned long *start, unsigned long *end); > > +extern void __hugetlb_zap_begin(struct vm_area_struct *vma, > + unsigned long *begin, unsigned long *end); > +extern void __hugetlb_zap_end(struct vm_area_struct *vma, > + struct zap_details *details); > + > +static inline void hugetlb_zap_begin(struct vm_area_struct *vma, > + unsigned long *start, unsigned long *end) > +{ > + if (is_vm_hugetlb_page(vma)) > + __hugetlb_zap_begin(vma, start, end); > +} > + > +static inline void hugetlb_zap_end(struct vm_area_struct *vma, > + struct zap_details *details) > +{ > + if (is_vm_hugetlb_page(vma)) > + __hugetlb_zap_end(vma, details); > +} > + > void hugetlb_vma_lock_read(struct vm_area_struct *vma); > void hugetlb_vma_unlock_read(struct vm_area_struct *vma); > void hugetlb_vma_lock_write(struct vm_area_struct *vma); > @@ -297,6 +316,18 @@ static inline void adjust_range_if_pmd_sharing_possible( > { > } > > +static inline void hugetlb_zap_begin( > + struct vm_area_struct *vma, > + unsigned long *start, unsigned long *end) > +{ > +} > + > +static inline void hugetlb_zap_end( > + struct vm_area_struct *vma, > + struct zap_details *details) > +{ > +} > + > static inline struct page *hugetlb_follow_page_mask( > struct vm_area_struct *vma, unsigned long address, unsigned int flags, > unsigned int *page_mask) > @@ -442,7 +473,7 @@ static inline long hugetlb_change_protection( > return 0; > } > > -static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, > +static inline void __unmap_hugepage_range(struct mmu_gather *tlb, > struct vm_area_struct *vma, unsigned long start, > unsigned long end, struct page *ref_page, > zap_flags_t zap_flags) > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index e859fba5bc7d..5f8b82e902a8 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5306,9 +5306,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, > return len + old_addr - old_end; > } > > -static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, > - unsigned long start, unsigned long end, > - struct page *ref_page, zap_flags_t zap_flags) > +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, > + unsigned long start, unsigned long end, > + struct page *ref_page, zap_flags_t zap_flags) > { > struct mm_struct *mm = vma->vm_mm; > unsigned long address; > @@ -5435,16 +5435,18 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct > tlb_flush_mmu_tlbonly(tlb); > } > > -void __unmap_hugepage_range_final(struct mmu_gather *tlb, > - struct vm_area_struct *vma, unsigned long start, > - unsigned long end, struct page *ref_page, > - zap_flags_t zap_flags) > +void __hugetlb_zap_begin(struct vm_area_struct *vma, > + unsigned long *start, unsigned long *end) > { > + adjust_range_if_pmd_sharing_possible(vma, start, end); > hugetlb_vma_lock_write(vma); > i_mmap_lock_write(vma->vm_file->f_mapping); > +} __unmap_hugepage_range_final() was called from unmap_single_vma. unmap_single_vma has two callers, zap_page_range_single and unmap_vmas. When the locking was moved into hugetlb_zap_begin, it was only added to the zap_page_range_single call path. Calls from unmap_vmas are missing the locking.
On Mon, 2023-09-25 at 15:25 -0700, Mike Kravetz wrote: > On 09/25/23 16:28, riel@surriel.com wrote: > > > > -void __unmap_hugepage_range_final(struct mmu_gather *tlb, > > - struct vm_area_struct *vma, unsigned long > > start, > > - unsigned long end, struct page *ref_page, > > - zap_flags_t zap_flags) > > +void __hugetlb_zap_begin(struct vm_area_struct *vma, > > + unsigned long *start, unsigned long *end) > > { > > + adjust_range_if_pmd_sharing_possible(vma, start, end); > > hugetlb_vma_lock_write(vma); > > i_mmap_lock_write(vma->vm_file->f_mapping); > > +} > > __unmap_hugepage_range_final() was called from unmap_single_vma. > unmap_single_vma has two callers, zap_page_range_single and > unmap_vmas. > > When the locking was moved into hugetlb_zap_begin, it was only added > to the > zap_page_range_single call path. Calls from unmap_vmas are missing > the > locking. Oh, that's a fun one. I suppose the locking of the f_mapping lock, and calling adjust_range_if_pmd_sharing_possible matters for the call from unmap_vmas, while the call tho hugetlb_vma_lock_write really doesn't matter, since unmap_vmas is called with the mmap_sem held for write, which already excludes page faults. I'll add the call there for v4. Good catch.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 694928fa06a3..d9ec500cfef9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -139,7 +139,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); -void __unmap_hugepage_range_final(struct mmu_gather *tlb, +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags); @@ -246,6 +246,25 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end); +extern void __hugetlb_zap_begin(struct vm_area_struct *vma, + unsigned long *begin, unsigned long *end); +extern void __hugetlb_zap_end(struct vm_area_struct *vma, + struct zap_details *details); + +static inline void hugetlb_zap_begin(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ + if (is_vm_hugetlb_page(vma)) + __hugetlb_zap_begin(vma, start, end); +} + +static inline void hugetlb_zap_end(struct vm_area_struct *vma, + struct zap_details *details) +{ + if (is_vm_hugetlb_page(vma)) + __hugetlb_zap_end(vma, details); +} + void hugetlb_vma_lock_read(struct vm_area_struct *vma); void hugetlb_vma_unlock_read(struct vm_area_struct *vma); void hugetlb_vma_lock_write(struct vm_area_struct *vma); @@ -297,6 +316,18 @@ static inline void adjust_range_if_pmd_sharing_possible( { } +static inline void hugetlb_zap_begin( + struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) +{ +} + +static inline void hugetlb_zap_end( + struct vm_area_struct *vma, + struct zap_details *details) +{ +} + static inline struct page *hugetlb_follow_page_mask( struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask) @@ -442,7 +473,7 @@ static inline long hugetlb_change_protection( return 0; } -static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, +static inline void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e859fba5bc7d..5f8b82e902a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5306,9 +5306,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, return len + old_addr - old_end; } -static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct page *ref_page, zap_flags_t zap_flags) +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page, zap_flags_t zap_flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -5435,16 +5435,18 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_flush_mmu_tlbonly(tlb); } -void __unmap_hugepage_range_final(struct mmu_gather *tlb, - struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page, - zap_flags_t zap_flags) +void __hugetlb_zap_begin(struct vm_area_struct *vma, + unsigned long *start, unsigned long *end) { + adjust_range_if_pmd_sharing_possible(vma, start, end); hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); +} - /* mmu notification performed in caller */ - __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); +void __hugetlb_zap_end(struct vm_area_struct *vma, + struct zap_details *details) +{ + zap_flags_t zap_flags = details ? details->zap_flags : 0; if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ /* diff --git a/mm/memory.c b/mm/memory.c index 6c264d2f969c..a07ae3b60530 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1683,7 +1683,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; - __unmap_hugepage_range_final(tlb, vma, start, end, + __unmap_hugepage_range(tlb, vma, start, end, NULL, zap_flags); } } else @@ -1753,9 +1753,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, end); - if (is_vm_hugetlb_page(vma)) - adjust_range_if_pmd_sharing_possible(vma, &range.start, - &range.end); + hugetlb_zap_begin(vma, &range.start, &range.end); tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -1766,6 +1764,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unmap_single_vma(&tlb, vma, address, end, details, false); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); + hugetlb_zap_end(vma, details); } /**