Message ID | 20231006040020.3677377-3-riel@surriel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | hugetlbfs: close race between MADV_DONTNEED and page fault | expand |
On Thu, Oct 05, 2023 at 11:59:07PM -0400, riel@surriel.com wrote: > From: Rik van Riel <riel@surriel.com> > > Extend the locking scheme used to protect shared hugetlb mappings > from truncate vs page fault races, in order to protect private > hugetlb mappings (with resv_map) against MADV_DONTNEED. > > Add a read-write semaphore to the resv_map data structure, and > use that from the hugetlb_vma_(un)lock_* functions, in preparation > for closing the race between MADV_DONTNEED and page faults. > > Signed-off-by: Rik van Riel <riel@surriel.com> > Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> > Cc: stable@kernel.org > Fixes: 04ada095dcfc ("hugetlb: don't delete vma_lock in hugetlb MADV_DONTNEED processing") > --- > include/linux/hugetlb.h | 6 ++++++ > mm/hugetlb.c | 41 +++++++++++++++++++++++++++++++++++++---- > 2 files changed, 43 insertions(+), 4 deletions(-) > > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 5b2626063f4f..694928fa06a3 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -60,6 +60,7 @@ struct resv_map { > long adds_in_progress; > struct list_head region_cache; > long region_cache_count; > + struct rw_semaphore rw_sema; > #ifdef CONFIG_CGROUP_HUGETLB > /* > * On private mappings, the counter to uncharge reservations is stored > @@ -1231,6 +1232,11 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma) > return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; > } > > +static inline bool __vma_private_lock(struct vm_area_struct *vma) > +{ > + return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data; > +} > + > /* > * Safe version of huge_pte_offset() to check the locks. See comments > * above huge_pte_offset(). > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index a86e070d735b..dd3de6ec8f1a 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -97,6 +97,7 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); > static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); > static void hugetlb_unshare_pmds(struct vm_area_struct *vma, > unsigned long start, unsigned long end); > +static struct resv_map *vma_resv_map(struct vm_area_struct *vma); > > static inline bool subpool_is_free(struct hugepage_subpool *spool) > { > @@ -267,6 +268,10 @@ void hugetlb_vma_lock_read(struct vm_area_struct *vma) > struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; > > down_read(&vma_lock->rw_sema); > + } else if (__vma_private_lock(vma)) { > + struct resv_map *resv_map = vma_resv_map(vma); > + > + down_read(&resv_map->rw_sema); > } > } +Ackerley +Oscar I'm reading the resv code recently and just stumbled upon this. So want to raise this question. IIUC __vma_private_lock() will return false for MAP_PRIVATE hugetlb vma if the vma is dup()ed from a fork(), with/without commit 187da0f8250a ("hugetlb: fix null-ptr-deref in hugetlb_vma_lock_write") which fixed a slightly different issue. The problem is the current vma lock for private mmap() is based on the resv map, and the resv map only belongs to the process that mmap()ed this private vma. E.g. dup_mmap() has: if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); Which does: if (vma->vm_flags & VM_MAYSHARE) { ... } else vma->vm_private_data = NULL; <--------------------- So even if I don't know how many of us are even using hugetlb PRIVATE + fork(), assuming that's the most controversial use case that I'm aware of on hugetlb that people complains about.. with some tricky changes like 04f2cbe35699.. Just still want to raise this pure question, that after a fork() on private vma, and if I read it alright, lock/unlock operations may become noop.. Thanks, > > @@ -276,6 +281,10 @@ void hugetlb_vma_unlock_read(struct vm_area_struct *vma) > struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; > > up_read(&vma_lock->rw_sema); > + } else if (__vma_private_lock(vma)) { > + struct resv_map *resv_map = vma_resv_map(vma); > + > + up_read(&resv_map->rw_sema); > } > } > > @@ -285,6 +294,10 @@ void hugetlb_vma_lock_write(struct vm_area_struct *vma) > struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; > > down_write(&vma_lock->rw_sema); > + } else if (__vma_private_lock(vma)) { > + struct resv_map *resv_map = vma_resv_map(vma); > + > + down_write(&resv_map->rw_sema); > } > } > > @@ -294,17 +307,27 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma) > struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; > > up_write(&vma_lock->rw_sema); > + } else if (__vma_private_lock(vma)) { > + struct resv_map *resv_map = vma_resv_map(vma); > + > + up_write(&resv_map->rw_sema); > } > } > > int hugetlb_vma_trylock_write(struct vm_area_struct *vma) > { > - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; > > - if (!__vma_shareable_lock(vma)) > - return 1; > + if (__vma_shareable_lock(vma)) { > + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; > > - return down_write_trylock(&vma_lock->rw_sema); > + return down_write_trylock(&vma_lock->rw_sema); > + } else if (__vma_private_lock(vma)) { > + struct resv_map *resv_map = vma_resv_map(vma); > + > + return down_write_trylock(&resv_map->rw_sema); > + } > + > + return 1; > } > > void hugetlb_vma_assert_locked(struct vm_area_struct *vma) > @@ -313,6 +336,10 @@ void hugetlb_vma_assert_locked(struct vm_area_struct *vma) > struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; > > lockdep_assert_held(&vma_lock->rw_sema); > + } else if (__vma_private_lock(vma)) { > + struct resv_map *resv_map = vma_resv_map(vma); > + > + lockdep_assert_held(&resv_map->rw_sema); > } > } > > @@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) > struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; > > __hugetlb_vma_unlock_write_put(vma_lock); > + } else if (__vma_private_lock(vma)) { > + struct resv_map *resv_map = vma_resv_map(vma); > + > + /* no free for anon vmas, but still need to unlock */ > + up_write(&resv_map->rw_sema); > } > } > > @@ -1068,6 +1100,7 @@ struct resv_map *resv_map_alloc(void) > kref_init(&resv_map->refs); > spin_lock_init(&resv_map->lock); > INIT_LIST_HEAD(&resv_map->regions); > + init_rwsem(&resv_map->rw_sema); > > resv_map->adds_in_progress = 0; > /* > -- > 2.41.0 > >
On Thu, Nov 07, 2024 at 03:18:51PM -0500, Peter Xu wrote: > +Ackerley +Oscar > > I'm reading the resv code recently and just stumbled upon this. So want to > raise this question. > > IIUC __vma_private_lock() will return false for MAP_PRIVATE hugetlb vma if > the vma is dup()ed from a fork(), with/without commit 187da0f8250a > ("hugetlb: fix null-ptr-deref in hugetlb_vma_lock_write") which fixed a > slightly different issue. > > The problem is the current vma lock for private mmap() is based on the resv > map, and the resv map only belongs to the process that mmap()ed this > private vma. E.g. dup_mmap() has: > > if (is_vm_hugetlb_page(tmp)) > hugetlb_dup_vma_private(tmp); > > Which does: > > if (vma->vm_flags & VM_MAYSHARE) { > ... > } else > vma->vm_private_data = NULL; <--------------------- > > So even if I don't know how many of us are even using hugetlb PRIVATE + > fork(), assuming that's the most controversial use case that I'm aware of > on hugetlb that people complains about.. with some tricky changes like > 04f2cbe35699.. Just still want to raise this pure question, that after a > fork() on private vma, and if I read it alright, lock/unlock operations may > become noop.. I have been taking a look at this, and yes, __vma_private_lock will return false for private hugetlb mappings that were forked . I quickly checked what protects what and we currently have: hugetlb_vma_lock_read - copy_hugetlb_page_range (only sharing) hugetlb_vma_lock_read - hugetlb_wp (only for HPAGE_RESV_OWNER) hugetlb_vma_lock_read - hugetlb_fault , protects huge_pmd_unshare? hugetlb_vma_lock_read - pagewalks hugetlb_vma_lock_write - hugetlb_change_protection hugetlb_vma_lock_write - hugetlb_unshare_pmds hugetlb_vma_lock_wirte - move_hugetlb_page_tables hugetlb_vma_lock_wirte - _hugetlb_zap_begin (unmap_vmas) the ones taking the hugetlb_vma_lock in write (so, the last four) also take the i_mmap_lock_write (vma->vm_file->f_mapping), and AFAIK, hugetlb mappings, private or not, should have vma->vm_file->f_mapping set. Which means that technically we cannot race between hugetlb_change_protection and move_hugetlb_page_tables etc. But, checking commit bf4916922c60f43efaa329744b3eef539aa6a2b2 Author: Rik van Riel <riel@surriel.com> Date: Thu Oct 5 23:59:07 2023 -0400 hugetlbfs: extend hugetlb_vma_lock to private VMAs which its motivation was to protect MADV_DONTNEED vs page_faults, I do not see how it gets protected with private hugetlb mappings that were dupped (forked). madvise_dontneed_single_vma zap_page_range_single _hugetlb_zap_begin hugetlb_vma_lock_write - noop for mappings that do not own the reservation i_mmap_lock_write But the hugetlb_fault path only takes hugetlb_vma_lock_*, so theorically we still could race between page_fault vs madvise_dontneed_single_vma? A quick way to prove would be map a hugetlb private mapping, fork it and have two threads tryong to madvise(MADV_DONTNEED) and the other trying to write to it? I do not know, maybe we are protected in some other way I cannot see right now. I will have a look.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5b2626063f4f..694928fa06a3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -60,6 +60,7 @@ struct resv_map { long adds_in_progress; struct list_head region_cache; long region_cache_count; + struct rw_semaphore rw_sema; #ifdef CONFIG_CGROUP_HUGETLB /* * On private mappings, the counter to uncharge reservations is stored @@ -1231,6 +1232,11 @@ static inline bool __vma_shareable_lock(struct vm_area_struct *vma) return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; } +static inline bool __vma_private_lock(struct vm_area_struct *vma) +{ + return (!(vma->vm_flags & VM_MAYSHARE)) && vma->vm_private_data; +} + /* * Safe version of huge_pte_offset() to check the locks. See comments * above huge_pte_offset(). diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a86e070d735b..dd3de6ec8f1a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -97,6 +97,7 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end); +static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -267,6 +268,10 @@ void hugetlb_vma_lock_read(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_read(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + down_read(&resv_map->rw_sema); } } @@ -276,6 +281,10 @@ void hugetlb_vma_unlock_read(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_read(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + up_read(&resv_map->rw_sema); } } @@ -285,6 +294,10 @@ void hugetlb_vma_lock_write(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; down_write(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + down_write(&resv_map->rw_sema); } } @@ -294,17 +307,27 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; up_write(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + up_write(&resv_map->rw_sema); } } int hugetlb_vma_trylock_write(struct vm_area_struct *vma) { - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - if (!__vma_shareable_lock(vma)) - return 1; + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - return down_write_trylock(&vma_lock->rw_sema); + return down_write_trylock(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + return down_write_trylock(&resv_map->rw_sema); + } + + return 1; } void hugetlb_vma_assert_locked(struct vm_area_struct *vma) @@ -313,6 +336,10 @@ void hugetlb_vma_assert_locked(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; lockdep_assert_held(&vma_lock->rw_sema); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + lockdep_assert_held(&resv_map->rw_sema); } } @@ -345,6 +372,11 @@ static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; __hugetlb_vma_unlock_write_put(vma_lock); + } else if (__vma_private_lock(vma)) { + struct resv_map *resv_map = vma_resv_map(vma); + + /* no free for anon vmas, but still need to unlock */ + up_write(&resv_map->rw_sema); } } @@ -1068,6 +1100,7 @@ struct resv_map *resv_map_alloc(void) kref_init(&resv_map->refs); spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); + init_rwsem(&resv_map->rw_sema); resv_map->adds_in_progress = 0; /*