Message ID | 5295d1c70c58e6aa63d14be68d4e1de9fa1c8e6d.1733248985.git.lorenzo.stoakes@oracle.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | mm/vma: make more mmap logic userland testable | expand |
On Tue, Dec 03, 2024 at 06:05:10PM +0000, Lorenzo Stoakes wrote: >Right now fs/exec.c invokes expand_downwards(), an otherwise internal >implementation detail of the VMA logic in order to ensure that an arg page >can be obtained by get_user_pages_remote(). > >In order to be able to move the stack expansion logic into mm/vma.c in >order to make it available to userland testing we need to find an Looks the second "in order" is not necessary. Not a native speaker, just my personal feeling. >alternative approach here. > >We do so by providing the mmap_read_lock_maybe_expand() function which also >helpfully documents what get_arg_page() is doing here and adds an >additional check against VM_GROWSDOWN to make explicit that the stack >expansion logic is only invoked when the VMA is indeed a downward-growing >stack. > >This allows expand_downwards() to become a static function. > >Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be >currently user-visible in any way, that is place within an rmap or VMA >tree. It must be a newly allocated VMA. > >This is the case when exec invokes this function. > >Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> >--- > fs/exec.c | 14 +++--------- > include/linux/mm.h | 5 ++--- > mm/mmap.c | 54 +++++++++++++++++++++++++++++++++++++++++++++- > 3 files changed, 58 insertions(+), 15 deletions(-) > >diff --git a/fs/exec.c b/fs/exec.c >index 98cb7ba9983c..1e1f79c514de 100644 >--- a/fs/exec.c >+++ b/fs/exec.c >@@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, > /* > * Avoid relying on expanding the stack down in GUP (which > * does not work for STACK_GROWSUP anyway), and just do it >- * by hand ahead of time. >+ * ahead of time. > */ >- if (write && pos < vma->vm_start) { >- mmap_write_lock(mm); >- ret = expand_downwards(vma, pos); >- if (unlikely(ret < 0)) { >- mmap_write_unlock(mm); >- return NULL; >- } >- mmap_write_downgrade(mm); >- } else >- mmap_read_lock(mm); >+ if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) >+ return NULL; > > /* > * We are doing an exec(). 'current' is the process >diff --git a/include/linux/mm.h b/include/linux/mm.h >index 4eb8e62d5c67..48312a934454 100644 >--- a/include/linux/mm.h >+++ b/include/linux/mm.h >@@ -3313,6 +3313,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi > extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); > extern void exit_mmap(struct mm_struct *); > int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, >+ unsigned long addr, bool write); > > static inline int check_data_rlimit(unsigned long rlim, > unsigned long new, >@@ -3426,9 +3428,6 @@ extern unsigned long stack_guard_gap; > int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); > struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); > >-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ >-int expand_downwards(struct vm_area_struct *vma, unsigned long address); >- > /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ > extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); > extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, >diff --git a/mm/mmap.c b/mm/mmap.c >index f053de1d6fae..4df38d3717ff 100644 >--- a/mm/mmap.c >+++ b/mm/mmap.c >@@ -1009,7 +1009,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) > * vma is the first one with address < vma->vm_start. Have to extend vma. > * mmap_lock held for writing. > */ >-int expand_downwards(struct vm_area_struct *vma, unsigned long address) >+static int expand_downwards(struct vm_area_struct *vma, unsigned long address) > { > struct mm_struct *mm = vma->vm_mm; > struct vm_area_struct *prev; >@@ -1940,3 +1940,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) > /* Shrink the vma to just the new range */ > return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); > } >+ >+#ifdef CONFIG_MMU >+/* >+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the >+ * start of the VMA, the intent is to perform a write, and it is a >+ * downward-growing stack, then attempt to expand the stack to contain it. >+ * >+ * This function is intended only for obtaining an argument page from an ELF >+ * image, and is almost certainly NOT what you want to use for any other >+ * purpose. >+ * >+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the >+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a >+ * new VMA being mapped. >+ * >+ * The function assumes that addr is either contained within the VMA or below >+ * it, and makes no attempt to validate this value beyond that. >+ * >+ * Returns true if the read lock was obtained and a stack was perhaps expanded, >+ * false if the stack expansion failed. >+ * >+ * On stack expansion the function temporarily acquires an mmap write lock >+ * before downgrading it. >+ */ >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, >+ struct vm_area_struct *new_vma, >+ unsigned long addr, bool write) >+{ >+ if (!write || addr >= new_vma->vm_start) { >+ mmap_read_lock(mm); >+ return true; >+ } >+ >+ if (!(new_vma->vm_flags & VM_GROWSDOWN)) >+ return false; >+ In expand_downwards() we have this checked. Maybe we just leave this done in one place is enough? >+ mmap_write_lock(mm); >+ if (expand_downwards(new_vma, addr)) { >+ mmap_write_unlock(mm); >+ return false; >+ } >+ >+ mmap_write_downgrade(mm); >+ return true; >+} >+#else >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, >+ unsigned long addr, bool write) >+{ >+ return false; >+} >+#endif >-- >2.47.1 >
On Thu, Dec 05, 2024 at 12:18:19AM +0000, Wei Yang wrote: > On Tue, Dec 03, 2024 at 06:05:10PM +0000, Lorenzo Stoakes wrote: > >Right now fs/exec.c invokes expand_downwards(), an otherwise internal > >implementation detail of the VMA logic in order to ensure that an arg page > >can be obtained by get_user_pages_remote(). > > > >In order to be able to move the stack expansion logic into mm/vma.c in > >order to make it available to userland testing we need to find an > > Looks the second "in order" is not necessary. > > Not a native speaker, just my personal feeling. > > >alternative approach here. > > > >We do so by providing the mmap_read_lock_maybe_expand() function which also > >helpfully documents what get_arg_page() is doing here and adds an > >additional check against VM_GROWSDOWN to make explicit that the stack > >expansion logic is only invoked when the VMA is indeed a downward-growing > >stack. > > > >This allows expand_downwards() to become a static function. > > > >Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be > >currently user-visible in any way, that is place within an rmap or VMA > >tree. It must be a newly allocated VMA. > > > >This is the case when exec invokes this function. > > > >Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > >--- > > fs/exec.c | 14 +++--------- > > include/linux/mm.h | 5 ++--- > > mm/mmap.c | 54 +++++++++++++++++++++++++++++++++++++++++++++- > > 3 files changed, 58 insertions(+), 15 deletions(-) > > > >diff --git a/fs/exec.c b/fs/exec.c > >index 98cb7ba9983c..1e1f79c514de 100644 > >--- a/fs/exec.c > >+++ b/fs/exec.c > >@@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, > > /* > > * Avoid relying on expanding the stack down in GUP (which > > * does not work for STACK_GROWSUP anyway), and just do it > >- * by hand ahead of time. > >+ * ahead of time. > > */ > >- if (write && pos < vma->vm_start) { > >- mmap_write_lock(mm); > >- ret = expand_downwards(vma, pos); > >- if (unlikely(ret < 0)) { > >- mmap_write_unlock(mm); > >- return NULL; > >- } > >- mmap_write_downgrade(mm); > >- } else > >- mmap_read_lock(mm); > >+ if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) > >+ return NULL; > > > > /* > > * We are doing an exec(). 'current' is the process > >diff --git a/include/linux/mm.h b/include/linux/mm.h > >index 4eb8e62d5c67..48312a934454 100644 > >--- a/include/linux/mm.h > >+++ b/include/linux/mm.h > >@@ -3313,6 +3313,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi > > extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); > > extern void exit_mmap(struct mm_struct *); > > int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); > >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, > >+ unsigned long addr, bool write); > > > > static inline int check_data_rlimit(unsigned long rlim, > > unsigned long new, > >@@ -3426,9 +3428,6 @@ extern unsigned long stack_guard_gap; > > int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); > > struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); > > > >-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ > >-int expand_downwards(struct vm_area_struct *vma, unsigned long address); > >- > > /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ > > extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); > > extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, > >diff --git a/mm/mmap.c b/mm/mmap.c > >index f053de1d6fae..4df38d3717ff 100644 > >--- a/mm/mmap.c > >+++ b/mm/mmap.c > >@@ -1009,7 +1009,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) > > * vma is the first one with address < vma->vm_start. Have to extend vma. > > * mmap_lock held for writing. > > */ > >-int expand_downwards(struct vm_area_struct *vma, unsigned long address) > >+static int expand_downwards(struct vm_area_struct *vma, unsigned long address) > > { > > struct mm_struct *mm = vma->vm_mm; > > struct vm_area_struct *prev; > >@@ -1940,3 +1940,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) > > /* Shrink the vma to just the new range */ > > return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); > > } > >+ > >+#ifdef CONFIG_MMU > >+/* > >+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the > >+ * start of the VMA, the intent is to perform a write, and it is a > >+ * downward-growing stack, then attempt to expand the stack to contain it. > >+ * > >+ * This function is intended only for obtaining an argument page from an ELF > >+ * image, and is almost certainly NOT what you want to use for any other > >+ * purpose. > >+ * > >+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the > >+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a > >+ * new VMA being mapped. > >+ * > >+ * The function assumes that addr is either contained within the VMA or below > >+ * it, and makes no attempt to validate this value beyond that. > >+ * > >+ * Returns true if the read lock was obtained and a stack was perhaps expanded, > >+ * false if the stack expansion failed. > >+ * > >+ * On stack expansion the function temporarily acquires an mmap write lock > >+ * before downgrading it. > >+ */ > >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, > >+ struct vm_area_struct *new_vma, > >+ unsigned long addr, bool write) > >+{ > >+ if (!write || addr >= new_vma->vm_start) { > >+ mmap_read_lock(mm); > >+ return true; > >+ } > >+ > >+ if (!(new_vma->vm_flags & VM_GROWSDOWN)) > >+ return false; > >+ > > In expand_downwards() we have this checked. > > Maybe we just leave this done in one place is enough? Wei, I feel like I have repeated myself about 'mathematically smallest code' rather too many times at this stage. Doing an unsolicited drive-by review applying this concept, which I have roundly and clearly rejected, is not appreciated. At any rate, we are checking this _before the mmap lock is acquired_. It is also self-documenting. Please try to take on board the point that there are many factors when it comes to writing kernel code, aversion to possibly generated branches being only one of them. > > >+ mmap_write_lock(mm); > >+ if (expand_downwards(new_vma, addr)) { > >+ mmap_write_unlock(mm); > >+ return false; > >+ } > >+ > >+ mmap_write_downgrade(mm); > >+ return true; > >+} > >+#else > >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, > >+ unsigned long addr, bool write) > >+{ > >+ return false; > >+} > >+#endif > >-- > >2.47.1 > > > > -- > Wei Yang > Help you, Help me
On Thu, Dec 05, 2024 at 12:18:19AM +0000, Wei Yang wrote: > On Tue, Dec 03, 2024 at 06:05:10PM +0000, Lorenzo Stoakes wrote: > >Right now fs/exec.c invokes expand_downwards(), an otherwise internal > >implementation detail of the VMA logic in order to ensure that an arg page > >can be obtained by get_user_pages_remote(). > > > >In order to be able to move the stack expansion logic into mm/vma.c in > >order to make it available to userland testing we need to find an > > Looks the second "in order" is not necessary. > > Not a native speaker, just my personal feeling. > > >alternative approach here. Sorry missed this one. You're right this is clunky (non-native speakers often find this better than native speakers to whom clunky turn of phrase can be more easily overlooked I imagine). Second 'in order to' should be 'to' really, but I'm not sure this is important enough to take pains to address, will fix if a respin is otherwise needed. > > > >We do so by providing the mmap_read_lock_maybe_expand() function which also > >helpfully documents what get_arg_page() is doing here and adds an > >additional check against VM_GROWSDOWN to make explicit that the stack > >expansion logic is only invoked when the VMA is indeed a downward-growing > >stack. > > > >This allows expand_downwards() to become a static function. > > > >Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be > >currently user-visible in any way, that is place within an rmap or VMA > >tree. It must be a newly allocated VMA. > > > >This is the case when exec invokes this function. > > > >Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > >--- > > fs/exec.c | 14 +++--------- > > include/linux/mm.h | 5 ++--- > > mm/mmap.c | 54 +++++++++++++++++++++++++++++++++++++++++++++- > > 3 files changed, 58 insertions(+), 15 deletions(-) > > > >diff --git a/fs/exec.c b/fs/exec.c > >index 98cb7ba9983c..1e1f79c514de 100644 > >--- a/fs/exec.c > >+++ b/fs/exec.c > >@@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, > > /* > > * Avoid relying on expanding the stack down in GUP (which > > * does not work for STACK_GROWSUP anyway), and just do it > >- * by hand ahead of time. > >+ * ahead of time. > > */ > >- if (write && pos < vma->vm_start) { > >- mmap_write_lock(mm); > >- ret = expand_downwards(vma, pos); > >- if (unlikely(ret < 0)) { > >- mmap_write_unlock(mm); > >- return NULL; > >- } > >- mmap_write_downgrade(mm); > >- } else > >- mmap_read_lock(mm); > >+ if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) > >+ return NULL; > > > > /* > > * We are doing an exec(). 'current' is the process > >diff --git a/include/linux/mm.h b/include/linux/mm.h > >index 4eb8e62d5c67..48312a934454 100644 > >--- a/include/linux/mm.h > >+++ b/include/linux/mm.h > >@@ -3313,6 +3313,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi > > extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); > > extern void exit_mmap(struct mm_struct *); > > int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); > >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, > >+ unsigned long addr, bool write); > > > > static inline int check_data_rlimit(unsigned long rlim, > > unsigned long new, > >@@ -3426,9 +3428,6 @@ extern unsigned long stack_guard_gap; > > int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); > > struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); > > > >-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ > >-int expand_downwards(struct vm_area_struct *vma, unsigned long address); > >- > > /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ > > extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); > > extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, > >diff --git a/mm/mmap.c b/mm/mmap.c > >index f053de1d6fae..4df38d3717ff 100644 > >--- a/mm/mmap.c > >+++ b/mm/mmap.c > >@@ -1009,7 +1009,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) > > * vma is the first one with address < vma->vm_start. Have to extend vma. > > * mmap_lock held for writing. > > */ > >-int expand_downwards(struct vm_area_struct *vma, unsigned long address) > >+static int expand_downwards(struct vm_area_struct *vma, unsigned long address) > > { > > struct mm_struct *mm = vma->vm_mm; > > struct vm_area_struct *prev; > >@@ -1940,3 +1940,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) > > /* Shrink the vma to just the new range */ > > return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); > > } > >+ > >+#ifdef CONFIG_MMU > >+/* > >+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the > >+ * start of the VMA, the intent is to perform a write, and it is a > >+ * downward-growing stack, then attempt to expand the stack to contain it. > >+ * > >+ * This function is intended only for obtaining an argument page from an ELF > >+ * image, and is almost certainly NOT what you want to use for any other > >+ * purpose. > >+ * > >+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the > >+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a > >+ * new VMA being mapped. > >+ * > >+ * The function assumes that addr is either contained within the VMA or below > >+ * it, and makes no attempt to validate this value beyond that. > >+ * > >+ * Returns true if the read lock was obtained and a stack was perhaps expanded, > >+ * false if the stack expansion failed. > >+ * > >+ * On stack expansion the function temporarily acquires an mmap write lock > >+ * before downgrading it. > >+ */ > >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, > >+ struct vm_area_struct *new_vma, > >+ unsigned long addr, bool write) > >+{ > >+ if (!write || addr >= new_vma->vm_start) { > >+ mmap_read_lock(mm); > >+ return true; > >+ } > >+ > >+ if (!(new_vma->vm_flags & VM_GROWSDOWN)) > >+ return false; > >+ > > In expand_downwards() we have this checked. > > Maybe we just leave this done in one place is enough? > > >+ mmap_write_lock(mm); > >+ if (expand_downwards(new_vma, addr)) { > >+ mmap_write_unlock(mm); > >+ return false; > >+ } > >+ > >+ mmap_write_downgrade(mm); > >+ return true; > >+} > >+#else > >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, > >+ unsigned long addr, bool write) > >+{ > >+ return false; > >+} > >+#endif > >-- > >2.47.1 > > > > -- > Wei Yang > Help you, Help me
On Thu, Dec 05, 2024 at 07:01:14AM +0000, Lorenzo Stoakes wrote: [...] >> >> Maybe we just leave this done in one place is enough? > >Wei, I feel like I have repeated myself about 'mathematically smallest >code' rather too many times at this stage. Doing an unsolicited drive-by >review applying this concept, which I have roundly and clearly rejected, is >not appreciated. > Hi, Lorenzo I would apologize for introducing this un-pleasant mail. Would be more thoughtful next time. >At any rate, we are checking this _before the mmap lock is acquired_. It is >also self-documenting. > >Please try to take on board the point that there are many factors when it >comes to writing kernel code, aversion to possibly generated branches being >only one of them. > Thanks for this suggestion. I am trying to be as professional as you are. In case you have other suggestions, they are welcome.
On Sun, Dec 08, 2024 at 11:27:06AM +0000, Wei Yang wrote: > On Thu, Dec 05, 2024 at 07:01:14AM +0000, Lorenzo Stoakes wrote: > [...] > >> > >> Maybe we just leave this done in one place is enough? > > > >Wei, I feel like I have repeated myself about 'mathematically smallest > >code' rather too many times at this stage. Doing an unsolicited drive-by > >review applying this concept, which I have roundly and clearly rejected, is > >not appreciated. > > > > Hi, Lorenzo > > I would apologize for introducing this un-pleasant mail. Would be more > thoughtful next time. It wasn't unpleasant, don't worry! :) I'm just trying to be as clear as I can about this so as to avoid you spending time on things that aren't useful. On occasion I think, for clarity, it's important to be firm - otherwise if I were receptive even on things that I thought not worthwhile - you'd end up wasting your time doing work that might end up not being used. > > >At any rate, we are checking this _before the mmap lock is acquired_. It is > >also self-documenting. > > > >Please try to take on board the point that there are many factors when it > >comes to writing kernel code, aversion to possibly generated branches being > >only one of them. > > > > Thanks for this suggestion. > > I am trying to be as professional as you are. In case you have other > suggestions, they are welcome. Thanks, what I would say is that simply observing that we might duplicate some logic in a non-harmful way does not necessarily indicate that this should be changed. Obviously if you can evidence a _statistically significant_ performance impact then OF COURSE you should report something like this and send a patch for it (along side the evidence of the perf regression). But in general, if you feel the need to refactor just for the sake of eliminating branches or grouping code like this, it isn't helpful or useful. Refactorings can be very useful _in general_ (I have done a lot of work on things like this myself obviously), but it's important to assess the RoI - is the churn worth the benefit - does it make significant enough of an impact - and is it 'tasteful'? These things are at least somewhat subjective obviously. What I would suggest you focus on instead is in finding bugs - your help in finding the bug where I (ugh) set a pointer to an error value in a case where, if you were unlucky, it might be dereferenced - was a really helpful contribution, as you can tell from how quickly we approved it and arranged for backporting. I'd say this ought to be your focus. For instance [0] was a horrid mistake on my part, and ripe for being discovered. Having a second pair of eyes checking for this kind of thing would be very useful, and discovering this kind of bug as early as possible would be hugely valued. So yeah TL;DR my advice is at the moment - focus on finding bugs above all else :) Cheers, Lorenzo [0]:https://lore.kernel.org/linux-mm/20241206215229.244413-1-lorenzo.stoakes@oracle.com/ > > > -- > Wei Yang > Help you, Help me
On Tue, Dec 3, 2024 at 7:05 PM Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote: > Right now fs/exec.c invokes expand_downwards(), an otherwise internal > implementation detail of the VMA logic in order to ensure that an arg page > can be obtained by get_user_pages_remote(). > > In order to be able to move the stack expansion logic into mm/vma.c in > order to make it available to userland testing we need to find an > alternative approach here. > > We do so by providing the mmap_read_lock_maybe_expand() function which also > helpfully documents what get_arg_page() is doing here and adds an > additional check against VM_GROWSDOWN to make explicit that the stack > expansion logic is only invoked when the VMA is indeed a downward-growing > stack. > > This allows expand_downwards() to become a static function. > > Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be > currently user-visible in any way, that is place within an rmap or VMA > tree. It must be a newly allocated VMA. > > This is the case when exec invokes this function. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > --- > fs/exec.c | 14 +++--------- > include/linux/mm.h | 5 ++--- > mm/mmap.c | 54 +++++++++++++++++++++++++++++++++++++++++++++- > 3 files changed, 58 insertions(+), 15 deletions(-) > > diff --git a/fs/exec.c b/fs/exec.c > index 98cb7ba9983c..1e1f79c514de 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, > /* > * Avoid relying on expanding the stack down in GUP (which > * does not work for STACK_GROWSUP anyway), and just do it > - * by hand ahead of time. > + * ahead of time. > */ > - if (write && pos < vma->vm_start) { > - mmap_write_lock(mm); > - ret = expand_downwards(vma, pos); > - if (unlikely(ret < 0)) { > - mmap_write_unlock(mm); > - return NULL; > - } > - mmap_write_downgrade(mm); > - } else > - mmap_read_lock(mm); > + if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) > + return NULL; [...] > +/* > + * Obtain a read lock on mm->mmap_lock, if the specified address is below the > + * start of the VMA, the intent is to perform a write, and it is a > + * downward-growing stack, then attempt to expand the stack to contain it. > + * > + * This function is intended only for obtaining an argument page from an ELF > + * image, and is almost certainly NOT what you want to use for any other > + * purpose. > + * > + * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the > + * VMA referenced must not be linked in any user-visible tree, i.e. it must be a > + * new VMA being mapped. > + * > + * The function assumes that addr is either contained within the VMA or below > + * it, and makes no attempt to validate this value beyond that. > + * > + * Returns true if the read lock was obtained and a stack was perhaps expanded, > + * false if the stack expansion failed. > + * > + * On stack expansion the function temporarily acquires an mmap write lock > + * before downgrading it. > + */ > +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, > + struct vm_area_struct *new_vma, > + unsigned long addr, bool write) > +{ > + if (!write || addr >= new_vma->vm_start) { > + mmap_read_lock(mm); > + return true; > + } > + > + if (!(new_vma->vm_flags & VM_GROWSDOWN)) > + return false; > + > + mmap_write_lock(mm); > + if (expand_downwards(new_vma, addr)) { > + mmap_write_unlock(mm); > + return false; > + } > + > + mmap_write_downgrade(mm); > + return true; > +} Random thought: For write==1, this looks a bit like lock_mm_and_find_vma(mm, addr, NULL), which needs similar stack expansion logic for handling userspace faults. But it's for a sufficiently different situation that maybe it makes sense to keep it like you did it, as a separate function...
On Tue, Dec 03, 2024 at 06:05:10PM +0000, Lorenzo Stoakes wrote: > Right now fs/exec.c invokes expand_downwards(), an otherwise internal > implementation detail of the VMA logic in order to ensure that an arg page > can be obtained by get_user_pages_remote(). I think this is already in -next, but yeah, this looks good. It does mix some logic changes, but it's pretty minor. > - if (write && pos < vma->vm_start) { > - mmap_write_lock(mm); > - ret = expand_downwards(vma, pos); > - if (unlikely(ret < 0)) { > - mmap_write_unlock(mm); > - return NULL; > - } > - mmap_write_downgrade(mm); > - } else > - mmap_read_lock(mm); > + if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) > + return NULL; > > [...] > + if (!write || addr >= new_vma->vm_start) { > + mmap_read_lock(mm); > + return true; > + } > + > + if (!(new_vma->vm_flags & VM_GROWSDOWN)) > + return false; The VM_GROWSDOWN check is moved around a bit. It seems okay, though. > + > + mmap_write_lock(mm); > + if (expand_downwards(new_vma, addr)) { > + mmap_write_unlock(mm); > + return false; > + } > + > + mmap_write_downgrade(mm); > + return true; I actually find this much more readable now since it follows the more common "bail out early" coding style. Acked-by: Kees Cook <kees@kernel.org> Thanks! -Kees
diff --git a/fs/exec.c b/fs/exec.c index 98cb7ba9983c..1e1f79c514de 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, /* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it - * by hand ahead of time. + * ahead of time. */ - if (write && pos < vma->vm_start) { - mmap_write_lock(mm); - ret = expand_downwards(vma, pos); - if (unlikely(ret < 0)) { - mmap_write_unlock(mm); - return NULL; - } - mmap_write_downgrade(mm); - } else - mmap_read_lock(mm); + if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) + return NULL; /* * We are doing an exec(). 'current' is the process diff --git a/include/linux/mm.h b/include/linux/mm.h index 4eb8e62d5c67..48312a934454 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3313,6 +3313,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, @@ -3426,9 +3428,6 @@ extern unsigned long stack_guard_gap; int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); -/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address); - /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, diff --git a/mm/mmap.c b/mm/mmap.c index f053de1d6fae..4df38d3717ff 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1009,7 +1009,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) * vma is the first one with address < vma->vm_start. Have to extend vma. * mmap_lock held for writing. */ -int expand_downwards(struct vm_area_struct *vma, unsigned long address) +static int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; @@ -1940,3 +1940,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) /* Shrink the vma to just the new range */ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff); } + +#ifdef CONFIG_MMU +/* + * Obtain a read lock on mm->mmap_lock, if the specified address is below the + * start of the VMA, the intent is to perform a write, and it is a + * downward-growing stack, then attempt to expand the stack to contain it. + * + * This function is intended only for obtaining an argument page from an ELF + * image, and is almost certainly NOT what you want to use for any other + * purpose. + * + * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the + * VMA referenced must not be linked in any user-visible tree, i.e. it must be a + * new VMA being mapped. + * + * The function assumes that addr is either contained within the VMA or below + * it, and makes no attempt to validate this value beyond that. + * + * Returns true if the read lock was obtained and a stack was perhaps expanded, + * false if the stack expansion failed. + * + * On stack expansion the function temporarily acquires an mmap write lock + * before downgrading it. + */ +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, + struct vm_area_struct *new_vma, + unsigned long addr, bool write) +{ + if (!write || addr >= new_vma->vm_start) { + mmap_read_lock(mm); + return true; + } + + if (!(new_vma->vm_flags & VM_GROWSDOWN)) + return false; + + mmap_write_lock(mm); + if (expand_downwards(new_vma, addr)) { + mmap_write_unlock(mm); + return false; + } + + mmap_write_downgrade(mm); + return true; +} +#else +bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, bool write) +{ + return false; +} +#endif
Right now fs/exec.c invokes expand_downwards(), an otherwise internal implementation detail of the VMA logic in order to ensure that an arg page can be obtained by get_user_pages_remote(). In order to be able to move the stack expansion logic into mm/vma.c in order to make it available to userland testing we need to find an alternative approach here. We do so by providing the mmap_read_lock_maybe_expand() function which also helpfully documents what get_arg_page() is doing here and adds an additional check against VM_GROWSDOWN to make explicit that the stack expansion logic is only invoked when the VMA is indeed a downward-growing stack. This allows expand_downwards() to become a static function. Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be currently user-visible in any way, that is place within an rmap or VMA tree. It must be a newly allocated VMA. This is the case when exec invokes this function. Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> --- fs/exec.c | 14 +++--------- include/linux/mm.h | 5 ++--- mm/mmap.c | 54 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 58 insertions(+), 15 deletions(-)