diff mbox series

[3/5] mm: abstract get_arg_page() stack expansion and mmap read lock

Message ID 5295d1c70c58e6aa63d14be68d4e1de9fa1c8e6d.1733248985.git.lorenzo.stoakes@oracle.com (mailing list archive)
State New
Headers show
Series mm/vma: make more mmap logic userland testable | expand

Commit Message

Lorenzo Stoakes Dec. 3, 2024, 6:05 p.m. UTC
Right now fs/exec.c invokes expand_downwards(), an otherwise internal
implementation detail of the VMA logic in order to ensure that an arg page
can be obtained by get_user_pages_remote().

In order to be able to move the stack expansion logic into mm/vma.c in
order to make it available to userland testing we need to find an
alternative approach here.

We do so by providing the mmap_read_lock_maybe_expand() function which also
helpfully documents what get_arg_page() is doing here and adds an
additional check against VM_GROWSDOWN to make explicit that the stack
expansion logic is only invoked when the VMA is indeed a downward-growing
stack.

This allows expand_downwards() to become a static function.

Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be
currently user-visible in any way, that is place within an rmap or VMA
tree. It must be a newly allocated VMA.

This is the case when exec invokes this function.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 fs/exec.c          | 14 +++---------
 include/linux/mm.h |  5 ++---
 mm/mmap.c          | 54 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 58 insertions(+), 15 deletions(-)

Comments

Wei Yang Dec. 5, 2024, 12:18 a.m. UTC | #1
On Tue, Dec 03, 2024 at 06:05:10PM +0000, Lorenzo Stoakes wrote:
>Right now fs/exec.c invokes expand_downwards(), an otherwise internal
>implementation detail of the VMA logic in order to ensure that an arg page
>can be obtained by get_user_pages_remote().
>
>In order to be able to move the stack expansion logic into mm/vma.c in
>order to make it available to userland testing we need to find an

Looks the second "in order" is not necessary.

Not a native speaker, just my personal feeling.

>alternative approach here.
>
>We do so by providing the mmap_read_lock_maybe_expand() function which also
>helpfully documents what get_arg_page() is doing here and adds an
>additional check against VM_GROWSDOWN to make explicit that the stack
>expansion logic is only invoked when the VMA is indeed a downward-growing
>stack.
>
>This allows expand_downwards() to become a static function.
>
>Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be
>currently user-visible in any way, that is place within an rmap or VMA
>tree. It must be a newly allocated VMA.
>
>This is the case when exec invokes this function.
>
>Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>---
> fs/exec.c          | 14 +++---------
> include/linux/mm.h |  5 ++---
> mm/mmap.c          | 54 +++++++++++++++++++++++++++++++++++++++++++++-
> 3 files changed, 58 insertions(+), 15 deletions(-)
>
>diff --git a/fs/exec.c b/fs/exec.c
>index 98cb7ba9983c..1e1f79c514de 100644
>--- a/fs/exec.c
>+++ b/fs/exec.c
>@@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
> 	/*
> 	 * Avoid relying on expanding the stack down in GUP (which
> 	 * does not work for STACK_GROWSUP anyway), and just do it
>-	 * by hand ahead of time.
>+	 * ahead of time.
> 	 */
>-	if (write && pos < vma->vm_start) {
>-		mmap_write_lock(mm);
>-		ret = expand_downwards(vma, pos);
>-		if (unlikely(ret < 0)) {
>-			mmap_write_unlock(mm);
>-			return NULL;
>-		}
>-		mmap_write_downgrade(mm);
>-	} else
>-		mmap_read_lock(mm);
>+	if (!mmap_read_lock_maybe_expand(mm, vma, pos, write))
>+		return NULL;
> 
> 	/*
> 	 * We are doing an exec().  'current' is the process
>diff --git a/include/linux/mm.h b/include/linux/mm.h
>index 4eb8e62d5c67..48312a934454 100644
>--- a/include/linux/mm.h
>+++ b/include/linux/mm.h
>@@ -3313,6 +3313,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi
> extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
> extern void exit_mmap(struct mm_struct *);
> int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
>+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
>+				 unsigned long addr, bool write);
> 
> static inline int check_data_rlimit(unsigned long rlim,
> 				    unsigned long new,
>@@ -3426,9 +3428,6 @@ extern unsigned long stack_guard_gap;
> int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
> struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
> 
>-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
>-int expand_downwards(struct vm_area_struct *vma, unsigned long address);
>-
> /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
> extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
>diff --git a/mm/mmap.c b/mm/mmap.c
>index f053de1d6fae..4df38d3717ff 100644
>--- a/mm/mmap.c
>+++ b/mm/mmap.c
>@@ -1009,7 +1009,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>  * vma is the first one with address < vma->vm_start.  Have to extend vma.
>  * mmap_lock held for writing.
>  */
>-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>+static int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> {
> 	struct mm_struct *mm = vma->vm_mm;
> 	struct vm_area_struct *prev;
>@@ -1940,3 +1940,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> 	/* Shrink the vma to just the new range */
> 	return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> }
>+
>+#ifdef CONFIG_MMU
>+/*
>+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the
>+ * start of the VMA, the intent is to perform a write, and it is a
>+ * downward-growing stack, then attempt to expand the stack to contain it.
>+ *
>+ * This function is intended only for obtaining an argument page from an ELF
>+ * image, and is almost certainly NOT what you want to use for any other
>+ * purpose.
>+ *
>+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
>+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
>+ * new VMA being mapped.
>+ *
>+ * The function assumes that addr is either contained within the VMA or below
>+ * it, and makes no attempt to validate this value beyond that.
>+ *
>+ * Returns true if the read lock was obtained and a stack was perhaps expanded,
>+ * false if the stack expansion failed.
>+ *
>+ * On stack expansion the function temporarily acquires an mmap write lock
>+ * before downgrading it.
>+ */
>+bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
>+				 struct vm_area_struct *new_vma,
>+				 unsigned long addr, bool write)
>+{
>+	if (!write || addr >= new_vma->vm_start) {
>+		mmap_read_lock(mm);
>+		return true;
>+	}
>+
>+	if (!(new_vma->vm_flags & VM_GROWSDOWN))
>+		return false;
>+

In expand_downwards() we have this checked.

Maybe we just leave this done in one place is enough?

>+	mmap_write_lock(mm);
>+	if (expand_downwards(new_vma, addr)) {
>+		mmap_write_unlock(mm);
>+		return false;
>+	}
>+
>+	mmap_write_downgrade(mm);
>+	return true;
>+}
>+#else
>+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
>+				 unsigned long addr, bool write)
>+{
>+	return false;
>+}
>+#endif
>-- 
>2.47.1
>
Lorenzo Stoakes Dec. 5, 2024, 7:01 a.m. UTC | #2
On Thu, Dec 05, 2024 at 12:18:19AM +0000, Wei Yang wrote:
> On Tue, Dec 03, 2024 at 06:05:10PM +0000, Lorenzo Stoakes wrote:
> >Right now fs/exec.c invokes expand_downwards(), an otherwise internal
> >implementation detail of the VMA logic in order to ensure that an arg page
> >can be obtained by get_user_pages_remote().
> >
> >In order to be able to move the stack expansion logic into mm/vma.c in
> >order to make it available to userland testing we need to find an
>
> Looks the second "in order" is not necessary.
>
> Not a native speaker, just my personal feeling.
>
> >alternative approach here.
> >
> >We do so by providing the mmap_read_lock_maybe_expand() function which also
> >helpfully documents what get_arg_page() is doing here and adds an
> >additional check against VM_GROWSDOWN to make explicit that the stack
> >expansion logic is only invoked when the VMA is indeed a downward-growing
> >stack.
> >
> >This allows expand_downwards() to become a static function.
> >
> >Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be
> >currently user-visible in any way, that is place within an rmap or VMA
> >tree. It must be a newly allocated VMA.
> >
> >This is the case when exec invokes this function.
> >
> >Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> >---
> > fs/exec.c          | 14 +++---------
> > include/linux/mm.h |  5 ++---
> > mm/mmap.c          | 54 +++++++++++++++++++++++++++++++++++++++++++++-
> > 3 files changed, 58 insertions(+), 15 deletions(-)
> >
> >diff --git a/fs/exec.c b/fs/exec.c
> >index 98cb7ba9983c..1e1f79c514de 100644
> >--- a/fs/exec.c
> >+++ b/fs/exec.c
> >@@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
> > 	/*
> > 	 * Avoid relying on expanding the stack down in GUP (which
> > 	 * does not work for STACK_GROWSUP anyway), and just do it
> >-	 * by hand ahead of time.
> >+	 * ahead of time.
> > 	 */
> >-	if (write && pos < vma->vm_start) {
> >-		mmap_write_lock(mm);
> >-		ret = expand_downwards(vma, pos);
> >-		if (unlikely(ret < 0)) {
> >-			mmap_write_unlock(mm);
> >-			return NULL;
> >-		}
> >-		mmap_write_downgrade(mm);
> >-	} else
> >-		mmap_read_lock(mm);
> >+	if (!mmap_read_lock_maybe_expand(mm, vma, pos, write))
> >+		return NULL;
> >
> > 	/*
> > 	 * We are doing an exec().  'current' is the process
> >diff --git a/include/linux/mm.h b/include/linux/mm.h
> >index 4eb8e62d5c67..48312a934454 100644
> >--- a/include/linux/mm.h
> >+++ b/include/linux/mm.h
> >@@ -3313,6 +3313,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi
> > extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
> > extern void exit_mmap(struct mm_struct *);
> > int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
> >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
> >+				 unsigned long addr, bool write);
> >
> > static inline int check_data_rlimit(unsigned long rlim,
> > 				    unsigned long new,
> >@@ -3426,9 +3428,6 @@ extern unsigned long stack_guard_gap;
> > int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
> > struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
> >
> >-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
> >-int expand_downwards(struct vm_area_struct *vma, unsigned long address);
> >-
> > /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> > extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
> > extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
> >diff --git a/mm/mmap.c b/mm/mmap.c
> >index f053de1d6fae..4df38d3717ff 100644
> >--- a/mm/mmap.c
> >+++ b/mm/mmap.c
> >@@ -1009,7 +1009,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> >  * vma is the first one with address < vma->vm_start.  Have to extend vma.
> >  * mmap_lock held for writing.
> >  */
> >-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> >+static int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> > {
> > 	struct mm_struct *mm = vma->vm_mm;
> > 	struct vm_area_struct *prev;
> >@@ -1940,3 +1940,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > 	/* Shrink the vma to just the new range */
> > 	return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> > }
> >+
> >+#ifdef CONFIG_MMU
> >+/*
> >+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the
> >+ * start of the VMA, the intent is to perform a write, and it is a
> >+ * downward-growing stack, then attempt to expand the stack to contain it.
> >+ *
> >+ * This function is intended only for obtaining an argument page from an ELF
> >+ * image, and is almost certainly NOT what you want to use for any other
> >+ * purpose.
> >+ *
> >+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
> >+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
> >+ * new VMA being mapped.
> >+ *
> >+ * The function assumes that addr is either contained within the VMA or below
> >+ * it, and makes no attempt to validate this value beyond that.
> >+ *
> >+ * Returns true if the read lock was obtained and a stack was perhaps expanded,
> >+ * false if the stack expansion failed.
> >+ *
> >+ * On stack expansion the function temporarily acquires an mmap write lock
> >+ * before downgrading it.
> >+ */
> >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
> >+				 struct vm_area_struct *new_vma,
> >+				 unsigned long addr, bool write)
> >+{
> >+	if (!write || addr >= new_vma->vm_start) {
> >+		mmap_read_lock(mm);
> >+		return true;
> >+	}
> >+
> >+	if (!(new_vma->vm_flags & VM_GROWSDOWN))
> >+		return false;
> >+
>
> In expand_downwards() we have this checked.
>
> Maybe we just leave this done in one place is enough?

Wei, I feel like I have repeated myself about 'mathematically smallest
code' rather too many times at this stage. Doing an unsolicited drive-by
review applying this concept, which I have roundly and clearly rejected, is
not appreciated.

At any rate, we are checking this _before the mmap lock is acquired_. It is
also self-documenting.

Please try to take on board the point that there are many factors when it
comes to writing kernel code, aversion to possibly generated branches being
only one of them.

>
> >+	mmap_write_lock(mm);
> >+	if (expand_downwards(new_vma, addr)) {
> >+		mmap_write_unlock(mm);
> >+		return false;
> >+	}
> >+
> >+	mmap_write_downgrade(mm);
> >+	return true;
> >+}
> >+#else
> >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
> >+				 unsigned long addr, bool write)
> >+{
> >+	return false;
> >+}
> >+#endif
> >--
> >2.47.1
> >
>
> --
> Wei Yang
> Help you, Help me
Lorenzo Stoakes Dec. 5, 2024, 7:06 a.m. UTC | #3
On Thu, Dec 05, 2024 at 12:18:19AM +0000, Wei Yang wrote:
> On Tue, Dec 03, 2024 at 06:05:10PM +0000, Lorenzo Stoakes wrote:
> >Right now fs/exec.c invokes expand_downwards(), an otherwise internal
> >implementation detail of the VMA logic in order to ensure that an arg page
> >can be obtained by get_user_pages_remote().
> >
> >In order to be able to move the stack expansion logic into mm/vma.c in
> >order to make it available to userland testing we need to find an
>
> Looks the second "in order" is not necessary.
>
> Not a native speaker, just my personal feeling.
>
> >alternative approach here.

Sorry missed this one.

You're right this is clunky (non-native speakers often find this better
than native speakers to whom clunky turn of phrase can be more easily
overlooked I imagine).

Second 'in order to' should be 'to' really, but I'm not sure this is
important enough to take pains to address, will fix if a respin is
otherwise needed.

> >
> >We do so by providing the mmap_read_lock_maybe_expand() function which also
> >helpfully documents what get_arg_page() is doing here and adds an
> >additional check against VM_GROWSDOWN to make explicit that the stack
> >expansion logic is only invoked when the VMA is indeed a downward-growing
> >stack.
> >
> >This allows expand_downwards() to become a static function.
> >
> >Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be
> >currently user-visible in any way, that is place within an rmap or VMA
> >tree. It must be a newly allocated VMA.
> >
> >This is the case when exec invokes this function.
> >
> >Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> >---
> > fs/exec.c          | 14 +++---------
> > include/linux/mm.h |  5 ++---
> > mm/mmap.c          | 54 +++++++++++++++++++++++++++++++++++++++++++++-
> > 3 files changed, 58 insertions(+), 15 deletions(-)
> >
> >diff --git a/fs/exec.c b/fs/exec.c
> >index 98cb7ba9983c..1e1f79c514de 100644
> >--- a/fs/exec.c
> >+++ b/fs/exec.c
> >@@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
> > 	/*
> > 	 * Avoid relying on expanding the stack down in GUP (which
> > 	 * does not work for STACK_GROWSUP anyway), and just do it
> >-	 * by hand ahead of time.
> >+	 * ahead of time.
> > 	 */
> >-	if (write && pos < vma->vm_start) {
> >-		mmap_write_lock(mm);
> >-		ret = expand_downwards(vma, pos);
> >-		if (unlikely(ret < 0)) {
> >-			mmap_write_unlock(mm);
> >-			return NULL;
> >-		}
> >-		mmap_write_downgrade(mm);
> >-	} else
> >-		mmap_read_lock(mm);
> >+	if (!mmap_read_lock_maybe_expand(mm, vma, pos, write))
> >+		return NULL;
> >
> > 	/*
> > 	 * We are doing an exec().  'current' is the process
> >diff --git a/include/linux/mm.h b/include/linux/mm.h
> >index 4eb8e62d5c67..48312a934454 100644
> >--- a/include/linux/mm.h
> >+++ b/include/linux/mm.h
> >@@ -3313,6 +3313,8 @@ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi
> > extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
> > extern void exit_mmap(struct mm_struct *);
> > int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
> >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
> >+				 unsigned long addr, bool write);
> >
> > static inline int check_data_rlimit(unsigned long rlim,
> > 				    unsigned long new,
> >@@ -3426,9 +3428,6 @@ extern unsigned long stack_guard_gap;
> > int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
> > struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
> >
> >-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
> >-int expand_downwards(struct vm_area_struct *vma, unsigned long address);
> >-
> > /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
> > extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
> > extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
> >diff --git a/mm/mmap.c b/mm/mmap.c
> >index f053de1d6fae..4df38d3717ff 100644
> >--- a/mm/mmap.c
> >+++ b/mm/mmap.c
> >@@ -1009,7 +1009,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> >  * vma is the first one with address < vma->vm_start.  Have to extend vma.
> >  * mmap_lock held for writing.
> >  */
> >-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> >+static int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> > {
> > 	struct mm_struct *mm = vma->vm_mm;
> > 	struct vm_area_struct *prev;
> >@@ -1940,3 +1940,55 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
> > 	/* Shrink the vma to just the new range */
> > 	return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
> > }
> >+
> >+#ifdef CONFIG_MMU
> >+/*
> >+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the
> >+ * start of the VMA, the intent is to perform a write, and it is a
> >+ * downward-growing stack, then attempt to expand the stack to contain it.
> >+ *
> >+ * This function is intended only for obtaining an argument page from an ELF
> >+ * image, and is almost certainly NOT what you want to use for any other
> >+ * purpose.
> >+ *
> >+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
> >+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
> >+ * new VMA being mapped.
> >+ *
> >+ * The function assumes that addr is either contained within the VMA or below
> >+ * it, and makes no attempt to validate this value beyond that.
> >+ *
> >+ * Returns true if the read lock was obtained and a stack was perhaps expanded,
> >+ * false if the stack expansion failed.
> >+ *
> >+ * On stack expansion the function temporarily acquires an mmap write lock
> >+ * before downgrading it.
> >+ */
> >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
> >+				 struct vm_area_struct *new_vma,
> >+				 unsigned long addr, bool write)
> >+{
> >+	if (!write || addr >= new_vma->vm_start) {
> >+		mmap_read_lock(mm);
> >+		return true;
> >+	}
> >+
> >+	if (!(new_vma->vm_flags & VM_GROWSDOWN))
> >+		return false;
> >+
>
> In expand_downwards() we have this checked.
>
> Maybe we just leave this done in one place is enough?
>
> >+	mmap_write_lock(mm);
> >+	if (expand_downwards(new_vma, addr)) {
> >+		mmap_write_unlock(mm);
> >+		return false;
> >+	}
> >+
> >+	mmap_write_downgrade(mm);
> >+	return true;
> >+}
> >+#else
> >+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
> >+				 unsigned long addr, bool write)
> >+{
> >+	return false;
> >+}
> >+#endif
> >--
> >2.47.1
> >
>
> --
> Wei Yang
> Help you, Help me
Wei Yang Dec. 8, 2024, 11:27 a.m. UTC | #4
On Thu, Dec 05, 2024 at 07:01:14AM +0000, Lorenzo Stoakes wrote:
[...]
>>
>> Maybe we just leave this done in one place is enough?
>
>Wei, I feel like I have repeated myself about 'mathematically smallest
>code' rather too many times at this stage. Doing an unsolicited drive-by
>review applying this concept, which I have roundly and clearly rejected, is
>not appreciated.
>

Hi, Lorenzo

I would apologize for introducing this un-pleasant mail. Would be more
thoughtful next time.

>At any rate, we are checking this _before the mmap lock is acquired_. It is
>also self-documenting.
>
>Please try to take on board the point that there are many factors when it
>comes to writing kernel code, aversion to possibly generated branches being
>only one of them.
>

Thanks for this suggestion.

I am trying to be as professional as you are. In case you have other
suggestions, they are welcome.
Lorenzo Stoakes Dec. 9, 2024, 10:47 a.m. UTC | #5
On Sun, Dec 08, 2024 at 11:27:06AM +0000, Wei Yang wrote:
> On Thu, Dec 05, 2024 at 07:01:14AM +0000, Lorenzo Stoakes wrote:
> [...]
> >>
> >> Maybe we just leave this done in one place is enough?
> >
> >Wei, I feel like I have repeated myself about 'mathematically smallest
> >code' rather too many times at this stage. Doing an unsolicited drive-by
> >review applying this concept, which I have roundly and clearly rejected, is
> >not appreciated.
> >
>
> Hi, Lorenzo
>
> I would apologize for introducing this un-pleasant mail. Would be more
> thoughtful next time.

It wasn't unpleasant, don't worry! :) I'm just trying to be as clear as I
can about this so as to avoid you spending time on things that aren't
useful.

On occasion I think, for clarity, it's important to be firm - otherwise if
I were receptive even on things that I thought not worthwhile - you'd end
up wasting your time doing work that might end up not being used.

>
> >At any rate, we are checking this _before the mmap lock is acquired_. It is
> >also self-documenting.
> >
> >Please try to take on board the point that there are many factors when it
> >comes to writing kernel code, aversion to possibly generated branches being
> >only one of them.
> >
>
> Thanks for this suggestion.
>
> I am trying to be as professional as you are. In case you have other
> suggestions, they are welcome.

Thanks, what I would say is that simply observing that we might duplicate
some logic in a non-harmful way does not necessarily indicate that this
should be changed.

Obviously if you can evidence a _statistically significant_ performance
impact then OF COURSE you should report something like this and send a
patch for it (along side the evidence of the perf regression).

But in general, if you feel the need to refactor just for the sake of
eliminating branches or grouping code like this, it isn't helpful or
useful.

Refactorings can be very useful _in general_ (I have done a lot of work on
things like this myself obviously), but it's important to assess the RoI -
is the churn worth the benefit - does it make significant enough of an
impact - and is it 'tasteful'?

These things are at least somewhat subjective obviously.

What I would suggest you focus on instead is in finding bugs - your help in
finding the bug where I (ugh) set a pointer to an error value in a case
where, if you were unlucky, it might be dereferenced - was a really helpful
contribution, as you can tell from how quickly we approved it and arranged
for backporting.

I'd say this ought to be your focus. For instance [0] was a horrid mistake
on my part, and ripe for being discovered. Having a second pair of eyes
checking for this kind of thing would be very useful, and discovering this
kind of bug as early as possible would be hugely valued.

So yeah TL;DR my advice is at the moment - focus on finding bugs above all
else :)

Cheers, Lorenzo

[0]:https://lore.kernel.org/linux-mm/20241206215229.244413-1-lorenzo.stoakes@oracle.com/

>
>
> --
> Wei Yang
> Help you, Help me
Jann Horn Dec. 10, 2024, 5:14 p.m. UTC | #6
On Tue, Dec 3, 2024 at 7:05 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
> Right now fs/exec.c invokes expand_downwards(), an otherwise internal
> implementation detail of the VMA logic in order to ensure that an arg page
> can be obtained by get_user_pages_remote().
>
> In order to be able to move the stack expansion logic into mm/vma.c in
> order to make it available to userland testing we need to find an
> alternative approach here.
>
> We do so by providing the mmap_read_lock_maybe_expand() function which also
> helpfully documents what get_arg_page() is doing here and adds an
> additional check against VM_GROWSDOWN to make explicit that the stack
> expansion logic is only invoked when the VMA is indeed a downward-growing
> stack.
>
> This allows expand_downwards() to become a static function.
>
> Importantly, the VMA referenced by mmap_read_maybe_expand() must NOT be
> currently user-visible in any way, that is place within an rmap or VMA
> tree. It must be a newly allocated VMA.
>
> This is the case when exec invokes this function.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
>  fs/exec.c          | 14 +++---------
>  include/linux/mm.h |  5 ++---
>  mm/mmap.c          | 54 +++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 58 insertions(+), 15 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 98cb7ba9983c..1e1f79c514de 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -205,18 +205,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
>         /*
>          * Avoid relying on expanding the stack down in GUP (which
>          * does not work for STACK_GROWSUP anyway), and just do it
> -        * by hand ahead of time.
> +        * ahead of time.
>          */
> -       if (write && pos < vma->vm_start) {
> -               mmap_write_lock(mm);
> -               ret = expand_downwards(vma, pos);
> -               if (unlikely(ret < 0)) {
> -                       mmap_write_unlock(mm);
> -                       return NULL;
> -               }
> -               mmap_write_downgrade(mm);
> -       } else
> -               mmap_read_lock(mm);
> +       if (!mmap_read_lock_maybe_expand(mm, vma, pos, write))
> +               return NULL;
[...]
> +/*
> + * Obtain a read lock on mm->mmap_lock, if the specified address is below the
> + * start of the VMA, the intent is to perform a write, and it is a
> + * downward-growing stack, then attempt to expand the stack to contain it.
> + *
> + * This function is intended only for obtaining an argument page from an ELF
> + * image, and is almost certainly NOT what you want to use for any other
> + * purpose.
> + *
> + * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
> + * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
> + * new VMA being mapped.
> + *
> + * The function assumes that addr is either contained within the VMA or below
> + * it, and makes no attempt to validate this value beyond that.
> + *
> + * Returns true if the read lock was obtained and a stack was perhaps expanded,
> + * false if the stack expansion failed.
> + *
> + * On stack expansion the function temporarily acquires an mmap write lock
> + * before downgrading it.
> + */
> +bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
> +                                struct vm_area_struct *new_vma,
> +                                unsigned long addr, bool write)
> +{
> +       if (!write || addr >= new_vma->vm_start) {
> +               mmap_read_lock(mm);
> +               return true;
> +       }
> +
> +       if (!(new_vma->vm_flags & VM_GROWSDOWN))
> +               return false;
> +
> +       mmap_write_lock(mm);
> +       if (expand_downwards(new_vma, addr)) {
> +               mmap_write_unlock(mm);
> +               return false;
> +       }
> +
> +       mmap_write_downgrade(mm);
> +       return true;
> +}

Random thought: For write==1, this looks a bit like
lock_mm_and_find_vma(mm, addr, NULL), which needs similar stack
expansion logic for handling userspace faults. But it's for a
sufficiently different situation that maybe it makes sense to keep it
like you did it, as a separate function...
Kees Cook Dec. 14, 2024, 1:05 a.m. UTC | #7
On Tue, Dec 03, 2024 at 06:05:10PM +0000, Lorenzo Stoakes wrote:
> Right now fs/exec.c invokes expand_downwards(), an otherwise internal
> implementation detail of the VMA logic in order to ensure that an arg page
> can be obtained by get_user_pages_remote().

I think this is already in -next, but yeah, this looks good. It does mix
some logic changes, but it's pretty minor.

> -	if (write && pos < vma->vm_start) {
> -		mmap_write_lock(mm);
> -		ret = expand_downwards(vma, pos);
> -		if (unlikely(ret < 0)) {
> -			mmap_write_unlock(mm);
> -			return NULL;
> -		}
> -		mmap_write_downgrade(mm);
> -	} else
> -		mmap_read_lock(mm);
> +	if (!mmap_read_lock_maybe_expand(mm, vma, pos, write))
> +		return NULL;
>  
> [...]
> +	if (!write || addr >= new_vma->vm_start) {
> +		mmap_read_lock(mm);
> +		return true;
> +	}
> +
> +	if (!(new_vma->vm_flags & VM_GROWSDOWN))
> +		return false;

The VM_GROWSDOWN check is moved around a bit. It seems okay, though.

> +
> +	mmap_write_lock(mm);
> +	if (expand_downwards(new_vma, addr)) {
> +		mmap_write_unlock(mm);
> +		return false;
> +	}
> +
> +	mmap_write_downgrade(mm);
> +	return true;

I actually find this much more readable now since it follows the more
common "bail out early" coding style.

Acked-by: Kees Cook <kees@kernel.org>

Thanks!

-Kees
diff mbox series

Patch

diff --git a/fs/exec.c b/fs/exec.c
index 98cb7ba9983c..1e1f79c514de 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -205,18 +205,10 @@  static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 	/*
 	 * Avoid relying on expanding the stack down in GUP (which
 	 * does not work for STACK_GROWSUP anyway), and just do it
-	 * by hand ahead of time.
+	 * ahead of time.
 	 */
-	if (write && pos < vma->vm_start) {
-		mmap_write_lock(mm);
-		ret = expand_downwards(vma, pos);
-		if (unlikely(ret < 0)) {
-			mmap_write_unlock(mm);
-			return NULL;
-		}
-		mmap_write_downgrade(mm);
-	} else
-		mmap_read_lock(mm);
+	if (!mmap_read_lock_maybe_expand(mm, vma, pos, write))
+		return NULL;
 
 	/*
 	 * We are doing an exec().  'current' is the process
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4eb8e62d5c67..48312a934454 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3313,6 +3313,8 @@  extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admi
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void exit_mmap(struct mm_struct *);
 int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
+				 unsigned long addr, bool write);
 
 static inline int check_data_rlimit(unsigned long rlim,
 				    unsigned long new,
@@ -3426,9 +3428,6 @@  extern unsigned long stack_guard_gap;
 int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
 struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);
 
-/* CONFIG_STACK_GROWSUP still needs to grow downwards at some places */
-int expand_downwards(struct vm_area_struct *vma, unsigned long address);
-
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
diff --git a/mm/mmap.c b/mm/mmap.c
index f053de1d6fae..4df38d3717ff 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1009,7 +1009,7 @@  static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
  * vma is the first one with address < vma->vm_start.  Have to extend vma.
  * mmap_lock held for writing.
  */
-int expand_downwards(struct vm_area_struct *vma, unsigned long address)
+static int expand_downwards(struct vm_area_struct *vma, unsigned long address)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *prev;
@@ -1940,3 +1940,55 @@  int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
 	/* Shrink the vma to just the new range */
 	return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
 }
+
+#ifdef CONFIG_MMU
+/*
+ * Obtain a read lock on mm->mmap_lock, if the specified address is below the
+ * start of the VMA, the intent is to perform a write, and it is a
+ * downward-growing stack, then attempt to expand the stack to contain it.
+ *
+ * This function is intended only for obtaining an argument page from an ELF
+ * image, and is almost certainly NOT what you want to use for any other
+ * purpose.
+ *
+ * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
+ * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
+ * new VMA being mapped.
+ *
+ * The function assumes that addr is either contained within the VMA or below
+ * it, and makes no attempt to validate this value beyond that.
+ *
+ * Returns true if the read lock was obtained and a stack was perhaps expanded,
+ * false if the stack expansion failed.
+ *
+ * On stack expansion the function temporarily acquires an mmap write lock
+ * before downgrading it.
+ */
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
+				 struct vm_area_struct *new_vma,
+				 unsigned long addr, bool write)
+{
+	if (!write || addr >= new_vma->vm_start) {
+		mmap_read_lock(mm);
+		return true;
+	}
+
+	if (!(new_vma->vm_flags & VM_GROWSDOWN))
+		return false;
+
+	mmap_write_lock(mm);
+	if (expand_downwards(new_vma, addr)) {
+		mmap_write_unlock(mm);
+		return false;
+	}
+
+	mmap_write_downgrade(mm);
+	return true;
+}
+#else
+bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
+				 unsigned long addr, bool write)
+{
+	return false;
+}
+#endif