Message ID | 20230822015501.791637-3-joel@joelfernandes.org (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Optimize mremap during mutual alignment within PMD | expand |
On Tue, Aug 22, 2023 at 01:54:55AM +0000, Joel Fernandes (Google) wrote: > For the stack move happening in shift_arg_pages(), the move is happening > within the same VMA which spans the old and new ranges. > > In case the aligned address happens to fall within that VMA, allow such > moves and don't abort the optimization. > > In the mremap case, we cannot allow any such moves as will end up > destroying some part of the mapping (either the source of the move, or > part of the existing mapping). So just avoid it for mremap. > > Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org> > --- > fs/exec.c | 2 +- > include/linux/mm.h | 2 +- > mm/mremap.c | 29 +++++++++++++++-------------- > 3 files changed, 17 insertions(+), 16 deletions(-) > > diff --git a/fs/exec.c b/fs/exec.c > index 1a827d55ba94..244925307958 100644 > --- a/fs/exec.c > +++ b/fs/exec.c > @@ -712,7 +712,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) > * process cleanup to remove whatever mess we made. > */ > if (length != move_page_tables(vma, old_start, > - vma, new_start, length, false)) > + vma, new_start, length, false, true)) > return -ENOMEM; > > lru_add_drain(); > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 406ab9ea818f..e635d1fc73b6 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -2458,7 +2458,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen); > extern unsigned long move_page_tables(struct vm_area_struct *vma, > unsigned long old_addr, struct vm_area_struct *new_vma, > unsigned long new_addr, unsigned long len, > - bool need_rmap_locks); > + bool need_rmap_locks, bool for_stack); It's a nit, but it'd be nice to not have 'mystery meat' booleans foo(bar, baz, true, false, true); always ends up being a pain to track down. However I think probably something better than that (flags or wrapper functions) might be too much noise here so perhaps we can live with this! > > /* > * Flags used by change_protection(). For now we make it a bitmap so > diff --git a/mm/mremap.c b/mm/mremap.c > index 035fbf542a8f..06baa13bd2c8 100644 > --- a/mm/mremap.c > +++ b/mm/mremap.c > @@ -490,12 +490,13 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, > } > > /* > - * A helper to check if a previous mapping exists. Required for > - * move_page_tables() and realign_addr() to determine if a previous mapping > - * exists before we can do realignment optimizations. > + * A helper to check if aligning down is OK. The aligned address should fall > + * on *no mapping*. For the stack moving down, that's a special move within > + * the VMA that is created to span the source and destination of the move, > + * so we make an exception for it. > */ > static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, > - unsigned long mask) > + unsigned long mask, bool for_stack) > { > unsigned long addr_masked = addr_to_align & mask; > > @@ -504,7 +505,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali > * of the corresponding VMA, we can't align down or we will destroy part > * of the current mapping. > */ > - if (vma->vm_start != addr_to_align) > + if (!for_stack && vma->vm_start != addr_to_align) > return false; I'm a little confused by this exception, is it very specifically for the shift_arg_pages() case where can assume we are safe to just discard the lower portion of the stack? Wouldn't the find_vma_intersection() line below fail in this case? I may be missing something here :) > > /* > @@ -517,7 +518,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali > /* Opportunistically realign to specified boundary for faster copy. */ > static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma, > unsigned long *new_addr, struct vm_area_struct *new_vma, > - unsigned long mask) > + unsigned long mask, bool for_stack) > { > /* Skip if the addresses are already aligned. */ > if ((*old_addr & ~mask) == 0) > @@ -528,8 +529,8 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old > return; > > /* Ensure realignment doesn't cause overlap with existing mappings. */ > - if (!can_align_down(old_vma, *old_addr, mask) || > - !can_align_down(new_vma, *new_addr, mask)) > + if (!can_align_down(old_vma, *old_addr, mask, for_stack) || > + !can_align_down(new_vma, *new_addr, mask, for_stack)) > return; > > *old_addr = *old_addr & mask; > @@ -539,7 +540,7 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old > unsigned long move_page_tables(struct vm_area_struct *vma, > unsigned long old_addr, struct vm_area_struct *new_vma, > unsigned long new_addr, unsigned long len, > - bool need_rmap_locks) > + bool need_rmap_locks, bool for_stack) > { > unsigned long extent, old_end; > struct mmu_notifier_range range; > @@ -559,9 +560,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma, > * If possible, realign addresses to PMD boundary for faster copy. > * Only realign if the mremap copying hits a PMD boundary. > */ > - if ((vma != new_vma) > - && (len >= PMD_SIZE - (old_addr & ~PMD_MASK))) > - try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK); > + if (len >= PMD_SIZE - (old_addr & ~PMD_MASK)) > + try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK, > + for_stack); > > flush_cache_range(vma, old_addr, old_end); > mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, > @@ -708,7 +709,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, > } > > moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, > - need_rmap_locks); > + need_rmap_locks, false); > if (moved_len < old_len) { > err = -ENOMEM; > } else if (vma->vm_ops && vma->vm_ops->mremap) { > @@ -722,7 +723,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, > * and then proceed to unmap new area instead of old. > */ > move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, > - true); > + true, false); > vma = new_vma; > old_len = new_len; > old_addr = new_addr; > -- > 2.42.0.rc1.204.g551eb34607-goog >
On Sun, Aug 27, 2023 at 10:21:14AM +0100, Lorenzo Stoakes wrote: [..] > > > > /* > > * Flags used by change_protection(). For now we make it a bitmap so > > diff --git a/mm/mremap.c b/mm/mremap.c > > index 035fbf542a8f..06baa13bd2c8 100644 > > --- a/mm/mremap.c > > +++ b/mm/mremap.c > > @@ -490,12 +490,13 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, > > } > > > > /* > > - * A helper to check if a previous mapping exists. Required for > > - * move_page_tables() and realign_addr() to determine if a previous mapping > > - * exists before we can do realignment optimizations. > > + * A helper to check if aligning down is OK. The aligned address should fall > > + * on *no mapping*. For the stack moving down, that's a special move within > > + * the VMA that is created to span the source and destination of the move, > > + * so we make an exception for it. > > */ > > static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, > > - unsigned long mask) > > + unsigned long mask, bool for_stack) > > { > > unsigned long addr_masked = addr_to_align & mask; > > > > @@ -504,7 +505,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali > > * of the corresponding VMA, we can't align down or we will destroy part > > * of the current mapping. > > */ > > - if (vma->vm_start != addr_to_align) > > + if (!for_stack && vma->vm_start != addr_to_align) > > return false; > > I'm a little confused by this exception, is it very specifically for the > shift_arg_pages() case where can assume we are safe to just discard the > lower portion of the stack? > > Wouldn't the find_vma_intersection() line below fail in this case? I may be > missing something here :) I think you are right. In v4, this was not an issue as we did this: + if (!for_stack && vma->vm_start != addr_to_align) + return false; + + cur = find_vma_prev(vma->vm_mm, vma->vm_start, &prev); + if (WARN_ON_ONCE(cur != vma)) + return false; Which essentially means this patch is a NOOP in v5 for the stack case. So what we really want is the VMA previous to @vma and whether than subsumes the masked address. Should I just change it back to the v4 version then as above for both patch 1 and 2 and carry your review tags? This is also hard to test as it requires triggering the execve stack move case. Though it is not a bug (as it is essentially a NOOP), it still would be nice to test it. This is complicated by also the fact that mremap(2) itself does not allow overlapping moves. I could try to hardcode the unfavorable situation as I have done in the past to force that mremap warning. thanks, - Joel
On Mon, Aug 28, 2023 at 06:32:40PM +0000, Joel Fernandes wrote: > On Sun, Aug 27, 2023 at 10:21:14AM +0100, Lorenzo Stoakes wrote: > [..] > > > > > > /* > > > * Flags used by change_protection(). For now we make it a bitmap so > > > diff --git a/mm/mremap.c b/mm/mremap.c > > > index 035fbf542a8f..06baa13bd2c8 100644 > > > --- a/mm/mremap.c > > > +++ b/mm/mremap.c > > > @@ -490,12 +490,13 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, > > > } > > > > > > /* > > > - * A helper to check if a previous mapping exists. Required for > > > - * move_page_tables() and realign_addr() to determine if a previous mapping > > > - * exists before we can do realignment optimizations. > > > + * A helper to check if aligning down is OK. The aligned address should fall > > > + * on *no mapping*. For the stack moving down, that's a special move within > > > + * the VMA that is created to span the source and destination of the move, > > > + * so we make an exception for it. > > > */ > > > static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, > > > - unsigned long mask) > > > + unsigned long mask, bool for_stack) > > > { > > > unsigned long addr_masked = addr_to_align & mask; > > > > > > @@ -504,7 +505,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali > > > * of the corresponding VMA, we can't align down or we will destroy part > > > * of the current mapping. > > > */ > > > - if (vma->vm_start != addr_to_align) > > > + if (!for_stack && vma->vm_start != addr_to_align) > > > return false; > > > > I'm a little confused by this exception, is it very specifically for the > > shift_arg_pages() case where can assume we are safe to just discard the > > lower portion of the stack? > > > > Wouldn't the find_vma_intersection() line below fail in this case? I may be > > missing something here :) > > I think you are right. In v4, this was not an issue as we did this: > > > + if (!for_stack && vma->vm_start != addr_to_align) > + return false; > + > + cur = find_vma_prev(vma->vm_mm, vma->vm_start, &prev); > + if (WARN_ON_ONCE(cur != vma)) > + return false; > > Which essentially means this patch is a NOOP in v5 for the stack case. > > So what we really want is the VMA previous to @vma and whether than subsumes > the masked address. > > Should I just change it back to the v4 version then as above for both patch 1 > and 2 and carry your review tags? You will not be surprised to hear that I'd rather not :) I think if we did revert to that approach it'd need rework anyway, so I'd ask for a respin w/o tag if we were to go down that road. HOWEVER let's first clarify what we want to check. My understand (please correct me if mistaken) is that there are two acceptable cases:- 1. !for_stack addr_masked addr_to_align | | v v . |-----| . <-must be empty-> | vma | . |-----| 2. for_stack addr_masked addr_to_align | | v v |----.-------------------.-----| | . vma . | |----.-------------------.-----| Meaning that there are only two cases that we should care about:- 1. !for_stack: addr_to_align == vma->vm_start and no other VMA exists between this and addr_masked 2. for_stack: addr_masked is in the same VMA as addr_to_align. In this case, the check can surely be:- return find_vma_intersection(vma->vm_mm, addr_masked, addr_to_align) == (for_stack ? vma : NULL); (maybe would be less ugly to actually assign the intersection value to a local var and check that) > > This is also hard to test as it requires triggering the execve stack move > case. Though it is not a bug (as it is essentially a NOOP), it still would be > nice to test it. This is complicated by also the fact that mremap(2) itself > does not allow overlapping moves. I could try to hardcode the unfavorable > situation as I have done in the past to force that mremap warning. I find this exception a bit confusing, why are we so adamant on performing the optimisation in this case when it makes the code uglier and is rather hard to understand? Does it really matter that much? I wonder whether it wouldn't be better to just drop that (unless you really felt strongly about it) for the patch set and then perhaps address it in a follow up? This may entirely be a product of my simply not entirely understanding this case so do forgive the probing, I just want to make sure we handle it correctly! > > thanks, > > - Joel >
On Mon, Aug 28, 2023 at 08:00:18PM +0100, Lorenzo Stoakes wrote: > On Mon, Aug 28, 2023 at 06:32:40PM +0000, Joel Fernandes wrote: > > On Sun, Aug 27, 2023 at 10:21:14AM +0100, Lorenzo Stoakes wrote: > > [..] > > > > > > > > /* > > > > * Flags used by change_protection(). For now we make it a bitmap so > > > > diff --git a/mm/mremap.c b/mm/mremap.c > > > > index 035fbf542a8f..06baa13bd2c8 100644 > > > > --- a/mm/mremap.c > > > > +++ b/mm/mremap.c > > > > @@ -490,12 +490,13 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, > > > > } > > > > > > > > /* > > > > - * A helper to check if a previous mapping exists. Required for > > > > - * move_page_tables() and realign_addr() to determine if a previous mapping > > > > - * exists before we can do realignment optimizations. > > > > + * A helper to check if aligning down is OK. The aligned address should fall > > > > + * on *no mapping*. For the stack moving down, that's a special move within > > > > + * the VMA that is created to span the source and destination of the move, > > > > + * so we make an exception for it. > > > > */ > > > > static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, > > > > - unsigned long mask) > > > > + unsigned long mask, bool for_stack) > > > > { > > > > unsigned long addr_masked = addr_to_align & mask; > > > > > > > > @@ -504,7 +505,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali > > > > * of the corresponding VMA, we can't align down or we will destroy part > > > > * of the current mapping. > > > > */ > > > > - if (vma->vm_start != addr_to_align) > > > > + if (!for_stack && vma->vm_start != addr_to_align) > > > > return false; > > > > > > I'm a little confused by this exception, is it very specifically for the > > > shift_arg_pages() case where can assume we are safe to just discard the > > > lower portion of the stack? > > > > > > Wouldn't the find_vma_intersection() line below fail in this case? I may be > > > missing something here :) > > > > I think you are right. In v4, this was not an issue as we did this: > > > > > > + if (!for_stack && vma->vm_start != addr_to_align) > > + return false; > > + > > + cur = find_vma_prev(vma->vm_mm, vma->vm_start, &prev); > > + if (WARN_ON_ONCE(cur != vma)) > > + return false; > > > > Which essentially means this patch is a NOOP in v5 for the stack case. > > > > > So what we really want is the VMA previous to @vma and whether than subsumes > > the masked address. > > > > Should I just change it back to the v4 version then as above for both patch 1 > > and 2 and carry your review tags? > > You will not be surprised to hear that I'd rather not :) I think if we did > revert to that approach it'd need rework anyway, so I'd ask for a respin w/o > tag if we were to go down that road. > > HOWEVER let's first clarify what we want to check. > > My understand (please correct me if mistaken) is that there are two > acceptable cases:- > > 1. !for_stack > > addr_masked addr_to_align > | | > v v > . |-----| > . <-must be empty-> | vma | > . |-----| > > 2. for_stack > > addr_masked addr_to_align > | | > v v > |----.-------------------.-----| > | . vma . | > |----.-------------------.-----| > > Meaning that there are only two cases that we should care about:- > > 1. !for_stack: addr_to_align == vma->vm_start and no other VMA exists > between this and addr_masked > > 2. for_stack: addr_masked is in the same VMA as addr_to_align. > > In this case, the check can surely be:- > > return find_vma_intersection(vma->vm_mm, addr_masked, addr_to_align) == > (for_stack ? vma : NULL); > > (maybe would be less ugly to actually assign the intersection value to a > local var and check that) For completness: Lorenzo made some valid points on IRC and we'll do this patch (2/7) like this for v6 after sufficient testing. static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, unsigned long mask, bool for_stack) { unsigned long addr_masked = addr_to_align & mask; /* * If @addr_to_align of either source or destination is not the beginning * of the corresponding VMA, we can't align down or we will destroy part * of the current mapping for cases other than the stack. */ if (!for_stack && vma->vm_start != addr_to_align) return false; /* In the stack case we explicitly permit in-VMA alignment. */ if (for_stack && addr_masked >= vma->vm_start) return true; /* * Make sure the realignment doesn't cause the address to fall on an * existing mapping. */ return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL; } Thanks Lorenzo for the suggestion! > > > > This is also hard to test as it requires triggering the execve stack move > > case. Though it is not a bug (as it is essentially a NOOP), it still would be > > nice to test it. This is complicated by also the fact that mremap(2) itself > > does not allow overlapping moves. I could try to hardcode the unfavorable > > situation as I have done in the past to force that mremap warning. > > I find this exception a bit confusing, why are we so adamant on performing > the optimisation in this case when it makes the code uglier and is rather > hard to understand? Does it really matter that much? Let me know if you still felt it made the code uglier, but it looks like just one more if() condition. And who knows may be in the future we want to do such overlapping moves for other cases? ;) > I wonder whether it wouldn't be better to just drop that (unless you really > felt strongly about it) for the patch set and then perhaps address it in a > follow up? > This may entirely be a product of my simply not entirely understanding this > case so do forgive the probing, I just want to make sure we handle it > correctly! It was just to avoid that false-positive warning where we can align down the stack move to avoid warnings about zero'd PMDs. We could certainly do it in this series or as a follow-up but since we came up with the above snippet, I will keep it in this series for now and hopefully you are ok with that. thanks, - Joel
diff --git a/fs/exec.c b/fs/exec.c index 1a827d55ba94..244925307958 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -712,7 +712,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, - vma, new_start, length, false)) + vma, new_start, length, false, true)) return -ENOMEM; lru_add_drain(); diff --git a/include/linux/mm.h b/include/linux/mm.h index 406ab9ea818f..e635d1fc73b6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2458,7 +2458,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, - bool need_rmap_locks); + bool need_rmap_locks, bool for_stack); /* * Flags used by change_protection(). For now we make it a bitmap so diff --git a/mm/mremap.c b/mm/mremap.c index 035fbf542a8f..06baa13bd2c8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -490,12 +490,13 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, } /* - * A helper to check if a previous mapping exists. Required for - * move_page_tables() and realign_addr() to determine if a previous mapping - * exists before we can do realignment optimizations. + * A helper to check if aligning down is OK. The aligned address should fall + * on *no mapping*. For the stack moving down, that's a special move within + * the VMA that is created to span the source and destination of the move, + * so we make an exception for it. */ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, - unsigned long mask) + unsigned long mask, bool for_stack) { unsigned long addr_masked = addr_to_align & mask; @@ -504,7 +505,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali * of the corresponding VMA, we can't align down or we will destroy part * of the current mapping. */ - if (vma->vm_start != addr_to_align) + if (!for_stack && vma->vm_start != addr_to_align) return false; /* @@ -517,7 +518,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali /* Opportunistically realign to specified boundary for faster copy. */ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma, unsigned long *new_addr, struct vm_area_struct *new_vma, - unsigned long mask) + unsigned long mask, bool for_stack) { /* Skip if the addresses are already aligned. */ if ((*old_addr & ~mask) == 0) @@ -528,8 +529,8 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old return; /* Ensure realignment doesn't cause overlap with existing mappings. */ - if (!can_align_down(old_vma, *old_addr, mask) || - !can_align_down(new_vma, *new_addr, mask)) + if (!can_align_down(old_vma, *old_addr, mask, for_stack) || + !can_align_down(new_vma, *new_addr, mask, for_stack)) return; *old_addr = *old_addr & mask; @@ -539,7 +540,7 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, - bool need_rmap_locks) + bool need_rmap_locks, bool for_stack) { unsigned long extent, old_end; struct mmu_notifier_range range; @@ -559,9 +560,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma, * If possible, realign addresses to PMD boundary for faster copy. * Only realign if the mremap copying hits a PMD boundary. */ - if ((vma != new_vma) - && (len >= PMD_SIZE - (old_addr & ~PMD_MASK))) - try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK); + if (len >= PMD_SIZE - (old_addr & ~PMD_MASK)) + try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK, + for_stack); flush_cache_range(vma, old_addr, old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, @@ -708,7 +709,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, - need_rmap_locks); + need_rmap_locks, false); if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { @@ -722,7 +723,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, * and then proceed to unmap new area instead of old. */ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, - true); + true, false); vma = new_vma; old_len = new_len; old_addr = new_addr;
For the stack move happening in shift_arg_pages(), the move is happening within the same VMA which spans the old and new ranges. In case the aligned address happens to fall within that VMA, allow such moves and don't abort the optimization. In the mremap case, we cannot allow any such moves as will end up destroying some part of the mapping (either the source of the move, or part of the existing mapping). So just avoid it for mremap. Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org> --- fs/exec.c | 2 +- include/linux/mm.h | 2 +- mm/mremap.c | 29 +++++++++++++++-------------- 3 files changed, 17 insertions(+), 16 deletions(-)