diff mbox series

[4/6] mm: proc: Invalidate TLB after clearing soft-dirty page state

Message ID 20201120143557.6715-5-will@kernel.org (mailing list archive)
State New, archived
Headers show
Series tlb: Fix access and (soft-)dirty bit management | expand

Commit Message

Will Deacon Nov. 20, 2020, 2:35 p.m. UTC
Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
via the tlb_remove_*() functions. Consequently, the page-table modifications
performed by clear_refs_write() in response to a write to
/proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
fine when simply aging the ptes, in the case of clearing the "soft-dirty"
state we can end up with entries where pte_write() is false, yet a
writable mapping remains in the TLB.

Fix this by calling tlb_remove_tlb_entry() for each entry being
write-protected when cleating soft-dirty.

Signed-off-by: Will Deacon <will@kernel.org>
---
 fs/proc/task_mmu.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

Comments

Peter Zijlstra Nov. 20, 2020, 3 p.m. UTC | #1
On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> via the tlb_remove_*() functions. Consequently, the page-table modifications
> performed by clear_refs_write() in response to a write to
> /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> state we can end up with entries where pte_write() is false, yet a
> writable mapping remains in the TLB.
> 
> Fix this by calling tlb_remove_tlb_entry() for each entry being
> write-protected when cleating soft-dirty.
> 

> @@ -1053,6 +1054,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
>  		ptent = pte_wrprotect(old_pte);
>  		ptent = pte_clear_soft_dirty(ptent);
>  		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
> +		tlb_remove_tlb_entry(tlb, pte, addr);
>  	} else if (is_swap_pte(ptent)) {
>  		ptent = pte_swp_clear_soft_dirty(ptent);
>  		set_pte_at(vma->vm_mm, addr, pte, ptent);

Oh!

Yesterday when you had me look at this code; I figured the sane thing
to do was to make it look more like mprotect().

Why did you chose to make it work with mmu_gather instead? I'll grant
you that it's probably the smaller patch, but I still think it's weird
to use mmu_gather here.

Also, is tlb_remote_tlb_entry() actually correct? If you look at
__tlb_remove_tlb_entry() you'll find that Power-Hash-32 will clear the
entry, which might not be what we want here, we want to update the
entrty.
Peter Zijlstra Nov. 20, 2020, 3:09 p.m. UTC | #2
On Fri, Nov 20, 2020 at 04:00:23PM +0100, Peter Zijlstra wrote:

> If you look at __tlb_remove_tlb_entry()

... you'll also find we can probably do this ... :-)

diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h
index e841cae544c2..779a5a0f0608 100644
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -24,7 +24,6 @@ void flush_tlb_pending(void);
 
 #define tlb_start_vma(tlb, vma) do { } while (0)
 #define tlb_end_vma(tlb, vma)	do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 #define tlb_flush(tlb)	flush_tlb_pending()
 
 /*
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 820082bd6880..1bfe979bb9bc 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -4,7 +4,6 @@
 
 #define tlb_start_vma(tlb, vma) do { } while (0)
 #define tlb_end_vma(tlb, vma) do { } while (0)
-#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 
 #define tlb_flush tlb_flush
 static inline void tlb_flush(struct mmu_gather *tlb);
Will Deacon Nov. 20, 2020, 3:15 p.m. UTC | #3
On Fri, Nov 20, 2020 at 04:00:23PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> > Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> > TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> > via the tlb_remove_*() functions. Consequently, the page-table modifications
> > performed by clear_refs_write() in response to a write to
> > /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> > fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> > state we can end up with entries where pte_write() is false, yet a
> > writable mapping remains in the TLB.
> > 
> > Fix this by calling tlb_remove_tlb_entry() for each entry being
> > write-protected when cleating soft-dirty.
> > 
> 
> > @@ -1053,6 +1054,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
> >  		ptent = pte_wrprotect(old_pte);
> >  		ptent = pte_clear_soft_dirty(ptent);
> >  		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
> > +		tlb_remove_tlb_entry(tlb, pte, addr);
> >  	} else if (is_swap_pte(ptent)) {
> >  		ptent = pte_swp_clear_soft_dirty(ptent);
> >  		set_pte_at(vma->vm_mm, addr, pte, ptent);
> 
> Oh!
> 
> Yesterday when you had me look at this code; I figured the sane thing
> to do was to make it look more like mprotect().

Ah, so you mean ditch the mmu_gather altogether?

> Why did you chose to make it work with mmu_gather instead? I'll grant
> you that it's probably the smaller patch, but I still think it's weird
> to use mmu_gather here.
> 
> Also, is tlb_remote_tlb_entry() actually correct? If you look at
> __tlb_remove_tlb_entry() you'll find that Power-Hash-32 will clear the
> entry, which might not be what we want here, we want to update the
> entrty.

Hmm, I didn't spot that, although ptep_modify_prot_start() does actually
clear the pte so we could just move this up a few lines.

Will

> 
> 
>
Peter Zijlstra Nov. 20, 2020, 3:27 p.m. UTC | #4
On Fri, Nov 20, 2020 at 03:15:24PM +0000, Will Deacon wrote:
> On Fri, Nov 20, 2020 at 04:00:23PM +0100, Peter Zijlstra wrote:
> > On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> > > Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> > > TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> > > via the tlb_remove_*() functions. Consequently, the page-table modifications
> > > performed by clear_refs_write() in response to a write to
> > > /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> > > fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> > > state we can end up with entries where pte_write() is false, yet a
> > > writable mapping remains in the TLB.
> > > 
> > > Fix this by calling tlb_remove_tlb_entry() for each entry being
> > > write-protected when cleating soft-dirty.
> > > 
> > 
> > > @@ -1053,6 +1054,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
> > >  		ptent = pte_wrprotect(old_pte);
> > >  		ptent = pte_clear_soft_dirty(ptent);
> > >  		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
> > > +		tlb_remove_tlb_entry(tlb, pte, addr);
> > >  	} else if (is_swap_pte(ptent)) {
> > >  		ptent = pte_swp_clear_soft_dirty(ptent);
> > >  		set_pte_at(vma->vm_mm, addr, pte, ptent);
> > 
> > Oh!
> > 
> > Yesterday when you had me look at this code; I figured the sane thing
> > to do was to make it look more like mprotect().
> 
> Ah, so you mean ditch the mmu_gather altogether?

Yes. Alternatively, if we decide mmu_gather is 'right', then we should
probably look at converting mprotect().

That is, I see no reason why this and mprotect should differ on this
point.

> > Why did you chose to make it work with mmu_gather instead? I'll grant
> > you that it's probably the smaller patch, but I still think it's weird
> > to use mmu_gather here.
> > 
> > Also, is tlb_remote_tlb_entry() actually correct? If you look at
> > __tlb_remove_tlb_entry() you'll find that Power-Hash-32 will clear the
> > entry, which might not be what we want here, we want to update the
> > entrty.
> 
> Hmm, I didn't spot that, although ptep_modify_prot_start() does actually
> clear the pte so we could just move this up a few lines.

Yes, but hash-entry != pte. If I'm not mistaken (and I could very well
be, it's Friday and Power-MMUs being the maze they are), the end result
here is an updated PTE but an empty hash-entry.
Minchan Kim Nov. 20, 2020, 3:55 p.m. UTC | #5
On Fri, Nov 20, 2020 at 04:00:23PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> > Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> > TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> > via the tlb_remove_*() functions. Consequently, the page-table modifications
> > performed by clear_refs_write() in response to a write to
> > /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> > fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> > state we can end up with entries where pte_write() is false, yet a
> > writable mapping remains in the TLB.
> > 
> > Fix this by calling tlb_remove_tlb_entry() for each entry being
> > write-protected when cleating soft-dirty.
> > 
> 
> > @@ -1053,6 +1054,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
> >  		ptent = pte_wrprotect(old_pte);
> >  		ptent = pte_clear_soft_dirty(ptent);
> >  		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
> > +		tlb_remove_tlb_entry(tlb, pte, addr);
> >  	} else if (is_swap_pte(ptent)) {
> >  		ptent = pte_swp_clear_soft_dirty(ptent);
> >  		set_pte_at(vma->vm_mm, addr, pte, ptent);
> 
> Oh!
> 
> Yesterday when you had me look at this code; I figured the sane thing
> to do was to make it look more like mprotect().
> 
> Why did you chose to make it work with mmu_gather instead? I'll grant
> you that it's probably the smaller patch, but I still think it's weird
> to use mmu_gather here.

I agree. The reason why clear_refs_write used the gather API was [1] and
seems like to overkill to me.

We could just do like [inc|dec]_tlb_flush_pending with flush_tlb_mm at
right before dec_tlb_flush_pending instead of gather.

thought?

[1] b3a81d0841a95, mm: fix KSM data corruption
Yu Zhao Nov. 20, 2020, 8:22 p.m. UTC | #6
On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> via the tlb_remove_*() functions. Consequently, the page-table modifications
> performed by clear_refs_write() in response to a write to
> /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> state we can end up with entries where pte_write() is false, yet a
> writable mapping remains in the TLB.

I don't think we need a TLB flush in this context, same reason as we
don't have one in copy_present_pte() which uses ptep_set_wrprotect()
to write-protect a src PTE.

ptep_modify_prot_start/commit() and ptep_set_wrprotect() guarantee
either the dirty bit is set (when a PTE is still writable) or a PF
happens (when a PTE has become r/o) when h/w page table walker races
with kernel that modifies a PTE using the two APIs.

> Fix this by calling tlb_remove_tlb_entry() for each entry being
> write-protected when cleating soft-dirty.
> 
> Signed-off-by: Will Deacon <will@kernel.org>
> ---
>  fs/proc/task_mmu.c | 18 +++++++++++-------
>  1 file changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index cd03ab9087b0..3308292ee5c5 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1032,11 +1032,12 @@ enum clear_refs_types {
>  
>  struct clear_refs_private {
>  	enum clear_refs_types type;
> +	struct mmu_gather *tlb;
>  };
>  
>  #ifdef CONFIG_MEM_SOFT_DIRTY
>  static inline void clear_soft_dirty(struct vm_area_struct *vma,
> -		unsigned long addr, pte_t *pte)
> +		unsigned long addr, pte_t *pte, struct mmu_gather *tlb)
>  {
>  	/*
>  	 * The soft-dirty tracker uses #PF-s to catch writes
> @@ -1053,6 +1054,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
>  		ptent = pte_wrprotect(old_pte);
>  		ptent = pte_clear_soft_dirty(ptent);
>  		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
> +		tlb_remove_tlb_entry(tlb, pte, addr);
>  	} else if (is_swap_pte(ptent)) {
>  		ptent = pte_swp_clear_soft_dirty(ptent);
>  		set_pte_at(vma->vm_mm, addr, pte, ptent);
> @@ -1060,14 +1062,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
>  }
>  #else
>  static inline void clear_soft_dirty(struct vm_area_struct *vma,
> -		unsigned long addr, pte_t *pte)
> +		unsigned long addr, pte_t *pte, struct mmu_gather *tlb)
>  {
>  }
>  #endif
>  
>  #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
>  static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
> -		unsigned long addr, pmd_t *pmdp)
> +		unsigned long addr, pmd_t *pmdp, struct mmu_gather *tlb)
>  {
>  	pmd_t old, pmd = *pmdp;
>  
> @@ -1081,6 +1083,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
>  
>  		pmd = pmd_wrprotect(pmd);
>  		pmd = pmd_clear_soft_dirty(pmd);
> +		tlb_remove_pmd_tlb_entry(tlb, pmdp, addr);
>  
>  		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
>  	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
> @@ -1090,7 +1093,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
>  }
>  #else
>  static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
> -		unsigned long addr, pmd_t *pmdp)
> +		unsigned long addr, pmd_t *pmdp, struct mmu_gather *tlb)
>  {
>  }
>  #endif
> @@ -1107,7 +1110,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
>  	ptl = pmd_trans_huge_lock(pmd, vma);
>  	if (ptl) {
>  		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
> -			clear_soft_dirty_pmd(vma, addr, pmd);
> +			clear_soft_dirty_pmd(vma, addr, pmd, cp->tlb);
>  			goto out;
>  		}
>  
> @@ -1133,7 +1136,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
>  		ptent = *pte;
>  
>  		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
> -			clear_soft_dirty(vma, addr, pte);
> +			clear_soft_dirty(vma, addr, pte, cp->tlb);
>  			continue;
>  		}
>  
> @@ -1212,7 +1215,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
>  	if (mm) {
>  		struct mmu_notifier_range range;
>  		struct clear_refs_private cp = {
> -			.type = type,
> +			.type	= type,
> +			.tlb	= &tlb,
>  		};
>  
>  		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
> -- 
> 2.29.2.454.gaff20da3a2-goog
>
Yu Zhao Nov. 21, 2020, 2:49 a.m. UTC | #7
On Fri, Nov 20, 2020 at 01:22:53PM -0700, Yu Zhao wrote:
> On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> > Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> > TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> > via the tlb_remove_*() functions. Consequently, the page-table modifications
> > performed by clear_refs_write() in response to a write to
> > /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> > fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> > state we can end up with entries where pte_write() is false, yet a
> > writable mapping remains in the TLB.
> 
> I don't think we need a TLB flush in this context, same reason as we
> don't have one in copy_present_pte() which uses ptep_set_wrprotect()
> to write-protect a src PTE.
> 
> ptep_modify_prot_start/commit() and ptep_set_wrprotect() guarantee
> either the dirty bit is set (when a PTE is still writable) or a PF
> happens (when a PTE has become r/o) when h/w page table walker races
> with kernel that modifies a PTE using the two APIs.

After we remove the writable bit, if we end up with a clean PTE, any
subsequent write will trigger a page fault. We can't have a stale
writable tlb entry. The architecture-specific APIs guarantee this.

If we end up with a dirty PTE, then yes, there will be a stale
writable tlb entry. But this won't be a problem because when we
write-protect a page (not PTE), we always check both pte_dirty()
and pte_write(), i.e., write_protect_page() and page_mkclean_one().
When they see this dirty PTE, they will flush. And generally, only
callers of pte_mkclean() should flush tlb; otherwise we end up one
extra if callers of pte_mkclean() and pte_wrprotect() both flush.

Now let's take a step back and see why we got
tlb_gather/finish_mmu() here in the first place. Commit b3a81d0841a95
("mm: fix KSM data corruption") explains the problem clearly. But
to fix a problem created by two threads clearing pte_write() and
pte_dirty() independently, we only need one of them to set
mm_tlb_flush_pending(). Given only removing the writable bit requires
tlb flush, that thread should be the one, as I just explained. Adding
tlb_gather/finish_mmu() is unnecessary in that fix. And there is no
point in having the original flush_tlb_mm() either, given data
integrity is already guaranteed. Of course, with it we have more
accurate access tracking.

Does a similar problem exist for page_mkclean_one()? Possibly. It
checks pte_dirty() and pte_write() but not mm_tlb_flush_pending().
At the moment, madvise_free_pte_range() only supports anonymous
memory, which doesn't do writeback. But the missing
mm_tlb_flush_pending() just seems to be an accident waiting to happen.
E.g., clean_record_pte() calls pte_mkclean() and does batched flush.
I don't know what it's for, but if it's called on file VMAs, a similar
race involving 4 CPUs can happen. This time CPU 1 runs
clean_record_pte() and CPU 3 runs page_mkclean_one().
Will Deacon Nov. 23, 2020, 6:23 p.m. UTC | #8
Hi Peter,

On Fri, Nov 20, 2020 at 04:27:31PM +0100, Peter Zijlstra wrote:
> On Fri, Nov 20, 2020 at 03:15:24PM +0000, Will Deacon wrote:
> > On Fri, Nov 20, 2020 at 04:00:23PM +0100, Peter Zijlstra wrote:
> > > On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> > > > Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> > > > TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> > > > via the tlb_remove_*() functions. Consequently, the page-table modifications
> > > > performed by clear_refs_write() in response to a write to
> > > > /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> > > > fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> > > > state we can end up with entries where pte_write() is false, yet a
> > > > writable mapping remains in the TLB.
> > > > 
> > > > Fix this by calling tlb_remove_tlb_entry() for each entry being
> > > > write-protected when cleating soft-dirty.
> > > > 
> > > 
> > > > @@ -1053,6 +1054,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
> > > >  		ptent = pte_wrprotect(old_pte);
> > > >  		ptent = pte_clear_soft_dirty(ptent);
> > > >  		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
> > > > +		tlb_remove_tlb_entry(tlb, pte, addr);
> > > >  	} else if (is_swap_pte(ptent)) {
> > > >  		ptent = pte_swp_clear_soft_dirty(ptent);
> > > >  		set_pte_at(vma->vm_mm, addr, pte, ptent);
> > > 
> > > Oh!
> > > 
> > > Yesterday when you had me look at this code; I figured the sane thing
> > > to do was to make it look more like mprotect().
> > 
> > Ah, so you mean ditch the mmu_gather altogether?
> 
> Yes. Alternatively, if we decide mmu_gather is 'right', then we should
> probably look at converting mprotect().
> 
> That is, I see no reason why this and mprotect should differ on this
> point.

I agree that we should aim for consistency, but it's worth pointing out
that madvise() uses the gather API in the same way that I'm proposing
here (see MADV_COLD/MADV_PAGEOUT).

Another thing to keep in mind is that, unlike mprotect(), we do actually
want to elide the TLB invalidation clear_refs_write() when all we're
doing is making the pages old. The gather API lends itself quite nicely
to this, as we can only update the range when actually doing the write
protection on the soft-dirty path.

> > > Why did you chose to make it work with mmu_gather instead? I'll grant
> > > you that it's probably the smaller patch, but I still think it's weird
> > > to use mmu_gather here.
> > > 
> > > Also, is tlb_remote_tlb_entry() actually correct? If you look at
> > > __tlb_remove_tlb_entry() you'll find that Power-Hash-32 will clear the
> > > entry, which might not be what we want here, we want to update the
> > > entrty.
> > 
> > Hmm, I didn't spot that, although ptep_modify_prot_start() does actually
> > clear the pte so we could just move this up a few lines.
> 
> Yes, but hash-entry != pte. If I'm not mistaken (and I could very well
> be, it's Friday and Power-MMUs being the maze they are), the end result
> here is an updated PTE but an empty hash-entry.

I had a look at the PPC code and, afaict, this should be fine. The next
access will fault, and we'll populate the hash entry from the pte afaict.

Am I missing something?

If we _really_ wanted to, then we could extend the mmu gather API to add
something like tlb_update_tlb_entry(), which would call
tlb_remove_tlb_entry() under the hood, and set a flag on the gather
structure so that tlb_finish_mmu() ends up calling update_mmu_cache() to
preload the hash.

However, I think this is purely a performance thing, and I'm wary about
pro-actively extending the API to optimise for the PPC hash.

Will
Will Deacon Nov. 23, 2020, 6:41 p.m. UTC | #9
On Fri, Nov 20, 2020 at 07:55:14AM -0800, Minchan Kim wrote:
> On Fri, Nov 20, 2020 at 04:00:23PM +0100, Peter Zijlstra wrote:
> > On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> > > Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> > > TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> > > via the tlb_remove_*() functions. Consequently, the page-table modifications
> > > performed by clear_refs_write() in response to a write to
> > > /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> > > fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> > > state we can end up with entries where pte_write() is false, yet a
> > > writable mapping remains in the TLB.
> > > 
> > > Fix this by calling tlb_remove_tlb_entry() for each entry being
> > > write-protected when cleating soft-dirty.
> > > 
> > 
> > > @@ -1053,6 +1054,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
> > >  		ptent = pte_wrprotect(old_pte);
> > >  		ptent = pte_clear_soft_dirty(ptent);
> > >  		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
> > > +		tlb_remove_tlb_entry(tlb, pte, addr);
> > >  	} else if (is_swap_pte(ptent)) {
> > >  		ptent = pte_swp_clear_soft_dirty(ptent);
> > >  		set_pte_at(vma->vm_mm, addr, pte, ptent);
> > 
> > Oh!
> > 
> > Yesterday when you had me look at this code; I figured the sane thing
> > to do was to make it look more like mprotect().
> > 
> > Why did you chose to make it work with mmu_gather instead? I'll grant
> > you that it's probably the smaller patch, but I still think it's weird
> > to use mmu_gather here.
> 
> I agree. The reason why clear_refs_write used the gather API was [1] and
> seems like to overkill to me.

I don't see why it's overkill. Prior to that commit, it called
flush_tlb_mm() directly.

> We could just do like [inc|dec]_tlb_flush_pending with flush_tlb_mm at
> right before dec_tlb_flush_pending instead of gather.
> 
> thought?

I'm not sure why this is better; it's different to the madvise() path, and
will need special logic to avoid the flush in the case where we're just
doing aging.

Will

> [1] b3a81d0841a95, mm: fix KSM data corruption
Yu Zhao Nov. 23, 2020, 7:21 p.m. UTC | #10
On Fri, Nov 20, 2020 at 07:49:22PM -0700, Yu Zhao wrote:
> On Fri, Nov 20, 2020 at 01:22:53PM -0700, Yu Zhao wrote:
> > On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> > > Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> > > TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> > > via the tlb_remove_*() functions. Consequently, the page-table modifications
> > > performed by clear_refs_write() in response to a write to
> > > /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> > > fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> > > state we can end up with entries where pte_write() is false, yet a
> > > writable mapping remains in the TLB.

I double checked my conclusion and I think it holds. But let me
correct some typos and add a summary.

> > I don't think we need a TLB flush in this context, same reason as we
                                ^^^^^ gather

> > don't have one in copy_present_pte() which uses ptep_set_wrprotect()
> > to write-protect a src PTE.
> > 
> > ptep_modify_prot_start/commit() and ptep_set_wrprotect() guarantee
> > either the dirty bit is set (when a PTE is still writable) or a PF
> > happens (when a PTE has become r/o) when h/w page table walker races
> > with kernel that modifies a PTE using the two APIs.
> 
> After we remove the writable bit, if we end up with a clean PTE, any
> subsequent write will trigger a page fault. We can't have a stale
> writable tlb entry. The architecture-specific APIs guarantee this.
> 
> If we end up with a dirty PTE, then yes, there will be a stale
> writable tlb entry. But this won't be a problem because when we
> write-protect a page (not PTE), we always check both pte_dirty()
> and pte_write(), i.e., write_protect_page() and page_mkclean_one().
> When they see this dirty PTE, they will flush. And generally, only
> callers of pte_mkclean() should flush tlb; otherwise we end up one
> extra if callers of pte_mkclean() and pte_wrprotect() both flush.
> 
> Now let's take a step back and see why we got
> tlb_gather/finish_mmu() here in the first place. Commit b3a81d0841a95
> ("mm: fix KSM data corruption") explains the problem clearly. But
> to fix a problem created by two threads clearing pte_write() and
> pte_dirty() independently, we only need one of them to set
> mm_tlb_flush_pending(). Given only removing the writable bit requires
                                                  ^^^^^^^^ dirty

> tlb flush, that thread should be the one, as I just explained. Adding
> tlb_gather/finish_mmu() is unnecessary in that fix. And there is no
> point in having the original flush_tlb_mm() either, given data
> integrity is already guaranteed.
(i.e., writable tlb entries are flushed when removing the dirty bit.)

> Of course, with it we have more accurate access tracking.
> 
> Does a similar problem exist for page_mkclean_one()? Possibly. It
> checks pte_dirty() and pte_write() but not mm_tlb_flush_pending().
> At the moment, madvise_free_pte_range() only supports anonymous
> memory, which doesn't do writeback. But the missing
> mm_tlb_flush_pending() just seems to be an accident waiting to happen.
> E.g., clean_record_pte() calls pte_mkclean() and does batched flush.
> I don't know what it's for, but if it's called on file VMAs, a similar
> race involving 4 CPUs can happen. This time CPU 1 runs
> clean_record_pte() and CPU 3 runs page_mkclean_one().

To summarize, IMO, we should 1) remove tlb_gather/finish_mmu() here;
2) check mm_tlb_flush_pending() in page_mkclean_one() and
dax_entry_mkclean().
Will Deacon Nov. 23, 2020, 10:04 p.m. UTC | #11
On Fri, Nov 20, 2020 at 07:49:22PM -0700, Yu Zhao wrote:
> On Fri, Nov 20, 2020 at 01:22:53PM -0700, Yu Zhao wrote:
> > On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> > > Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> > > TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> > > via the tlb_remove_*() functions. Consequently, the page-table modifications
> > > performed by clear_refs_write() in response to a write to
> > > /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> > > fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> > > state we can end up with entries where pte_write() is false, yet a
> > > writable mapping remains in the TLB.
> > 
> > I don't think we need a TLB flush in this context, same reason as we
> > don't have one in copy_present_pte() which uses ptep_set_wrprotect()
> > to write-protect a src PTE.

Hmm. Afaict, copy_present_pte() is only called on the fork() path when
VM_WIPEONFORK is set. I think that's a bit different to the fault case,
and even then, there is a fullmm flush after the copy.

> > ptep_modify_prot_start/commit() and ptep_set_wrprotect() guarantee
> > either the dirty bit is set (when a PTE is still writable) or a PF
> > happens (when a PTE has become r/o) when h/w page table walker races
> > with kernel that modifies a PTE using the two APIs.
> 
> After we remove the writable bit, if we end up with a clean PTE, any
> subsequent write will trigger a page fault. We can't have a stale
> writable tlb entry. The architecture-specific APIs guarantee this.
> 
> If we end up with a dirty PTE, then yes, there will be a stale
> writable tlb entry. But this won't be a problem because when we
> write-protect a page (not PTE), we always check both pte_dirty()
> and pte_write(), i.e., write_protect_page() and page_mkclean_one().
> When they see this dirty PTE, they will flush. And generally, only
> callers of pte_mkclean() should flush tlb; otherwise we end up one
> extra if callers of pte_mkclean() and pte_wrprotect() both flush.

I just find this sort of analysis incredibly fragile: we're justifying the
lack of TLB invalidation on a case-by-case basis rather than some general
rules that mean it is not required by construction. Even if all current
users don't need it, what means that will still be true in six months time?
It's not like this stuff is easy to trigger in practice if we get it wrong.

Will
Minchan Kim Nov. 25, 2020, 10:51 p.m. UTC | #12
On Mon, Nov 23, 2020 at 06:41:14PM +0000, Will Deacon wrote:
> On Fri, Nov 20, 2020 at 07:55:14AM -0800, Minchan Kim wrote:
> > On Fri, Nov 20, 2020 at 04:00:23PM +0100, Peter Zijlstra wrote:
> > > On Fri, Nov 20, 2020 at 02:35:55PM +0000, Will Deacon wrote:
> > > > Since commit 0758cd830494 ("asm-generic/tlb: avoid potential double flush"),
> > > > TLB invalidation is elided in tlb_finish_mmu() if no entries were batched
> > > > via the tlb_remove_*() functions. Consequently, the page-table modifications
> > > > performed by clear_refs_write() in response to a write to
> > > > /proc/<pid>/clear_refs do not perform TLB invalidation. Although this is
> > > > fine when simply aging the ptes, in the case of clearing the "soft-dirty"
> > > > state we can end up with entries where pte_write() is false, yet a
> > > > writable mapping remains in the TLB.
> > > > 
> > > > Fix this by calling tlb_remove_tlb_entry() for each entry being
> > > > write-protected when cleating soft-dirty.
> > > > 
> > > 
> > > > @@ -1053,6 +1054,7 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
> > > >  		ptent = pte_wrprotect(old_pte);
> > > >  		ptent = pte_clear_soft_dirty(ptent);
> > > >  		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
> > > > +		tlb_remove_tlb_entry(tlb, pte, addr);
> > > >  	} else if (is_swap_pte(ptent)) {
> > > >  		ptent = pte_swp_clear_soft_dirty(ptent);
> > > >  		set_pte_at(vma->vm_mm, addr, pte, ptent);
> > > 
> > > Oh!
> > > 
> > > Yesterday when you had me look at this code; I figured the sane thing
> > > to do was to make it look more like mprotect().
> > > 
> > > Why did you chose to make it work with mmu_gather instead? I'll grant
> > > you that it's probably the smaller patch, but I still think it's weird
> > > to use mmu_gather here.
> > 
> > I agree. The reason why clear_refs_write used the gather API was [1] and
> > seems like to overkill to me.
> 
> I don't see why it's overkill. Prior to that commit, it called
> flush_tlb_mm() directly.

The TLB gather was added for increasing tlb flush pending count for
stability bug, not for performance optimiataion(The commit never had
any number to support it and didn't have the logic to handle each pte
with tlb gather) and then it introduced a bug now so I take it as overkill
since it made complication from the beginning *unnecessary*.

> 
> > We could just do like [inc|dec]_tlb_flush_pending with flush_tlb_mm at
> > right before dec_tlb_flush_pending instead of gather.
> > 
> > thought?
> 
> I'm not sure why this is better; it's different to the madvise() path, and
> will need special logic to avoid the flush in the case where we're just
> doing aging.

I thought it's better to fix the bug first as *simple* patch and then
do optimization based on it.
Anyway, following to Yu's comment, we don't need gather API and 
even the flush if we give up the accuarcy(but I want to have it).
diff mbox series

Patch

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index cd03ab9087b0..3308292ee5c5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1032,11 +1032,12 @@  enum clear_refs_types {
 
 struct clear_refs_private {
 	enum clear_refs_types type;
+	struct mmu_gather *tlb;
 };
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
 static inline void clear_soft_dirty(struct vm_area_struct *vma,
-		unsigned long addr, pte_t *pte)
+		unsigned long addr, pte_t *pte, struct mmu_gather *tlb)
 {
 	/*
 	 * The soft-dirty tracker uses #PF-s to catch writes
@@ -1053,6 +1054,7 @@  static inline void clear_soft_dirty(struct vm_area_struct *vma,
 		ptent = pte_wrprotect(old_pte);
 		ptent = pte_clear_soft_dirty(ptent);
 		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
+		tlb_remove_tlb_entry(tlb, pte, addr);
 	} else if (is_swap_pte(ptent)) {
 		ptent = pte_swp_clear_soft_dirty(ptent);
 		set_pte_at(vma->vm_mm, addr, pte, ptent);
@@ -1060,14 +1062,14 @@  static inline void clear_soft_dirty(struct vm_area_struct *vma,
 }
 #else
 static inline void clear_soft_dirty(struct vm_area_struct *vma,
-		unsigned long addr, pte_t *pte)
+		unsigned long addr, pte_t *pte, struct mmu_gather *tlb)
 {
 }
 #endif
 
 #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
-		unsigned long addr, pmd_t *pmdp)
+		unsigned long addr, pmd_t *pmdp, struct mmu_gather *tlb)
 {
 	pmd_t old, pmd = *pmdp;
 
@@ -1081,6 +1083,7 @@  static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 
 		pmd = pmd_wrprotect(pmd);
 		pmd = pmd_clear_soft_dirty(pmd);
+		tlb_remove_pmd_tlb_entry(tlb, pmdp, addr);
 
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
@@ -1090,7 +1093,7 @@  static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 }
 #else
 static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
-		unsigned long addr, pmd_t *pmdp)
+		unsigned long addr, pmd_t *pmdp, struct mmu_gather *tlb)
 {
 }
 #endif
@@ -1107,7 +1110,7 @@  static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
-			clear_soft_dirty_pmd(vma, addr, pmd);
+			clear_soft_dirty_pmd(vma, addr, pmd, cp->tlb);
 			goto out;
 		}
 
@@ -1133,7 +1136,7 @@  static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 		ptent = *pte;
 
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
-			clear_soft_dirty(vma, addr, pte);
+			clear_soft_dirty(vma, addr, pte, cp->tlb);
 			continue;
 		}
 
@@ -1212,7 +1215,8 @@  static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 	if (mm) {
 		struct mmu_notifier_range range;
 		struct clear_refs_private cp = {
-			.type = type,
+			.type	= type,
+			.tlb	= &tlb,
 		};
 
 		if (type == CLEAR_REFS_MM_HIWATER_RSS) {