diff mbox series

[v4,10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code

Message ID 20250112155453.1104139-11-riel@surriel.com (mailing list archive)
State New
Headers show
Series AMD broadcast TLB invalidation | expand

Commit Message

Rik van Riel Jan. 12, 2025, 3:53 p.m. UTC
Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.

This also allows us to avoid adding the CPUs of processes using broadcast
flushing to the batch->cpumask, and will hopefully further reduce TLB
flushing from the reclaim and compaction paths.

Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/include/asm/tlbbatch.h |  1 +
 arch/x86/include/asm/tlbflush.h | 12 +++-------
 arch/x86/mm/tlb.c               | 41 ++++++++++++++++++++++++++++++---
 3 files changed, 42 insertions(+), 12 deletions(-)

Comments

Jann Horn Jan. 13, 2025, 5:05 p.m. UTC | #1
On Sun, Jan 12, 2025 at 4:55 PM Rik van Riel <riel@surriel.com> wrote:
> Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
> queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
>
> This also allows us to avoid adding the CPUs of processes using broadcast
> flushing to the batch->cpumask, and will hopefully further reduce TLB
> flushing from the reclaim and compaction paths.
[...]
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 80375ef186d5..532911fbb12a 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -1658,9 +1658,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>          * a local TLB flush is needed. Optimize this use-case by calling
>          * flush_tlb_func_local() directly in this case.
>          */
> -       if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
> -               invlpgb_flush_all_nonglobals();
> -       } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
> +       if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
>                 flush_tlb_multi(&batch->cpumask, info);
>         } else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
>                 lockdep_assert_irqs_enabled();
> @@ -1669,12 +1667,49 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>                 local_irq_enable();
>         }
>
> +       /*
> +        * If we issued (asynchronous) INVLPGB flushes, wait for them here.
> +        * The cpumask above contains only CPUs that were running tasks
> +        * not using broadcast TLB flushing.
> +        */
> +       if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
> +               tlbsync();
> +               migrate_enable();
> +               batch->used_invlpgb = false;
> +       }
> +
>         cpumask_clear(&batch->cpumask);
>
>         put_flush_tlb_info();
>         put_cpu();
>  }
>
> +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
> +                                            struct mm_struct *mm,
> +                                            unsigned long uaddr)
> +{
> +       if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_global_asid(mm)) {
> +               u16 asid = mm_global_asid(mm);
> +               /*
> +                * Queue up an asynchronous invalidation. The corresponding
> +                * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
> +                * on the same CPU.
> +                */
> +               if (!batch->used_invlpgb) {
> +                       batch->used_invlpgb = true;
> +                       migrate_disable();
> +               }
> +               invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
> +               /* Do any CPUs supporting INVLPGB need PTI? */
> +               if (static_cpu_has(X86_FEATURE_PTI))
> +                       invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
> +       } else {
> +               inc_mm_tlb_gen(mm);
> +               cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
> +       }
> +       mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
> +}

How does this work if the MM is currently transitioning to a global
ASID? Should the "mm_global_asid(mm)" check maybe be replaced with
something that checks if the MM has fully transitioned to a global
ASID, so that we keep using the classic path if there might be holdout
CPUs?
Jann Horn Jan. 13, 2025, 5:48 p.m. UTC | #2
On Mon, Jan 13, 2025 at 6:05 PM Jann Horn <jannh@google.com> wrote:
> On Sun, Jan 12, 2025 at 4:55 PM Rik van Riel <riel@surriel.com> wrote:
> > Instead of doing a system-wide TLB flush from arch_tlbbatch_flush,
> > queue up asynchronous, targeted flushes from arch_tlbbatch_add_pending.
> >
> > This also allows us to avoid adding the CPUs of processes using broadcast
> > flushing to the batch->cpumask, and will hopefully further reduce TLB
> > flushing from the reclaim and compaction paths.
> [...]
> > diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> > index 80375ef186d5..532911fbb12a 100644
> > --- a/arch/x86/mm/tlb.c
> > +++ b/arch/x86/mm/tlb.c
> > @@ -1658,9 +1658,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
> >          * a local TLB flush is needed. Optimize this use-case by calling
> >          * flush_tlb_func_local() directly in this case.
> >          */
> > -       if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
> > -               invlpgb_flush_all_nonglobals();
> > -       } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
> > +       if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
> >                 flush_tlb_multi(&batch->cpumask, info);
> >         } else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
> >                 lockdep_assert_irqs_enabled();
> > @@ -1669,12 +1667,49 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
> >                 local_irq_enable();
> >         }
> >
> > +       /*
> > +        * If we issued (asynchronous) INVLPGB flushes, wait for them here.
> > +        * The cpumask above contains only CPUs that were running tasks
> > +        * not using broadcast TLB flushing.
> > +        */
> > +       if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
> > +               tlbsync();
> > +               migrate_enable();
> > +               batch->used_invlpgb = false;
> > +       }
> > +
> >         cpumask_clear(&batch->cpumask);
> >
> >         put_flush_tlb_info();
> >         put_cpu();
> >  }
> >
> > +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
> > +                                            struct mm_struct *mm,
> > +                                            unsigned long uaddr)
> > +{
> > +       if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_global_asid(mm)) {
> > +               u16 asid = mm_global_asid(mm);
> > +               /*
> > +                * Queue up an asynchronous invalidation. The corresponding
> > +                * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
> > +                * on the same CPU.
> > +                */
> > +               if (!batch->used_invlpgb) {
> > +                       batch->used_invlpgb = true;
> > +                       migrate_disable();
> > +               }
> > +               invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
> > +               /* Do any CPUs supporting INVLPGB need PTI? */
> > +               if (static_cpu_has(X86_FEATURE_PTI))
> > +                       invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
> > +       } else {
> > +               inc_mm_tlb_gen(mm);
> > +               cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
> > +       }
> > +       mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
> > +}
>
> How does this work if the MM is currently transitioning to a global
> ASID? Should the "mm_global_asid(mm)" check maybe be replaced with
> something that checks if the MM has fully transitioned to a global
> ASID, so that we keep using the classic path if there might be holdout
> CPUs?

Ah, but if we did that, we'd also have to ensure that the MM switching
path keeps invalidating the TLB when the MM's TLB generation count
increments, even if the CPU has already switched to the global ASID.
Rik van Riel Jan. 13, 2025, 9:16 p.m. UTC | #3
On Mon, 2025-01-13 at 18:05 +0100, Jann Horn wrote:
> On Sun, Jan 12, 2025 at 4:55 PM Rik van Riel <riel@surriel.com>
> wrote:
> 
> > 
> > +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch
> > *batch,
> > +                                            struct mm_struct *mm,
> > +                                            unsigned long uaddr)
> > +{
> > +       if (static_cpu_has(X86_FEATURE_INVLPGB) &&
> > mm_global_asid(mm)) {
> > +               u16 asid = mm_global_asid(mm);
> > +               /*
> > +                * Queue up an asynchronous invalidation. The
> > corresponding
> > +                * TLBSYNC is done in arch_tlbbatch_flush(), and
> > must be done
> > +                * on the same CPU.
> > +                */
> > +               if (!batch->used_invlpgb) {
> > +                       batch->used_invlpgb = true;
> > +                       migrate_disable();
> > +               }
> > +               invlpgb_flush_user_nr_nosync(kern_pcid(asid),
> > uaddr, 1, false);
> > +               /* Do any CPUs supporting INVLPGB need PTI? */
> > +               if (static_cpu_has(X86_FEATURE_PTI))
> > +                      
> > invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
> > +       } else {
> > +               inc_mm_tlb_gen(mm);
> > +               cpumask_or(&batch->cpumask, &batch->cpumask,
> > mm_cpumask(mm));
> > +       }
> > +       mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
> > +}
> 
> How does this work if the MM is currently transitioning to a global
> ASID? Should the "mm_global_asid(mm)" check maybe be replaced with
> something that checks if the MM has fully transitioned to a global
> ASID, so that we keep using the classic path if there might be
> holdout
> CPUs?
> 
You are right!

If the mm is still transitioning, we should send a
TLB flush IPI, in addition to doing the broadcast shootdown.

Worst case the CPU is already using a global ASID, and
the TLB flush IPI ends up being a noop.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
index 1ad56eb3e8a8..f9a17edf63ad 100644
--- a/arch/x86/include/asm/tlbbatch.h
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -10,6 +10,7 @@  struct arch_tlbflush_unmap_batch {
 	 * the PFNs being flushed..
 	 */
 	struct cpumask cpumask;
+	bool used_invlpgb;
 };
 
 #endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cd244cdd49dd..fa4fcafa8b87 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -350,21 +350,15 @@  static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 	return atomic64_inc_return(&mm->context.tlb_gen);
 }
 
-static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
-					     struct mm_struct *mm,
-					     unsigned long uaddr)
-{
-	inc_mm_tlb_gen(mm);
-	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
-	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
-}
-
 static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
 {
 	flush_tlb_mm(mm);
 }
 
 extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr);
 
 static inline bool pte_flags_need_flush(unsigned long oldflags,
 					unsigned long newflags,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 80375ef186d5..532911fbb12a 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1658,9 +1658,7 @@  void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 	 * a local TLB flush is needed. Optimize this use-case by calling
 	 * flush_tlb_func_local() directly in this case.
 	 */
-	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
-		invlpgb_flush_all_nonglobals();
-	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
 		flush_tlb_multi(&batch->cpumask, info);
 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
 		lockdep_assert_irqs_enabled();
@@ -1669,12 +1667,49 @@  void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 		local_irq_enable();
 	}
 
+	/*
+	 * If we issued (asynchronous) INVLPGB flushes, wait for them here.
+	 * The cpumask above contains only CPUs that were running tasks
+	 * not using broadcast TLB flushing.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) {
+		tlbsync();
+		migrate_enable();
+		batch->used_invlpgb = false;
+	}
+
 	cpumask_clear(&batch->cpumask);
 
 	put_flush_tlb_info();
 	put_cpu();
 }
 
+void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
+					     struct mm_struct *mm,
+					     unsigned long uaddr)
+{
+	if (static_cpu_has(X86_FEATURE_INVLPGB) && mm_global_asid(mm)) {
+		u16 asid = mm_global_asid(mm);
+		/*
+		 * Queue up an asynchronous invalidation. The corresponding
+		 * TLBSYNC is done in arch_tlbbatch_flush(), and must be done
+		 * on the same CPU.
+		 */
+		if (!batch->used_invlpgb) {
+			batch->used_invlpgb = true;
+			migrate_disable();
+		}
+		invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false);
+		/* Do any CPUs supporting INVLPGB need PTI? */
+		if (static_cpu_has(X86_FEATURE_PTI))
+			invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false);
+	} else {
+		inc_mm_tlb_gen(mm);
+		cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+	}
+	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
+}
+
 /*
  * Blindly accessing user memory from NMI context can be dangerous
  * if we're in the middle of switching the current user task or