Message ID | 20250205014033.3626204-11-riel@surriel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | AMD broadcast TLB invalidation | expand |
On Tue, Feb 04, 2025 at 08:39:59PM -0500, Rik van Riel wrote: > @@ -1657,12 +1655,65 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) > local_irq_enable(); > } > > + /* > + * If we issued (asynchronous) INVLPGB flushes, wait for them here. > + * The cpumask above contains only CPUs that were running tasks > + * not using broadcast TLB flushing. > + */ > + if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) { > + tlbsync(); > + migrate_enable(); > + batch->used_invlpgb = false; > + } > + > cpumask_clear(&batch->cpumask); > > put_flush_tlb_info(); > put_cpu(); > } > > +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, > + struct mm_struct *mm, > + unsigned long uaddr) > +{ > + u16 asid = mm_global_asid(mm); > + > + if (asid) { > + /* > + * Queue up an asynchronous invalidation. The corresponding > + * TLBSYNC is done in arch_tlbbatch_flush(), and must be done > + * on the same CPU. > + */ > + if (!batch->used_invlpgb) { > + batch->used_invlpgb = true; > + migrate_disable(); > + } How about we do something like this instead? This keeps all the TLBSYNC in the same task as the INVLPGB, without making things complicated and allowing random CR3 writes in between them -- which makes my head hurt. --- --- a/arch/x86/include/asm/tlbbatch.h +++ b/arch/x86/include/asm/tlbbatch.h @@ -10,7 +10,6 @@ struct arch_tlbflush_unmap_batch { * the PFNs being flushed.. */ struct cpumask cpumask; - bool used_invlpgb; }; #endif /* _ARCH_X86_TLBBATCH_H */ --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -106,6 +106,7 @@ struct tlb_state { * need to be invalidated. */ bool invalidate_other; + bool need_tlbsync; #ifdef CONFIG_ADDRESS_MASKING /* --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -266,6 +266,37 @@ static void choose_new_asid(struct mm_st *need_flush = true; } +static inline void tlbsync(void) +{ + if (!this_cpu_read(cpu_tlbstate.need_tlbsync)) + return; + __tlbsync(); + this_cpu_write(cpu_tlbstate.need_tlbsync, false); +} + +static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, + unsigned long addr, + u16 nr, bool pmd_stride) +{ + __invlpgb_flush_user_nr(pcid, addr, nr, pmd_stride); + if (!this_cpu_read(cpu_tlbstate.need_tlbsync)) + this_cpu_write(cpu_tlbstate.need_tlbsync, true); +} + +static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) +{ + __invlpgb_flush_single_pcid(pcid); + if (!this_cpu_read(cpu_tlbstate.need_tlbsync)) + this_cpu_write(cpu_tlbstate.need_tlbsync, true); +} + +static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) +{ + __invlpgb_flush_addr(addr, nr); + if (!this_cpu_read(cpu_tlbstate.need_tlbsync)) + this_cpu_write(cpu_tlbstate.need_tlbsync, true); +} + #ifdef CONFIG_X86_BROADCAST_TLB_FLUSH /* * Logic for broadcast TLB invalidation. @@ -793,6 +824,8 @@ void switch_mm_irqs_off(struct mm_struct if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(!irqs_disabled()); + tlbsync(); + /* * Verify that CR3 is what we think it is. This will catch * hypothetical buggy code that directly switches to swapper_pg_dir @@ -968,6 +1001,8 @@ void switch_mm_irqs_off(struct mm_struct */ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { + tlbsync(); + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; @@ -1623,11 +1658,8 @@ void arch_tlbbatch_flush(struct arch_tlb * The cpumask above contains only CPUs that were running tasks * not using broadcast TLB flushing. */ - if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) { + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) tlbsync(); - migrate_enable(); - batch->used_invlpgb = false; - } cpumask_clear(&batch->cpumask); @@ -1647,10 +1679,6 @@ void arch_tlbbatch_add_pending(struct ar * TLBSYNC is done in arch_tlbbatch_flush(), and must be done * on the same CPU. */ - if (!batch->used_invlpgb) { - batch->used_invlpgb = true; - migrate_disable(); - } invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false); /* Do any CPUs supporting INVLPGB need PTI? */ if (static_cpu_has(X86_FEATURE_PTI)) --- a/arch/x86/include/asm/invlpgb.h +++ b/arch/x86/include/asm/invlpgb.h @@ -3,6 +3,7 @@ #define _ASM_X86_INVLPGB #include <linux/kernel.h> +#include <asm/page_types.h> #include <vdso/bits.h> #include <vdso/page.h> @@ -31,9 +32,8 @@ static inline void __invlpgb(unsigned lo } /* Wait for INVLPGB originated by this CPU to complete. */ -static inline void tlbsync(void) +static inline void __tlbsync(void) { - cant_migrate(); /* TLBSYNC: supported in binutils >= 0.36. */ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); } @@ -61,19 +61,19 @@ static inline void invlpgb_flush_user(un unsigned long addr) { __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA); - tlbsync(); + __tlbsync(); } -static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, - unsigned long addr, - u16 nr, - bool pmd_stride) +static inline void __invlpgb_flush_user_nr(unsigned long pcid, + unsigned long addr, + u16 nr, + bool pmd_stride) { __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA); } /* Flush all mappings for a given PCID, not including globals. */ -static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) +static inline void __invlpgb_flush_single_pcid(unsigned long pcid) { __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID); } @@ -82,11 +82,11 @@ static inline void invlpgb_flush_single_ static inline void invlpgb_flush_all(void) { __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL); - tlbsync(); + __tlbsync(); } /* Flush addr, including globals, for all PCIDs. */ -static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) +static inline void __invlpgb_flush_addr(unsigned long addr, u16 nr) { __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL); } @@ -95,7 +95,7 @@ static inline void invlpgb_flush_addr_no static inline void invlpgb_flush_all_nonglobals(void) { __invlpgb(0, 0, 0, 0, 0, 0); - tlbsync(); + __tlbsync(); } #endif /* _ASM_X86_INVLPGB */
On Wed, 2025-02-05 at 14:51 +0100, Peter Zijlstra wrote: > > How about we do something like this instead? > > This keeps all the TLBSYNC in the same task as the INVLPGB, without > making things complicated and allowing random CR3 writes in between > them -- which makes my head hurt. I like your idea better! Guess I should go make and test a v9 with the WARN_ONCE in patch 3, and these changes :)
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h index 1ad56eb3e8a8..f9a17edf63ad 100644 --- a/arch/x86/include/asm/tlbbatch.h +++ b/arch/x86/include/asm/tlbbatch.h @@ -10,6 +10,7 @@ struct arch_tlbflush_unmap_batch { * the PFNs being flushed.. */ struct cpumask cpumask; + bool used_invlpgb; }; #endif /* _ARCH_X86_TLBBATCH_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 7e2f3f7f6455..f8aaa4bcb4d8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -359,21 +359,15 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) return atomic64_inc_return(&mm->context.tlb_gen); } -static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm, - unsigned long uaddr) -{ - inc_mm_tlb_gen(mm); - cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); - mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); -} - static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) { flush_tlb_mm(mm); } extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); +extern void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr); static inline bool pte_flags_need_flush(unsigned long oldflags, unsigned long newflags, diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 7b363ae1569b..c064e27df1f3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1646,9 +1646,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) * a local TLB flush is needed. Optimize this use-case by calling * flush_tlb_func_local() directly in this case. */ - if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { - invlpgb_flush_all_nonglobals(); - } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { + if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { flush_tlb_multi(&batch->cpumask, info); } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { lockdep_assert_irqs_enabled(); @@ -1657,12 +1655,65 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) local_irq_enable(); } + /* + * If we issued (asynchronous) INVLPGB flushes, wait for them here. + * The cpumask above contains only CPUs that were running tasks + * not using broadcast TLB flushing. + */ + if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->used_invlpgb) { + tlbsync(); + migrate_enable(); + batch->used_invlpgb = false; + } + cpumask_clear(&batch->cpumask); put_flush_tlb_info(); put_cpu(); } +void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) +{ + u16 asid = mm_global_asid(mm); + + if (asid) { + /* + * Queue up an asynchronous invalidation. The corresponding + * TLBSYNC is done in arch_tlbbatch_flush(), and must be done + * on the same CPU. + */ + if (!batch->used_invlpgb) { + batch->used_invlpgb = true; + migrate_disable(); + } + invlpgb_flush_user_nr_nosync(kern_pcid(asid), uaddr, 1, false); + /* Do any CPUs supporting INVLPGB need PTI? */ + if (static_cpu_has(X86_FEATURE_PTI)) + invlpgb_flush_user_nr_nosync(user_pcid(asid), uaddr, 1, false); + + /* + * Some CPUs might still be using a local ASID for this + * process, and require IPIs, while others are using the + * global ASID. + * + * In this corner case we need to do both the broadcast + * TLB invalidation, and send IPIs. The IPIs will help + * stragglers transition to the broadcast ASID. + */ + if (READ_ONCE(mm->context.asid_transition)) + asid = 0; + } + + if (!asid) { + inc_mm_tlb_gen(mm); + cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); + } + + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); +} + /* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or