Message ID | 20250112155453.1104139-6-riel@surriel.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | AMD broadcast TLB invalidation | expand |
On 1/12/25 09:53, Rik van Riel wrote: > Add invlpgb.h with the helper functions and definitions needed to use > broadcast TLB invalidation on AMD EPYC 3 and newer CPUs. > > Signed-off-by: Rik van Riel <riel@surriel.com> > --- > arch/x86/include/asm/invlpgb.h | 95 +++++++++++++++++++++++++++++++++ > arch/x86/include/asm/tlbflush.h | 1 + > 2 files changed, 96 insertions(+) > create mode 100644 arch/x86/include/asm/invlpgb.h > > diff --git a/arch/x86/include/asm/invlpgb.h b/arch/x86/include/asm/invlpgb.h > new file mode 100644 > index 000000000000..d62e3733a1ab > --- /dev/null > +++ b/arch/x86/include/asm/invlpgb.h > @@ -0,0 +1,95 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _ASM_X86_INVLPGB > +#define _ASM_X86_INVLPGB > + > +#include <vdso/bits.h> > + > +/* > + * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. > + * > + * The INVLPGB instruction is weakly ordered, and a batch of invalidations can > + * be done in a parallel fashion. > + * > + * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from > + * this CPU have completed. > + */ > +static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, > + int extra_count, bool pmd_stride, unsigned long flags) > +{ > + u32 edx = (pcid << 16) | asid; > + u32 ecx = (pmd_stride << 31); > + u64 rax = addr | flags; > + > + /* Protect against negative numbers. */ > + extra_count = max(extra_count, 0); > + ecx |= extra_count; A bad ECX value (ECX[15:0] > invlpgb_count_max) will result in a #GP, is that ok? Thanks, Tom > + > + asm volatile("invlpgb" : : "a" (rax), "c" (ecx), "d" (edx)); > +} > + > +/* Wait for INVLPGB originated by this CPU to complete. */ > +static inline void tlbsync(void) > +{ > + asm volatile("tlbsync"); > +} > + > +/* > + * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination > + * of the three. For example: > + * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address > + * - INVLPGB_PCID: invalidate all TLB entries matching the PCID > + * > + * The first can be used to invalidate (kernel) mappings at a particular > + * address across all processes. > + * > + * The latter invalidates all TLB entries matching a PCID. > + */ > +#define INVLPGB_VA BIT(0) > +#define INVLPGB_PCID BIT(1) > +#define INVLPGB_ASID BIT(2) > +#define INVLPGB_INCLUDE_GLOBAL BIT(3) > +#define INVLPGB_FINAL_ONLY BIT(4) > +#define INVLPGB_INCLUDE_NESTED BIT(5) > + > +/* Flush all mappings for a given pcid and addr, not including globals. */ > +static inline void invlpgb_flush_user(unsigned long pcid, > + unsigned long addr) > +{ > + __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA); > + tlbsync(); > +} > + > +static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, > + unsigned long addr, > + int nr, bool pmd_stride) > +{ > + __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA); > +} > + > +/* Flush all mappings for a given PCID, not including globals. */ > +static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) > +{ > + __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID); > +} > + > +/* Flush all mappings, including globals, for all PCIDs. */ > +static inline void invlpgb_flush_all(void) > +{ > + __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL); > + tlbsync(); > +} > + > +/* Flush addr, including globals, for all PCIDs. */ > +static inline void invlpgb_flush_addr_nosync(unsigned long addr, int nr) > +{ > + __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL); > +} > + > +/* Flush all mappings for all PCIDs except globals. */ > +static inline void invlpgb_flush_all_nonglobals(void) > +{ > + __invlpgb(0, 0, 0, 0, 0, 0); > + tlbsync(); > +} > + > +#endif /* _ASM_X86_INVLPGB */ > diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h > index 8fe3b2dda507..dba5caa4a9f4 100644 > --- a/arch/x86/include/asm/tlbflush.h > +++ b/arch/x86/include/asm/tlbflush.h > @@ -10,6 +10,7 @@ > #include <asm/cpufeature.h> > #include <asm/special_insns.h> > #include <asm/smp.h> > +#include <asm/invlpgb.h> > #include <asm/invpcid.h> > #include <asm/pti.h> > #include <asm/processor-flags.h>
On Sun, Jan 12, 2025 at 4:55 PM Rik van Riel <riel@surriel.com> wrote: > +/* Wait for INVLPGB originated by this CPU to complete. */ > +static inline void tlbsync(void) > +{ > + asm volatile("tlbsync"); > +} If possible, it might be a good idea to add a cant_migrate() assertion in here, though I'm not sure if that works in terms of include hierarchy.
On Mon, 2025-01-13 at 08:21 -0600, Tom Lendacky wrote: > On 1/12/25 09:53, Rik van Riel wrote: > > > > +static inline void __invlpgb(unsigned long asid, unsigned long > > pcid, unsigned long addr, > > + int extra_count, bool pmd_stride, > > unsigned long flags) > > +{ > > + u32 edx = (pcid << 16) | asid; > > + u32 ecx = (pmd_stride << 31); > > + u64 rax = addr | flags; > > + > > + /* Protect against negative numbers. */ > > + extra_count = max(extra_count, 0); > > + ecx |= extra_count; > > A bad ECX value (ECX[15:0] > invlpgb_count_max) will result in a #GP, > is > that ok? The calling code ensures we do not call this code with more than invlpgb_count_max pages at a time. Given the choice between "a bug in the calling code crashes the kernel" and "a bug in the calling code results in a missed TLB flush", I'm guessing the crash is probably better.
On Mon, 2025-01-13 at 18:24 +0100, Jann Horn wrote: > On Sun, Jan 12, 2025 at 4:55 PM Rik van Riel <riel@surriel.com> > wrote: > > +/* Wait for INVLPGB originated by this CPU to complete. */ > > +static inline void tlbsync(void) > > +{ > > + asm volatile("tlbsync"); > > +} > > If possible, it might be a good idea to add a cant_migrate() > assertion > in here, though I'm not sure if that works in terms of include > hierarchy. > Nice idea. It just builds without any compiler issues, so I've included that for the next version.
On 1/13/25 15:10, Rik van Riel wrote: > On Mon, 2025-01-13 at 08:21 -0600, Tom Lendacky wrote: >> On 1/12/25 09:53, Rik van Riel wrote: >>> >>> +static inline void __invlpgb(unsigned long asid, unsigned long >>> pcid, unsigned long addr, >>> + int extra_count, bool pmd_stride, >>> unsigned long flags) >>> +{ >>> + u32 edx = (pcid << 16) | asid; >>> + u32 ecx = (pmd_stride << 31); >>> + u64 rax = addr | flags; >>> + >>> + /* Protect against negative numbers. */ >>> + extra_count = max(extra_count, 0); >>> + ecx |= extra_count; >> >> A bad ECX value (ECX[15:0] > invlpgb_count_max) will result in a #GP, >> is >> that ok? > > The calling code ensures we do not call this code > with more than invlpgb_count_max pages at a time. > > Given the choice between "a bug in the calling code > crashes the kernel" and "a bug in the calling code > results in a missed TLB flush", I'm guessing the > crash is probably better. So instead of the negative number protection, shouldn't this just use an unsigned int for extra_count and panic() if the value is greater than invlpgb_count_max? The caller has some sort of logic problem and it could possibly result in missed TLB flushes. Or if a panic() is out of the question, maybe a WARN() and a full TLB flush to be safe? Thanks, Tom >
On 1/14/25 06:29, Tom Lendacky wrote: >> Given the choice between "a bug in the calling code >> crashes the kernel" and "a bug in the calling code >> results in a missed TLB flush", I'm guessing the >> crash is probably better. > So instead of the negative number protection, shouldn't this just use an > unsigned int for extra_count and panic() if the value is greater than > invlpgb_count_max? The caller has some sort of logic problem and it > could possibly result in missed TLB flushes. Or if a panic() is out of > the question, maybe a WARN() and a full TLB flush to be safe? The current implementation will panic in the #GP handler though. It should be pretty easy to figure out that INVLPGB is involved with RIP or the Code: snippet. From there, you'd need to figure out what caused the #GP. I guess the one nasty thing is that a person debugging this might not have a CPUID dump handy so wouldn't actually know the number of valid addresses that INVLPGB can take. But otherwise, I'm not sure an explicit panic adds _much_ value here over an implicit one via the #GP handler. I don't know how everybody else feels about it, but I'm happy just depending on the #GP for now.
On 1/14/25 09:05, Dave Hansen wrote: > On 1/14/25 06:29, Tom Lendacky wrote: >>> Given the choice between "a bug in the calling code >>> crashes the kernel" and "a bug in the calling code >>> results in a missed TLB flush", I'm guessing the >>> crash is probably better. >> So instead of the negative number protection, shouldn't this just use an >> unsigned int for extra_count and panic() if the value is greater than >> invlpgb_count_max? The caller has some sort of logic problem and it >> could possibly result in missed TLB flushes. Or if a panic() is out of >> the question, maybe a WARN() and a full TLB flush to be safe? > > The current implementation will panic in the #GP handler though. It > should be pretty easy to figure out that INVLPGB is involved with RIP or > the Code: snippet. From there, you'd need to figure out what caused the #GP. Hmmm, maybe I'm missing something. IIUC, when a negative number is supplied, the extra_count field will be set to 0 (via the max() function) and allow the INVLPGB to continue. 0 is valid in ECX[15:0] and so the instruction won't #GP. Thanks, Tom > > I guess the one nasty thing is that a person debugging this might not > have a CPUID dump handy so wouldn't actually know the number of valid > addresses that INVLPGB can take. > > But otherwise, I'm not sure an explicit panic adds _much_ value here > over an implicit one via the #GP handler. I don't know how everybody > else feels about it, but I'm happy just depending on the #GP for now.
On Tue, 2025-01-14 at 09:23 -0600, Tom Lendacky wrote: > On 1/14/25 09:05, Dave Hansen wrote: > > On 1/14/25 06:29, Tom Lendacky wrote: > > > > Given the choice between "a bug in the calling code > > > > crashes the kernel" and "a bug in the calling code > > > > results in a missed TLB flush", I'm guessing the > > > > crash is probably better. > > > So instead of the negative number protection, shouldn't this just > > > use an > > > unsigned int for extra_count and panic() if the value is greater > > > than > > > invlpgb_count_max? The caller has some sort of logic problem and > > > it > > > could possibly result in missed TLB flushes. Or if a panic() is > > > out of > > > the question, maybe a WARN() and a full TLB flush to be safe? > > > > The current implementation will panic in the #GP handler though. It > > should be pretty easy to figure out that INVLPGB is involved with > > RIP or > > the Code: snippet. From there, you'd need to figure out what caused > > the #GP. > > Hmmm, maybe I'm missing something. IIUC, when a negative number is > supplied, the extra_count field will be set to 0 (via the max() > function) and allow the INVLPGB to continue. 0 is valid in ECX[15:0] > and > so the instruction won't #GP. I added that at the request of somebody else :) Let me remove it again, now that we seem to have a consensus that a panic is preferable to a wrong TLB flush.
On 1/14/25 09:47, Rik van Riel wrote: > On Tue, 2025-01-14 at 09:23 -0600, Tom Lendacky wrote: >> On 1/14/25 09:05, Dave Hansen wrote: >>> On 1/14/25 06:29, Tom Lendacky wrote: >>>>> Given the choice between "a bug in the calling code >>>>> crashes the kernel" and "a bug in the calling code >>>>> results in a missed TLB flush", I'm guessing the >>>>> crash is probably better. >>>> So instead of the negative number protection, shouldn't this just >>>> use an >>>> unsigned int for extra_count and panic() if the value is greater >>>> than >>>> invlpgb_count_max? The caller has some sort of logic problem and >>>> it >>>> could possibly result in missed TLB flushes. Or if a panic() is >>>> out of >>>> the question, maybe a WARN() and a full TLB flush to be safe? >>> >>> The current implementation will panic in the #GP handler though. It >>> should be pretty easy to figure out that INVLPGB is involved with >>> RIP or >>> the Code: snippet. From there, you'd need to figure out what caused >>> the #GP. >> >> Hmmm, maybe I'm missing something. IIUC, when a negative number is >> supplied, the extra_count field will be set to 0 (via the max() >> function) and allow the INVLPGB to continue. 0 is valid in ECX[15:0] >> and >> so the instruction won't #GP. > > I added that at the request of somebody else :) > > Let me remove it again, now that we seem to have a > consensus that a panic is preferable to a wrong > TLB flush. I believe the instruction will #GP if any of the ECX[30:16] reserved bits are non-zero (although the APM doesn't document that), in addition to ECX[15:0] being greater than allowed. But what if 0x80000000 is passed in. That would set ECX[31] with a zero count field, which is valid for the instruction, but the input is obviously bogus. I think the safest thing to do is make the extra_count parameter an unsigned int and check if it is greater than invlpgb_count_max. Not sure what to actually do at that point, though... panic()? WARN() with full TLB flush? Thanks, Tom >
On 1/14/25 08:30, Tom Lendacky wrote: >> Let me remove it again, now that we seem to have a >> consensus that a panic is preferable to a wrong >> TLB flush. > I believe the instruction will #GP if any of the ECX[30:16] reserved > bits are non-zero (although the APM doesn't document that), in addition > to ECX[15:0] being greater than allowed. But what if 0x80000000 is > passed in. That would set ECX[31] with a zero count field, which is > valid for the instruction, but the input is obviously bogus. > > I think the safest thing to do is make the extra_count parameter an > unsigned int and check if it is greater than invlpgb_count_max. Not sure > what to actually do at that point, though... panic()? WARN() with full > TLB flush? How about we give 'extra_count' a type that can't bleed over into the stride bits: static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, u16 extra_count, bool pmd_stride, u8 flags) plus a check on CPUID 8000_0008h EDX[15:0] to make sure it's not 0xffff.
From: riel@surriel.com <riel@surriel.com> Sent: Sunday, January 12, 2025 7:54 AM > > Add invlpgb.h with the helper functions and definitions needed to use > broadcast TLB invalidation on AMD EPYC 3 and newer CPUs. > > Signed-off-by: Rik van Riel <riel@surriel.com> > --- > arch/x86/include/asm/invlpgb.h | 95 +++++++++++++++++++++++++++++++++ > arch/x86/include/asm/tlbflush.h | 1 + > 2 files changed, 96 insertions(+) > create mode 100644 arch/x86/include/asm/invlpgb.h > > diff --git a/arch/x86/include/asm/invlpgb.h b/arch/x86/include/asm/invlpgb.h > new file mode 100644 > index 000000000000..d62e3733a1ab > --- /dev/null > +++ b/arch/x86/include/asm/invlpgb.h > @@ -0,0 +1,95 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _ASM_X86_INVLPGB > +#define _ASM_X86_INVLPGB > + > +#include <vdso/bits.h> > + > +/* > + * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. > + * > + * The INVLPGB instruction is weakly ordered, and a batch of invalidations can > + * be done in a parallel fashion. > + * > + * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from > + * this CPU have completed. > + */ > +static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, > + int extra_count, bool pmd_stride, unsigned long flags) > +{ > + u32 edx = (pcid << 16) | asid; > + u32 ecx = (pmd_stride << 31); > + u64 rax = addr | flags; > + > + /* Protect against negative numbers. */ > + extra_count = max(extra_count, 0); > + ecx |= extra_count; > + > + asm volatile("invlpgb" : : "a" (rax), "c" (ecx), "d" (edx)); The above needs to be: asm volatile(".byte 0x0f, 0x01, 0xfe" : : "a" (rax), "c" (ecx), "d" (edx)); plus an explanatory comment. As Boris Petkov previously noted[1], the "invlpgb" instruction name requires binutils version 2.36. But the current Linux kernel minimum binutils version is 2.25 (in scripts/min-tool-version.sh). For example, I'm using binutils 2.34, and your asm statement doesn't compile. > +} > + > +/* Wait for INVLPGB originated by this CPU to complete. */ > +static inline void tlbsync(void) > +{ > + asm volatile("tlbsync"); Same as above for "tlbsync". Michael [1] https://lore.kernel.org/lkml/20250102124247.GPZ3aJx8JTJa6PcaOW@fat_crate.local/
diff --git a/arch/x86/include/asm/invlpgb.h b/arch/x86/include/asm/invlpgb.h new file mode 100644 index 000000000000..d62e3733a1ab --- /dev/null +++ b/arch/x86/include/asm/invlpgb.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_INVLPGB +#define _ASM_X86_INVLPGB + +#include <vdso/bits.h> + +/* + * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. + * + * The INVLPGB instruction is weakly ordered, and a batch of invalidations can + * be done in a parallel fashion. + * + * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from + * this CPU have completed. + */ +static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, + int extra_count, bool pmd_stride, unsigned long flags) +{ + u32 edx = (pcid << 16) | asid; + u32 ecx = (pmd_stride << 31); + u64 rax = addr | flags; + + /* Protect against negative numbers. */ + extra_count = max(extra_count, 0); + ecx |= extra_count; + + asm volatile("invlpgb" : : "a" (rax), "c" (ecx), "d" (edx)); +} + +/* Wait for INVLPGB originated by this CPU to complete. */ +static inline void tlbsync(void) +{ + asm volatile("tlbsync"); +} + +/* + * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination + * of the three. For example: + * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address + * - INVLPGB_PCID: invalidate all TLB entries matching the PCID + * + * The first can be used to invalidate (kernel) mappings at a particular + * address across all processes. + * + * The latter invalidates all TLB entries matching a PCID. + */ +#define INVLPGB_VA BIT(0) +#define INVLPGB_PCID BIT(1) +#define INVLPGB_ASID BIT(2) +#define INVLPGB_INCLUDE_GLOBAL BIT(3) +#define INVLPGB_FINAL_ONLY BIT(4) +#define INVLPGB_INCLUDE_NESTED BIT(5) + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invlpgb_flush_user(unsigned long pcid, + unsigned long addr) +{ + __invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA); + tlbsync(); +} + +static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, + unsigned long addr, + int nr, bool pmd_stride) +{ + __invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) +{ + __invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invlpgb_flush_all(void) +{ + __invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL); + tlbsync(); +} + +/* Flush addr, including globals, for all PCIDs. */ +static inline void invlpgb_flush_addr_nosync(unsigned long addr, int nr) +{ + __invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invlpgb_flush_all_nonglobals(void) +{ + __invlpgb(0, 0, 0, 0, 0, 0); + tlbsync(); +} + +#endif /* _ASM_X86_INVLPGB */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 8fe3b2dda507..dba5caa4a9f4 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -10,6 +10,7 @@ #include <asm/cpufeature.h> #include <asm/special_insns.h> #include <asm/smp.h> +#include <asm/invlpgb.h> #include <asm/invpcid.h> #include <asm/pti.h> #include <asm/processor-flags.h>
Add invlpgb.h with the helper functions and definitions needed to use broadcast TLB invalidation on AMD EPYC 3 and newer CPUs. Signed-off-by: Rik van Riel <riel@surriel.com> --- arch/x86/include/asm/invlpgb.h | 95 +++++++++++++++++++++++++++++++++ arch/x86/include/asm/tlbflush.h | 1 + 2 files changed, 96 insertions(+) create mode 100644 arch/x86/include/asm/invlpgb.h