[v4,05/12] x86/mm: add INVLPGB support code

Message ID	20250112155453.1104139-6-riel@surriel.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: Rik van Riel <riel@surriel.com> To: x86@kernel.org Cc: linux-kernel@vger.kernel.org, bp@alien8.de, peterz@infradead.org, dave.hansen@linux.intel.com, zhengqi.arch@bytedance.com, nadav.amit@gmail.com, thomas.lendacky@amd.com, kernel-team@meta.com, linux-mm@kvack.org, akpm@linux-foundation.org, jannh@google.com, Rik van Riel <riel@surriel.com> Subject: [PATCH v4 05/12] x86/mm: add INVLPGB support code Date: Sun, 12 Jan 2025 10:53:49 -0500 Message-ID: <20250112155453.1104139-6-riel@surriel.com> In-Reply-To: <20250112155453.1104139-1-riel@surriel.com> References: <20250112155453.1104139-1-riel@surriel.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	AMD broadcast TLB invalidation \| expand [RFC,v4,00/10] AMD broadcast TLB invalidation [v4,01/12] x86/mm: make MMU_GATHER_RCU_TABLE_FREE unconditional [v4,02/12] x86/mm: remove pv_ops.mmu.tlb_remove_table call [v4,03/12] x86/mm: consolidate full flush threshold decision [v4,04/12] x86/mm: get INVLPGB count max from CPUID [v4,05/12] x86/mm: add INVLPGB support code [v4,06/12] x86/mm: use INVLPGB for kernel TLB flushes [v4,07/12] x86/tlb: use INVLPGB in flush_tlb_all [v4,08/12] x86/mm: use broadcast TLB flushing for page reclaim TLB flushing [v4,09/12] x86/mm: enable broadcast TLB invalidation for multi-threaded processes [v4,10/12] x86,tlb: do targeted broadcast flushing from tlbbatch code [v4,11/12] x86/mm: enable AMD translation cache extensions [v4,12/12] x86/mm: only invalidate final translations with INVLPGB

Rik van Riel Jan. 12, 2025, 3:53 p.m. UTC

Add invlpgb.h with the helper functions and definitions needed to use
broadcast TLB invalidation on AMD EPYC 3 and newer CPUs.

Signed-off-by: Rik van Riel <riel@surriel.com>
---
 arch/x86/include/asm/invlpgb.h  | 95 +++++++++++++++++++++++++++++++++
 arch/x86/include/asm/tlbflush.h |  1 +
 2 files changed, 96 insertions(+)
 create mode 100644 arch/x86/include/asm/invlpgb.h

Tom Lendacky Jan. 13, 2025, 2:21 p.m. UTC | #1

On 1/12/25 09:53, Rik van Riel wrote:
> Add invlpgb.h with the helper functions and definitions needed to use
> broadcast TLB invalidation on AMD EPYC 3 and newer CPUs.
> 
> Signed-off-by: Rik van Riel <riel@surriel.com>
> ---
>  arch/x86/include/asm/invlpgb.h  | 95 +++++++++++++++++++++++++++++++++
>  arch/x86/include/asm/tlbflush.h |  1 +
>  2 files changed, 96 insertions(+)
>  create mode 100644 arch/x86/include/asm/invlpgb.h
> 
> diff --git a/arch/x86/include/asm/invlpgb.h b/arch/x86/include/asm/invlpgb.h
> new file mode 100644
> index 000000000000..d62e3733a1ab
> --- /dev/null
> +++ b/arch/x86/include/asm/invlpgb.h
> @@ -0,0 +1,95 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_INVLPGB
> +#define _ASM_X86_INVLPGB
> +
> +#include <vdso/bits.h>
> +
> +/*
> + * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
> + *
> + * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
> + * be done in a parallel fashion.
> + *
> + * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
> + * this CPU have completed.
> + */
> +static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr,
> +			    int extra_count, bool pmd_stride, unsigned long flags)
> +{
> +	u32 edx = (pcid << 16) | asid;
> +	u32 ecx = (pmd_stride << 31);
> +	u64 rax = addr | flags;
> +
> +	/* Protect against negative numbers. */
> +	extra_count = max(extra_count, 0);
> +	ecx |= extra_count;

A bad ECX value (ECX[15:0] > invlpgb_count_max) will result in a #GP, is
that ok?

Thanks,
Tom

> +
> +	asm volatile("invlpgb" : : "a" (rax), "c" (ecx), "d" (edx));
> +}
> +
> +/* Wait for INVLPGB originated by this CPU to complete. */
> +static inline void tlbsync(void)
> +{
> +	asm volatile("tlbsync");
> +}
> +
> +/*
> + * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination
> + * of the three. For example:
> + * - INVLPGB_VA | INVLPGB_INCLUDE_GLOBAL: invalidate all TLB entries at the address
> + * - INVLPGB_PCID:              	  invalidate all TLB entries matching the PCID
> + *
> + * The first can be used to invalidate (kernel) mappings at a particular
> + * address across all processes.
> + *
> + * The latter invalidates all TLB entries matching a PCID.
> + */
> +#define INVLPGB_VA			BIT(0)
> +#define INVLPGB_PCID			BIT(1)
> +#define INVLPGB_ASID			BIT(2)
> +#define INVLPGB_INCLUDE_GLOBAL		BIT(3)
> +#define INVLPGB_FINAL_ONLY		BIT(4)
> +#define INVLPGB_INCLUDE_NESTED		BIT(5)
> +
> +/* Flush all mappings for a given pcid and addr, not including globals. */
> +static inline void invlpgb_flush_user(unsigned long pcid,
> +				      unsigned long addr)
> +{
> +	__invlpgb(0, pcid, addr, 0, 0, INVLPGB_PCID | INVLPGB_VA);
> +	tlbsync();
> +}
> +
> +static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid,
> +						unsigned long addr,
> +						int nr, bool pmd_stride)
> +{
> +	__invlpgb(0, pcid, addr, nr - 1, pmd_stride, INVLPGB_PCID | INVLPGB_VA);
> +}
> +
> +/* Flush all mappings for a given PCID, not including globals. */
> +static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid)
> +{
> +	__invlpgb(0, pcid, 0, 0, 0, INVLPGB_PCID);
> +}
> +
> +/* Flush all mappings, including globals, for all PCIDs. */
> +static inline void invlpgb_flush_all(void)
> +{
> +	__invlpgb(0, 0, 0, 0, 0, INVLPGB_INCLUDE_GLOBAL);
> +	tlbsync();
> +}
> +
> +/* Flush addr, including globals, for all PCIDs. */
> +static inline void invlpgb_flush_addr_nosync(unsigned long addr, int nr)
> +{
> +	__invlpgb(0, 0, addr, nr - 1, 0, INVLPGB_INCLUDE_GLOBAL);
> +}
> +
> +/* Flush all mappings for all PCIDs except globals. */
> +static inline void invlpgb_flush_all_nonglobals(void)
> +{
> +	__invlpgb(0, 0, 0, 0, 0, 0);
> +	tlbsync();
> +}
> +
> +#endif /* _ASM_X86_INVLPGB */
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 8fe3b2dda507..dba5caa4a9f4 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -10,6 +10,7 @@
>  #include <asm/cpufeature.h>
>  #include <asm/special_insns.h>
>  #include <asm/smp.h>
> +#include <asm/invlpgb.h>
>  #include <asm/invpcid.h>
>  #include <asm/pti.h>
>  #include <asm/processor-flags.h>

Jann Horn Jan. 13, 2025, 5:24 p.m. UTC | #2

On Sun, Jan 12, 2025 at 4:55 PM Rik van Riel <riel@surriel.com> wrote:
> +/* Wait for INVLPGB originated by this CPU to complete. */
> +static inline void tlbsync(void)
> +{
> +       asm volatile("tlbsync");
> +}

If possible, it might be a good idea to add a cant_migrate() assertion
in here, though I'm not sure if that works in terms of include
hierarchy.

Rik van Riel Jan. 13, 2025, 9:10 p.m. UTC | #3

On Mon, 2025-01-13 at 08:21 -0600, Tom Lendacky wrote:
> On 1/12/25 09:53, Rik van Riel wrote:
> > 
> > +static inline void __invlpgb(unsigned long asid, unsigned long
> > pcid, unsigned long addr,
> > +			    int extra_count, bool pmd_stride,
> > unsigned long flags)
> > +{
> > +	u32 edx = (pcid << 16) | asid;
> > +	u32 ecx = (pmd_stride << 31);
> > +	u64 rax = addr | flags;
> > +
> > +	/* Protect against negative numbers. */
> > +	extra_count = max(extra_count, 0);
> > +	ecx |= extra_count;
> 
> A bad ECX value (ECX[15:0] > invlpgb_count_max) will result in a #GP,
> is
> that ok?

The calling code ensures we do not call this code
with more than invlpgb_count_max pages at a time.

Given the choice between "a bug in the calling code
crashes the kernel" and "a bug in the calling code
results in a missed TLB flush", I'm guessing the
crash is probably better.

Rik van Riel Jan. 14, 2025, 1:33 a.m. UTC | #4

On Mon, 2025-01-13 at 18:24 +0100, Jann Horn wrote:
> On Sun, Jan 12, 2025 at 4:55 PM Rik van Riel <riel@surriel.com>
> wrote:
> > +/* Wait for INVLPGB originated by this CPU to complete. */
> > +static inline void tlbsync(void)
> > +{
> > +       asm volatile("tlbsync");
> > +}
> 
> If possible, it might be a good idea to add a cant_migrate()
> assertion
> in here, though I'm not sure if that works in terms of include
> hierarchy.
> 

Nice idea. It just builds without any compiler
issues, so I've included that for the next version.

Tom Lendacky Jan. 14, 2025, 2:29 p.m. UTC | #5

On 1/13/25 15:10, Rik van Riel wrote:
> On Mon, 2025-01-13 at 08:21 -0600, Tom Lendacky wrote:
>> On 1/12/25 09:53, Rik van Riel wrote:
>>>
>>> +static inline void __invlpgb(unsigned long asid, unsigned long
>>> pcid, unsigned long addr,
>>> +			    int extra_count, bool pmd_stride,
>>> unsigned long flags)
>>> +{
>>> +	u32 edx = (pcid << 16) | asid;
>>> +	u32 ecx = (pmd_stride << 31);
>>> +	u64 rax = addr | flags;
>>> +
>>> +	/* Protect against negative numbers. */
>>> +	extra_count = max(extra_count, 0);
>>> +	ecx |= extra_count;
>>
>> A bad ECX value (ECX[15:0] > invlpgb_count_max) will result in a #GP,
>> is
>> that ok?
> 
> The calling code ensures we do not call this code
> with more than invlpgb_count_max pages at a time.
> 
> Given the choice between "a bug in the calling code
> crashes the kernel" and "a bug in the calling code
> results in a missed TLB flush", I'm guessing the
> crash is probably better.

So instead of the negative number protection, shouldn't this just use an
unsigned int for extra_count and panic() if the value is greater than
invlpgb_count_max? The caller has some sort of logic problem and it
could possibly result in missed TLB flushes. Or if a panic() is out of
the question, maybe a WARN() and a full TLB flush to be safe?

Thanks,
Tom

>

Dave Hansen Jan. 14, 2025, 3:05 p.m. UTC | #6

On 1/14/25 06:29, Tom Lendacky wrote:
>> Given the choice between "a bug in the calling code
>> crashes the kernel" and "a bug in the calling code
>> results in a missed TLB flush", I'm guessing the
>> crash is probably better.
> So instead of the negative number protection, shouldn't this just use an
> unsigned int for extra_count and panic() if the value is greater than
> invlpgb_count_max? The caller has some sort of logic problem and it
> could possibly result in missed TLB flushes. Or if a panic() is out of
> the question, maybe a WARN() and a full TLB flush to be safe?

The current implementation will panic in the #GP handler though. It
should be pretty easy to figure out that INVLPGB is involved with RIP or
the Code: snippet. From there, you'd need to figure out what caused the #GP.

I guess the one nasty thing is that a person debugging this might not
have a CPUID dump handy so wouldn't actually know the number of valid
addresses that INVLPGB can take.

But otherwise, I'm not sure an explicit panic adds _much_ value here
over an implicit one via the #GP handler. I don't know how everybody
else feels about it, but I'm happy just depending on the #GP for now.

Tom Lendacky Jan. 14, 2025, 3:23 p.m. UTC | #7

On 1/14/25 09:05, Dave Hansen wrote:
> On 1/14/25 06:29, Tom Lendacky wrote:
>>> Given the choice between "a bug in the calling code
>>> crashes the kernel" and "a bug in the calling code
>>> results in a missed TLB flush", I'm guessing the
>>> crash is probably better.
>> So instead of the negative number protection, shouldn't this just use an
>> unsigned int for extra_count and panic() if the value is greater than
>> invlpgb_count_max? The caller has some sort of logic problem and it
>> could possibly result in missed TLB flushes. Or if a panic() is out of
>> the question, maybe a WARN() and a full TLB flush to be safe?
> 
> The current implementation will panic in the #GP handler though. It
> should be pretty easy to figure out that INVLPGB is involved with RIP or
> the Code: snippet. From there, you'd need to figure out what caused the #GP.

Hmmm, maybe I'm missing something. IIUC, when a negative number is
supplied, the extra_count field will be set to 0 (via the max()
function) and allow the INVLPGB to continue. 0 is valid in ECX[15:0] and
so the instruction won't #GP.

Thanks,
Tom

> 
> I guess the one nasty thing is that a person debugging this might not
> have a CPUID dump handy so wouldn't actually know the number of valid
> addresses that INVLPGB can take.
> 
> But otherwise, I'm not sure an explicit panic adds _much_ value here
> over an implicit one via the #GP handler. I don't know how everybody
> else feels about it, but I'm happy just depending on the #GP for now.

Rik van Riel Jan. 14, 2025, 3:47 p.m. UTC | #8

On Tue, 2025-01-14 at 09:23 -0600, Tom Lendacky wrote:
> On 1/14/25 09:05, Dave Hansen wrote:
> > On 1/14/25 06:29, Tom Lendacky wrote:
> > > > Given the choice between "a bug in the calling code
> > > > crashes the kernel" and "a bug in the calling code
> > > > results in a missed TLB flush", I'm guessing the
> > > > crash is probably better.
> > > So instead of the negative number protection, shouldn't this just
> > > use an
> > > unsigned int for extra_count and panic() if the value is greater
> > > than
> > > invlpgb_count_max? The caller has some sort of logic problem and
> > > it
> > > could possibly result in missed TLB flushes. Or if a panic() is
> > > out of
> > > the question, maybe a WARN() and a full TLB flush to be safe?
> > 
> > The current implementation will panic in the #GP handler though. It
> > should be pretty easy to figure out that INVLPGB is involved with
> > RIP or
> > the Code: snippet. From there, you'd need to figure out what caused
> > the #GP.
> 
> Hmmm, maybe I'm missing something. IIUC, when a negative number is
> supplied, the extra_count field will be set to 0 (via the max()
> function) and allow the INVLPGB to continue. 0 is valid in ECX[15:0]
> and
> so the instruction won't #GP.

I added that at the request of somebody else :)

Let me remove it again, now that we seem to have a
consensus that a panic is preferable to a wrong
TLB flush.

Tom Lendacky Jan. 14, 2025, 4:30 p.m. UTC | #9

On 1/14/25 09:47, Rik van Riel wrote:
> On Tue, 2025-01-14 at 09:23 -0600, Tom Lendacky wrote:
>> On 1/14/25 09:05, Dave Hansen wrote:
>>> On 1/14/25 06:29, Tom Lendacky wrote:
>>>>> Given the choice between "a bug in the calling code
>>>>> crashes the kernel" and "a bug in the calling code
>>>>> results in a missed TLB flush", I'm guessing the
>>>>> crash is probably better.
>>>> So instead of the negative number protection, shouldn't this just
>>>> use an
>>>> unsigned int for extra_count and panic() if the value is greater
>>>> than
>>>> invlpgb_count_max? The caller has some sort of logic problem and
>>>> it
>>>> could possibly result in missed TLB flushes. Or if a panic() is
>>>> out of
>>>> the question, maybe a WARN() and a full TLB flush to be safe?
>>>
>>> The current implementation will panic in the #GP handler though. It
>>> should be pretty easy to figure out that INVLPGB is involved with
>>> RIP or
>>> the Code: snippet. From there, you'd need to figure out what caused
>>> the #GP.
>>
>> Hmmm, maybe I'm missing something. IIUC, when a negative number is
>> supplied, the extra_count field will be set to 0 (via the max()
>> function) and allow the INVLPGB to continue. 0 is valid in ECX[15:0]
>> and
>> so the instruction won't #GP.
> 
> I added that at the request of somebody else :)
> 
> Let me remove it again, now that we seem to have a
> consensus that a panic is preferable to a wrong
> TLB flush.

I believe the instruction will #GP if any of the ECX[30:16] reserved
bits are non-zero (although the APM doesn't document that), in addition
to ECX[15:0] being greater than allowed. But what if 0x80000000 is
passed in. That would set ECX[31] with a zero count field, which is
valid for the instruction, but the input is obviously bogus.

I think the safest thing to do is make the extra_count parameter an
unsigned int and check if it is greater than invlpgb_count_max. Not sure
what to actually do at that point, though... panic()? WARN() with full
TLB flush?

Thanks,
Tom

>

Dave Hansen Jan. 14, 2025, 4:41 p.m. UTC | #10

On 1/14/25 08:30, Tom Lendacky wrote:
>> Let me remove it again, now that we seem to have a
>> consensus that a panic is preferable to a wrong
>> TLB flush.
> I believe the instruction will #GP if any of the ECX[30:16] reserved
> bits are non-zero (although the APM doesn't document that), in addition
> to ECX[15:0] being greater than allowed. But what if 0x80000000 is
> passed in. That would set ECX[31] with a zero count field, which is
> valid for the instruction, but the input is obviously bogus.
> 
> I think the safest thing to do is make the extra_count parameter an
> unsigned int and check if it is greater than invlpgb_count_max. Not sure
> what to actually do at that point, though... panic()? WARN() with full
> TLB flush?

How about we give 'extra_count' a type that can't bleed over into the
stride bits:

static inline void __invlpgb(unsigned long asid, unsigned long pcid,
			     unsigned long addr, u16 extra_count,
			     bool pmd_stride, u8 flags)

plus a check on CPUID 8000_0008h EDX[15:0] to make sure it's not 0xffff.

Michael Kelley Jan. 14, 2025, 6:24 p.m. UTC | #11

From: riel@surriel.com <riel@surriel.com> Sent: Sunday, January 12, 2025 7:54 AM
> 
> Add invlpgb.h with the helper functions and definitions needed to use
> broadcast TLB invalidation on AMD EPYC 3 and newer CPUs.
> 
> Signed-off-by: Rik van Riel <riel@surriel.com>
> ---
>  arch/x86/include/asm/invlpgb.h  | 95 +++++++++++++++++++++++++++++++++
>  arch/x86/include/asm/tlbflush.h |  1 +
>  2 files changed, 96 insertions(+)
>  create mode 100644 arch/x86/include/asm/invlpgb.h
> 
> diff --git a/arch/x86/include/asm/invlpgb.h b/arch/x86/include/asm/invlpgb.h
> new file mode 100644
> index 000000000000..d62e3733a1ab
> --- /dev/null
> +++ b/arch/x86/include/asm/invlpgb.h
> @@ -0,0 +1,95 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_X86_INVLPGB
> +#define _ASM_X86_INVLPGB
> +
> +#include <vdso/bits.h>
> +
> +/*
> + * INVLPGB does broadcast TLB invalidation across all the CPUs in the system.
> + *
> + * The INVLPGB instruction is weakly ordered, and a batch of invalidations can
> + * be done in a parallel fashion.
> + *
> + * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from
> + * this CPU have completed.
> + */
> +static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr,
> +			    int extra_count, bool pmd_stride, unsigned long flags)
> +{
> +	u32 edx = (pcid << 16) | asid;
> +	u32 ecx = (pmd_stride << 31);
> +	u64 rax = addr | flags;
> +
> +	/* Protect against negative numbers. */
> +	extra_count = max(extra_count, 0);
> +	ecx |= extra_count;
> +
> +	asm volatile("invlpgb" : : "a" (rax), "c" (ecx), "d" (edx));

The above needs to be:

	asm volatile(".byte 0x0f, 0x01, 0xfe" : : "a" (rax), "c" (ecx), "d" (edx));

plus an explanatory comment.

As Boris Petkov previously noted[1], the "invlpgb" instruction name requires
binutils version 2.36. But the current Linux kernel minimum binutils version
is 2.25 (in scripts/min-tool-version.sh). For example, I'm using binutils 2.34,
and your asm statement doesn't compile.

> +}
> +
> +/* Wait for INVLPGB originated by this CPU to complete. */
> +static inline void tlbsync(void)
> +{
> +	asm volatile("tlbsync");

Same as above for "tlbsync".

Michael

[1] https://lore.kernel.org/lkml/20250102124247.GPZ3aJx8JTJa6PcaOW@fat_crate.local/

[v4,05/12] x86/mm: add INVLPGB support code

Commit Message

Comments

Patch