diff mbox series

[v3] kasan: speed up mte_set_mem_tag_range

Message ID 20210517235546.3038875-1-eugenis@google.com (mailing list archive)
State New, archived
Headers show
Series [v3] kasan: speed up mte_set_mem_tag_range | expand

Commit Message

Evgenii Stepanov May 17, 2021, 11:55 p.m. UTC
Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.

The first cacheline is always tagged using STG/STZG even if the address is
cacheline-aligned, as benchmarks show it is faster than a conditional
branch.

Signed-off-by: Evgenii Stepanov <eugenis@google.com>
Co-developed-by: Peter Collingbourne <pcc@google.com>
Signed-off-by: Peter Collingbourne <pcc@google.com>
---
Changelog since v1:
- Added Co-developed-by.

Changelog since v2:
- Added Signed-off-by.

 arch/arm64/include/asm/mte-kasan.h | 40 +------------------
 arch/arm64/lib/Makefile            |  2 +
 arch/arm64/lib/mte-kasan.S         | 63 ++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 39 deletions(-)
 create mode 100644 arch/arm64/lib/mte-kasan.S

Comments

Catalin Marinas May 18, 2021, 5:44 p.m. UTC | #1
On Mon, May 17, 2021 at 04:55:46PM -0700, Evgenii Stepanov wrote:
> Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.
> 
> The first cacheline is always tagged using STG/STZG even if the address is
> cacheline-aligned, as benchmarks show it is faster than a conditional
> branch.
[...]
> diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
> index ddd4d17cf9a0..e29a0e2ab35c 100644
> --- a/arch/arm64/include/asm/mte-kasan.h
> +++ b/arch/arm64/include/asm/mte-kasan.h
> @@ -48,45 +48,7 @@ static inline u8 mte_get_random_tag(void)
>  	return mte_get_ptr_tag(addr);
>  }
>  
> -/*
> - * Assign allocation tags for a region of memory based on the pointer tag.
> - * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
> - * size must be non-zero and MTE_GRANULE_SIZE aligned.
> - */
> -static inline void mte_set_mem_tag_range(void *addr, size_t size,
> -						u8 tag, bool init)

With commit 2cb34276427a ("arm64: kasan: simplify and inline MTE
functions") you wanted this inlined for performance. Does this not
matter much that it's now out of line?

> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index d31e1169d9b8..c06ada79a437 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
>  obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
>  
>  obj-$(CONFIG_ARM64_MTE) += mte.o
> +
> +obj-$(CONFIG_KASAN_HW_TAGS) += mte-kasan.o
> diff --git a/arch/arm64/lib/mte-kasan.S b/arch/arm64/lib/mte-kasan.S
> new file mode 100644
> index 000000000000..9f6975e2af60
> --- /dev/null
> +++ b/arch/arm64/lib/mte-kasan.S
> @@ -0,0 +1,63 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2021 Google Inc.
> + */
> +#include <linux/const.h>
> +#include <linux/linkage.h>
> +
> +#include <asm/mte-def.h>
> +
> +	.arch	armv8.5-a+memtag
> +
> +	.macro  __set_mem_tag_range, stg, gva, start, size, linesize, tmp1, tmp2, tmp3
> +	add	\tmp3, \start, \size
> +	cmp	\size, \linesize, lsl #1
> +	b.lt	.Lsmtr3_\@

We could do with some comments here. Why the lsl #1? I think I get it
but it would be good to make this more readable.

It may be easier if you placed it in a file on its own (as it is now but
with a less generic file name) and use a few .req instead of the tmpX.
You can use the macro args only for the stg/gva.

> +
> +	sub	\tmp1, \linesize, #1
> +	bic	\tmp2, \tmp3, \tmp1
> +	orr	\tmp1, \start, \tmp1
> +
> +.Lsmtr1_\@:
> +	\stg	\start, [\start], #MTE_GRANULE_SIZE
> +	cmp	\start, \tmp1
> +	b.lt	.Lsmtr1_\@
> +
> +.Lsmtr2_\@:
> +	dc	\gva, \start
> +	add	\start, \start, \linesize
> +	cmp	\start, \tmp2
> +	b.lt	.Lsmtr2_\@
> +
> +.Lsmtr3_\@:
> +	cmp	\start, \tmp3
> +	b.ge	.Lsmtr4_\@
> +	\stg	\start, [\start], #MTE_GRANULE_SIZE
> +	b	.Lsmtr3_\@
> +.Lsmtr4_\@:
> +	.endm

If we want to get the best performance out of this, we should look at
the memset implementation and do something similar. In principle it's
not that far from a memzero, though depending on the microarchitecture
it may behave slightly differently.

Anyway, before that I wonder if we wrote all this in C + inline asm
(three while loops or maybe two and some goto), what's the performance
difference? It has the advantage of being easier to maintain even if we
used some C macros to generate gva/gzva variants.
Peter Collingbourne May 18, 2021, 6:11 p.m. UTC | #2
On Tue, May 18, 2021 at 10:44 AM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Mon, May 17, 2021 at 04:55:46PM -0700, Evgenii Stepanov wrote:
> > Use DC GVA / DC GZVA to speed up KASan memory tagging in HW tags mode.
> >
> > The first cacheline is always tagged using STG/STZG even if the address is
> > cacheline-aligned, as benchmarks show it is faster than a conditional
> > branch.
> [...]
> > diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
> > index ddd4d17cf9a0..e29a0e2ab35c 100644
> > --- a/arch/arm64/include/asm/mte-kasan.h
> > +++ b/arch/arm64/include/asm/mte-kasan.h
> > @@ -48,45 +48,7 @@ static inline u8 mte_get_random_tag(void)
> >       return mte_get_ptr_tag(addr);
> >  }
> >
> > -/*
> > - * Assign allocation tags for a region of memory based on the pointer tag.
> > - * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
> > - * size must be non-zero and MTE_GRANULE_SIZE aligned.
> > - */
> > -static inline void mte_set_mem_tag_range(void *addr, size_t size,
> > -                                             u8 tag, bool init)
>
> With commit 2cb34276427a ("arm64: kasan: simplify and inline MTE
> functions") you wanted this inlined for performance. Does this not
> matter much that it's now out of line?
>
> > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> > index d31e1169d9b8..c06ada79a437 100644
> > --- a/arch/arm64/lib/Makefile
> > +++ b/arch/arm64/lib/Makefile
> > @@ -18,3 +18,5 @@ obj-$(CONFIG_CRC32) += crc32.o
> >  obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> >
> >  obj-$(CONFIG_ARM64_MTE) += mte.o
> > +
> > +obj-$(CONFIG_KASAN_HW_TAGS) += mte-kasan.o
> > diff --git a/arch/arm64/lib/mte-kasan.S b/arch/arm64/lib/mte-kasan.S
> > new file mode 100644
> > index 000000000000..9f6975e2af60
> > --- /dev/null
> > +++ b/arch/arm64/lib/mte-kasan.S
> > @@ -0,0 +1,63 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (C) 2021 Google Inc.
> > + */
> > +#include <linux/const.h>
> > +#include <linux/linkage.h>
> > +
> > +#include <asm/mte-def.h>
> > +
> > +     .arch   armv8.5-a+memtag
> > +
> > +     .macro  __set_mem_tag_range, stg, gva, start, size, linesize, tmp1, tmp2, tmp3
> > +     add     \tmp3, \start, \size
> > +     cmp     \size, \linesize, lsl #1
> > +     b.lt    .Lsmtr3_\@
>
> We could do with some comments here. Why the lsl #1? I think I get it
> but it would be good to make this more readable.
>
> It may be easier if you placed it in a file on its own (as it is now but
> with a less generic file name) and use a few .req instead of the tmpX.
> You can use the macro args only for the stg/gva.

Yes, I think we could use more comments and .req here, like the
userspace version of this function that we use in Scudo:
https://cs.android.com/android/platform/superproject/+/master:external/scudo/standalone/memtag.h;l=150;drc=34c8857fef28eab205c22cbfb4bfda2f848e5a80

> > +
> > +     sub     \tmp1, \linesize, #1
> > +     bic     \tmp2, \tmp3, \tmp1
> > +     orr     \tmp1, \start, \tmp1
> > +
> > +.Lsmtr1_\@:
> > +     \stg    \start, [\start], #MTE_GRANULE_SIZE
> > +     cmp     \start, \tmp1
> > +     b.lt    .Lsmtr1_\@
> > +
> > +.Lsmtr2_\@:
> > +     dc      \gva, \start
> > +     add     \start, \start, \linesize
> > +     cmp     \start, \tmp2
> > +     b.lt    .Lsmtr2_\@
> > +
> > +.Lsmtr3_\@:
> > +     cmp     \start, \tmp3
> > +     b.ge    .Lsmtr4_\@
> > +     \stg    \start, [\start], #MTE_GRANULE_SIZE
> > +     b       .Lsmtr3_\@
> > +.Lsmtr4_\@:
> > +     .endm
>
> If we want to get the best performance out of this, we should look at
> the memset implementation and do something similar. In principle it's
> not that far from a memzero, though depending on the microarchitecture
> it may behave slightly differently.

For Scudo I compared our storeTags implementation linked above against
__mtag_tag_zero_region from the arm-optimized-routines repository
(which I think is basically an improved version of that memset
implementation rewritten to use STG and DC GZVA), and our
implementation performed better on the hardware that we have access
to.

> Anyway, before that I wonder if we wrote all this in C + inline asm
> (three while loops or maybe two and some goto), what's the performance
> difference? It has the advantage of being easier to maintain even if we
> used some C macros to generate gva/gzva variants.

I'm not sure I agree that it will be easier to maintain. Due to the
number of "unusual" instructions required here it seems more readable
to have the code in pure assembly than to require readers to switch
contexts between C and asm. If we did move it to inline asm then I
think it should basically be a large blob of asm like the Scudo code
that I linked.

Peter
Catalin Marinas May 19, 2021, 6:12 p.m. UTC | #3
On Tue, May 18, 2021 at 11:11:52AM -0700, Peter Collingbourne wrote:
> On Tue, May 18, 2021 at 10:44 AM Catalin Marinas
> <catalin.marinas@arm.com> wrote:
> > If we want to get the best performance out of this, we should look at
> > the memset implementation and do something similar. In principle it's
> > not that far from a memzero, though depending on the microarchitecture
> > it may behave slightly differently.
> 
> For Scudo I compared our storeTags implementation linked above against
> __mtag_tag_zero_region from the arm-optimized-routines repository
> (which I think is basically an improved version of that memset
> implementation rewritten to use STG and DC GZVA), and our
> implementation performed better on the hardware that we have access
> to.

That's the advantage of having hardware early ;).

> > Anyway, before that I wonder if we wrote all this in C + inline asm
> > (three while loops or maybe two and some goto), what's the performance
> > difference? It has the advantage of being easier to maintain even if we
> > used some C macros to generate gva/gzva variants.
> 
> I'm not sure I agree that it will be easier to maintain. Due to the
> number of "unusual" instructions required here it seems more readable
> to have the code in pure assembly than to require readers to switch
> contexts between C and asm. If we did move it to inline asm then I
> think it should basically be a large blob of asm like the Scudo code
> that I linked.

I was definitely not thinking of a big asm block, that's even less
readable than separate .S file. It's more like adding dedicated macros
for single STG or DC GVA uses and using them in while loops.

Anyway, let's see a better commented .S implementation first. Given that
tagging is very sensitive to the performance of this function, we'd
probably benefit from a (few percent I suspect) perf improvement with
the hand-coded assembly.
Evgenii Stepanov May 19, 2021, 7:52 p.m. UTC | #4
On Wed, May 19, 2021 at 11:13 AM Catalin Marinas
<catalin.marinas@arm.com> wrote:
>
> On Tue, May 18, 2021 at 11:11:52AM -0700, Peter Collingbourne wrote:
> > On Tue, May 18, 2021 at 10:44 AM Catalin Marinas
> > <catalin.marinas@arm.com> wrote:
> > > If we want to get the best performance out of this, we should look at
> > > the memset implementation and do something similar. In principle it's
> > > not that far from a memzero, though depending on the microarchitecture
> > > it may behave slightly differently.
> >
> > For Scudo I compared our storeTags implementation linked above against
> > __mtag_tag_zero_region from the arm-optimized-routines repository
> > (which I think is basically an improved version of that memset
> > implementation rewritten to use STG and DC GZVA), and our
> > implementation performed better on the hardware that we have access
> > to.
>
> That's the advantage of having hardware early ;).
>
> > > Anyway, before that I wonder if we wrote all this in C + inline asm
> > > (three while loops or maybe two and some goto), what's the performance
> > > difference? It has the advantage of being easier to maintain even if we
> > > used some C macros to generate gva/gzva variants.
> >
> > I'm not sure I agree that it will be easier to maintain. Due to the
> > number of "unusual" instructions required here it seems more readable
> > to have the code in pure assembly than to require readers to switch
> > contexts between C and asm. If we did move it to inline asm then I
> > think it should basically be a large blob of asm like the Scudo code
> > that I linked.
>
> I was definitely not thinking of a big asm block, that's even less
> readable than separate .S file. It's more like adding dedicated macros
> for single STG or DC GVA uses and using them in while loops.

I've got a C version with 4 single-instruction asm blocks, and it
looks pretty nice. The assembly is almost identical to the hand
written variant, and performance is 3% better, presumably because of
the inlining. Also, the C version allows more potential optimizations,
like specialization on the value of "init" - which is not happening
right now because it is not constant in any of the callers.

I'll upload a v4 shortly.

>
> Anyway, let's see a better commented .S implementation first. Given that
> tagging is very sensitive to the performance of this function, we'd
> probably benefit from a (few percent I suspect) perf improvement with
> the hand-coded assembly.
>
> --
> Catalin
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index ddd4d17cf9a0..e29a0e2ab35c 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -48,45 +48,7 @@  static inline u8 mte_get_random_tag(void)
 	return mte_get_ptr_tag(addr);
 }
 
-/*
- * Assign allocation tags for a region of memory based on the pointer tag.
- * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
- * size must be non-zero and MTE_GRANULE_SIZE aligned.
- */
-static inline void mte_set_mem_tag_range(void *addr, size_t size,
-						u8 tag, bool init)
-{
-	u64 curr, end;
-
-	if (!size)
-		return;
-
-	curr = (u64)__tag_set(addr, tag);
-	end = curr + size;
-
-	/*
-	 * 'asm volatile' is required to prevent the compiler to move
-	 * the statement outside of the loop.
-	 */
-	if (init) {
-		do {
-			asm volatile(__MTE_PREAMBLE "stzg %0, [%0]"
-				     :
-				     : "r" (curr)
-				     : "memory");
-			curr += MTE_GRANULE_SIZE;
-		} while (curr != end);
-	} else {
-		do {
-			asm volatile(__MTE_PREAMBLE "stg %0, [%0]"
-				     :
-				     : "r" (curr)
-				     : "memory");
-			curr += MTE_GRANULE_SIZE;
-		} while (curr != end);
-	}
-}
-
+void mte_set_mem_tag_range(void *addr, size_t size, u8 tag, bool init);
 void mte_enable_kernel_sync(void);
 void mte_enable_kernel_async(void);
 void mte_init_tags(u64 max_tag);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index d31e1169d9b8..c06ada79a437 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -18,3 +18,5 @@  obj-$(CONFIG_CRC32) += crc32.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_ARM64_MTE) += mte.o
+
+obj-$(CONFIG_KASAN_HW_TAGS) += mte-kasan.o
diff --git a/arch/arm64/lib/mte-kasan.S b/arch/arm64/lib/mte-kasan.S
new file mode 100644
index 000000000000..9f6975e2af60
--- /dev/null
+++ b/arch/arm64/lib/mte-kasan.S
@@ -0,0 +1,63 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Google Inc.
+ */
+#include <linux/const.h>
+#include <linux/linkage.h>
+
+#include <asm/mte-def.h>
+
+	.arch	armv8.5-a+memtag
+
+	.macro  __set_mem_tag_range, stg, gva, start, size, linesize, tmp1, tmp2, tmp3
+	add	\tmp3, \start, \size
+	cmp	\size, \linesize, lsl #1
+	b.lt	.Lsmtr3_\@
+
+	sub	\tmp1, \linesize, #1
+	bic	\tmp2, \tmp3, \tmp1
+	orr	\tmp1, \start, \tmp1
+
+.Lsmtr1_\@:
+	\stg	\start, [\start], #MTE_GRANULE_SIZE
+	cmp	\start, \tmp1
+	b.lt	.Lsmtr1_\@
+
+.Lsmtr2_\@:
+	dc	\gva, \start
+	add	\start, \start, \linesize
+	cmp	\start, \tmp2
+	b.lt	.Lsmtr2_\@
+
+.Lsmtr3_\@:
+	cmp	\start, \tmp3
+	b.ge	.Lsmtr4_\@
+	\stg	\start, [\start], #MTE_GRANULE_SIZE
+	b	.Lsmtr3_\@
+.Lsmtr4_\@:
+	.endm
+
+/*
+ * Assign allocation tags for a region of memory based on the pointer tag.
+ * Note: The address must be non-NULL and MTE_GRANULE_SIZE aligned and
+ * size must be non-zero and MTE_GRANULE_SIZE aligned.
+ *   x0 - start address
+ *   x1 - region size
+ *   x2 - tag
+ *   x3 - bool init
+ */
+SYM_FUNC_START(mte_set_mem_tag_range)
+	mrs	x4, dczid_el0
+	and	w4, w4, #0xf
+	mov	x5, #4
+	lsl	x4, x5, x4
+
+	bfi	x0, x2, #56, #8
+
+	cbz	x3, .Lnoinit
+	__set_mem_tag_range stzg, gzva, x0, x1, x4, x2, x3, x5
+	ret
+.Lnoinit:
+	__set_mem_tag_range stg, gva, x0, x1, x4, x2, x3, x5
+	ret
+SYM_FUNC_END(mte_set_mem_tag_range)