diff mbox series

[4/4] arm64: add software prefetches for AmpereOne

Message ID 20231122092855.4440-5-shijie@os.amperecomputing.com (mailing list archive)
State New, archived
Headers show
Series arm64: an optimization for AmpereOne | expand

Commit Message

Huang Shijie Nov. 22, 2023, 9:28 a.m. UTC
0) Background:
   We found that AmpereOne benefits from aggressive prefetches when
   using 4K page size.

1) This patch:
    1.1) adds new WORKAROUND_AMPERE_AC03_PREFETCH capability.
    1.2) uses MIDR_AMPERE1 to filter the processor.
    1.3) uses alternative_if to alternative the code
         for AmpereOne.
    1.4) adds software prefetches for the specific loop.
    	 Also add a macro add_prefetch.

2) Test result:
    In hugetlb or tmpfs, We can get big seqential read performance improvement
    up to 1.3x ~ 1.4x.

Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
---
 arch/arm64/Kconfig.platforms   |  7 +++++++
 arch/arm64/kernel/cpu_errata.c |  9 +++++++++
 arch/arm64/lib/copy_template.S | 31 +++++++++++++++++++++++++++++++
 arch/arm64/tools/cpucaps       |  1 +
 4 files changed, 48 insertions(+)

Comments

Robin Murphy Nov. 22, 2023, 11:34 a.m. UTC | #1
On 2023-11-22 9:28 am, Huang Shijie wrote:
> 0) Background:
>     We found that AmpereOne benefits from aggressive prefetches when
>     using 4K page size.
> 
> 1) This patch:
>      1.1) adds new WORKAROUND_AMPERE_AC03_PREFETCH capability.
>      1.2) uses MIDR_AMPERE1 to filter the processor.
>      1.3) uses alternative_if to alternative the code
>           for AmpereOne.
>      1.4) adds software prefetches for the specific loop.
>      	 Also add a macro add_prefetch.
> 
> 2) Test result:
>      In hugetlb or tmpfs, We can get big seqential read performance improvement
>      up to 1.3x ~ 1.4x.

Frankly the copy_template code is pretty terrible anyway, so the fact 
that you're not touching anything *else* (memcpy(), copy_page(), etc.) 
makes me wonder whether you'd benefit from just a better baseline to 
begin with (unless the underlying concern really is something more 
specific like the hardware prefetcher failing to recognise LDTR/STTR). 
The last attempt to improve this derailed into questioning the usercopy 
API semantics themselves, but for reference that would be my original 
patches at [0] (more optimised, but some copy_to_user() fault fixups are 
buggy), and/or Mark's follow-up at [1] (less aggressive but still better 
than the current code, and doesn't touch copy_from_user()).

Thanks,
Robin.


[0] 
https://lore.kernel.org/linux-arm-kernel/cover.1664363162.git.robin.murphy@arm.com/
[1] 
https://lore.kernel.org/linux-arch/20230321122514.1743889-1-mark.rutland@arm.com/

> Signed-off-by: Huang Shijie <shijie@os.amperecomputing.com>
> ---
>   arch/arm64/Kconfig.platforms   |  7 +++++++
>   arch/arm64/kernel/cpu_errata.c |  9 +++++++++
>   arch/arm64/lib/copy_template.S | 31 +++++++++++++++++++++++++++++++
>   arch/arm64/tools/cpucaps       |  1 +
>   4 files changed, 48 insertions(+)
> 
> diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
> index 6069120199bb..74ab8bea0019 100644
> --- a/arch/arm64/Kconfig.platforms
> +++ b/arch/arm64/Kconfig.platforms
> @@ -8,6 +8,13 @@ config ARCH_ACTIONS
>   	help
>   	  This enables support for the Actions Semiconductor S900 SoC family.
>   
> +config ARCH_AMPEREONE
> +	bool "AmpereOne Platforms"
> +	help
> +	  This enables support for the ARMv8 based AmpereOne chipsets.
> +	  AmpereOne is the next generation of Cloud Native Processors from
> +	  Ampere.
> +
>   config ARCH_SUNXI
>   	bool "Allwinner sunxi 64-bit SoC Family"
>   	select ARCH_HAS_RESET_CONTROLLER
> diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
> index 5706e74c5578..c0060d3086d0 100644
> --- a/arch/arm64/kernel/cpu_errata.c
> +++ b/arch/arm64/kernel/cpu_errata.c
> @@ -744,6 +744,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
>   		.capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38,
>   		ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1),
>   	},
> +#endif
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> +	{
> +		.desc = "Optimization for AmpereOne chip",
> +		.capability = ARM64_WORKAROUND_AMPERE_AC03_PREFETCH,
> +		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
> +		.matches = is_affected_midr_range,
> +		.midr_range = MIDR_ALL_VERSIONS(MIDR_AMPERE1)
> +	},
>   #endif
>   	{
>   	}
> diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
> index 79b32569260c..b707c3ec6820 100644
> --- a/arch/arm64/lib/copy_template.S
> +++ b/arch/arm64/lib/copy_template.S
> @@ -41,6 +41,18 @@
>   	b.ne	.Ltail63
>   .endm
>   
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> +.macro add_prefetch
> +	/*
> +	 * Add prefetch two cache lines by prfm to optimize the
> +	 * performance. The 2K offset is the best offset which
> +	 * we get from the tests.
> +	 */
> +	prfm	pldl2keep, [src, #2048]
> +	prfm	pldl2keep, [src, #2112]
> +.endm
> +#endif
> +
>   /*
>    * Copy a buffer from src to dest (alignment handled by the hardware)
>    *
> @@ -156,6 +168,13 @@ D_h	.req	x14
>   	b	.Lexitfunc
>   
>   .Lcpy_over64:
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> +alternative_if  ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
> +	cmp	count, #PAGE_SIZE
> +	b.ge	.Lcpy_over_pagesize
> +alternative_else_nop_endif
> +#endif
> +
>   	subs	count, count, #128
>   	b.ge	.Lcpy_body_large
>   	/*
> @@ -182,4 +201,16 @@ D_h	.req	x14
>   	.p2align	L1_CACHE_SHIFT
>   .Lcpy_body_large:
>   	loop_for_copy_128_bytes
> +
> +#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
> +	b	.Lexitfunc
> +
> +	.p2align        L1_CACHE_SHIFT
> +.Lcpy_over_pagesize:
> +alternative_if  ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
> +	subs	count, count, #128
> +	loop_for_copy_128_bytes add_prefetch
> +alternative_else_nop_endif
> +#endif
> +
>   .Lexitfunc:
> diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
> index dea3dc89234b..13e197abf249 100644
> --- a/arch/arm64/tools/cpucaps
> +++ b/arch/arm64/tools/cpucaps
> @@ -100,3 +100,4 @@ WORKAROUND_NVIDIA_CARMEL_CNP
>   WORKAROUND_QCOM_FALKOR_E1003
>   WORKAROUND_REPEAT_TLBI
>   WORKAROUND_SPECULATIVE_AT
> +WORKAROUND_AMPERE_AC03_PREFETCH
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 6069120199bb..74ab8bea0019 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -8,6 +8,13 @@  config ARCH_ACTIONS
 	help
 	  This enables support for the Actions Semiconductor S900 SoC family.
 
+config ARCH_AMPEREONE
+	bool "AmpereOne Platforms"
+	help
+	  This enables support for the ARMv8 based AmpereOne chipsets.
+	  AmpereOne is the next generation of Cloud Native Processors from
+	  Ampere.
+
 config ARCH_SUNXI
 	bool "Allwinner sunxi 64-bit SoC Family"
 	select ARCH_HAS_RESET_CONTROLLER
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 5706e74c5578..c0060d3086d0 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -744,6 +744,15 @@  const struct arm64_cpu_capabilities arm64_errata[] = {
 		.capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38,
 		ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1),
 	},
+#endif
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+	{
+		.desc = "Optimization for AmpereOne chip",
+		.capability = ARM64_WORKAROUND_AMPERE_AC03_PREFETCH,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = is_affected_midr_range,
+		.midr_range = MIDR_ALL_VERSIONS(MIDR_AMPERE1)
+	},
 #endif
 	{
 	}
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 79b32569260c..b707c3ec6820 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -41,6 +41,18 @@ 
 	b.ne	.Ltail63
 .endm
 
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+.macro add_prefetch
+	/*
+	 * Add prefetch two cache lines by prfm to optimize the
+	 * performance. The 2K offset is the best offset which
+	 * we get from the tests.
+	 */
+	prfm	pldl2keep, [src, #2048]
+	prfm	pldl2keep, [src, #2112]
+.endm
+#endif
+
 /*
  * Copy a buffer from src to dest (alignment handled by the hardware)
  *
@@ -156,6 +168,13 @@  D_h	.req	x14
 	b	.Lexitfunc
 
 .Lcpy_over64:
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+alternative_if  ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
+	cmp	count, #PAGE_SIZE
+	b.ge	.Lcpy_over_pagesize
+alternative_else_nop_endif
+#endif
+
 	subs	count, count, #128
 	b.ge	.Lcpy_body_large
 	/*
@@ -182,4 +201,16 @@  D_h	.req	x14
 	.p2align	L1_CACHE_SHIFT
 .Lcpy_body_large:
 	loop_for_copy_128_bytes
+
+#if defined(CONFIG_ARCH_AMPEREONE) && defined(CONFIG_ARM64_4K_PAGES)
+	b	.Lexitfunc
+
+	.p2align        L1_CACHE_SHIFT
+.Lcpy_over_pagesize:
+alternative_if  ARM64_WORKAROUND_AMPERE_AC03_PREFETCH
+	subs	count, count, #128
+	loop_for_copy_128_bytes add_prefetch
+alternative_else_nop_endif
+#endif
+
 .Lexitfunc:
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index dea3dc89234b..13e197abf249 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -100,3 +100,4 @@  WORKAROUND_NVIDIA_CARMEL_CNP
 WORKAROUND_QCOM_FALKOR_E1003
 WORKAROUND_REPEAT_TLBI
 WORKAROUND_SPECULATIVE_AT
+WORKAROUND_AMPERE_AC03_PREFETCH