diff mbox series

[4/5] arm64: lib: Use MOPS for memcpy() routines

Message ID 20240930161051.3777828-5-kristina.martsenko@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: Use memory copy instructions in kernel routines | expand

Commit Message

Kristina Martšenko Sept. 30, 2024, 4:10 p.m. UTC
Make memcpy(), memmove() and memset() use the Armv8.8 FEAT_MOPS
instructions when implemented on the CPU.

The CPY*/SET* instructions copy or set a block of memory of arbitrary
size and alignment. They can be interrupted by the CPU and the copying
resumed later. Their performance is expected to be close to the best
generic copy/set sequence of loads/stores for a given CPU. Using them in
the kernel's copy/set routines therefore avoids the need to periodically
rewrite the routines to optimize for new microarchitectures. It could
also lead to a performance improvement for some CPUs and systems.

With this change the kernel will always use the instructions if they are
implemented on the CPU (and have not been disabled by the arm64.nomops
command line parameter). When not implemented the usual routines will be
used (patched via alternatives). Note, we need to patch B/NOP instead of
the whole sequence to avoid executing a partially patched sequence in
case the compiler generates a mem*() call inside the alternatives
patching code.

Note that MOPS instructions have relaxed behavior on Device memory, but
it is expected that these routines are not generally used on MMIO.

Note: For memcpy(), this uses the CPY* instructions instead of CPYF*, as
CPY* allows overlaps between the source and destination buffers, and
despite contradicting the C standard, compilers require that memcpy()
work on exactly overlapping source and destination:
  https://gcc.gnu.org/onlinedocs/gcc/Standards.html#C-Language
  https://reviews.llvm.org/D86993

Signed-off-by: Kristina Martsenko <kristina.martsenko@arm.com>
---
 arch/arm64/Kconfig      |  3 +++
 arch/arm64/lib/memcpy.S | 19 ++++++++++++++++++-
 arch/arm64/lib/memset.S | 20 +++++++++++++++++++-
 3 files changed, 40 insertions(+), 2 deletions(-)

Comments

Catalin Marinas Oct. 2, 2024, 3:29 p.m. UTC | #1
On Mon, Sep 30, 2024 at 05:10:50PM +0100, Kristina Martsenko wrote:
> diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
> index 4ab48d49c451..9b99106fb95f 100644
> --- a/arch/arm64/lib/memcpy.S
> +++ b/arch/arm64/lib/memcpy.S
> @@ -57,7 +57,7 @@
>     The loop tail is handled by always copying 64 bytes from the end.
>  */
>  
> -SYM_FUNC_START(__pi_memcpy)
> +SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
>  	add	srcend, src, count
>  	add	dstend, dstin, count
>  	cmp	count, 128
> @@ -238,7 +238,24 @@ L(copy64_from_start):
>  	stp	B_l, B_h, [dstin, 16]
>  	stp	C_l, C_h, [dstin]
>  	ret
> +SYM_FUNC_END(__pi_memcpy_generic)
> +
> +#ifdef CONFIG_AS_HAS_MOPS
> +	.arch_extension mops
> +SYM_FUNC_START(__pi_memcpy)
> +alternative_if_not ARM64_HAS_MOPS
> +	b	__pi_memcpy_generic
> +alternative_else_nop_endif

I'm fine with patching the branch but I wonder whether, for the time
being, we should use alternative_if instead and the NOP to fall through
the default implementation. The hardware in the field doesn't have
FEAT_MOPS yet and they may see a slight penalty introduced by the
branch, especially for small memcpys. Just guessing, I haven't done any
benchmarks.
Kristina Martšenko Oct. 3, 2024, 4:46 p.m. UTC | #2
On 02/10/2024 16:29, Catalin Marinas wrote:
> On Mon, Sep 30, 2024 at 05:10:50PM +0100, Kristina Martsenko wrote:
>> diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
>> index 4ab48d49c451..9b99106fb95f 100644
>> --- a/arch/arm64/lib/memcpy.S
>> +++ b/arch/arm64/lib/memcpy.S
>> @@ -57,7 +57,7 @@
>>     The loop tail is handled by always copying 64 bytes from the end.
>>  */
>>  
>> -SYM_FUNC_START(__pi_memcpy)
>> +SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
>>  	add	srcend, src, count
>>  	add	dstend, dstin, count
>>  	cmp	count, 128
>> @@ -238,7 +238,24 @@ L(copy64_from_start):
>>  	stp	B_l, B_h, [dstin, 16]
>>  	stp	C_l, C_h, [dstin]
>>  	ret
>> +SYM_FUNC_END(__pi_memcpy_generic)
>> +
>> +#ifdef CONFIG_AS_HAS_MOPS
>> +	.arch_extension mops
>> +SYM_FUNC_START(__pi_memcpy)
>> +alternative_if_not ARM64_HAS_MOPS
>> +	b	__pi_memcpy_generic
>> +alternative_else_nop_endif
> 
> I'm fine with patching the branch but I wonder whether, for the time
> being, we should use alternative_if instead and the NOP to fall through
> the default implementation. The hardware in the field doesn't have
> FEAT_MOPS yet and they may see a slight penalty introduced by the
> branch, especially for small memcpys. Just guessing, I haven't done any
> benchmarks.

My thinking was that this way it doesn't have to be changed again in the
future. But I'm fine with switching to alternative_if for v2.

Thanks,
Kristina
Catalin Marinas Oct. 4, 2024, 10:07 a.m. UTC | #3
On Thu, Oct 03, 2024 at 05:46:08PM +0100, Kristina Martsenko wrote:
> On 02/10/2024 16:29, Catalin Marinas wrote:
> > On Mon, Sep 30, 2024 at 05:10:50PM +0100, Kristina Martsenko wrote:
> >> diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
> >> index 4ab48d49c451..9b99106fb95f 100644
> >> --- a/arch/arm64/lib/memcpy.S
> >> +++ b/arch/arm64/lib/memcpy.S
> >> @@ -57,7 +57,7 @@
> >>     The loop tail is handled by always copying 64 bytes from the end.
> >>  */
> >>  
> >> -SYM_FUNC_START(__pi_memcpy)
> >> +SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
> >>  	add	srcend, src, count
> >>  	add	dstend, dstin, count
> >>  	cmp	count, 128
> >> @@ -238,7 +238,24 @@ L(copy64_from_start):
> >>  	stp	B_l, B_h, [dstin, 16]
> >>  	stp	C_l, C_h, [dstin]
> >>  	ret
> >> +SYM_FUNC_END(__pi_memcpy_generic)
> >> +
> >> +#ifdef CONFIG_AS_HAS_MOPS
> >> +	.arch_extension mops
> >> +SYM_FUNC_START(__pi_memcpy)
> >> +alternative_if_not ARM64_HAS_MOPS
> >> +	b	__pi_memcpy_generic
> >> +alternative_else_nop_endif
> > 
> > I'm fine with patching the branch but I wonder whether, for the time
> > being, we should use alternative_if instead and the NOP to fall through
> > the default implementation. The hardware in the field doesn't have
> > FEAT_MOPS yet and they may see a slight penalty introduced by the
> > branch, especially for small memcpys. Just guessing, I haven't done any
> > benchmarks.
> 
> My thinking was that this way it doesn't have to be changed again in the
> future. But I'm fine with switching to alternative_if for v2.

The other option is to benchmark the proposed patches a bit and see if
we notice any difference on current hardware. Not sure exactly what
benchmarks would exercise these paths. For copy_page(), I suspect the
branch is probably lost in the noise. It's more like small copies that
might notice.

Yet another option is to leave the patches as they are and see if anyone
complains, we swap them over then ;).
Kristina Martšenko Oct. 16, 2024, 1:08 p.m. UTC | #4
On 04/10/2024 11:07, Catalin Marinas wrote:
> On Thu, Oct 03, 2024 at 05:46:08PM +0100, Kristina Martsenko wrote:
>> On 02/10/2024 16:29, Catalin Marinas wrote:
>>> On Mon, Sep 30, 2024 at 05:10:50PM +0100, Kristina Martsenko wrote:
>>>> diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
>>>> index 4ab48d49c451..9b99106fb95f 100644
>>>> --- a/arch/arm64/lib/memcpy.S
>>>> +++ b/arch/arm64/lib/memcpy.S
>>>> @@ -57,7 +57,7 @@
>>>>     The loop tail is handled by always copying 64 bytes from the end.
>>>>  */
>>>>  
>>>> -SYM_FUNC_START(__pi_memcpy)
>>>> +SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
>>>>  	add	srcend, src, count
>>>>  	add	dstend, dstin, count
>>>>  	cmp	count, 128
>>>> @@ -238,7 +238,24 @@ L(copy64_from_start):
>>>>  	stp	B_l, B_h, [dstin, 16]
>>>>  	stp	C_l, C_h, [dstin]
>>>>  	ret
>>>> +SYM_FUNC_END(__pi_memcpy_generic)
>>>> +
>>>> +#ifdef CONFIG_AS_HAS_MOPS
>>>> +	.arch_extension mops
>>>> +SYM_FUNC_START(__pi_memcpy)
>>>> +alternative_if_not ARM64_HAS_MOPS
>>>> +	b	__pi_memcpy_generic
>>>> +alternative_else_nop_endif
>>>
>>> I'm fine with patching the branch but I wonder whether, for the time
>>> being, we should use alternative_if instead and the NOP to fall through
>>> the default implementation. The hardware in the field doesn't have
>>> FEAT_MOPS yet and they may see a slight penalty introduced by the
>>> branch, especially for small memcpys. Just guessing, I haven't done any
>>> benchmarks.
>>
>> My thinking was that this way it doesn't have to be changed again in the
>> future. But I'm fine with switching to alternative_if for v2.
> 
> The other option is to benchmark the proposed patches a bit and see if
> we notice any difference on current hardware. Not sure exactly what
> benchmarks would exercise these paths. For copy_page(), I suspect the
> branch is probably lost in the noise. It's more like small copies that
> might notice.
> 
> Yet another option is to leave the patches as they are and see if anyone
> complains, we swap them over then ;).

I tried benchmarking a kernel build and hackbench on a Morello board (with
usercopy patches applied as well) but didn't see any significant performance
difference between the branch and NOP so I would leave the patches as they are.

Thanks,
Kristina
Catalin Marinas Oct. 17, 2024, 11:57 a.m. UTC | #5
On Wed, Oct 16, 2024 at 02:08:27PM +0100, Kristina Martsenko wrote:
> On 04/10/2024 11:07, Catalin Marinas wrote:
> > On Thu, Oct 03, 2024 at 05:46:08PM +0100, Kristina Martsenko wrote:
> >> On 02/10/2024 16:29, Catalin Marinas wrote:
> >>> On Mon, Sep 30, 2024 at 05:10:50PM +0100, Kristina Martsenko wrote:
> >>>> diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
> >>>> index 4ab48d49c451..9b99106fb95f 100644
> >>>> --- a/arch/arm64/lib/memcpy.S
> >>>> +++ b/arch/arm64/lib/memcpy.S
> >>>> @@ -57,7 +57,7 @@
> >>>>     The loop tail is handled by always copying 64 bytes from the end.
> >>>>  */
> >>>>  
> >>>> -SYM_FUNC_START(__pi_memcpy)
> >>>> +SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
> >>>>  	add	srcend, src, count
> >>>>  	add	dstend, dstin, count
> >>>>  	cmp	count, 128
> >>>> @@ -238,7 +238,24 @@ L(copy64_from_start):
> >>>>  	stp	B_l, B_h, [dstin, 16]
> >>>>  	stp	C_l, C_h, [dstin]
> >>>>  	ret
> >>>> +SYM_FUNC_END(__pi_memcpy_generic)
> >>>> +
> >>>> +#ifdef CONFIG_AS_HAS_MOPS
> >>>> +	.arch_extension mops
> >>>> +SYM_FUNC_START(__pi_memcpy)
> >>>> +alternative_if_not ARM64_HAS_MOPS
> >>>> +	b	__pi_memcpy_generic
> >>>> +alternative_else_nop_endif
> >>>
> >>> I'm fine with patching the branch but I wonder whether, for the time
> >>> being, we should use alternative_if instead and the NOP to fall through
> >>> the default implementation. The hardware in the field doesn't have
> >>> FEAT_MOPS yet and they may see a slight penalty introduced by the
> >>> branch, especially for small memcpys. Just guessing, I haven't done any
> >>> benchmarks.
> >>
> >> My thinking was that this way it doesn't have to be changed again in the
> >> future. But I'm fine with switching to alternative_if for v2.
> > 
> > The other option is to benchmark the proposed patches a bit and see if
> > we notice any difference on current hardware. Not sure exactly what
> > benchmarks would exercise these paths. For copy_page(), I suspect the
> > branch is probably lost in the noise. It's more like small copies that
> > might notice.
> > 
> > Yet another option is to leave the patches as they are and see if anyone
> > complains, we swap them over then ;).
> 
> I tried benchmarking a kernel build and hackbench on a Morello board (with
> usercopy patches applied as well) but didn't see any significant performance
> difference between the branch and NOP so I would leave the patches as they are.

That's great. Thanks for checking.
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3e29b44d2d7b..d0fe90ea704d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2155,6 +2155,9 @@  config ARM64_EPAN
 	  if the cpu does not implement the feature.
 endmenu # "ARMv8.7 architectural features"
 
+config AS_HAS_MOPS
+	def_bool $(as-instr,.arch_extension mops)
+
 menu "ARMv8.9 architectural features"
 
 config ARM64_POE
diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 4ab48d49c451..9b99106fb95f 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -57,7 +57,7 @@ 
    The loop tail is handled by always copying 64 bytes from the end.
 */
 
-SYM_FUNC_START(__pi_memcpy)
+SYM_FUNC_START_LOCAL(__pi_memcpy_generic)
 	add	srcend, src, count
 	add	dstend, dstin, count
 	cmp	count, 128
@@ -238,7 +238,24 @@  L(copy64_from_start):
 	stp	B_l, B_h, [dstin, 16]
 	stp	C_l, C_h, [dstin]
 	ret
+SYM_FUNC_END(__pi_memcpy_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+SYM_FUNC_START(__pi_memcpy)
+alternative_if_not ARM64_HAS_MOPS
+	b	__pi_memcpy_generic
+alternative_else_nop_endif
+
+	mov	dst, dstin
+	cpyp	[dst]!, [src]!, count!
+	cpym	[dst]!, [src]!, count!
+	cpye	[dst]!, [src]!, count!
+	ret
 SYM_FUNC_END(__pi_memcpy)
+#else
+SYM_FUNC_ALIAS(__pi_memcpy, __pi_memcpy_generic)
+#endif
 
 SYM_FUNC_ALIAS(__memcpy, __pi_memcpy)
 EXPORT_SYMBOL(__memcpy)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index a5aebe82ad73..97157da65ec6 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -26,6 +26,7 @@ 
  */
 
 dstin		.req	x0
+val_x		.req	x1
 val		.req	w1
 count		.req	x2
 tmp1		.req	x3
@@ -42,7 +43,7 @@  dst		.req	x8
 tmp3w		.req	w9
 tmp3		.req	x9
 
-SYM_FUNC_START(__pi_memset)
+SYM_FUNC_START_LOCAL(__pi_memset_generic)
 	mov	dst, dstin	/* Preserve return value.  */
 	and	A_lw, val, #255
 	orr	A_lw, A_lw, A_lw, lsl #8
@@ -201,7 +202,24 @@  SYM_FUNC_START(__pi_memset)
 	ands	count, count, zva_bits_x
 	b.ne	.Ltail_maybe_long
 	ret
+SYM_FUNC_END(__pi_memset_generic)
+
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+SYM_FUNC_START(__pi_memset)
+alternative_if_not ARM64_HAS_MOPS
+	b	__pi_memset_generic
+alternative_else_nop_endif
+
+	mov	dst, dstin
+	setp	[dst]!, count!, val_x
+	setm	[dst]!, count!, val_x
+	sete	[dst]!, count!, val_x
+	ret
 SYM_FUNC_END(__pi_memset)
+#else
+SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic)
+#endif
 
 SYM_FUNC_ALIAS(__memset, __pi_memset)
 EXPORT_SYMBOL(__memset)