diff mbox series

[3/3] arm64: lib: Use MOPS for usercopy routines

Message ID 20250218171430.28227-4-kristina.martsenko@arm.com (mailing list archive)
State New
Headers show
Series arm64: Use memory copy instructions in usercopy routines | expand

Commit Message

Kristina Martšenko Feb. 18, 2025, 5:14 p.m. UTC
Similarly to what was done with the memcpy() routines, make
copy_to_user(), copy_from_user() and clear_user() also use the Armv8.8
FEAT_MOPS instructions.

Both MOPS implementation options (A and B) are supported, including
asymmetric systems. The exception fixup code fixes up the registers
according to the option used.

In case of a fault the routines return precisely how much was not copied
(as required by the comment in include/linux/uaccess.h), as unprivileged
versions of CPY/SET are guaranteed not to have written past the
addresses reported in the GPRs.

The MOPS instructions could possibly be inlined into callers (and
patched to branch to the generic implementation if not detected;
similarly to what x86 does), but as a first step this patch just uses
them in the out-of-line routines.

Signed-off-by: Kristina Martšenko <kristina.martsenko@arm.com>
---
 arch/arm64/include/asm/asm-uaccess.h |  4 ++++
 arch/arm64/lib/clear_user.S          | 25 +++++++++++++++++++++----
 arch/arm64/lib/copy_from_user.S      | 10 ++++++++++
 arch/arm64/lib/copy_template.S       | 10 ++++++++++
 arch/arm64/lib/copy_to_user.S        | 10 ++++++++++
 5 files changed, 55 insertions(+), 4 deletions(-)

Comments

Robin Murphy Feb. 20, 2025, 7:15 p.m. UTC | #1
On 18/02/2025 5:14 pm, Kristina Martšenko wrote:
> Similarly to what was done with the memcpy() routines, make
> copy_to_user(), copy_from_user() and clear_user() also use the Armv8.8
> FEAT_MOPS instructions.
> 
> Both MOPS implementation options (A and B) are supported, including
> asymmetric systems. The exception fixup code fixes up the registers
> according to the option used.
> 
> In case of a fault the routines return precisely how much was not copied
> (as required by the comment in include/linux/uaccess.h), as unprivileged
> versions of CPY/SET are guaranteed not to have written past the
> addresses reported in the GPRs.
> 
> The MOPS instructions could possibly be inlined into callers (and
> patched to branch to the generic implementation if not detected;
> similarly to what x86 does), but as a first step this patch just uses
> them in the out-of-line routines.
> 
> Signed-off-by: Kristina Martšenko <kristina.martsenko@arm.com>
> ---
>   arch/arm64/include/asm/asm-uaccess.h |  4 ++++
>   arch/arm64/lib/clear_user.S          | 25 +++++++++++++++++++++----
>   arch/arm64/lib/copy_from_user.S      | 10 ++++++++++
>   arch/arm64/lib/copy_template.S       | 10 ++++++++++
>   arch/arm64/lib/copy_to_user.S        | 10 ++++++++++
>   5 files changed, 55 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
> index 5b6efe8abeeb..9148f5a31968 100644
> --- a/arch/arm64/include/asm/asm-uaccess.h
> +++ b/arch/arm64/include/asm/asm-uaccess.h
> @@ -61,6 +61,10 @@ alternative_else_nop_endif
>   9999:	x;					\
>   	_asm_extable_uaccess	9999b, l
>   
> +#define USER_CPY(l, uaccess_is_write, x...)	\
> +9999:	x;					\
> +	_asm_extable_uaccess_cpy 9999b, l, uaccess_is_write
> +
>   /*
>    * Generate the assembly for LDTR/STTR with exception table entries.
>    * This is complicated as there is no post-increment or pair versions of the
> diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
> index a5a5f5b97b17..de9a303b6ad0 100644
> --- a/arch/arm64/lib/clear_user.S
> +++ b/arch/arm64/lib/clear_user.S
> @@ -17,14 +17,27 @@
>    * Alignment fixed up by hardware.
>    */
>   
> -	.p2align 4
> -	// Alignment is for the loop, but since the prologue (including BTI)
> -	// is also 16 bytes we can keep any padding outside the function
>   SYM_FUNC_START(__arch_clear_user)
>   	add	x2, x0, x1

This subtlety...

> +
> +#ifdef CONFIG_AS_HAS_MOPS
> +	.arch_extension mops
> +alternative_if_not ARM64_HAS_MOPS
> +	b	.Lno_mops
> +alternative_else_nop_endif
> +
> +USER(9f, setpt	[x0]!, x1!, xzr)
> +USER(6f, setmt	[x0]!, x1!, xzr)
> +USER(6f, setet	[x0]!, x1!, xzr)
> +	mov	x0, #0
> +	ret
> +.Lno_mops:
> +#endif
> +
>   	subs	x1, x1, #8
>   	b.mi	2f
> -1:
> +
> +1:	.p2align 4
>   USER(9f, sttr	xzr, [x0])
>   	add	x0, x0, #8
>   	subs	x1, x1, #8
> @@ -47,6 +60,10 @@ USER(7f, sttrb	wzr, [x2, #-1])
>   	ret
>   
>   	// Exception fixups
> +6:	b.cs	9f
> +	// Registers are in Option A format
> +	add	x0, x0, x1
> +	b	9f

...and then all this, is a bit hard to follow IMO. I'd be inclined to 
just have dedicated "mov x0, x1; ret" and "cneg x0, x1, cc; ret" fixups 
for the prologue and other ops, rather than entangle them with the 
non-MOPS flow at all. (Plus then the prologue fixup could arguably be 
the normal exit path as well...)

>   7:	sub	x0, x2, #5	// Adjust for faulting on the final byte...
>   8:	add	x0, x0, #4	// ...or the second word of the 4-7 byte case
>   9:	sub	x0, x2, x0
> diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
> index 34e317907524..400057d607ec 100644
> --- a/arch/arm64/lib/copy_from_user.S
> +++ b/arch/arm64/lib/copy_from_user.S
> @@ -52,6 +52,13 @@
>   	stp \reg1, \reg2, [\ptr], \val
>   	.endm
>   
> +	.macro cpy1 dst, src, count
> +	.arch_extension mops
> +	USER_CPY(9997f, 0, cpyfprt [\dst]!, [\src]!, \count!)
> +	USER_CPY(9996f, 0, cpyfmrt [\dst]!, [\src]!, \count!)
> +	USER_CPY(9996f, 0, cpyfert [\dst]!, [\src]!, \count!)
> +	.endm
> +
>   end	.req	x5
>   srcin	.req	x15
>   SYM_FUNC_START(__arch_copy_from_user)
> @@ -62,6 +69,9 @@ SYM_FUNC_START(__arch_copy_from_user)
>   	ret
>   
>   	// Exception fixups
> +9996:	b.cs	9997f
> +	// Registers are in Option A format
> +	add	dst, dst, count

However for copies it's somewhat justified since, IIUC, MOPS aren't 
guaranteed to make progress if we're starting on the last byte of a page 
and the next page is unmapped, and thus we may still have a "try harder" 
requirement similar to the previous alignment fault case, is that right?

Thanks,
Robin.

>   9997:	cmp	dst, dstin
>   	b.ne	9998f
>   	// Before being absolutely sure we couldn't copy anything, try harder
> diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
> index 488df234c49a..7f2f5a0e2fb9 100644
> --- a/arch/arm64/lib/copy_template.S
> +++ b/arch/arm64/lib/copy_template.S
> @@ -40,6 +40,16 @@ D_l	.req	x13
>   D_h	.req	x14
>   
>   	mov	dst, dstin
> +
> +#ifdef CONFIG_AS_HAS_MOPS
> +alternative_if_not ARM64_HAS_MOPS
> +	b	.Lno_mops
> +alternative_else_nop_endif
> +	cpy1	dst, src, count
> +	b	.Lexitfunc
> +.Lno_mops:
> +#endif
> +
>   	cmp	count, #16
>   	/*When memory length is less than 16, the accessed are not aligned.*/
>   	b.lo	.Ltiny15
> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
> index 802231772608..819f2e3fc7a9 100644
> --- a/arch/arm64/lib/copy_to_user.S
> +++ b/arch/arm64/lib/copy_to_user.S
> @@ -51,6 +51,13 @@
>   	user_stp 9997f, \reg1, \reg2, \ptr, \val
>   	.endm
>   
> +	.macro cpy1 dst, src, count
> +	.arch_extension mops
> +	USER_CPY(9997f, 1, cpyfpwt [\dst]!, [\src]!, \count!)
> +	USER_CPY(9996f, 1, cpyfmwt [\dst]!, [\src]!, \count!)
> +	USER_CPY(9996f, 1, cpyfewt [\dst]!, [\src]!, \count!)
> +	.endm
> +
>   end	.req	x5
>   srcin	.req	x15
>   SYM_FUNC_START(__arch_copy_to_user)
> @@ -61,6 +68,9 @@ SYM_FUNC_START(__arch_copy_to_user)
>   	ret
>   
>   	// Exception fixups
> +9996:	b.cs	9997f
> +	// Registers are in Option A format
> +	add	dst, dst, count
>   9997:	cmp	dst, dstin
>   	b.ne	9998f
>   	// Before being absolutely sure we couldn't copy anything, try harder
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h
index 5b6efe8abeeb..9148f5a31968 100644
--- a/arch/arm64/include/asm/asm-uaccess.h
+++ b/arch/arm64/include/asm/asm-uaccess.h
@@ -61,6 +61,10 @@  alternative_else_nop_endif
 9999:	x;					\
 	_asm_extable_uaccess	9999b, l
 
+#define USER_CPY(l, uaccess_is_write, x...)	\
+9999:	x;					\
+	_asm_extable_uaccess_cpy 9999b, l, uaccess_is_write
+
 /*
  * Generate the assembly for LDTR/STTR with exception table entries.
  * This is complicated as there is no post-increment or pair versions of the
diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S
index a5a5f5b97b17..de9a303b6ad0 100644
--- a/arch/arm64/lib/clear_user.S
+++ b/arch/arm64/lib/clear_user.S
@@ -17,14 +17,27 @@ 
  * Alignment fixed up by hardware.
  */
 
-	.p2align 4
-	// Alignment is for the loop, but since the prologue (including BTI)
-	// is also 16 bytes we can keep any padding outside the function
 SYM_FUNC_START(__arch_clear_user)
 	add	x2, x0, x1
+
+#ifdef CONFIG_AS_HAS_MOPS
+	.arch_extension mops
+alternative_if_not ARM64_HAS_MOPS
+	b	.Lno_mops
+alternative_else_nop_endif
+
+USER(9f, setpt	[x0]!, x1!, xzr)
+USER(6f, setmt	[x0]!, x1!, xzr)
+USER(6f, setet	[x0]!, x1!, xzr)
+	mov	x0, #0
+	ret
+.Lno_mops:
+#endif
+
 	subs	x1, x1, #8
 	b.mi	2f
-1:
+
+1:	.p2align 4
 USER(9f, sttr	xzr, [x0])
 	add	x0, x0, #8
 	subs	x1, x1, #8
@@ -47,6 +60,10 @@  USER(7f, sttrb	wzr, [x2, #-1])
 	ret
 
 	// Exception fixups
+6:	b.cs	9f
+	// Registers are in Option A format
+	add	x0, x0, x1
+	b	9f
 7:	sub	x0, x2, #5	// Adjust for faulting on the final byte...
 8:	add	x0, x0, #4	// ...or the second word of the 4-7 byte case
 9:	sub	x0, x2, x0
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 34e317907524..400057d607ec 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -52,6 +52,13 @@ 
 	stp \reg1, \reg2, [\ptr], \val
 	.endm
 
+	.macro cpy1 dst, src, count
+	.arch_extension mops
+	USER_CPY(9997f, 0, cpyfprt [\dst]!, [\src]!, \count!)
+	USER_CPY(9996f, 0, cpyfmrt [\dst]!, [\src]!, \count!)
+	USER_CPY(9996f, 0, cpyfert [\dst]!, [\src]!, \count!)
+	.endm
+
 end	.req	x5
 srcin	.req	x15
 SYM_FUNC_START(__arch_copy_from_user)
@@ -62,6 +69,9 @@  SYM_FUNC_START(__arch_copy_from_user)
 	ret
 
 	// Exception fixups
+9996:	b.cs	9997f
+	// Registers are in Option A format
+	add	dst, dst, count
 9997:	cmp	dst, dstin
 	b.ne	9998f
 	// Before being absolutely sure we couldn't copy anything, try harder
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..7f2f5a0e2fb9 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -40,6 +40,16 @@  D_l	.req	x13
 D_h	.req	x14
 
 	mov	dst, dstin
+
+#ifdef CONFIG_AS_HAS_MOPS
+alternative_if_not ARM64_HAS_MOPS
+	b	.Lno_mops
+alternative_else_nop_endif
+	cpy1	dst, src, count
+	b	.Lexitfunc
+.Lno_mops:
+#endif
+
 	cmp	count, #16
 	/*When memory length is less than 16, the accessed are not aligned.*/
 	b.lo	.Ltiny15
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index 802231772608..819f2e3fc7a9 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -51,6 +51,13 @@ 
 	user_stp 9997f, \reg1, \reg2, \ptr, \val
 	.endm
 
+	.macro cpy1 dst, src, count
+	.arch_extension mops
+	USER_CPY(9997f, 1, cpyfpwt [\dst]!, [\src]!, \count!)
+	USER_CPY(9996f, 1, cpyfmwt [\dst]!, [\src]!, \count!)
+	USER_CPY(9996f, 1, cpyfewt [\dst]!, [\src]!, \count!)
+	.endm
+
 end	.req	x5
 srcin	.req	x15
 SYM_FUNC_START(__arch_copy_to_user)
@@ -61,6 +68,9 @@  SYM_FUNC_START(__arch_copy_to_user)
 	ret
 
 	// Exception fixups
+9996:	b.cs	9997f
+	// Registers are in Option A format
+	add	dst, dst, count
 9997:	cmp	dst, dstin
 	b.ne	9998f
 	// Before being absolutely sure we couldn't copy anything, try harder