diff mbox

[1/6] arm64: lib: Implement optimized memcpy routine

Message ID 1386743082-5231-2-git-send-email-zhichang.yuan@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

zhichang.yuan@linaro.org Dec. 11, 2013, 6:24 a.m. UTC
From: "zhichang.yuan" <zhichang.yuan@linaro.org>

This patch, based on Linaro's Cortex Strings library, improves
the performance of the assembly optimized memcpy() function.

Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org>
Signed-off-by: Deepak Saxena <dsaxena@linaro.org>
---
 arch/arm64/lib/memcpy.S |  182 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 160 insertions(+), 22 deletions(-)

Comments

Will Deacon Dec. 16, 2013, 4:08 p.m. UTC | #1
On Wed, Dec 11, 2013 at 06:24:37AM +0000, zhichang.yuan@linaro.org wrote:
> From: "zhichang.yuan" <zhichang.yuan@linaro.org>
> 
> This patch, based on Linaro's Cortex Strings library, improves
> the performance of the assembly optimized memcpy() function.
> 
> Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org>
> Signed-off-by: Deepak Saxena <dsaxena@linaro.org>
> ---
>  arch/arm64/lib/memcpy.S |  182 +++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 160 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
> index 27b5003..e3bab71 100644
> --- a/arch/arm64/lib/memcpy.S
> +++ b/arch/arm64/lib/memcpy.S
> @@ -1,5 +1,13 @@
>  /*
>   * Copyright (C) 2013 ARM Ltd.
> + * Copyright (C) 2013 Linaro.
> + *
> + * This code is based on glibc cortex strings work originally authored by Linaro
> + * and re-licensed under GPLv2 for the Linux kernel. The original code can
> + * be found @
> + *
> + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> + * files/head:/src/aarch64/
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License version 2 as
> @@ -27,27 +35,157 @@
>   * Returns:
>   *	x0 - dest
>   */
> +#define dstin	x0
> +#define src	x1
> +#define count	x2
> +#define tmp1	x3
> +#define tmp1w	w3
> +#define tmp2	x4
> +#define tmp2w	w4
> +#define tmp3	x5
> +#define tmp3w	w5
> +#define dst	x6
> +
> +#define A_l	x7
> +#define A_h	x8
> +#define B_l	x9
> +#define B_h	x10
> +#define C_l	x11
> +#define C_h	x12
> +#define D_l	x13
> +#define D_h	x14

Use .req instead of #define?

>  ENTRY(memcpy)
> -	mov	x4, x0
> -	subs	x2, x2, #8
> -	b.mi	2f
> -1:	ldr	x3, [x1], #8
> -	subs	x2, x2, #8
> -	str	x3, [x4], #8
> -	b.pl	1b
> -2:	adds	x2, x2, #4
> -	b.mi	3f
> -	ldr	w3, [x1], #4
> -	sub	x2, x2, #4
> -	str	w3, [x4], #4
> -3:	adds	x2, x2, #2
> -	b.mi	4f
> -	ldrh	w3, [x1], #2
> -	sub	x2, x2, #2
> -	strh	w3, [x4], #2
> -4:	adds	x2, x2, #1
> -	b.mi	5f
> -	ldrb	w3, [x1]
> -	strb	w3, [x4]
> -5:	ret
> +	mov	dst, dstin
> +	cmp	count, #16
> +	/*If memory length is less than 16, stp or ldp can not be used.*/
> +	b.lo	.Ltiny15
> +.Lover16:
> +	neg	tmp2, src
> +	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
> +	b.eq	.LSrcAligned
> +	sub	count, count, tmp2
> +	/*
> +	* Use ldp and sdp to copy 16 bytes,then backward the src to
> +	* aligned address.This way is more efficient.
> +	* But the risk overwriting the source area exists.Here,prefer to
> +	* access memory forward straight,no backward.It will need a bit
> +	* more instructions, but on the same time,the accesses are aligned.
> +	*/

This comment reads very badly:

  - sdp doesn't exist
  - `more efficient' than what? How is this measured?
  - `access memory forward straight,no backward' What?

Please re-write it in a clearer fashion, so that reviewers have some
understanding of your optimisations when potentially trying to change the
code later on.

> +	tbz	tmp2, #0, 1f
> +	ldrb	tmp1w, [src], #1
> +	strb	tmp1w, [dst], #1
> +1:
> +	tbz	tmp2, #1, 1f
> +	ldrh	tmp1w, [src], #2
> +	strh	tmp1w, [dst], #2
> +1:
> +	tbz	tmp2, #2, 1f
> +	ldr	tmp1w, [src], #4
> +	str	tmp1w, [dst], #4
> +1:

Three labels called '1:' ? Can you make them unique please (the old code
just incremented a counter).

> +	tbz	tmp2, #3, .LSrcAligned
> +	ldr	tmp1, [src],#8
> +	str	tmp1, [dst],#8
> +
> +.LSrcAligned:
> +	cmp	count, #64
> +	b.ge	.Lcpy_over64
> +	/*
> +	* Deal with small copies quickly by dropping straight into the
> +	* exit block.
> +	*/
> +.Ltail63:
> +	/*
> +	* Copy up to 48 bytes of data. At this point we only need the
> +	* bottom 6 bits of count to be accurate.
> +	*/
> +	ands	tmp1, count, #0x30
> +	b.eq	.Ltiny15
> +	cmp	tmp1w, #0x20
> +	b.eq	1f
> +	b.lt	2f
> +	ldp	A_l, A_h, [src], #16
> +	stp	A_l, A_h, [dst], #16
> +1:
> +	ldp	A_l, A_h, [src], #16
> +	stp	A_l, A_h, [dst], #16
> +2:
> +	ldp	A_l, A_h, [src], #16
> +	stp	A_l, A_h, [dst], #16
> +.Ltiny15:
> +	/*
> +	* To make memmove simpler, here don't make src backwards.
> +	* since backwards will probably overwrite the src area where src
> +	* data for nex copy located,if dst is not so far from src.
> +	*/

Another awful comment...

> +	tbz	count, #3, 1f
> +	ldr	tmp1, [src], #8
> +	str	tmp1, [dst], #8
> +1:
> +	tbz	count, #2, 1f
> +	ldr	tmp1w, [src], #4
> +	str	tmp1w, [dst], #4
> +1:
> +	tbz	count, #1, 1f
> +	ldrh	tmp1w, [src], #2
> +	strh	tmp1w, [dst], #2
> +1:

... and more of these labels.

> +	tbz	count, #0, .Lexitfunc
> +	ldrb	tmp1w, [src]
> +	strb	tmp1w, [dst]
> +
> +.Lexitfunc:
> +	ret
> +
> +.Lcpy_over64:
> +	subs	count, count, #128
> +	b.ge	.Lcpy_body_large
> +	/*
> +	* Less than 128 bytes to copy, so handle 64 here and then jump
> +	* to the tail.
> +	*/
> +	ldp	A_l, A_h, [src],#16
> +	stp	A_l, A_h, [dst],#16
> +	ldp	B_l, B_h, [src],#16
> +	ldp	C_l, C_h, [src],#16
> +	stp	B_l, B_h, [dst],#16
> +	stp	C_l, C_h, [dst],#16
> +	ldp	D_l, D_h, [src],#16
> +	stp	D_l, D_h, [dst],#16
> +
> +	tst	count, #0x3f
> +	b.ne	.Ltail63
> +	ret
> +
> +	/*
> +	* Critical loop.  Start at a new cache line boundary.  Assuming
> +	* 64 bytes per line this ensures the entire loop is in one line.
> +	*/
> +	.p2align	6

Can you parameterise this with L1_CACHE_SHIFT instead?

Will
zhichang.yuan@linaro.org Dec. 18, 2013, 1:54 a.m. UTC | #2
Hi Will

Thanks for your reply.
I think your comments are ok, i will modify the patches involved those questions.
After those fixes are ready, the patch v2 will be submitted.

Thanks again!
Zhichang




On 2013?12?17? 00:08, Will Deacon wrote:
> On Wed, Dec 11, 2013 at 06:24:37AM +0000, zhichang.yuan@linaro.org wrote:
>> From: "zhichang.yuan" <zhichang.yuan@linaro.org>
>>
>> This patch, based on Linaro's Cortex Strings library, improves
>> the performance of the assembly optimized memcpy() function.
>>
>> Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org>
>> Signed-off-by: Deepak Saxena <dsaxena@linaro.org>
>> ---
>>  arch/arm64/lib/memcpy.S |  182 +++++++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 160 insertions(+), 22 deletions(-)
>>
>> diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
>> index 27b5003..e3bab71 100644
>> --- a/arch/arm64/lib/memcpy.S
>> +++ b/arch/arm64/lib/memcpy.S
>> @@ -1,5 +1,13 @@
>>  /*
>>   * Copyright (C) 2013 ARM Ltd.
>> + * Copyright (C) 2013 Linaro.
>> + *
>> + * This code is based on glibc cortex strings work originally authored by Linaro
>> + * and re-licensed under GPLv2 for the Linux kernel. The original code can
>> + * be found @
>> + *
>> + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
>> + * files/head:/src/aarch64/
>>   *
>>   * This program is free software; you can redistribute it and/or modify
>>   * it under the terms of the GNU General Public License version 2 as
>> @@ -27,27 +35,157 @@
>>   * Returns:
>>   *	x0 - dest
>>   */
>> +#define dstin	x0
>> +#define src	x1
>> +#define count	x2
>> +#define tmp1	x3
>> +#define tmp1w	w3
>> +#define tmp2	x4
>> +#define tmp2w	w4
>> +#define tmp3	x5
>> +#define tmp3w	w5
>> +#define dst	x6
>> +
>> +#define A_l	x7
>> +#define A_h	x8
>> +#define B_l	x9
>> +#define B_h	x10
>> +#define C_l	x11
>> +#define C_h	x12
>> +#define D_l	x13
>> +#define D_h	x14
> 
> Use .req instead of #define?
> 
>>  ENTRY(memcpy)
>> -	mov	x4, x0
>> -	subs	x2, x2, #8
>> -	b.mi	2f
>> -1:	ldr	x3, [x1], #8
>> -	subs	x2, x2, #8
>> -	str	x3, [x4], #8
>> -	b.pl	1b
>> -2:	adds	x2, x2, #4
>> -	b.mi	3f
>> -	ldr	w3, [x1], #4
>> -	sub	x2, x2, #4
>> -	str	w3, [x4], #4
>> -3:	adds	x2, x2, #2
>> -	b.mi	4f
>> -	ldrh	w3, [x1], #2
>> -	sub	x2, x2, #2
>> -	strh	w3, [x4], #2
>> -4:	adds	x2, x2, #1
>> -	b.mi	5f
>> -	ldrb	w3, [x1]
>> -	strb	w3, [x4]
>> -5:	ret
>> +	mov	dst, dstin
>> +	cmp	count, #16
>> +	/*If memory length is less than 16, stp or ldp can not be used.*/
>> +	b.lo	.Ltiny15
>> +.Lover16:
>> +	neg	tmp2, src
>> +	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
>> +	b.eq	.LSrcAligned
>> +	sub	count, count, tmp2
>> +	/*
>> +	* Use ldp and sdp to copy 16 bytes,then backward the src to
>> +	* aligned address.This way is more efficient.
>> +	* But the risk overwriting the source area exists.Here,prefer to
>> +	* access memory forward straight,no backward.It will need a bit
>> +	* more instructions, but on the same time,the accesses are aligned.
>> +	*/
> 
> This comment reads very badly:
> 
>   - sdp doesn't exist
>   - `more efficient' than what? How is this measured?
>   - `access memory forward straight,no backward' What?
> 
> Please re-write it in a clearer fashion, so that reviewers have some
> understanding of your optimisations when potentially trying to change the
> code later on.
> 
>> +	tbz	tmp2, #0, 1f
>> +	ldrb	tmp1w, [src], #1
>> +	strb	tmp1w, [dst], #1
>> +1:
>> +	tbz	tmp2, #1, 1f
>> +	ldrh	tmp1w, [src], #2
>> +	strh	tmp1w, [dst], #2
>> +1:
>> +	tbz	tmp2, #2, 1f
>> +	ldr	tmp1w, [src], #4
>> +	str	tmp1w, [dst], #4
>> +1:
> 
> Three labels called '1:' ? Can you make them unique please (the old code
> just incremented a counter).
> 
>> +	tbz	tmp2, #3, .LSrcAligned
>> +	ldr	tmp1, [src],#8
>> +	str	tmp1, [dst],#8
>> +
>> +.LSrcAligned:
>> +	cmp	count, #64
>> +	b.ge	.Lcpy_over64
>> +	/*
>> +	* Deal with small copies quickly by dropping straight into the
>> +	* exit block.
>> +	*/
>> +.Ltail63:
>> +	/*
>> +	* Copy up to 48 bytes of data. At this point we only need the
>> +	* bottom 6 bits of count to be accurate.
>> +	*/
>> +	ands	tmp1, count, #0x30
>> +	b.eq	.Ltiny15
>> +	cmp	tmp1w, #0x20
>> +	b.eq	1f
>> +	b.lt	2f
>> +	ldp	A_l, A_h, [src], #16
>> +	stp	A_l, A_h, [dst], #16
>> +1:
>> +	ldp	A_l, A_h, [src], #16
>> +	stp	A_l, A_h, [dst], #16
>> +2:
>> +	ldp	A_l, A_h, [src], #16
>> +	stp	A_l, A_h, [dst], #16
>> +.Ltiny15:
>> +	/*
>> +	* To make memmove simpler, here don't make src backwards.
>> +	* since backwards will probably overwrite the src area where src
>> +	* data for nex copy located,if dst is not so far from src.
>> +	*/
> 
> Another awful comment...
> 
>> +	tbz	count, #3, 1f
>> +	ldr	tmp1, [src], #8
>> +	str	tmp1, [dst], #8
>> +1:
>> +	tbz	count, #2, 1f
>> +	ldr	tmp1w, [src], #4
>> +	str	tmp1w, [dst], #4
>> +1:
>> +	tbz	count, #1, 1f
>> +	ldrh	tmp1w, [src], #2
>> +	strh	tmp1w, [dst], #2
>> +1:
> 
> ... and more of these labels.
> 
>> +	tbz	count, #0, .Lexitfunc
>> +	ldrb	tmp1w, [src]
>> +	strb	tmp1w, [dst]
>> +
>> +.Lexitfunc:
>> +	ret
>> +
>> +.Lcpy_over64:
>> +	subs	count, count, #128
>> +	b.ge	.Lcpy_body_large
>> +	/*
>> +	* Less than 128 bytes to copy, so handle 64 here and then jump
>> +	* to the tail.
>> +	*/
>> +	ldp	A_l, A_h, [src],#16
>> +	stp	A_l, A_h, [dst],#16
>> +	ldp	B_l, B_h, [src],#16
>> +	ldp	C_l, C_h, [src],#16
>> +	stp	B_l, B_h, [dst],#16
>> +	stp	C_l, C_h, [dst],#16
>> +	ldp	D_l, D_h, [src],#16
>> +	stp	D_l, D_h, [dst],#16
>> +
>> +	tst	count, #0x3f
>> +	b.ne	.Ltail63
>> +	ret
>> +
>> +	/*
>> +	* Critical loop.  Start at a new cache line boundary.  Assuming
>> +	* 64 bytes per line this ensures the entire loop is in one line.
>> +	*/
>> +	.p2align	6
> 
> Can you parameterise this with L1_CACHE_SHIFT instead?
> 
> Will
>
diff mbox

Patch

diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 27b5003..e3bab71 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,5 +1,13 @@ 
 /*
  * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -27,27 +35,157 @@ 
  * Returns:
  *	x0 - dest
  */
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define tmp1	x3
+#define tmp1w	w3
+#define tmp2	x4
+#define tmp2w	w4
+#define tmp3	x5
+#define tmp3w	w5
+#define dst	x6
+
+#define A_l	x7
+#define A_h	x8
+#define B_l	x9
+#define B_h	x10
+#define C_l	x11
+#define C_h	x12
+#define D_l	x13
+#define D_h	x14
+
 ENTRY(memcpy)
-	mov	x4, x0
-	subs	x2, x2, #8
-	b.mi	2f
-1:	ldr	x3, [x1], #8
-	subs	x2, x2, #8
-	str	x3, [x4], #8
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	ldr	w3, [x1], #4
-	sub	x2, x2, #4
-	str	w3, [x4], #4
-3:	adds	x2, x2, #2
-	b.mi	4f
-	ldrh	w3, [x1], #2
-	sub	x2, x2, #2
-	strh	w3, [x4], #2
-4:	adds	x2, x2, #1
-	b.mi	5f
-	ldrb	w3, [x1]
-	strb	w3, [x4]
-5:	ret
+	mov	dst, dstin
+	cmp	count, #16
+	/*If memory length is less than 16, stp or ldp can not be used.*/
+	b.lo	.Ltiny15
+.Lover16:
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* Use ldp and sdp to copy 16 bytes,then backward the src to
+	* aligned address.This way is more efficient.
+	* But the risk overwriting the source area exists.Here,prefer to
+	* access memory forward straight,no backward.It will need a bit
+	* more instructions, but on the same time,the accesses are aligned.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src], #1
+	strb	tmp1w, [dst], #1
+1:
+	tbz	tmp2, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	tmp2, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	tmp2, #3, .LSrcAligned
+	ldr	tmp1, [src],#8
+	str	tmp1, [dst],#8
+
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+1:
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+2:
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+.Ltiny15:
+	/*
+	* To make memmove simpler, here don't make src backwards.
+	* since backwards will probably overwrite the src area where src
+	* data for nex copy located,if dst is not so far from src.
+	*/
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	count, #0, .Lexitfunc
+	ldrb	tmp1w, [src]
+	strb	tmp1w, [dst]
+
+.Lexitfunc:
+	ret
+
+.Lcpy_over64:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+	ldp	A_l, A_h, [src],#16
+	stp	A_l, A_h, [dst],#16
+	ldp	B_l, B_h, [src],#16
+	ldp	C_l, C_h, [src],#16
+	stp	B_l, B_h, [dst],#16
+	stp	C_l, C_h, [dst],#16
+	ldp	D_l, D_h, [src],#16
+	stp	D_l, D_h, [dst],#16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+	.p2align	6
+.Lcpy_body_large:
+	/* There are at least 128 bytes to copy.  */
+	ldp	A_l, A_h, [src],#16
+	ldp	B_l, B_h, [src],#16
+	ldp	C_l, C_h, [src],#16
+	ldp	D_l, D_h, [src],#16
+1:
+	stp	A_l, A_h, [dst],#16
+	ldp	A_l, A_h, [src],#16
+	stp	B_l, B_h, [dst],#16
+	ldp	B_l, B_h, [src],#16
+	stp	C_l, C_h, [dst],#16
+	ldp	C_l, C_h, [src],#16
+	stp	D_l, D_h, [dst],#16
+	ldp	D_l, D_h, [src],#16
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst],#16
+	stp	B_l, B_h, [dst],#16
+	stp	C_l, C_h, [dst],#16
+	stp	D_l, D_h, [dst],#16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
 ENDPROC(memcpy)