diff mbox

[3/6] arm64: lib: Implement optimized memset routine

Message ID 1386743082-5231-4-git-send-email-zhichang.yuan@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

zhichang.yuan@linaro.org Dec. 11, 2013, 6:24 a.m. UTC
From: "zhichang.yuan" <zhichang.yuan@linaro.org>

This patch, based on Linaro's Cortex Strings library, improves
the performance of the assembly optimized memset() function.

Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org>
Signed-off-by: Deepak Saxena <dsaxena@linaro.org>
---
 arch/arm64/lib/memset.S |  227 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 201 insertions(+), 26 deletions(-)

Comments

Will Deacon Dec. 16, 2013, 4:55 p.m. UTC | #1
On Wed, Dec 11, 2013 at 06:24:39AM +0000, zhichang.yuan@linaro.org wrote:
> From: "zhichang.yuan" <zhichang.yuan@linaro.org>
> 
> This patch, based on Linaro's Cortex Strings library, improves
> the performance of the assembly optimized memset() function.
> 
> Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org>
> Signed-off-by: Deepak Saxena <dsaxena@linaro.org>
> ---
>  arch/arm64/lib/memset.S |  227 +++++++++++++++++++++++++++++++++++++++++------
>  1 file changed, 201 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
> index 87e4a68..90b973e 100644
> --- a/arch/arm64/lib/memset.S
> +++ b/arch/arm64/lib/memset.S
> @@ -1,13 +1,21 @@
>  /*
>   * Copyright (C) 2013 ARM Ltd.
> + * Copyright (C) 2013 Linaro.
> + *
> + * This code is based on glibc cortex strings work originally authored by Linaro
> + * and re-licensed under GPLv2 for the Linux kernel. The original code can
> + * be found @
> + *
> + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> + * files/head:/src/aarch64/
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License version 2 as
>   * published by the Free Software Foundation.
>   *
> - * This program is distributed in the hope that it will be useful,
> - * but WITHOUT ANY WARRANTY; without even the implied warranty of
> - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * This program is distributed "as is" WITHOUT ANY WARRANTY of any
> + * kind, whether express or implied; without even the implied warranty
> + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

Why are you changing this?

>   * GNU General Public License for more details.
>   *
>   * You should have received a copy of the GNU General Public License
> @@ -18,7 +26,7 @@
>  #include <asm/assembler.h>
>  
>  /*
> - * Fill in the buffer with character c (alignment handled by the hardware)
> + * Fill in the buffer with character c
>   *
>   * Parameters:
>   *	x0 - buf
> @@ -27,27 +35,194 @@
>   * Returns:
>   *	x0 - buf
>   */
> +
> +/* By default we assume that the DC instruction can be used to zero
> +*  data blocks more efficiently.  In some circumstances this might be
> +*  unsafe, for example in an asymmetric multiprocessor environment with
> +*  different DC clear lengths (neither the upper nor lower lengths are
> +*  safe to use).  The feature can be disabled by defining DONT_USE_DC.
> +*/

We already use DC ZVA for clear_page, so I think we should start off using
it unconditionally. If we need to revisit this later, we can, but adding a
random #ifdef doesn't feel like something we need initially.

For the benefit of anybody else reviewing this; the DC ZVA instruction still
works for normal, non-cacheable memory.

The comments I made on the earlier patch wrt quality of comments and labels
seem to apply to all of the patches in this series.

Will
zhichang.yuan@linaro.org Dec. 18, 2013, 2:37 a.m. UTC | #2
On 2013?12?17? 00:55, Will Deacon wrote:
> On Wed, Dec 11, 2013 at 06:24:39AM +0000, zhichang.yuan@linaro.org wrote:
>> From: "zhichang.yuan" <zhichang.yuan@linaro.org>
>>
>> This patch, based on Linaro's Cortex Strings library, improves
>> the performance of the assembly optimized memset() function.
>>
>> Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org>
>> Signed-off-by: Deepak Saxena <dsaxena@linaro.org>
>> ---
>>  arch/arm64/lib/memset.S |  227 +++++++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 201 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
>> index 87e4a68..90b973e 100644
>> --- a/arch/arm64/lib/memset.S
>> +++ b/arch/arm64/lib/memset.S
>> @@ -1,13 +1,21 @@
>>  /*
>>   * Copyright (C) 2013 ARM Ltd.
>> + * Copyright (C) 2013 Linaro.
>> + *
>> + * This code is based on glibc cortex strings work originally authored by Linaro
>> + * and re-licensed under GPLv2 for the Linux kernel. The original code can
>> + * be found @
>> + *
>> + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
>> + * files/head:/src/aarch64/
>>   *
>>   * This program is free software; you can redistribute it and/or modify
>>   * it under the terms of the GNU General Public License version 2 as
>>   * published by the Free Software Foundation.
>>   *
>> - * This program is distributed in the hope that it will be useful,
>> - * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * This program is distributed "as is" WITHOUT ANY WARRANTY of any
>> + * kind, whether express or implied; without even the implied warranty
>> + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> 
> Why are you changing this?
> 
>>   * GNU General Public License for more details.
>>   *
>>   * You should have received a copy of the GNU General Public License
>> @@ -18,7 +26,7 @@
>>  #include <asm/assembler.h>
>>  
>>  /*
>> - * Fill in the buffer with character c (alignment handled by the hardware)
>> + * Fill in the buffer with character c
>>   *
>>   * Parameters:
>>   *	x0 - buf
>> @@ -27,27 +35,194 @@
>>   * Returns:
>>   *	x0 - buf
>>   */
>> +
>> +/* By default we assume that the DC instruction can be used to zero
>> +*  data blocks more efficiently.  In some circumstances this might be
>> +*  unsafe, for example in an asymmetric multiprocessor environment with
>> +*  different DC clear lengths (neither the upper nor lower lengths are
>> +*  safe to use).  The feature can be disabled by defining DONT_USE_DC.
>> +*/
This comments is not so correct, for the AMP,i think the DONT_USE_DC also is not necessary.
Since this memset will read the dczid_el0 each time,it will get the current correct
value from the system register.
> 
> We already use DC ZVA for clear_page, so I think we should start off using
> it unconditionally. If we need to revisit this later, we can, but adding a
> random #ifdef doesn't feel like something we need initially.

As for the macro DONT_USE_DC,i am no sure whether default configure of some systems
do not permit the DC ZVA. In this case,the try to use DC ZVA will be ended and
turn back to the normal memory setting process. It will not cause any error,only
need more instructions executed.
I initially think using the DONT_USE_DC is a bit efficient.
Actually,even if the system doesn't support DC ZVA,the cost of reading the dczid_el0
register is small. It is not worthy to introduce a kernel macro.
I will modify it.

Zhichang
> 
> For the benefit of anybody else reviewing this; the DC ZVA instruction still
> works for normal, non-cacheable memory.
> 
> The comments I made on the earlier patch wrt quality of comments and labels
> seem to apply to all of the patches in this series.
> 
> Will
>
diff mbox

Patch

diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index 87e4a68..90b973e 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -1,13 +1,21 @@ 
 /*
  * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
@@ -18,7 +26,7 @@ 
 #include <asm/assembler.h>
 
 /*
- * Fill in the buffer with character c (alignment handled by the hardware)
+ * Fill in the buffer with character c
  *
  * Parameters:
  *	x0 - buf
@@ -27,27 +35,194 @@ 
  * Returns:
  *	x0 - buf
  */
+
+/* By default we assume that the DC instruction can be used to zero
+*  data blocks more efficiently.  In some circumstances this might be
+*  unsafe, for example in an asymmetric multiprocessor environment with
+*  different DC clear lengths (neither the upper nor lower lengths are
+*  safe to use).  The feature can be disabled by defining DONT_USE_DC.
+*/
+
+#define dstin		x0
+#define val		w1
+#define count		x2
+#define tmp1		x3
+#define tmp1w		w3
+#define tmp2		x4
+#define tmp2w		w4
+#define zva_len_x	x5
+#define zva_len	w5
+#define zva_bits_x	x6
+
+#define A_l		x7
+#define A_lw		w7
+#define dst		x8
+#define tmp3w		w9
+#define tmp3		x9
+
 ENTRY(memset)
-	mov	x4, x0
-	and	w1, w1, #0xff
-	orr	w1, w1, w1, lsl #8
-	orr	w1, w1, w1, lsl #16
-	orr	x1, x1, x1, lsl #32
-	subs	x2, x2, #8
-	b.mi	2f
-1:	str	x1, [x4], #8
-	subs	x2, x2, #8
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	sub	x2, x2, #4
-	str	w1, [x4], #4
-3:	adds	x2, x2, #2
-	b.mi	4f
-	sub	x2, x2, #2
-	strh	w1, [x4], #2
-4:	adds	x2, x2, #1
-	b.mi	5f
-	strb	w1, [x4]
-5:	ret
+	mov	dst, dstin	/* Preserve return value.  */
+	and	A_lw, val, #255
+	orr	A_lw, A_lw, A_lw, lsl #8
+	orr	A_lw, A_lw, A_lw, lsl #16
+	orr	A_l, A_l, A_l, lsl #32
+
+	/*first align dst with 16...*/
+	neg	tmp2, dst
+	ands	tmp2, tmp2, #15
+	b.eq	.Laligned
+
+	cmp	count, #15
+	b.le	.Ltail15tiny
+	/*
+	* The count is not less than 16, we can use stp to set 16 bytes
+	* once. This way is more efficient but the access is non-aligned.
+	*/
+	stp	A_l, A_l, [dst]
+	/*make the dst aligned..*/
+	sub	count, count, tmp2
+	add	dst, dst, tmp2
+
+	/*Here, dst is aligned 16 now...*/
+.Laligned:
+#ifndef DONT_USE_DC
+	cbz	A_l, .Lzero_mem
+#endif
+
+.Ltail_maybe_long:
+	cmp	count, #64
+	b.ge	.Lnot_short
+.Ltail63:
+	ands	tmp1, count, #0x30
+	b.eq	.Ltail15tiny
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	stp	A_l, A_l, [dst], #16
+1:
+	stp	A_l, A_l, [dst], #16
+2:
+	stp	A_l, A_l, [dst], #16
+/*
+* The following process is non-aligned access. But it is more efficient than
+* .Ltail15tiny. Of-course, we can delete this code, but have a bit
+* performance cost.
+*/
+	ands	count, count, #15
+	cbz	count, 1f
+	add	dst, dst, count
+	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
+1:
+	ret
+
+.Ltail15tiny:
+	/* Set up to 15 bytes.  Does not assume earlier memory
+	being set.  */
+	tbz	count, #3, 1f
+	str	A_l, [dst], #8
+1:
+	tbz	count, #2, 1f
+	str	A_lw, [dst], #4
+1:
+	tbz	count, #1, 1f
+	strh	A_lw, [dst], #2
+1:
+	tbz	count, #0, 1f
+	strb	A_lw, [dst]
+1:
+	ret
+
+	/*
+	* Critical loop. Start at a new cache line boundary. Assuming
+	* 64 bytes per line, this ensures the entire loop is in one line.
+	*/
+	.p2align	6
+.Lnot_short: /*count must be not less than 64*/
+	sub	dst, dst, #16/* Pre-bias.  */
+	sub	count, count, #64
+1:
+	stp	A_l, A_l, [dst, #16]
+	stp	A_l, A_l, [dst, #32]
+	stp	A_l, A_l, [dst, #48]
+	stp	A_l, A_l, [dst, #64]!
+	subs	count, count, #64
+	b.ge	1b
+	tst	count, #0x3f
+	add	dst, dst, #16
+	b.ne	.Ltail63
+.Lexitfunc:
+	ret
+
+#ifndef DONT_USE_DC
+	/*
+	* For zeroing memory, check to see if we can use the ZVA feature to
+	* zero entire 'cache' lines.
+	*/
+.Lzero_mem:
+	cmp	count, #63
+	b.le	.Ltail63
+	/*
+	* For zeroing small amounts of memory, it's not worth setting up
+	* the line-clear code.
+	*/
+	cmp	count, #128
+	b.lt	.Lnot_short /*count is at least  128 bytes*/
+
+	mrs	tmp1, dczid_el0
+	tbnz	tmp1, #4, .Lnot_short
+	mov	tmp3w, #4
+	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
+	lsl	zva_len, tmp3w, zva_len
+
+	ands	tmp3w, zva_len, #63
+	/*
+	* ensure the zva_len is not less than 64.
+	* It is not meaningful to use ZVA if the block size is less than 64.
+	*/
+	b.ne	.Lnot_short
+.Lzero_by_line:
+	/*
+	* Compute how far we need to go to become suitably aligned. We're
+	* already at quad-word alignment.
+	*/
+	cmp	count, zva_len_x
+	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
+	sub	zva_bits_x, zva_len_x, #1
+	neg	tmp2, dst
+	ands	tmp2, tmp2, zva_bits_x
+	b.eq	1f			/* Already aligned.  */
+	/* Not aligned, check that there's enough to copy after alignment.*/
+	sub	tmp1, count, tmp2
+	/*
+	* grantee the remain length to be ZVA is bigger than 64,
+	* avoid to make the 2f's process over mem range.*/
+	cmp	tmp1, #64
+	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
+	b.lt	.Lnot_short
+	/*
+	* We know that there's at least 64 bytes to zero and that it's safe
+	* to overrun by 64 bytes.
+	*/
+	mov	count, tmp1
+2:
+	stp	A_l, A_l, [dst]
+	stp	A_l, A_l, [dst, #16]
+	stp	A_l, A_l, [dst, #32]
+	subs	tmp2, tmp2, #64
+	stp	A_l, A_l, [dst, #48]
+	add	dst, dst, #64
+	b.ge	2b
+	/* We've overrun a bit, so adjust dst downwards.*/
+	add	dst, dst, tmp2
+1:
+	sub	count, count, zva_len_x
+3:
+	dc	zva, dst
+	add	dst, dst, zva_len_x
+	subs	count, count, zva_len_x
+	b.ge	3b
+	ands	count, count, zva_bits_x
+	b.ne	.Ltail_maybe_long
+	ret
+#endif /* DONT_USE_DC */
 ENDPROC(memset)