diff mbox

[V3] arm64: optimized copy_to_user and copy_from_user assembly code

Message ID 1430267932-17730-1-git-send-email-fkan@apm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Feng Kan April 29, 2015, 12:38 a.m. UTC
Using the glibc cortex string work work authored by Linaro as base to
create new copy to/from user kernel routine.

Iperf performance increase:
		-l (size)		1 core result
Optimized 	64B			44-51Mb/s
		1500B			4.9Gb/s
		30000B			16.2Gb/s
Original	64B			34-50.7Mb/s
		1500B			4.7Gb/s
		30000B			14.5Gb/s

Signed-off-by: Feng Kan <fkan@apm.com>
---
 arch/arm64/lib/copy_from_user.S |  92 +++++++++++------
 arch/arm64/lib/copy_template.S  | 213 ++++++++++++++++++++++++++++++++++++++++
 arch/arm64/lib/copy_to_user.S   |  56 ++++++-----
 3 files changed, 302 insertions(+), 59 deletions(-)
 create mode 100644 arch/arm64/lib/copy_template.S

Comments

Catalin Marinas April 30, 2015, 10:57 a.m. UTC | #1
On Tue, Apr 28, 2015 at 05:38:52PM -0700, Feng Kan wrote:
> Using the glibc cortex string work work authored by Linaro as base to
> create new copy to/from user kernel routine.
> 
> Iperf performance increase:
> 		-l (size)		1 core result
> Optimized 	64B			44-51Mb/s
> 		1500B			4.9Gb/s
> 		30000B			16.2Gb/s
> Original	64B			34-50.7Mb/s
> 		1500B			4.7Gb/s
> 		30000B			14.5Gb/s

There is indeed some performance improvement, especially for large
buffers, so I'm fine in principle with changing these functions.
However, I have some comments below.

>  arch/arm64/lib/copy_from_user.S |  92 +++++++++++------
>  arch/arm64/lib/copy_template.S  | 213 ++++++++++++++++++++++++++++++++++++++++
>  arch/arm64/lib/copy_to_user.S   |  56 ++++++-----

We should try to share copy_template.S with the memcpy routine as well.
They are basically the same, just the labels for user access differ.
More about this below.

We also have copy_in_user which is very similar.

> --- a/arch/arm64/lib/copy_from_user.S
> +++ b/arch/arm64/lib/copy_from_user.S
> @@ -15,7 +15,6 @@
>   */
>  
>  #include <linux/linkage.h>
> -#include <asm/assembler.h>
>  
>  /*
>   * Copy from user space to a kernel buffer (alignment handled by the hardware)
[...]
> +9:
> +	/*
> +	 * count is accurate
> +	 * dst is accurate
> +	 */
> +	mov	x0, count
> +	sub	dst, dst, tmp1
> +	b	.Lfinalize
> +
> +10:
> +	/*
> +	 * count is in the last 15 bytes
> +	 * dst is somewhere in there
> +	 */
> +	mov	x0, count
> +	sub	dst, limit, count
> +	b	.Lfinalize

I'm confused by these labels and what they try to do. In the copy
template, usually 'count' is decremented before a set of post-indexed
load/store instructions are issued. I don't think 'count' is relevant
here for how many bytes have been copied. For example, copy_from_user()
can only fault on a load instruction. When you handle such exception,
you want to zero from the current dst (the store wasn't issued since the
load failed) to the end of the buffer (your limit).

> --- /dev/null
> +++ b/arch/arm64/lib/copy_template.S
> @@ -0,0 +1,213 @@
[...]
> +#include <asm/assembler.h>
> +#include <asm/cache.h>
> +
> +dstin	.req x0
> +src	.req x1
> +count	.req x2
> +tmp1	.req x3
> +tmp1w	.req w3
> +tmp2	.req x4
> +tmp2w	.req w4
> +limit	.req x5
> +dst	.req x6
> +
> +A_l	.req x7
> +A_h	.req x8
> +B_l	.req x9
> +B_h	.req x10
> +C_l	.req x11
> +C_h	.req x12
> +D_l	.req x13
> +D_h	.req x14
> +
> +	mov	dst, dstin
> +	add	limit, dst, count
> +	cmp	count, #16
> +	b.lo	.Ltail15
> +
> +	/*
> +	 * We don't much care about the alignment of DST, but we want SRC
> +	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
> +	 * boundaries on both loads and stores.
> +	 */
> +	ands	tmp2, src, #15
> +	b.eq	.LSrcAligned
> +	sub	count, count, tmp2
> +
> +	tbz	tmp2, #0, 1f
> +	USER(11f, ldrb	tmp1w, [src], #1)
> +	USER(11f, strb	tmp1w, [dst], #1)

That's wrong. Depending own whether you want copy_from_user,
copy_to_user or copy_in_user, you have the USER annotation for load,
store or both respectively. It may be easier if we use asm macros
similar to the arm32 copy_template.S. Or maybe some macros like TMPL_LD,
TMPL_ST which may be defined to USER or just expanding the argument
based on the function they are called from (and we can easily share the
template with memcpy and copy_in_user).

> --- a/arch/arm64/lib/copy_to_user.S
> +++ b/arch/arm64/lib/copy_to_user.S
[...]
> +	.align    2
> +9:
> +10:
> +	/*
> +	 * count is accurate
> +	 */
> +	mov	x0, count
> +	b	.Lfinalize
> +11:
> +	/*
> +	 * count is over accounted by tmp2
> +	 */
> +	add	x0, count, tmp2
> +	b	.Lfinalize
> +12:
> +14:
> +	/*
> +	 * (count + 64) bytes remain
> +	 * dst is accurate
> +	 */
> +	adds	x0, count, #64
> +	b	.Lfinalize
> +13:
> +	/*
> +	 * (count + 128) bytes remain
> +	 */
> +	add	x0, count, #128
> +.Lfinalize:
>  	ret
>  	.previous

Can we no just use something like (limit - dst) here and avoid checks
for whether 'count' was accurate or not?
diff mbox

Patch

diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 5e27add..0e79ed9 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -15,7 +15,6 @@ 
  */
 
 #include <linux/linkage.h>
-#include <asm/assembler.h>
 
 /*
  * Copy from user space to a kernel buffer (alignment handled by the hardware)
@@ -28,39 +27,68 @@ 
  *	x0 - bytes not copied
  */
 ENTRY(__copy_from_user)
-	add	x4, x1, x2			// upper user buffer boundary
-	subs	x2, x2, #8
-	b.mi	2f
-1:
-USER(9f, ldr	x3, [x1], #8	)
-	subs	x2, x2, #8
-	str	x3, [x0], #8
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-USER(9f, ldr	w3, [x1], #4	)
-	sub	x2, x2, #4
-	str	w3, [x0], #4
-3:	adds	x2, x2, #2
-	b.mi	4f
-USER(9f, ldrh	w3, [x1], #2	)
-	sub	x2, x2, #2
-	strh	w3, [x0], #2
-4:	adds	x2, x2, #1
-	b.mi	5f
-USER(9f, ldrb	w3, [x1]	)
-	strb	w3, [x0]
-5:	mov	x0, #0
-	ret
+#include "copy_template.S"
 ENDPROC(__copy_from_user)
 
 	.section .fixup,"ax"
-	.align	2
-9:	sub	x2, x4, x1
-	mov	x3, x2
-10:	strb	wzr, [x0], #1			// zero remaining buffer space
-	subs	x3, x3, #1
-	b.ne	10b
-	mov	x0, x2				// bytes not copied
+	.align    2
+9:
+	/*
+	 * count is accurate
+	 * dst is accurate
+	 */
+	mov	x0, count
+	sub	dst, dst, tmp1
+	b	.Lfinalize
+
+10:
+	/*
+	 * count is in the last 15 bytes
+	 * dst is somewhere in there
+	 */
+	mov	x0, count
+	sub	dst, limit, count
+	b	.Lfinalize
+11:
+	/*
+	 * count over counted by tmp2
+	 * dst could be anywhere in there
+	 */
+	add	x0, count, tmp2
+	sub	dst, limit, x0
+	b	.Lfinalize
+12:
+	/*
+	 * (count + 64) bytes remain
+	 * dst is accurate
+	 */
+	adds	x0, count, #64
+	b	.Lfinalize
+13:
+	/*
+	 * (count + 128) bytes remain
+	 * dst is pre-biased to (dst + 16)
+	 */
+	adds	x0, count, #128
+	add	dst, dst, #16
+	b	.Lfinalize
+14:
+	/*
+	 * (count + 64) bytes remain
+	 * dst is pre-biased to (dst + 16)
+	 */
+	adds	x0, count, #64
+	add	dst, dst, #16
+
+.Lfinalize:
+	/*
+	 * Zeroize remaining destination-buffer
+	 */
+	mov	count, x0
+20:
+	/* Zero remaining buffer space */
+	strb	wzr, [dst], #1
+	subs	count, count, #1
+	b.ne	20b
 	ret
 	.previous
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
new file mode 100644
index 0000000..7a22c6b
--- /dev/null
+++ b/arch/arm64/lib/copy_template.S
@@ -0,0 +1,213 @@ 
+/*
+ * Copyright (c) 2013, Applied Micro Circuits Corporation
+ * Copyright (c) 2012-2013, Linaro Limited
+ *
+ * Author: Feng Kan <fkan@apm.com>
+ * Author: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
+ *
+ * The code is adopted from the memcpy routine by Linaro Limited.
+ *
+ * This file is free software: you may copy, redistribute and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This file incorporates work covered by the following copyright and
+ * permission notice:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *      1 Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      2 Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer in the
+ *        documentation and/or other materials provided with the distribution.
+ *      3 Neither the name of the Linaro nor the
+ *        names of its contributors may be used to endorse or promote products
+ *        derived from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <asm/assembler.h>
+#include <asm/cache.h>
+
+dstin	.req x0
+src	.req x1
+count	.req x2
+tmp1	.req x3
+tmp1w	.req w3
+tmp2	.req x4
+tmp2w	.req w4
+limit	.req x5
+dst	.req x6
+
+A_l	.req x7
+A_h	.req x8
+B_l	.req x9
+B_h	.req x10
+C_l	.req x11
+C_h	.req x12
+D_l	.req x13
+D_h	.req x14
+
+	mov	dst, dstin
+	add	limit, dst, count
+	cmp	count, #16
+	b.lo	.Ltail15
+
+	/*
+	 * We don't much care about the alignment of DST, but we want SRC
+	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
+	 * boundaries on both loads and stores.
+	 */
+	ands	tmp2, src, #15
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+
+	tbz	tmp2, #0, 1f
+	USER(11f, ldrb	tmp1w, [src], #1)
+	USER(11f, strb	tmp1w, [dst], #1)
+1:
+	tbz	tmp2, #1, 2f
+	USER(11f, ldrh	tmp1w, [src], #2)
+	USER(11f, strh	tmp1w, [dst], #2)
+2:
+	tbz	tmp2, #2, 3f
+	USER(11f, ldr	tmp1w, [src], #4)
+	USER(11f, str	tmp1w, [dst], #4)
+3:
+	tbz	tmp2, #3, .LSrcAligned
+	USER(11f, ldr	tmp1, [src], #8)
+	USER(11f, str	tmp1, [dst], #8)
+
+.LSrcAligned:
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+
+	/*
+	 * Deal with small copies quickly by dropping straight into the
+	 * exit block.
+	 */
+.Ltail63:
+	/*
+	 * Copy up to 48 bytes of data.  At this point we only need the
+	 * bottom 6 bits of count to be accurate.
+	 */
+	ands	tmp1, count, #0x30
+	b.eq	.Ltail15
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	USER(9f, ldp A_l, A_h, [src, #-48])
+	USER(9f, stp A_l, A_h, [dst, #-48])
+1:
+	USER(9f, ldp A_l, A_h, [src, #-32])
+	USER(9f, stp A_l, A_h, [dst, #-32])
+2:
+	USER(9f, ldp A_l, A_h, [src, #-16])
+	USER(9f, stp A_l, A_h, [dst, #-16])
+
+.Ltail15:
+	ands	count, count, #15
+	beq	.Lsuccess	/* Quick exit if we are done*/
+	/*
+	 * Copy up to 15 bytes of data.  Does not assume additional data
+	 * being copied.
+	 */
+	tbz	count, #3, 1f
+	USER(10f, ldr tmp1, [src], #8)
+	USER(10f, str tmp1, [dst], #8)
+1:
+	tbz	count, #2, 1f
+	USER(10f, ldr tmp1w, [src], #4)
+	USER(10f, str tmp1w, [dst], #4)
+1:
+	tbz	count, #1, 1f
+	USER(10f, ldrh tmp1w, [src], #2)
+	USER(10f, strh tmp1w, [dst], #2)
+1:
+	tbz	count, #0, 1f
+	USER(10f, ldrb tmp1w, [src], #1)
+	USER(10f, strb tmp1w, [dst], #1)
+1:
+	b	.Lsuccess
+
+.Lcpy_over64:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	 * Less than 128 bytes to copy, so handle 64 here and then jump
+	 * to the tail.
+	 */
+	adds	count, count, #64
+	USER(12f, ldp A_l, A_h, [src])
+	USER(12f, ldp B_l, B_h, [src, #16])
+	USER(12f, ldp C_l, C_h, [src, #32])
+	USER(12f, ldp D_l, D_h, [src, #48])
+	USER(12f, stp A_l, A_h, [dst])
+	USER(12f, stp B_l, B_h, [dst, #16])
+	USER(12f, stp C_l, C_h, [dst, #32])
+	USER(12f, stp D_l, D_h, [dst, #48])
+	add	src, src, #64
+	add	dst, dst, #64
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	b	.Lsuccess
+
+	/*
+	 * Critical loop.  Start at a new cache line boundary.  Assuming
+	 * 64 bytes per line this ensures the entire loop is in one line.
+	 */
+	.p2align L1_CACHE_SHIFT
+.Lcpy_body_large:
+	/* There are at least 128 bytes to copy.  */
+	sub	dst, dst, #16			/* Pre-bias.  */
+	USER(13f, ldp A_l, A_h, [src, #0])
+	USER(13f, ldp B_l, B_h, [src, #16])
+	USER(13f, ldp C_l, C_h, [src, #32])
+	USER(13f, ldp D_l, D_h, [src, #48]!)	/* src += 64 - Pre-bias. */
+1:
+	USER(13f, stp A_l, A_h, [dst, #16])
+	USER(13f, ldp A_l, A_h, [src, #16])
+	USER(13f, stp B_l, B_h, [dst, #32])
+	USER(13f, ldp B_l, B_h, [src, #32])
+	USER(13f, stp C_l, C_h, [dst, #48])
+	USER(13f, ldp C_l, C_h, [src, #48])
+	USER(14f, stp D_l, D_h, [dst, #64]!)
+	USER(14f, ldp D_l, D_h, [src, #64]!)
+	subs	count, count, #64
+	b.ge	1b
+	USER(13f, stp A_l, A_h, [dst, #16])
+	USER(13f, stp B_l, B_h, [dst, #32])
+	USER(13f, stp C_l, C_h, [dst, #48])
+	USER(13f, stp D_l, D_h, [dst, #64])
+	add	src, src, #16
+	add	dst, dst, #80		/* 64 bytes + 16 prebias */
+	adds	count, count, #64
+	tst	count, #0x3f
+	b.ne	.Ltail63
+.Lsuccess:
+	/* Nothing left to copy */
+	mov	x0, #0
+	ret
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index a0aeeb9..2ce36de 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -15,7 +15,6 @@ 
  */
 
 #include <linux/linkage.h>
-#include <asm/assembler.h>
 
 /*
  * Copy to user space from a kernel buffer (alignment handled by the hardware)
@@ -28,34 +27,37 @@ 
  *	x0 - bytes not copied
  */
 ENTRY(__copy_to_user)
-	add	x4, x0, x2			// upper user buffer boundary
-	subs	x2, x2, #8
-	b.mi	2f
-1:
-	ldr	x3, [x1], #8
-	subs	x2, x2, #8
-USER(9f, str	x3, [x0], #8	)
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	ldr	w3, [x1], #4
-	sub	x2, x2, #4
-USER(9f, str	w3, [x0], #4	)
-3:	adds	x2, x2, #2
-	b.mi	4f
-	ldrh	w3, [x1], #2
-	sub	x2, x2, #2
-USER(9f, strh	w3, [x0], #2	)
-4:	adds	x2, x2, #1
-	b.mi	5f
-	ldrb	w3, [x1]
-USER(9f, strb	w3, [x0]	)
-5:	mov	x0, #0
-	ret
+#include "copy_template.S"
 ENDPROC(__copy_to_user)
 
 	.section .fixup,"ax"
-	.align	2
-9:	sub	x0, x4, x0			// bytes not copied
+	.align    2
+9:
+10:
+	/*
+	 * count is accurate
+	 */
+	mov	x0, count
+	b	.Lfinalize
+11:
+	/*
+	 * count is over accounted by tmp2
+	 */
+	add	x0, count, tmp2
+	b	.Lfinalize
+12:
+14:
+	/*
+	 * (count + 64) bytes remain
+	 * dst is accurate
+	 */
+	adds	x0, count, #64
+	b	.Lfinalize
+13:
+	/*
+	 * (count + 128) bytes remain
+	 */
+	add	x0, count, #128
+.Lfinalize:
 	ret
 	.previous