[1/4] arm64: optimized copy_to_user assembly code

Message ID	1376591397-1776-1-git-send-email-fkan@apm.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org> From: Feng Kan <fkan@apm.com> To: fkan@apm.com, Catalin.Marinas@arm.com, linux-arm-kernel@lists.infradead.org Subject: [PATCH 1/4] arm64: optimized copy_to_user assembly code Date: Thu, 15 Aug 2013 11:29:57 -0700 Message-Id: <1376591397-1776-1-git-send-email-fkan@apm.com> Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org> Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org

Message ID

1376591397-1776-1-git-send-email-fkan@apm.com (mailing list archive)

State

New, archived

Headers

From: Feng Kan <fkan@apm.com>
To: fkan@apm.com, Catalin.Marinas@arm.com,
	linux-arm-kernel@lists.infradead.org
Subject: [PATCH 1/4] arm64: optimized copy_to_user assembly code
Date: Thu, 15 Aug 2013 11:29:57 -0700
Message-Id: <1376591397-1776-1-git-send-email-fkan@apm.com>
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org>
Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org

Using the optimized copy_to_user code, iperf performance improved 20%. --- arch/arm64/lib/copy_to_user.S | 249 +++++++++++++++++++++++++++++++++-------- 1 files changed, 204 insertions(+), 45 deletions(-)

Comments

Catalin Marinas Aug. 19, 2013, 5:22 p.m. UTC | #1

On Thu, Aug 15, 2013 at 07:29:57PM +0100, Feng Kan wrote:
> Using the optimized copy_to_user code, iperf performance improved 20%.

I have some questions first:

Was this code written from scratch or derived from something else? I can
see some Linaro copyright in there.

Since it is dual-licensed, have you replaced the original code entirely?

Is this patch part of a series? I can't seem to find the other 3 patches
(unless I deleted them by mistake).

> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
> index a0aeeb9..2a07b13 100644
> --- a/arch/arm64/lib/copy_to_user.S
> +++ b/arch/arm64/lib/copy_to_user.S
> @@ -1,61 +1,220 @@
>  /*
> - * Copyright (C) 2012 ARM Ltd.
> + * Copyright (c) 2013, Applied Micro Circuits Corporation
> + * Copyright (c) 2012-2013, Linaro Limited
>   *
> - * This program is free software; you can redistribute it and/or modify
> - * it under the terms of the GNU General Public License version 2 as
> - * published by the Free Software Foundation.
> + * Author: Feng Kan <fkan@apm.com>
> + * Author: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
>   *
> - * This program is distributed in the hope that it will be useful,
> - * but WITHOUT ANY WARRANTY; without even the implied warranty of
> - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> - * GNU General Public License for more details.
> + * The code is adopted from the memcpy routine by Linaro Limited.
>   *
> - * You should have received a copy of the GNU General Public License
> - * along with this program.  If not, see <http://www.gnu.org/licenses/>.
> + * This file is dual licensed: you can use it either under the terms of
> + * the GPL, or the BSD license, at your option.
> + *
> + *  a) This library is free software; you can redistribute it and/or
> + *     modify it under the terms of the GNU General Public License as
> + *     published by the Free Software Foundation; either version 2 of the
> + *     License, or (at your option) any later version.
> + *
> + *     This library is distributed in the hope that it will be useful,
> + *     but WITHOUT ANY WARRANTY; without even the implied warranty of
> + *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + *     GNU General Public License for more details.
> + *
> + *     You should have received a copy of the GNU General Public
> + *     License along with this library; if not, write to the Free
> + *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
> + *     MA 02110-1301 USA

This is raised regularly, please don't include the FSF address, they may
relocated. Just point at the web address.

> -/*
> - * Copy to user space from a kernel buffer (alignment handled by the hardware)
> - *
> - * Parameters:
> - *	x0 - to
> - *	x1 - from
> - *	x2 - n
> - * Returns:
> - *	x0 - bytes not copied
> - */

What's the point of dropping some of the documentation (not that it was
much either)?



> -ENTRY(__copy_to_user)
> -	add	x4, x0, x2			// upper user buffer boundary
> -	subs	x2, x2, #8
> -	b.mi	2f
> +#define dstin	x0
> +#define src	x1
> +#define count	x2
> +#define tmp1	x3
> +#define tmp1w	w3
> +#define tmp2	x4
> +#define tmp2w	w4
> +#define tmp3	x5
> +#define tmp3w	w5
> +#define dst	x6
> +
> +#define A_l	x7
> +#define A_h	x8
> +#define B_l	x9
> +#define B_h	x10
> +#define C_l	x11
> +#define C_h	x12
> +#define D_l	x13
> +#define D_h	x14

Could we use .req instead of #define?

> +
> +	/*
> +	 * Prototype: size_t copy_to_user (void *dst, const void *src, size_t nb)
> +	 */
> +	.text
> +	.align	2
> +	.p2align 6,,63

Do we need two align directives?

> +ENTRY(__copy_to_user)	
> +	mov	dst, dstin
> +	cmp	count, #64
> +	b.ge	.Lcpy_not_short
> +	cmp	count, #15
> +	b.le	.Ltail15tiny
> +
> +	/* Deal with small copies quickly by dropping straight into the
> +	 * exit block.  */
> +.Ltail63:
> +	/* Copy up to 48 bytes of data.  At this point we only need the
> +	 * bottom 6 bits of count to be accurate.  */
> +	ands	tmp1, count, #0x30
> +	b.eq	.Ltail15
> +	add	dst, dst, tmp1
> +	add	src, src, tmp1
> +	cmp	tmp1w, #0x20
> +	b.eq	1f
> +	b.lt	2f
> +	ldp	A_l, A_h, [src, #-48]
> +USER (9f, stp A_l, A_h, [dst, #-48])
> +1:
> +	ldp	A_l, A_h, [src, #-32]
> +USER (9f, stp A_l, A_h, [dst, #-32])
> +2:
> +	ldp	A_l, A_h, [src, #-16]
> +USER (9f, stp A_l, A_h, [dst, #-16])
> +
> +.Ltail15:
> +	ands	count, count, #15
> +	beq	1f
> +	add	src, src, count
> +	ldp	A_l, A_h, [src, #-16]
> +	add	dst, dst, count
> +USER (9f, stp A_l, A_h, [dst, #-16])
> +1:
> +	b	.Lsuccess
> +
> +.Ltail15tiny:
> +	/* Copy up to 15 bytes of data.  Does not assume additional data
> +	   being copied.  */
> +	tbz	count, #3, 1f
> +	ldr	tmp1, [src], #8
> +USER (9f, str tmp1, [dst], #8)
> +1:
> +	tbz	count, #2, 1f
> +	ldr	tmp1w, [src], #4
> +USER (9f, str tmp1w, [dst], #4)
>  1:
> -	ldr	x3, [x1], #8
> -	subs	x2, x2, #8
> -USER(9f, str	x3, [x0], #8	)
> -	b.pl	1b
> -2:	adds	x2, x2, #4
> -	b.mi	3f
> -	ldr	w3, [x1], #4
> -	sub	x2, x2, #4
> -USER(9f, str	w3, [x0], #4	)
> -3:	adds	x2, x2, #2
> -	b.mi	4f
> -	ldrh	w3, [x1], #2
> -	sub	x2, x2, #2
> -USER(9f, strh	w3, [x0], #2	)
> -4:	adds	x2, x2, #1
> -	b.mi	5f
> -	ldrb	w3, [x1]
> -USER(9f, strb	w3, [x0]	)
> -5:	mov	x0, #0
> +	tbz	count, #1, 1f
> +	ldrh	tmp1w, [src], #2
> +USER (9f, strh tmp1w, [dst], #2)
> +1:
> +	tbz	count, #0, 1f
> +	ldrb	tmp1w, [src]
> +USER (9f, strb tmp1w, [dst])
> +1:
> +	b	.Lsuccess
> +
> +.Lcpy_not_short:
> +	/* We don't much care about the alignment of DST, but we want SRC
> +	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
> +	 * boundaries on both loads and stores.  */
> +	neg	tmp2, src
> +	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
> +	b.eq	2f
> +	sub	count, count, tmp2
> +	/* Copy more data than needed; it's faster than jumping
> +	 * around copying sub-Quadword quantities.  We know that
> +	 * it can't overrun.  */
> +	ldp	A_l, A_h, [src]
> +	add	src, src, tmp2
> +USER (9f, stp A_l, A_h, [dst])
> +	add	dst, dst, tmp2
> +	/* There may be less than 63 bytes to go now.  */
> +	cmp	count, #63
> +	b.le	.Ltail63
> +2:
> +	subs	count, count, #128
> +	b.ge	.Lcpy_body_large
> +	/* Less than 128 bytes to copy, so handle 64 here and then jump
> +	 * to the tail.  */
> +	ldp	A_l, A_h, [src]
> +	ldp	B_l, B_h, [src, #16]
> +	ldp	C_l, C_h, [src, #32]
> +	ldp	D_l, D_h, [src, #48]
> +USER (9f, stp A_l, A_h, [dst])
> +USER (9f, stp B_l, B_h, [dst, #16])
> +USER (9f, stp C_l, C_h, [dst, #32])
> +USER (9f, stp D_l, D_h, [dst, #48])
> +	tst	count, #0x3f
> +	add	src, src, #64
> +	add	dst, dst, #64
> +	b.ne	.Ltail63
> +	b	.Lsuccess
> +
> +	/* Critical loop.  Start at a new cache line boundary.  Assuming
> +	 * 64 bytes per line this ensures the entire loop is in one line.  */
> +	.p2align 6
> +.Lcpy_body_large:
> +	/* There are at least 128 bytes to copy.  */
> +	ldp	A_l, A_h, [src, #0]
> +	sub	dst, dst, #16		/* Pre-bias.  */
> +	ldp	B_l, B_h, [src, #16]
> +	ldp	C_l, C_h, [src, #32]
> +	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
> +1:
> +USER (9f, stp A_l, A_h, [dst, #16])
> +	ldp	A_l, A_h, [src, #16]
> +USER (9f, stp B_l, B_h, [dst, #32])
> +	ldp	B_l, B_h, [src, #32]
> +USER (9f, stp C_l, C_h, [dst, #48])
> +	ldp	C_l, C_h, [src, #48]
> +USER (9f, stp D_l, D_h, [dst, #64]!)
> +	ldp	D_l, D_h, [src, #64]!
> +	subs	count, count, #64
> +	b.ge	1b
> +USER (9f, stp A_l, A_h, [dst, #16])
> +USER (9f, stp B_l, B_h, [dst, #32])
> +USER (9f, stp C_l, C_h, [dst, #48])
> +USER (9f, stp D_l, D_h, [dst, #64])
> +	add	src, src, #16
> +	add	dst, dst, #64 + 16
> +	tst	count, #0x3f
> +	b.ne	.Ltail63
> +.Lsuccess:
> +	mov	x0, #0		    // nothing left to copy
>  	ret
>  ENDPROC(__copy_to_user)
>  
>  	.section .fixup,"ax"
> -	.align	2
> -9:	sub	x0, x4, x0			// bytes not copied
> +	.align    2

The .align change is just whitespace.

> +9:	mov    x0, count            // approximate the number of bytes not copied

Is this an approximate or the real number of bytes not copied?

I haven't done a proper review of the algorithm yet, so I'll give more
comments. For the next version there are also a few coding style issues
that need clean-up (comment style, white space).

diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index a0aeeb9..2a07b13 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -1,61 +1,220 @@ 
 /*
- * Copyright (C) 2012 ARM Ltd.
+ * Copyright (c) 2013, Applied Micro Circuits Corporation
+ * Copyright (c) 2012-2013, Linaro Limited
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Author: Feng Kan <fkan@apm.com>
+ * Author: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * The code is adopted from the memcpy routine by Linaro Limited.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ * This file is dual licensed: you can use it either under the terms of
+ * the GPL, or the BSD license, at your option.
+ *
+ *  a) This library is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License as
+ *     published by the Free Software Foundation; either version 2 of the
+ *     License, or (at your option) any later version.
+ *
+ *     This library is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public
+ *     License along with this library; if not, write to the Free
+ *     Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
+ *     MA 02110-1301 USA
+ *
+ * Alternatively,
+ *
+ *  b) Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *     1. Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *     2. Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ *     CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ *     INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *     MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *     DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ *     CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ *     NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ *     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ *     OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ *     EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
- * Copy to user space from a kernel buffer (alignment handled by the hardware)
- *
- * Parameters:
- *	x0 - to
- *	x1 - from
- *	x2 - n
- * Returns:
- *	x0 - bytes not copied
- */
-ENTRY(__copy_to_user)
-	add	x4, x0, x2			// upper user buffer boundary
-	subs	x2, x2, #8
-	b.mi	2f
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define tmp1	x3
+#define tmp1w	w3
+#define tmp2	x4
+#define tmp2w	w4
+#define tmp3	x5
+#define tmp3w	w5
+#define dst	x6
+
+#define A_l	x7
+#define A_h	x8
+#define B_l	x9
+#define B_h	x10
+#define C_l	x11
+#define C_h	x12
+#define D_l	x13
+#define D_h	x14
+
+	/*
+	 * Prototype: size_t copy_to_user (void *dst, const void *src, size_t nb)
+	 */
+	.text
+	.align	2
+	.p2align 6,,63
+ENTRY(__copy_to_user)	
+	mov	dst, dstin
+	cmp	count, #64
+	b.ge	.Lcpy_not_short
+	cmp	count, #15
+	b.le	.Ltail15tiny
+
+	/* Deal with small copies quickly by dropping straight into the
+	 * exit block.  */
+.Ltail63:
+	/* Copy up to 48 bytes of data.  At this point we only need the
+	 * bottom 6 bits of count to be accurate.  */
+	ands	tmp1, count, #0x30
+	b.eq	.Ltail15
+	add	dst, dst, tmp1
+	add	src, src, tmp1
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src, #-48]
+USER (9f, stp A_l, A_h, [dst, #-48])
+1:
+	ldp	A_l, A_h, [src, #-32]
+USER (9f, stp A_l, A_h, [dst, #-32])
+2:
+	ldp	A_l, A_h, [src, #-16]
+USER (9f, stp A_l, A_h, [dst, #-16])
+
+.Ltail15:
+	ands	count, count, #15
+	beq	1f
+	add	src, src, count
+	ldp	A_l, A_h, [src, #-16]
+	add	dst, dst, count
+USER (9f, stp A_l, A_h, [dst, #-16])
+1:
+	b	.Lsuccess
+
+.Ltail15tiny:
+	/* Copy up to 15 bytes of data.  Does not assume additional data
+	   being copied.  */
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+USER (9f, str tmp1, [dst], #8)
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src], #4
+USER (9f, str tmp1w, [dst], #4)
 1:
-	ldr	x3, [x1], #8
-	subs	x2, x2, #8
-USER(9f, str	x3, [x0], #8	)
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	ldr	w3, [x1], #4
-	sub	x2, x2, #4
-USER(9f, str	w3, [x0], #4	)
-3:	adds	x2, x2, #2
-	b.mi	4f
-	ldrh	w3, [x1], #2
-	sub	x2, x2, #2
-USER(9f, strh	w3, [x0], #2	)
-4:	adds	x2, x2, #1
-	b.mi	5f
-	ldrb	w3, [x1]
-USER(9f, strb	w3, [x0]	)
-5:	mov	x0, #0
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src], #2
+USER (9f, strh tmp1w, [dst], #2)
+1:
+	tbz	count, #0, 1f
+	ldrb	tmp1w, [src]
+USER (9f, strb tmp1w, [dst])
+1:
+	b	.Lsuccess
+
+.Lcpy_not_short:
+	/* We don't much care about the alignment of DST, but we want SRC
+	 * to be 128-bit (16 byte) aligned so that we don't cross cache line
+	 * boundaries on both loads and stores.  */
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15		/* Bytes to reach alignment.  */
+	b.eq	2f
+	sub	count, count, tmp2
+	/* Copy more data than needed; it's faster than jumping
+	 * around copying sub-Quadword quantities.  We know that
+	 * it can't overrun.  */
+	ldp	A_l, A_h, [src]
+	add	src, src, tmp2
+USER (9f, stp A_l, A_h, [dst])
+	add	dst, dst, tmp2
+	/* There may be less than 63 bytes to go now.  */
+	cmp	count, #63
+	b.le	.Ltail63
+2:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/* Less than 128 bytes to copy, so handle 64 here and then jump
+	 * to the tail.  */
+	ldp	A_l, A_h, [src]
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]
+USER (9f, stp A_l, A_h, [dst])
+USER (9f, stp B_l, B_h, [dst, #16])
+USER (9f, stp C_l, C_h, [dst, #32])
+USER (9f, stp D_l, D_h, [dst, #48])
+	tst	count, #0x3f
+	add	src, src, #64
+	add	dst, dst, #64
+	b.ne	.Ltail63
+	b	.Lsuccess
+
+	/* Critical loop.  Start at a new cache line boundary.  Assuming
+	 * 64 bytes per line this ensures the entire loop is in one line.  */
+	.p2align 6
+.Lcpy_body_large:
+	/* There are at least 128 bytes to copy.  */
+	ldp	A_l, A_h, [src, #0]
+	sub	dst, dst, #16		/* Pre-bias.  */
+	ldp	B_l, B_h, [src, #16]
+	ldp	C_l, C_h, [src, #32]
+	ldp	D_l, D_h, [src, #48]!	/* src += 64 - Pre-bias.  */
+1:
+USER (9f, stp A_l, A_h, [dst, #16])
+	ldp	A_l, A_h, [src, #16]
+USER (9f, stp B_l, B_h, [dst, #32])
+	ldp	B_l, B_h, [src, #32]
+USER (9f, stp C_l, C_h, [dst, #48])
+	ldp	C_l, C_h, [src, #48]
+USER (9f, stp D_l, D_h, [dst, #64]!)
+	ldp	D_l, D_h, [src, #64]!
+	subs	count, count, #64
+	b.ge	1b
+USER (9f, stp A_l, A_h, [dst, #16])
+USER (9f, stp B_l, B_h, [dst, #32])
+USER (9f, stp C_l, C_h, [dst, #48])
+USER (9f, stp D_l, D_h, [dst, #64])
+	add	src, src, #16
+	add	dst, dst, #64 + 16
+	tst	count, #0x3f
+	b.ne	.Ltail63
+.Lsuccess:
+	mov	x0, #0		    // nothing left to copy
 	ret
 ENDPROC(__copy_to_user)
 
 	.section .fixup,"ax"
-	.align	2
-9:	sub	x0, x4, x0			// bytes not copied
+	.align    2
+9:	mov    x0, count            // approximate the number of bytes not copied
 	ret
 	.previous

[1/4] arm64: optimized copy_to_user assembly code

Commit Message

Comments

Patch