From patchwork Fri Aug 21 22:01:33 2015
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Feng Kan <fkan@apm.com>
X-Patchwork-Id: 7054211
Return-Path: 
 <linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org>
X-Original-To: patchwork-linux-arm@patchwork.kernel.org
Delivered-To: patchwork-parsemail@patchwork1.web.kernel.org
Received: from mail.kernel.org (mail.kernel.org [198.145.29.136])
	by patchwork1.web.kernel.org (Postfix) with ESMTP id BD71C9FDA1
	for <patchwork-linux-arm@patchwork.kernel.org>;
	Fri, 21 Aug 2015 22:03:49 +0000 (UTC)
Received: from mail.kernel.org (localhost [127.0.0.1])
	by mail.kernel.org (Postfix) with ESMTP id 3F27120483
	for <patchwork-linux-arm@patchwork.kernel.org>;
	Fri, 21 Aug 2015 22:03:48 +0000 (UTC)
Received: from bombadil.infradead.org (bombadil.infradead.org
	[198.137.202.9])
	(using TLSv1.2 with cipher AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by mail.kernel.org (Postfix) with ESMTPS id B2F362046F
	for <patchwork-linux-arm@patchwork.kernel.org>;
	Fri, 21 Aug 2015 22:03:46 +0000 (UTC)
Received: from localhost ([127.0.0.1] helo=bombadil.infradead.org)
	by bombadil.infradead.org with esmtp (Exim 4.80.1 #2 (Red Hat Linux))
	id 1ZSuN7-0001h2-Or; Fri, 21 Aug 2015 22:01:13 +0000
Received: from mail-pa0-x232.google.com ([2607:f8b0:400e:c03::232])
	by bombadil.infradead.org with esmtps (Exim 4.80.1 #2 (Red Hat
	Linux)) id 1ZSuN3-0001eC-MK
	for linux-arm-kernel@lists.infradead.org;
	Fri, 21 Aug 2015 22:01:11 +0000
Received: by pacgr6 with SMTP id gr6so2492110pac.0
	for <linux-arm-kernel@lists.infradead.org>;
	Fri, 21 Aug 2015 15:00:48 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=apm.com; s=apm;
	h=from:to:cc:subject:date:message-id;
	bh=Bnh/TUFfevTig4yGB7mx8ZwOsx0aDU1EnUxKxsK91CE=;
	b=EuvtMShqletwvDX/34kw/DwdaU21E9iEK64tAnPM9HUp4gu4AAVGEw2QE43rL9NGOu
	IXN6k7KY4dm4d8z4BW/kH+snmMQUmQytYERgjAjQvH2GtuwH7pWaYH5k96aMz5iaa9l9
	S5SAOFLvy9DUgeBFBDeI1dZ6uuheDz2nZFcDg=
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
	d=1e100.net; s=20130820;
	h=x-gm-message-state:from:to:cc:subject:date:message-id;
	bh=Bnh/TUFfevTig4yGB7mx8ZwOsx0aDU1EnUxKxsK91CE=;
	b=b7kPWMuHWhjpMzJAS4NG7vtoT29BdKyLEbYn1/aFboxlWii+Kjx00tAqG2zplZKTFU
	XQAjOq4iw9eGWiBDTQj22aPlD7HUY/qH/p9pQZKix9+7Ee9oVWBGUrZRsylz7Udw/f1n
	jN2UdtP04f90UfPAvH4cE1LLyh9sZph/uiomhkh/q2IKs37uPwEVRZVgTCWJtuEEoeA2
	oBGuNko70Xxr2f2VbinBn7UtHWvaHhIa9COvnKtuWCKjlI3vghajad/OZo3G2a80npo+
	D5z4uUt0N4pB576a707Vs0jI0rO3RHqewFnUErxi+RWHeRRXes2+d82pibkfDnZF+3DO
	YPew==
X-Gm-Message-State: 
 ALoCoQkn3uEgRf56Hg5EOlhLrOGP3cuBZ3jlUa71jAWNYvazvxwufPJDoh0l6nSbseDITQ4T2hPb
X-Received: by 10.66.160.40 with SMTP id xh8mr21424789pab.10.1440194448731;
	Fri, 21 Aug 2015 15:00:48 -0700 (PDT)
Received: from fkan-lpt.amcc.com (70-35-53-82.static.wiline.com.
	[70.35.53.82]) by smtp.gmail.com with ESMTPSA id
	d13sm8876312pbu.51.2015.08.21.15.00.47
	(version=TLSv1.2 cipher=ECDHE-RSA-AES128-SHA bits=128/128);
	Fri, 21 Aug 2015 15:00:48 -0700 (PDT)
From: Feng Kan <fkan@apm.com>
To: patches@apm.com, linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, philipp.tomsich@theobroma-systems.com,
	dann.frazier@canonical.com, tim.gardner@canonical.com,
	craig.magina@canonical.com, soni.trilok.oss@gmail.com
Subject: [PATCH V4 1/2] arm64: copy_to-from-in_user optimization using copy
	template
Date: Fri, 21 Aug 2015 15:01:33 -0700
Message-Id: <1440194493-4907-1-git-send-email-fkan@apm.com>
X-Mailer: git-send-email 1.9.1
X-CRM114-Version: 20100106-BlameMichelson ( TRE 0.8.0 (BSD) ) MR-646709E3 
X-CRM114-CacheID: sfid-20150821_150109_793249_D0FDF2EF 
X-CRM114-Status: GOOD (  21.41  )
X-Spam-Score: -2.7 (--)
X-BeenThere: linux-arm-kernel@lists.infradead.org
X-Mailman-Version: 2.1.20
Precedence: list
List-Id: <linux-arm-kernel.lists.infradead.org>
List-Unsubscribe: 
 <http://lists.infradead.org/mailman/options/linux-arm-kernel>,
	<mailto:linux-arm-kernel-request@lists.infradead.org?subject=unsubscribe>
List-Archive: <http://lists.infradead.org/pipermail/linux-arm-kernel/>
List-Post: <mailto:linux-arm-kernel@lists.infradead.org>
List-Help: <mailto:linux-arm-kernel-request@lists.infradead.org?subject=help>
List-Subscribe: 
 <http://lists.infradead.org/mailman/listinfo/linux-arm-kernel>,
	<mailto:linux-arm-kernel-request@lists.infradead.org?subject=subscribe>
Cc: Feng Kan <fkan@apm.com>, Balamurugan Shanmugam <bshanmugam@apm.com>
MIME-Version: 1.0
Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org>
Errors-To: 
 linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org
X-Spam-Status: No, score=-4.9 required=5.0 tests=BAYES_00,DKIM_SIGNED,
	RCVD_IN_DNSWL_MED,RP_MATCHES_RCVD,T_DKIM_INVALID,UNPARSEABLE_RELAY
	autolearn=unavailable version=3.3.1
X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org
X-Virus-Scanned: ClamAV using ClamSMTP

This patch optimize copy_to-from-in_user for arm 64bit architecture.
The copy template is using the memcpy.S as a base. This allows the
sharing of the copy template with all of the copy*.S files.

Signed-off-by: Feng Kan <fkan@apm.com>
Signed-off-by: Balamurugan Shanmugam <bshanmugam@apm.com>
---
 arch/arm64/lib/copy_from_user.S |  78 +++++++++-------
 arch/arm64/lib/copy_in_user.S   |  66 ++++++++------
 arch/arm64/lib/copy_template.S  | 196 ++++++++++++++++++++++++++++++++++++++++
 arch/arm64/lib/copy_to_user.S   |  66 ++++++++------
 4 files changed, 314 insertions(+), 92 deletions(-)
 create mode 100644 arch/arm64/lib/copy_template.S

diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 1be9ef2..cb085cf 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -18,6 +18,7 @@
 
 #include <asm/alternative.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
 
@@ -31,49 +32,58 @@
  * Returns:
  *	x0 - bytes not copied
  */
+
+	.macro ldrb1 label, ptr, regB, val
+	USER(\label, ldrb  \ptr, [\regB], \val)
+	.endm
+
+	.macro strb1 label, ptr, regB, val
+	strb \ptr, [\regB], \val
+	.endm
+
+	.macro ldrh1 label, ptr, regB, val
+	USER(\label, ldrh  \ptr, [\regB], \val)
+	.endm
+
+	.macro strh1 label, ptr, regB, val
+	strh \ptr, [\regB], \val
+	.endm
+
+	.macro ldr1 label, ptr, regB, val
+	USER(\label, ldr \ptr, [\regB], \val)
+	.endm
+
+	.macro str1 label, ptr, regB, val
+	str \ptr, [\regB], \val
+	.endm
+
+	.macro ldp1 label, ptr, regB, regC, val
+	USER(\label, ldp \ptr, \regB, [\regC], \val)
+	.endm
+
+	.macro stp1 label, ptr, regB, regC, val
+	stp \ptr, \regB, [\regC], \val
+	.endm
+
 ENTRY(__copy_from_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
-	add	x5, x1, x2			// upper user buffer boundary
-	subs	x2, x2, #16
-	b.mi	1f
-0:
-USER(9f, ldp	x3, x4, [x1], #16)
-	subs	x2, x2, #16
-	stp	x3, x4, [x0], #16
-	b.pl	0b
-1:	adds	x2, x2, #8
-	b.mi	2f
-USER(9f, ldr	x3, [x1], #8	)
-	sub	x2, x2, #8
-	str	x3, [x0], #8
-2:	adds	x2, x2, #4
-	b.mi	3f
-USER(9f, ldr	w3, [x1], #4	)
-	sub	x2, x2, #4
-	str	w3, [x0], #4
-3:	adds	x2, x2, #2
-	b.mi	4f
-USER(9f, ldrh	w3, [x1], #2	)
-	sub	x2, x2, #2
-	strh	w3, [x0], #2
-4:	adds	x2, x2, #1
-	b.mi	5f
-USER(9f, ldrb	w3, [x1]	)
-	strb	w3, [x0]
-5:	mov	x0, #0
+#include "copy_template.S"
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
+	mov	x0, #0				// Nothing to copy
 	ret
 ENDPROC(__copy_from_user)
 
 	.section .fixup,"ax"
 	.align	2
-9:	sub	x2, x5, x1
-	mov	x3, x2
-10:	strb	wzr, [x0], #1			// zero remaining buffer space
-	subs	x3, x3, #1
-	b.ne	10b
-	mov	x0, x2				// bytes not copied
+11:
+	sub	x4, tmp3, dst
+	mov	x0, x4
+	sub	dst, tmp3, x4
+
+20:	strb	wzr, [dst], #1			// zero remaining buffer space
+	subs	x4, x4, #1
+	b.ne	20b
 	ret
 	.previous
diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S
index 1b94661e..b54d44e 100644
--- a/arch/arm64/lib/copy_in_user.S
+++ b/arch/arm64/lib/copy_in_user.S
@@ -20,6 +20,7 @@
 
 #include <asm/alternative.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
 
@@ -33,44 +34,51 @@
  * Returns:
  *	x0 - bytes not copied
  */
+	.macro ldrb1 label, ptr, regB, val
+	USER(\label, ldrb  \ptr, [\regB], \val)
+	.endm
+
+	.macro strb1 label, ptr, regB, val
+	USER(\label, strb \ptr, [\regB], \val)
+	.endm
+
+	.macro ldrh1 label, ptr, regB, val
+	USER(\label, ldrh  \ptr, [\regB], \val)
+	.endm
+
+	.macro strh1 label, ptr, regB, val
+	USER(\label, strh \ptr, [\regB], \val)
+	.endm
+
+	.macro ldr1 label, ptr, regB, val
+	USER(\label, ldr \ptr, [\regB], \val)
+	.endm
+
+	.macro str1 label, ptr, regB, val
+	USER(\label, str \ptr, [\regB], \val)
+	.endm
+
+	.macro ldp1 label, ptr, regB, regC, val
+	USER(\label, ldp \ptr, \regB, [\regC], \val)
+	.endm
+
+	.macro stp1 label, ptr, regB, regC, val
+	USER(\label, stp \ptr, \regB, [\regC], \val)
+	.endm
+
 ENTRY(__copy_in_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
-	add	x5, x0, x2			// upper user buffer boundary
-	subs	x2, x2, #16
-	b.mi	1f
-0:
-USER(9f, ldp	x3, x4, [x1], #16)
-	subs	x2, x2, #16
-USER(9f, stp	x3, x4, [x0], #16)
-	b.pl	0b
-1:	adds	x2, x2, #8
-	b.mi	2f
-USER(9f, ldr	x3, [x1], #8	)
-	sub	x2, x2, #8
-USER(9f, str	x3, [x0], #8	)
-2:	adds	x2, x2, #4
-	b.mi	3f
-USER(9f, ldr	w3, [x1], #4	)
-	sub	x2, x2, #4
-USER(9f, str	w3, [x0], #4	)
-3:	adds	x2, x2, #2
-	b.mi	4f
-USER(9f, ldrh	w3, [x1], #2	)
-	sub	x2, x2, #2
-USER(9f, strh	w3, [x0], #2	)
-4:	adds	x2, x2, #1
-	b.mi	5f
-USER(9f, ldrb	w3, [x1]	)
-USER(9f, strb	w3, [x0]	)
-5:	mov	x0, #0
+#include "copy_template.S"
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
+	mov	x0, #0
 	ret
 ENDPROC(__copy_in_user)
 
 	.section .fixup,"ax"
 	.align	2
-9:	sub	x0, x5, x0			// bytes not copied
+11:	sub	tmp3, tmp3, dst			// bytes not copied
+	mov	x0, tmp3
 	ret
 	.previous
diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
new file mode 100644
index 0000000..c9ece2f
--- /dev/null
+++ b/arch/arm64/lib/copy_template.S
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+/*
+ * Copy a buffer from src to dest (alignment handled by the hardware)
+ *
+ * Parameters:
+ *	x0 - dest
+ *	x1 - src
+ *	x2 - n
+ * Returns:
+ *	x0 - dest
+ */
+dstin	.req	x0
+src	.req	x1
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+tmp3	.req	x5
+tmp3w	.req	w5
+dst	.req	x6
+
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
+
+	mov	dst, dstin
+	add	tmp3, dst, count
+	cmp	count, #16
+	/*When memory length is less than 16, the accessed are not aligned.*/
+	b.lo	.Ltiny15
+
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* Copy the leading memory data from src to dst in an increasing
+	* address order.By this way,the risk of overwritting the source
+	* memory data is eliminated when the distance between src and
+	* dst is less than 16. The memory accesses here are alignment.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb1	11f, tmp1w, src, #1
+	strb1	11f, tmp1w, dst, #1
+1:
+	tbz	tmp2, #1, 2f
+	ldrh1	11f, tmp1w, src, #2
+	strh1	11f, tmp1w, dst, #2
+2:
+	tbz	tmp2, #2, 3f
+	ldr1	11f, tmp1w, src, #4
+	str1	11f, tmp1w, dst, #4
+3:
+	tbz	tmp2, #3, .LSrcAligned
+	ldr1	11f, tmp1, src, #8
+	str1	11f, tmp1, dst, #8
+
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp1	11f, A_l, A_h, src, #16
+	stp1	11f, A_l, A_h, dst, #16
+1:
+	ldp1	11f, A_l, A_h, src, #16
+	stp1	11f, A_l, A_h, dst, #16
+2:
+	ldp1	11f, A_l, A_h, src, #16
+	stp1	11f, A_l, A_h, dst, #16
+.Ltiny15:
+	/*
+	* Prefer to break one ldp/stp into several load/store to access
+	* memory in an increasing address order,rather than to load/store 16
+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
+	* address,which way is used in original cortex memcpy. If keeping
+	* the original memcpy process here, memmove need to satisfy the
+	* precondition that src address is at least 16 bytes bigger than dst
+	* address,otherwise some source data will be overwritten when memove
+	* call memcpy directly. To make memmove simpler and decouple the
+	* memcpy's dependency on memmove, withdrew the original process.
+	*/
+	tbz	count, #3, 1f
+	ldr1	11f, tmp1, src, #8
+	str1	11f, tmp1, dst, #8
+1:
+	tbz	count, #2, 2f
+	ldr1	11f, tmp1w, src, #4
+	str1	11f, tmp1w, dst, #4
+2:
+	tbz	count, #1, 3f
+	ldrh1	11f, tmp1w, src, #2
+	strh1	11f, tmp1w, dst, #2
+3:
+	tbz	count, #0, .Lexitfunc
+	ldrb1	11f, tmp1w, src, #1
+	strb1	11f, tmp1w, dst, #1
+
+	b	.Lexitfunc
+
+.Lcpy_over64:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+	ldp1	11f, A_l, A_h, src, #16
+	stp1	11f, A_l, A_h, dst, #16
+	ldp1	11f, B_l, B_h, src, #16
+	ldp1	11f, C_l, C_h, src, #16
+	stp1	11f, B_l, B_h, dst, #16
+	stp1	11f, C_l, C_h, dst, #16
+	ldp1	11f, D_l, D_h, src, #16
+	stp1	11f, D_l, D_h, dst, #16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	b	.Lexitfunc
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+	.p2align	L1_CACHE_SHIFT
+.Lcpy_body_large:
+	/* pre-get 64 bytes data. */
+	ldp1	11f, A_l, A_h, src, #16
+	ldp1	11f, B_l, B_h, src, #16
+	ldp1	11f, C_l, C_h, src, #16
+	ldp1	11f, D_l, D_h, src, #16
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+	stp1	11f, A_l, A_h, dst, #16
+	ldp1	11f, A_l, A_h, src, #16
+	stp1	11f, B_l, B_h, dst, #16
+	ldp1	11f, B_l, B_h, src, #16
+	stp1	11f, C_l, C_h, dst, #16
+	ldp1	11f, C_l, C_h, src, #16
+	stp1	11f, D_l, D_h, dst, #16
+	ldp1	11f, D_l, D_h, src, #16
+	subs	count, count, #64
+	b.ge	1b
+	stp1	11f, A_l, A_h, dst, #16
+	stp1	11f, B_l, B_h, dst, #16
+	stp1	11f, C_l, C_h, dst, #16
+	stp1	11f, D_l, D_h, dst, #16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+.Lexitfunc:
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index a257b47..0ef3eb2 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -18,6 +18,7 @@
 
 #include <asm/alternative.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 #include <asm/cpufeature.h>
 #include <asm/sysreg.h>
 
@@ -31,44 +32,51 @@
  * Returns:
  *	x0 - bytes not copied
  */
+	.macro ldrb1 label, ptr, regB, val
+	ldrb  \ptr, [\regB], \val
+	.endm
+
+	.macro strb1 label, ptr, regB, val
+	USER(\label, strb \ptr, [\regB], \val)
+	.endm
+
+	.macro ldrh1 label, ptr, regB, val
+	ldrh  \ptr, [\regB], \val
+	.endm
+
+	.macro strh1 label, ptr, regB, val
+	USER(\label, strh \ptr, [\regB], \val)
+	.endm
+
+	.macro ldr1 label, ptr, regB, val
+	ldr \ptr, [\regB], \val
+	.endm
+
+	.macro str1 label, ptr, regB, val
+	USER(\label, str \ptr, [\regB], \val)
+	.endm
+
+	.macro ldp1 label, ptr, regB, regC, val
+	ldp \ptr, \regB, [\regC], \val
+	.endm
+
+	.macro stp1 label, ptr, regB, regC, val
+	USER(\label, stp \ptr, \regB, [\regC], \val)
+	.endm
+
 ENTRY(__copy_to_user)
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
-	add	x5, x0, x2			// upper user buffer boundary
-	subs	x2, x2, #16
-	b.mi	1f
-0:
-	ldp	x3, x4, [x1], #16
-	subs	x2, x2, #16
-USER(9f, stp	x3, x4, [x0], #16)
-	b.pl	0b
-1:	adds	x2, x2, #8
-	b.mi	2f
-	ldr	x3, [x1], #8
-	sub	x2, x2, #8
-USER(9f, str	x3, [x0], #8	)
-2:	adds	x2, x2, #4
-	b.mi	3f
-	ldr	w3, [x1], #4
-	sub	x2, x2, #4
-USER(9f, str	w3, [x0], #4	)
-3:	adds	x2, x2, #2
-	b.mi	4f
-	ldrh	w3, [x1], #2
-	sub	x2, x2, #2
-USER(9f, strh	w3, [x0], #2	)
-4:	adds	x2, x2, #1
-	b.mi	5f
-	ldrb	w3, [x1]
-USER(9f, strb	w3, [x0]	)
-5:	mov	x0, #0
+#include "copy_template.S"
 ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \
 	    CONFIG_ARM64_PAN)
+	mov	x0, #0
 	ret
 ENDPROC(__copy_to_user)
 
 	.section .fixup,"ax"
 	.align	2
-9:	sub	x0, x5, x0			// bytes not copied
+11:	sub	tmp3, tmp3, dst			// bytes not copied
+	mov	x0, tmp3
 	ret
 	.previous