From patchwork Wed Dec 11 06:24:39 2013
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: zhichang.yuan@linaro.org
X-Patchwork-Id: 3322431
Return-Path: 
 <linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org>
X-Original-To: patchwork-linux-arm@patchwork.kernel.org
Delivered-To: patchwork-parsemail@patchwork2.web.kernel.org
Received: from mail.kernel.org (mail.kernel.org [198.145.19.201])
	by patchwork2.web.kernel.org (Postfix) with ESMTP id D7C1BC0D4A
	for <patchwork-linux-arm@patchwork.kernel.org>;
	Wed, 11 Dec 2013 06:27:37 +0000 (UTC)
Received: from mail.kernel.org (localhost [127.0.0.1])
	by mail.kernel.org (Postfix) with ESMTP id B901B205E5
	for <patchwork-linux-arm@patchwork.kernel.org>;
	Wed, 11 Dec 2013 06:27:36 +0000 (UTC)
Received: from casper.infradead.org (casper.infradead.org [85.118.1.10])
	(using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by mail.kernel.org (Postfix) with ESMTPS id 41FD320412
	for <patchwork-linux-arm@patchwork.kernel.org>;
	Wed, 11 Dec 2013 06:27:35 +0000 (UTC)
Received: from merlin.infradead.org ([2001:4978:20e::2])
	by casper.infradead.org with esmtps (Exim 4.80.1 #2 (Red Hat Linux))
	id 1VqdFR-00030E-S4; Wed, 11 Dec 2013 06:26:18 +0000
Received: from localhost ([::1] helo=merlin.infradead.org)
	by merlin.infradead.org with esmtp (Exim 4.80.1 #2 (Red Hat Linux))
	id 1VqdFB-0000mn-AV; Wed, 11 Dec 2013 06:26:01 +0000
Received: from mail-pb0-f53.google.com ([209.85.160.53])
	by merlin.infradead.org with esmtps (Exim 4.80.1 #2 (Red Hat Linux))
	id 1VqdF6-0000kk-CG for linux-arm-kernel@lists.infradead.org;
	Wed, 11 Dec 2013 06:25:58 +0000
Received: by mail-pb0-f53.google.com with SMTP id ma3so9274529pbc.40
	for <linux-arm-kernel@lists.infradead.org>;
	Tue, 10 Dec 2013 22:25:34 -0800 (PST)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
	d=1e100.net; s=20130820;
	h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
	:references;
	bh=d8RllpSLwP3M822Vm85BcgmzxNztrnE590yGdGABbT8=;
	b=HTUzvHolNSBO+LwQHDYAI7PdqaoZDmL/d34arD7UqTYqTOanVmyZPpNZ8B/Lh091Gk
	b1rGBjp7rVZXHoYQlMMHRZvHiGmyblmFdEkRzo8W5Kds6cirFHI3u7h+X0Uf7bJpwwZ4
	GABTj+q2lB8cJNbx5ZD+ZyAd1gttMElCYeET3xCx3eF/mXoog2N5MiEPoh4XcXr9gyDJ
	mTtNOPT1zVjA4S9SBlmSeEmsVNqMw7YTVbjCmCjtQV+20RIIJb8DyQp5vqZats1ElVpi
	b6jPbb8zXyJ1cTgSZmwFih5eAyH7Hlj1yRnJC0xrmTLdGz6inXHSHOuXDLyHnM0BZSX+
	xjoQ==
X-Gm-Message-State: 
 ALoCoQnfaU5UF3FOsYg3kN/kcZYhuiAZ4Ikzjl0R/8/2ixwM+mdVGKHfxRVjB53+47oWckg+QF7K
X-Received: by 10.68.231.166 with SMTP id th6mr32293177pbc.27.1386743134677;
	Tue, 10 Dec 2013 22:25:34 -0800 (PST)
Received: from localhost ([58.251.159.252]) by mx.google.com with ESMTPSA id
	y9sm41630204pas.10.2013.12.10.22.25.29 for <multiple recipients>
	(version=TLSv1.2 cipher=RC4-SHA bits=128/128);
	Tue, 10 Dec 2013 22:25:34 -0800 (PST)
From: zhichang.yuan@linaro.org
To: linux-arm-kernel@lists.infradead.org, catalin.marinas@arm.com,
	will.deacon@arm.com
Subject: [PATCH 3/6] arm64: lib: Implement optimized memset routine
Date: Wed, 11 Dec 2013 14:24:39 +0800
Message-Id: <1386743082-5231-4-git-send-email-zhichang.yuan@linaro.org>
X-Mailer: git-send-email 1.7.9.5
In-Reply-To: <1386743082-5231-1-git-send-email-zhichang.yuan@linaro.org>
References: <1386743082-5231-1-git-send-email-zhichang.yuan@linaro.org>
X-CRM114-Version: 20100106-BlameMichelson ( TRE 0.8.0 (BSD) ) MR-646709E3 
X-CRM114-CacheID: sfid-20131211_012556_638984_1CC63156 
X-CRM114-Status: GOOD (  23.53  )
X-Spam-Score: -2.6 (--)
Cc: Deepak Saxena <dsaxena@linaro.org>, liguozhu@huawei.com,
	"zhichang.yuan" <zhichang.yuan@linaro.org>
X-BeenThere: linux-arm-kernel@lists.infradead.org
X-Mailman-Version: 2.1.15
Precedence: list
List-Id: <linux-arm-kernel.lists.infradead.org>
List-Unsubscribe: 
 <http://lists.infradead.org/mailman/options/linux-arm-kernel>,
	<mailto:linux-arm-kernel-request@lists.infradead.org?subject=unsubscribe>
List-Archive: <http://lists.infradead.org/pipermail/linux-arm-kernel/>
List-Post: <mailto:linux-arm-kernel@lists.infradead.org>
List-Help: <mailto:linux-arm-kernel-request@lists.infradead.org?subject=help>
List-Subscribe: 
 <http://lists.infradead.org/mailman/listinfo/linux-arm-kernel>,
	<mailto:linux-arm-kernel-request@lists.infradead.org?subject=subscribe>
MIME-Version: 1.0
Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org>
Errors-To: 
 linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org
X-Spam-Status: No, score=-4.4 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_MED,
	RP_MATCHES_RCVD, UNPARSEABLE_RELAY autolearn=unavailable version=3.3.1
X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org
X-Virus-Scanned: ClamAV using ClamSMTP

From: "zhichang.yuan" <zhichang.yuan@linaro.org>

This patch, based on Linaro's Cortex Strings library, improves
the performance of the assembly optimized memset() function.

Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org>
Signed-off-by: Deepak Saxena <dsaxena@linaro.org>
---
 arch/arm64/lib/memset.S |  227 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 201 insertions(+), 26 deletions(-)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index 87e4a68..90b973e 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -1,13 +1,21 @@
 /*
  * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
@@ -18,7 +26,7 @@
 #include <asm/assembler.h>
 
 /*
- * Fill in the buffer with character c (alignment handled by the hardware)
+ * Fill in the buffer with character c
  *
  * Parameters:
  *	x0 - buf
@@ -27,27 +35,194 @@
  * Returns:
  *	x0 - buf
  */
+
+/* By default we assume that the DC instruction can be used to zero
+*  data blocks more efficiently.  In some circumstances this might be
+*  unsafe, for example in an asymmetric multiprocessor environment with
+*  different DC clear lengths (neither the upper nor lower lengths are
+*  safe to use).  The feature can be disabled by defining DONT_USE_DC.
+*/
+
+#define dstin		x0
+#define val		w1
+#define count		x2
+#define tmp1		x3
+#define tmp1w		w3
+#define tmp2		x4
+#define tmp2w		w4
+#define zva_len_x	x5
+#define zva_len	w5
+#define zva_bits_x	x6
+
+#define A_l		x7
+#define A_lw		w7
+#define dst		x8
+#define tmp3w		w9
+#define tmp3		x9
+
 ENTRY(memset)
-	mov	x4, x0
-	and	w1, w1, #0xff
-	orr	w1, w1, w1, lsl #8
-	orr	w1, w1, w1, lsl #16
-	orr	x1, x1, x1, lsl #32
-	subs	x2, x2, #8
-	b.mi	2f
-1:	str	x1, [x4], #8
-	subs	x2, x2, #8
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	sub	x2, x2, #4
-	str	w1, [x4], #4
-3:	adds	x2, x2, #2
-	b.mi	4f
-	sub	x2, x2, #2
-	strh	w1, [x4], #2
-4:	adds	x2, x2, #1
-	b.mi	5f
-	strb	w1, [x4]
-5:	ret
+	mov	dst, dstin	/* Preserve return value.  */
+	and	A_lw, val, #255
+	orr	A_lw, A_lw, A_lw, lsl #8
+	orr	A_lw, A_lw, A_lw, lsl #16
+	orr	A_l, A_l, A_l, lsl #32
+
+	/*first align dst with 16...*/
+	neg	tmp2, dst
+	ands	tmp2, tmp2, #15
+	b.eq	.Laligned
+
+	cmp	count, #15
+	b.le	.Ltail15tiny
+	/*
+	* The count is not less than 16, we can use stp to set 16 bytes
+	* once. This way is more efficient but the access is non-aligned.
+	*/
+	stp	A_l, A_l, [dst]
+	/*make the dst aligned..*/
+	sub	count, count, tmp2
+	add	dst, dst, tmp2
+
+	/*Here, dst is aligned 16 now...*/
+.Laligned:
+#ifndef DONT_USE_DC
+	cbz	A_l, .Lzero_mem
+#endif
+
+.Ltail_maybe_long:
+	cmp	count, #64
+	b.ge	.Lnot_short
+.Ltail63:
+	ands	tmp1, count, #0x30
+	b.eq	.Ltail15tiny
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	stp	A_l, A_l, [dst], #16
+1:
+	stp	A_l, A_l, [dst], #16
+2:
+	stp	A_l, A_l, [dst], #16
+/*
+* The following process is non-aligned access. But it is more efficient than
+* .Ltail15tiny. Of-course, we can delete this code, but have a bit
+* performance cost.
+*/
+	ands	count, count, #15
+	cbz	count, 1f
+	add	dst, dst, count
+	stp	A_l, A_l, [dst, #-16]	/* Repeat some/all of last store. */
+1:
+	ret
+
+.Ltail15tiny:
+	/* Set up to 15 bytes.  Does not assume earlier memory
+	being set.  */
+	tbz	count, #3, 1f
+	str	A_l, [dst], #8
+1:
+	tbz	count, #2, 1f
+	str	A_lw, [dst], #4
+1:
+	tbz	count, #1, 1f
+	strh	A_lw, [dst], #2
+1:
+	tbz	count, #0, 1f
+	strb	A_lw, [dst]
+1:
+	ret
+
+	/*
+	* Critical loop. Start at a new cache line boundary. Assuming
+	* 64 bytes per line, this ensures the entire loop is in one line.
+	*/
+	.p2align	6
+.Lnot_short: /*count must be not less than 64*/
+	sub	dst, dst, #16/* Pre-bias.  */
+	sub	count, count, #64
+1:
+	stp	A_l, A_l, [dst, #16]
+	stp	A_l, A_l, [dst, #32]
+	stp	A_l, A_l, [dst, #48]
+	stp	A_l, A_l, [dst, #64]!
+	subs	count, count, #64
+	b.ge	1b
+	tst	count, #0x3f
+	add	dst, dst, #16
+	b.ne	.Ltail63
+.Lexitfunc:
+	ret
+
+#ifndef DONT_USE_DC
+	/*
+	* For zeroing memory, check to see if we can use the ZVA feature to
+	* zero entire 'cache' lines.
+	*/
+.Lzero_mem:
+	cmp	count, #63
+	b.le	.Ltail63
+	/*
+	* For zeroing small amounts of memory, it's not worth setting up
+	* the line-clear code.
+	*/
+	cmp	count, #128
+	b.lt	.Lnot_short /*count is at least  128 bytes*/
+
+	mrs	tmp1, dczid_el0
+	tbnz	tmp1, #4, .Lnot_short
+	mov	tmp3w, #4
+	and	zva_len, tmp1w, #15	/* Safety: other bits reserved.  */
+	lsl	zva_len, tmp3w, zva_len
+
+	ands	tmp3w, zva_len, #63
+	/*
+	* ensure the zva_len is not less than 64.
+	* It is not meaningful to use ZVA if the block size is less than 64.
+	*/
+	b.ne	.Lnot_short
+.Lzero_by_line:
+	/*
+	* Compute how far we need to go to become suitably aligned. We're
+	* already at quad-word alignment.
+	*/
+	cmp	count, zva_len_x
+	b.lt	.Lnot_short		/* Not enough to reach alignment.  */
+	sub	zva_bits_x, zva_len_x, #1
+	neg	tmp2, dst
+	ands	tmp2, tmp2, zva_bits_x
+	b.eq	1f			/* Already aligned.  */
+	/* Not aligned, check that there's enough to copy after alignment.*/
+	sub	tmp1, count, tmp2
+	/*
+	* grantee the remain length to be ZVA is bigger than 64,
+	* avoid to make the 2f's process over mem range.*/
+	cmp	tmp1, #64
+	ccmp	tmp1, zva_len_x, #8, ge	/* NZCV=0b1000 */
+	b.lt	.Lnot_short
+	/*
+	* We know that there's at least 64 bytes to zero and that it's safe
+	* to overrun by 64 bytes.
+	*/
+	mov	count, tmp1
+2:
+	stp	A_l, A_l, [dst]
+	stp	A_l, A_l, [dst, #16]
+	stp	A_l, A_l, [dst, #32]
+	subs	tmp2, tmp2, #64
+	stp	A_l, A_l, [dst, #48]
+	add	dst, dst, #64
+	b.ge	2b
+	/* We've overrun a bit, so adjust dst downwards.*/
+	add	dst, dst, tmp2
+1:
+	sub	count, count, zva_len_x
+3:
+	dc	zva, dst
+	add	dst, dst, zva_len_x
+	subs	count, count, zva_len_x
+	b.ge	3b
+	ands	count, count, zva_bits_x
+	b.ne	.Ltail_maybe_long
+	ret
+#endif /* DONT_USE_DC */
 ENDPROC(memset)