From patchwork Wed Dec 11 06:24:39 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: zhichang.yuan@linaro.org X-Patchwork-Id: 3322431 Return-Path: X-Original-To: patchwork-linux-arm@patchwork.kernel.org Delivered-To: patchwork-parsemail@patchwork2.web.kernel.org Received: from mail.kernel.org (mail.kernel.org [198.145.19.201]) by patchwork2.web.kernel.org (Postfix) with ESMTP id D7C1BC0D4A for ; Wed, 11 Dec 2013 06:27:37 +0000 (UTC) Received: from mail.kernel.org (localhost [127.0.0.1]) by mail.kernel.org (Postfix) with ESMTP id B901B205E5 for ; Wed, 11 Dec 2013 06:27:36 +0000 (UTC) Received: from casper.infradead.org (casper.infradead.org [85.118.1.10]) (using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.kernel.org (Postfix) with ESMTPS id 41FD320412 for ; Wed, 11 Dec 2013 06:27:35 +0000 (UTC) Received: from merlin.infradead.org ([2001:4978:20e::2]) by casper.infradead.org with esmtps (Exim 4.80.1 #2 (Red Hat Linux)) id 1VqdFR-00030E-S4; Wed, 11 Dec 2013 06:26:18 +0000 Received: from localhost ([::1] helo=merlin.infradead.org) by merlin.infradead.org with esmtp (Exim 4.80.1 #2 (Red Hat Linux)) id 1VqdFB-0000mn-AV; Wed, 11 Dec 2013 06:26:01 +0000 Received: from mail-pb0-f53.google.com ([209.85.160.53]) by merlin.infradead.org with esmtps (Exim 4.80.1 #2 (Red Hat Linux)) id 1VqdF6-0000kk-CG for linux-arm-kernel@lists.infradead.org; Wed, 11 Dec 2013 06:25:58 +0000 Received: by mail-pb0-f53.google.com with SMTP id ma3so9274529pbc.40 for ; Tue, 10 Dec 2013 22:25:34 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references; bh=d8RllpSLwP3M822Vm85BcgmzxNztrnE590yGdGABbT8=; b=HTUzvHolNSBO+LwQHDYAI7PdqaoZDmL/d34arD7UqTYqTOanVmyZPpNZ8B/Lh091Gk b1rGBjp7rVZXHoYQlMMHRZvHiGmyblmFdEkRzo8W5Kds6cirFHI3u7h+X0Uf7bJpwwZ4 GABTj+q2lB8cJNbx5ZD+ZyAd1gttMElCYeET3xCx3eF/mXoog2N5MiEPoh4XcXr9gyDJ mTtNOPT1zVjA4S9SBlmSeEmsVNqMw7YTVbjCmCjtQV+20RIIJb8DyQp5vqZats1ElVpi b6jPbb8zXyJ1cTgSZmwFih5eAyH7Hlj1yRnJC0xrmTLdGz6inXHSHOuXDLyHnM0BZSX+ xjoQ== X-Gm-Message-State: ALoCoQnfaU5UF3FOsYg3kN/kcZYhuiAZ4Ikzjl0R/8/2ixwM+mdVGKHfxRVjB53+47oWckg+QF7K X-Received: by 10.68.231.166 with SMTP id th6mr32293177pbc.27.1386743134677; Tue, 10 Dec 2013 22:25:34 -0800 (PST) Received: from localhost ([58.251.159.252]) by mx.google.com with ESMTPSA id y9sm41630204pas.10.2013.12.10.22.25.29 for (version=TLSv1.2 cipher=RC4-SHA bits=128/128); Tue, 10 Dec 2013 22:25:34 -0800 (PST) From: zhichang.yuan@linaro.org To: linux-arm-kernel@lists.infradead.org, catalin.marinas@arm.com, will.deacon@arm.com Subject: [PATCH 3/6] arm64: lib: Implement optimized memset routine Date: Wed, 11 Dec 2013 14:24:39 +0800 Message-Id: <1386743082-5231-4-git-send-email-zhichang.yuan@linaro.org> X-Mailer: git-send-email 1.7.9.5 In-Reply-To: <1386743082-5231-1-git-send-email-zhichang.yuan@linaro.org> References: <1386743082-5231-1-git-send-email-zhichang.yuan@linaro.org> X-CRM114-Version: 20100106-BlameMichelson ( TRE 0.8.0 (BSD) ) MR-646709E3 X-CRM114-CacheID: sfid-20131211_012556_638984_1CC63156 X-CRM114-Status: GOOD ( 23.53 ) X-Spam-Score: -2.6 (--) Cc: Deepak Saxena , liguozhu@huawei.com, "zhichang.yuan" X-BeenThere: linux-arm-kernel@lists.infradead.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: "linux-arm-kernel" Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org X-Spam-Status: No, score=-4.4 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_MED, RP_MATCHES_RCVD, UNPARSEABLE_RELAY autolearn=unavailable version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP From: "zhichang.yuan" This patch, based on Linaro's Cortex Strings library, improves the performance of the assembly optimized memset() function. Signed-off-by: Zhichang Yuan Signed-off-by: Deepak Saxena --- arch/arm64/lib/memset.S | 227 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 201 insertions(+), 26 deletions(-) diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S index 87e4a68..90b973e 100644 --- a/arch/arm64/lib/memset.S +++ b/arch/arm64/lib/memset.S @@ -1,13 +1,21 @@ /* * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * This program is distributed "as is" WITHOUT ANY WARRANTY of any + * kind, whether express or implied; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License @@ -18,7 +26,7 @@ #include /* - * Fill in the buffer with character c (alignment handled by the hardware) + * Fill in the buffer with character c * * Parameters: * x0 - buf @@ -27,27 +35,194 @@ * Returns: * x0 - buf */ + +/* By default we assume that the DC instruction can be used to zero +* data blocks more efficiently. In some circumstances this might be +* unsafe, for example in an asymmetric multiprocessor environment with +* different DC clear lengths (neither the upper nor lower lengths are +* safe to use). The feature can be disabled by defining DONT_USE_DC. +*/ + +#define dstin x0 +#define val w1 +#define count x2 +#define tmp1 x3 +#define tmp1w w3 +#define tmp2 x4 +#define tmp2w w4 +#define zva_len_x x5 +#define zva_len w5 +#define zva_bits_x x6 + +#define A_l x7 +#define A_lw w7 +#define dst x8 +#define tmp3w w9 +#define tmp3 x9 + ENTRY(memset) - mov x4, x0 - and w1, w1, #0xff - orr w1, w1, w1, lsl #8 - orr w1, w1, w1, lsl #16 - orr x1, x1, x1, lsl #32 - subs x2, x2, #8 - b.mi 2f -1: str x1, [x4], #8 - subs x2, x2, #8 - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f - sub x2, x2, #4 - str w1, [x4], #4 -3: adds x2, x2, #2 - b.mi 4f - sub x2, x2, #2 - strh w1, [x4], #2 -4: adds x2, x2, #1 - b.mi 5f - strb w1, [x4] -5: ret + mov dst, dstin /* Preserve return value. */ + and A_lw, val, #255 + orr A_lw, A_lw, A_lw, lsl #8 + orr A_lw, A_lw, A_lw, lsl #16 + orr A_l, A_l, A_l, lsl #32 + + /*first align dst with 16...*/ + neg tmp2, dst + ands tmp2, tmp2, #15 + b.eq .Laligned + + cmp count, #15 + b.le .Ltail15tiny + /* + * The count is not less than 16, we can use stp to set 16 bytes + * once. This way is more efficient but the access is non-aligned. + */ + stp A_l, A_l, [dst] + /*make the dst aligned..*/ + sub count, count, tmp2 + add dst, dst, tmp2 + + /*Here, dst is aligned 16 now...*/ +.Laligned: +#ifndef DONT_USE_DC + cbz A_l, .Lzero_mem +#endif + +.Ltail_maybe_long: + cmp count, #64 + b.ge .Lnot_short +.Ltail63: + ands tmp1, count, #0x30 + b.eq .Ltail15tiny + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + stp A_l, A_l, [dst], #16 +1: + stp A_l, A_l, [dst], #16 +2: + stp A_l, A_l, [dst], #16 +/* +* The following process is non-aligned access. But it is more efficient than +* .Ltail15tiny. Of-course, we can delete this code, but have a bit +* performance cost. +*/ + ands count, count, #15 + cbz count, 1f + add dst, dst, count + stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */ +1: + ret + +.Ltail15tiny: + /* Set up to 15 bytes. Does not assume earlier memory + being set. */ + tbz count, #3, 1f + str A_l, [dst], #8 +1: + tbz count, #2, 1f + str A_lw, [dst], #4 +1: + tbz count, #1, 1f + strh A_lw, [dst], #2 +1: + tbz count, #0, 1f + strb A_lw, [dst] +1: + ret + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line, this ensures the entire loop is in one line. + */ + .p2align 6 +.Lnot_short: /*count must be not less than 64*/ + sub dst, dst, #16/* Pre-bias. */ + sub count, count, #64 +1: + stp A_l, A_l, [dst, #16] + stp A_l, A_l, [dst, #32] + stp A_l, A_l, [dst, #48] + stp A_l, A_l, [dst, #64]! + subs count, count, #64 + b.ge 1b + tst count, #0x3f + add dst, dst, #16 + b.ne .Ltail63 +.Lexitfunc: + ret + +#ifndef DONT_USE_DC + /* + * For zeroing memory, check to see if we can use the ZVA feature to + * zero entire 'cache' lines. + */ +.Lzero_mem: + cmp count, #63 + b.le .Ltail63 + /* + * For zeroing small amounts of memory, it's not worth setting up + * the line-clear code. + */ + cmp count, #128 + b.lt .Lnot_short /*count is at least 128 bytes*/ + + mrs tmp1, dczid_el0 + tbnz tmp1, #4, .Lnot_short + mov tmp3w, #4 + and zva_len, tmp1w, #15 /* Safety: other bits reserved. */ + lsl zva_len, tmp3w, zva_len + + ands tmp3w, zva_len, #63 + /* + * ensure the zva_len is not less than 64. + * It is not meaningful to use ZVA if the block size is less than 64. + */ + b.ne .Lnot_short +.Lzero_by_line: + /* + * Compute how far we need to go to become suitably aligned. We're + * already at quad-word alignment. + */ + cmp count, zva_len_x + b.lt .Lnot_short /* Not enough to reach alignment. */ + sub zva_bits_x, zva_len_x, #1 + neg tmp2, dst + ands tmp2, tmp2, zva_bits_x + b.eq 1f /* Already aligned. */ + /* Not aligned, check that there's enough to copy after alignment.*/ + sub tmp1, count, tmp2 + /* + * grantee the remain length to be ZVA is bigger than 64, + * avoid to make the 2f's process over mem range.*/ + cmp tmp1, #64 + ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */ + b.lt .Lnot_short + /* + * We know that there's at least 64 bytes to zero and that it's safe + * to overrun by 64 bytes. + */ + mov count, tmp1 +2: + stp A_l, A_l, [dst] + stp A_l, A_l, [dst, #16] + stp A_l, A_l, [dst, #32] + subs tmp2, tmp2, #64 + stp A_l, A_l, [dst, #48] + add dst, dst, #64 + b.ge 2b + /* We've overrun a bit, so adjust dst downwards.*/ + add dst, dst, tmp2 +1: + sub count, count, zva_len_x +3: + dc zva, dst + add dst, dst, zva_len_x + subs count, count, zva_len_x + b.ge 3b + ands count, count, zva_bits_x + b.ne .Ltail_maybe_long + ret +#endif /* DONT_USE_DC */ ENDPROC(memset)