From patchwork Wed Dec 11 06:24:40 2013 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: zhichang.yuan@linaro.org X-Patchwork-Id: 3322441 Return-Path: X-Original-To: patchwork-linux-arm@patchwork.kernel.org Delivered-To: patchwork-parsemail@patchwork1.web.kernel.org Received: from mail.kernel.org (mail.kernel.org [198.145.19.201]) by patchwork1.web.kernel.org (Postfix) with ESMTP id 83A439F37A for ; Wed, 11 Dec 2013 06:28:09 +0000 (UTC) Received: from mail.kernel.org (localhost [127.0.0.1]) by mail.kernel.org (Postfix) with ESMTP id 40E60205E5 for ; Wed, 11 Dec 2013 06:28:08 +0000 (UTC) Received: from casper.infradead.org (casper.infradead.org [85.118.1.10]) (using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.kernel.org (Postfix) with ESMTPS id B4EFD20412 for ; Wed, 11 Dec 2013 06:28:06 +0000 (UTC) Received: from merlin.infradead.org ([2001:4978:20e::2]) by casper.infradead.org with esmtps (Exim 4.80.1 #2 (Red Hat Linux)) id 1VqdFk-0003BE-L3; Wed, 11 Dec 2013 06:26:37 +0000 Received: from localhost ([::1] helo=merlin.infradead.org) by merlin.infradead.org with esmtp (Exim 4.80.1 #2 (Red Hat Linux)) id 1VqdFL-0000nb-T4; Wed, 11 Dec 2013 06:26:11 +0000 Received: from mail-pd0-f177.google.com ([209.85.192.177]) by merlin.infradead.org with esmtps (Exim 4.80.1 #2 (Red Hat Linux)) id 1VqdFH-0000lH-J0 for linux-arm-kernel@lists.infradead.org; Wed, 11 Dec 2013 06:26:10 +0000 Received: by mail-pd0-f177.google.com with SMTP id q10so8861451pdj.36 for ; Tue, 10 Dec 2013 22:25:43 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references; bh=fuSgAGNMqbg8mK7BHIPsyE/cS9ooI2/FJM/05khaoFQ=; b=LnTvmVVYoEYkHxy8z1lAX6hwG0BVvY4D2Zi5ry5wOAq0NF4n3bDnxqIEYgzsXhpXq4 +LA2KKIfAOaCGHlP0cZiyqUT+n3obT1Vu44iwglpHQdgbnYBw6nvPYs9D+BJ+hYEapGh Vhqa1ZDbkqihqorTluMmFbGQVBFZY51FuWBRGaXsX1r0pmUGZd+ZENqHeJlFMgqd5v4g y3FSrTJANHQNkB6rfvT5PmWyPhvN1CS+6lcKJ4dhmBFCiXhUzVhQzNJwmtXJuYfiZXxU sgDAhtMHknIu8B35zca/QHjvJVnhmlE0ILINgCT1bsDxFRZ0vAX01m3jZl1FK1JJ37Sq QBzg== X-Gm-Message-State: ALoCoQkA6eusyAzIfsjR0he5F9WQQspnZnS8ZqjQyAw4tmlZ6PqyJOIw/uJzldl5zRhDBBIN3QKT X-Received: by 10.68.254.164 with SMTP id aj4mr15199563pbd.161.1386743143566; Tue, 10 Dec 2013 22:25:43 -0800 (PST) Received: from localhost ([58.251.159.252]) by mx.google.com with ESMTPSA id p1sm4785801pbo.12.2013.12.10.22.25.39 for (version=TLSv1.2 cipher=RC4-SHA bits=128/128); Tue, 10 Dec 2013 22:25:42 -0800 (PST) From: zhichang.yuan@linaro.org To: linux-arm-kernel@lists.infradead.org, catalin.marinas@arm.com, will.deacon@arm.com Subject: [PATCH 4/6] arm64: lib: Implement optimized memcmp routine Date: Wed, 11 Dec 2013 14:24:40 +0800 Message-Id: <1386743082-5231-5-git-send-email-zhichang.yuan@linaro.org> X-Mailer: git-send-email 1.7.9.5 In-Reply-To: <1386743082-5231-1-git-send-email-zhichang.yuan@linaro.org> References: <1386743082-5231-1-git-send-email-zhichang.yuan@linaro.org> X-CRM114-Version: 20100106-BlameMichelson ( TRE 0.8.0 (BSD) ) MR-646709E3 X-CRM114-CacheID: sfid-20131211_012607_841935_FF97C536 X-CRM114-Status: GOOD ( 20.61 ) X-Spam-Score: -1.9 (-) Cc: Deepak Saxena , liguozhu@huawei.com, "zhichang.yuan" X-BeenThere: linux-arm-kernel@lists.infradead.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: "linux-arm-kernel" Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org X-Spam-Status: No, score=-4.4 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_MED, RP_MATCHES_RCVD, UNPARSEABLE_RELAY autolearn=unavailable version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP From: "zhichang.yuan" This patch, based on Linaro's Cortex Strings library, adds an assembly optimized memcmp() function. Signed-off-by: Zhichang Yuan Signed-off-by: Deepak Saxena --- arch/arm64/include/asm/string.h | 3 + arch/arm64/kernel/arm64ksyms.c | 1 + arch/arm64/lib/Makefile | 2 +- arch/arm64/lib/memcmp.S | 258 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/lib/memcmp.S diff --git a/arch/arm64/include/asm/string.h b/arch/arm64/include/asm/string.h index 3ee8b30..3a43305 100644 --- a/arch/arm64/include/asm/string.h +++ b/arch/arm64/include/asm/string.h @@ -34,4 +34,7 @@ extern void *memchr(const void *, int, __kernel_size_t); #define __HAVE_ARCH_MEMSET extern void *memset(void *, int, __kernel_size_t); +#define __HAVE_ARCH_MEMCMP +extern int memcmp(const void *, const void *, size_t); + #endif diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index e7ee770..af02a25 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -51,6 +51,7 @@ EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memchr); +EXPORT_SYMBOL(memcmp); /* atomic bitops */ EXPORT_SYMBOL(set_bit); diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 59acc0e..a6a8d3d 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -2,5 +2,5 @@ lib-y := bitops.o delay.o \ strncpy_from_user.o strnlen_user.o clear_user.o \ copy_from_user.o copy_to_user.o copy_in_user.o \ copy_page.o clear_page.o \ - memchr.o memcpy.o memmove.o memset.o \ + memchr.o memcpy.o memmove.o memset.o memcmp.o \ strchr.o strrchr.o diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S new file mode 100644 index 0000000..97e8431 --- /dev/null +++ b/arch/arm64/lib/memcmp.S @@ -0,0 +1,258 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +/* +* compare memory areas(when two memory areas' offset are different, +* alignment handled by the hardware) +* +* Parameters: +* x0 - const memory area 1 pointer +* x1 - const memory area 2 pointer +* x2 - the maximal compare byte length +* Returns: +* x0 - a compare result, maybe less than, equal to, or greater than ZERO +*/ + +/* Parameters and result. */ +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result x0 + +/* Internal variables. */ +#define data1 x3 +#define data1w w3 +#define data2 x4 +#define data2w w4 +#define has_nul x5 +#define diff x6 +#define endloop x7 +#define tmp1 x8 +#define tmp2 x9 +#define tmp3 x10 +#define pos x11 +#define limit_wd x12 +#define mask x13 + +ENTRY(memcmp) + cbz limit, .Lret0 + eor tmp1, src1, src2 + tst tmp1, #7 + b.ne .Lmisaligned8 + ands tmp1, src1, #7 + b.ne .Lmutual_align + sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ + lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ +.Lloop_aligned: + ldr data1, [src1], #8 + ldr data2, [src2], #8 +.Lstart_realigned: + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 /* Non-zero if differences found. */ + csinv endloop, diff, xzr, cs /* Last Dword or differences. */ + cbz endloop, .Lloop_aligned + + /* Not reached the limit, must have found a diff. */ + tbz limit_wd, #63, .Lnot_limit + + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq .Lnot_limit + + lsl limit, limit, #3 /* Bits -> bytes. */ + mov mask, #~0 +#ifdef __ARM64EB__ + lsr mask, mask, limit +#else + lsl mask, mask, limit +#endif + bic data1, data1, mask + bic data2, data2, mask + + orr diff, diff, mask + b .Lnot_limit + +.Lmutual_align: + /* + * Sources are mutually aligned, but are not currently at an + * alignment boundary. Round down the addresses and then mask off + * the bytes that precede the start point. + */ + bic src1, src1, #7 + bic src2, src2, #7 + ldr data1, [src1], #8 + ldr data2, [src2], #8 + sub limit_wd, limit, #1/* limit != 0, so no underflow. */ + and tmp3, limit_wd, #7 + lsr limit_wd, limit_wd, #3 + add tmp3, tmp3, tmp1 + add limit_wd, limit_wd, tmp3, lsr #3 + add limit, limit, tmp1/* Adjust the limit for the extra. */ + lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ + neg tmp1, tmp1/* Bits to alignment -64. */ + mov tmp2, #~0 +#ifdef __ARM64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsl tmp2, tmp2, tmp1/* Shift (tmp1 & 63). */ +#else + /* Little-endian. Early bytes are at LSB. */ + lsr tmp2, tmp2, tmp1/* Shift (tmp1 & 63). */ +#endif + orr data1, data1, tmp2 + orr data2, data2, tmp2 + b .Lstart_realigned + +.Lmisaligned8: + cmp limit, #8 + b.lo .Ltiny8proc /*limit < 8... */ + /* + * Get the align offset length to compare per byte first. + * After this process, one string's address will be aligned. + */ + and tmp1, src1, #7 + neg tmp1, tmp1 + add tmp1, tmp1, #8 + and tmp2, src2, #7 + neg tmp2, tmp2 + add tmp2, tmp2, #8 + subs tmp3, tmp1, tmp2 + csel pos, tmp1, tmp2, hi /*Choose the maximum. */ + /* + * Here, limit is not less than 8, + * so directly run .Ltinycmp without checking the limit.*/ + sub limit, limit, pos +.Ltinycmp: + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs pos, pos, #1 + ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ + b.eq .Ltinycmp + cbnz pos, 1f /*find the unequal...*/ + cmp data1w, data2w + b.eq .Lstart_align /*the last bytes are equal....*/ +1: + sub result, data1, data2 + ret + +.Lstart_align: + lsr limit_wd, limit, #3 + cbz limit_wd, .Lremain8 + ands xzr, src1, #7 + /* + * eq means tmp1 bytes finished the compare in the Ltinycmp, + * tmp3 is positive here + */ + b.eq .Lrecal_offset + add src1, src1, tmp3 + add src2, src2, tmp3 + sub limit, limit, tmp3 + lsr limit_wd, limit, #3 + cbz limit_wd, .Lremain8 + ldr data1, [src1], #8 + ldr data2, [src2], #8 + + subs limit_wd, limit_wd, #1 + eor diff, data1, data2 /*Non-zero if differences found.*/ + csinv endloop, diff, xzr, ne + cbnz endloop, .Lunequal_proc + and tmp3, tmp3, #7 /*tmp3 = 8 + tmp3 ( old tmp3 is negative)*/ + /* + * src1 is aligned and src1 is in the right of src2. + * Remain count is not less than 8 here. + */ +.Lrecal_offset: + neg pos, tmp3 +.Lloopcmp_proc: + /* + * Fall back pos bytes, get the first bytes segment of + * one Dword of src1. pos is negative here. We also can use : + * ldr data1, [src1] + * ldr data2, [src2, pos] + * These two instructions will read data with aligned address,then + * do the compare.But if we adapt this method, have to add some + * shift and mask out some bits from these two Dword to construct + * two Dwords to compare.Some more instructions will be added, + * and most important, it will need more time cost. + */ + ldr data1, [src1,pos] + ldr data2, [src2,pos] + eor diff, data1, data2 /* Non-zero if differences found.*/ + cbnz diff, .Lnot_limit + + /*The second part process*/ + ldr data1, [src1], #8 + ldr data2, [src2], #8 + eor diff, data1, data2 /* Non-zero if differences found.*/ + subs limit_wd, limit_wd, #1 + csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ + cbz endloop, .Lloopcmp_proc +.Lunequal_proc: + /*whether unequal occurred?*/ + cbz diff, .Lremain8 +.Lnot_limit: +#ifndef __ARM64EB__ + rev diff, diff + rev data1, data1 + rev data2, data2 +#endif + /* + * The MS-non-zero bit of DIFF marks either the first bit + * that is different, or the end of the significant data. + * Shifting left now will bring the critical information into the + * top bits. + */ + clz pos, diff + lsl data1, data1, pos + lsl data2, data2, pos + /* But we need to zero-extend (char is unsigned) the value and then + perform a signed 32-bit subtraction. */ + lsr data1, data1, #56 + sub result, data1, data2, lsr #56 + ret + + .p2align 6 +.Lremain8: + /* Limit % 8 == 0 => all bytes significant. */ + ands limit, limit, #7 + b.eq .Lret0 + +.Ltiny8proc: + /*Perhaps we can do better than this.*/ + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + subs limit, limit, #1 + /* + * ne satisfied means current limit > 0. Z=1 will make cs =0, + * lead to next ccmp use ZERO to set flags,so break the loop.*/ + ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ + b.eq .Ltiny8proc + sub result, data1, data2 + ret +.Lret0: + mov result, #0 + ret +ENDPROC(memcmp)