From patchwork Mon Apr 28 05:11:29 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: zhichang.yuan@linaro.org X-Patchwork-Id: 4075171 Return-Path: X-Original-To: patchwork-linux-arm@patchwork.kernel.org Delivered-To: patchwork-parsemail@patchwork1.web.kernel.org Received: from mail.kernel.org (mail.kernel.org [198.145.19.201]) by patchwork1.web.kernel.org (Postfix) with ESMTP id 6F2F69FA15 for ; Mon, 28 Apr 2014 05:15:59 +0000 (UTC) Received: from mail.kernel.org (localhost [127.0.0.1]) by mail.kernel.org (Postfix) with ESMTP id 644852021B for ; Mon, 28 Apr 2014 05:15:58 +0000 (UTC) Received: from bombadil.infradead.org (bombadil.infradead.org [198.137.202.9]) (using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.kernel.org (Postfix) with ESMTPS id EBFF520219 for ; Mon, 28 Apr 2014 05:15:56 +0000 (UTC) Received: from localhost ([127.0.0.1] helo=bombadil.infradead.org) by bombadil.infradead.org with esmtp (Exim 4.80.1 #2 (Red Hat Linux)) id 1Wedrp-0004IL-1k; Mon, 28 Apr 2014 05:12:37 +0000 Received: from mail-pd0-f169.google.com ([209.85.192.169]) by bombadil.infradead.org with esmtps (Exim 4.80.1 #2 (Red Hat Linux)) id 1Wedrl-0004F3-Bp for linux-arm-kernel@lists.infradead.org; Mon, 28 Apr 2014 05:12:35 +0000 Received: by mail-pd0-f169.google.com with SMTP id y13so4362092pdi.14 for ; Sun, 27 Apr 2014 22:12:09 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references; bh=s0sMYuRsYykQ3Qbwb5Oq6Zm46Evj6qvHHToVh1pHY+A=; b=RpnFl5hmKDciyBRstobRDteie3iYcUl4zlc+6i1ph2NerkUk54H5gARX2gBxlpTWYc dVO3E0QPaydlBmMDp7gT3yTHMzfd0kswh1jpvGId6ALISU0/BGp0kQ7cpZwCegcBjZQe 5EarkNftFZqL2N4bc1DzATOkgCRFFuKfkowI2o1Y2XwsZjNSOPemA7JZelYsvARTM6V5 lr1Jsv/qMiX0d3JUDl7HoxzOjTmRdHHPOXgnPibVJngktFaqoP+fCcP/Gbh5qe9xOCn0 i+WeSwc4K+v8nUGJGHkux3EdmCiHQ4W0cOMVo/Ehq+GBtkqwgu427qnMFkwUO4RrxgXF QVlQ== X-Gm-Message-State: ALoCoQkoaw1HYbZByb4EdjIogyk9DobmYpkfETTrIbVj1suITtjwvnjFYNDTOUjfhNyFQ7PFOLFS X-Received: by 10.66.150.228 with SMTP id ul4mr23241141pab.16.1398661928909; Sun, 27 Apr 2014 22:12:08 -0700 (PDT) Received: from localhost ([58.251.159.252]) by mx.google.com with ESMTPSA id vm3sm32027835pbc.45.2014.04.27.22.12.05 for (version=TLSv1.2 cipher=RC4-SHA bits=128/128); Sun, 27 Apr 2014 22:12:08 -0700 (PDT) From: zhichang.yuan@linaro.org To: linux-arm-kernel@lists.infradead.org, catalin.marinas@arm.com, will.deacon@arm.com Subject: [PATCHv2 1/6] arm64: lib: Implement optimized memcpy routine Date: Mon, 28 Apr 2014 13:11:29 +0800 Message-Id: <1398661895-5559-2-git-send-email-zhichang.yuan@linaro.org> X-Mailer: git-send-email 1.7.9.5 In-Reply-To: <1398661895-5559-1-git-send-email-zhichang.yuan@linaro.org> References: <1398661895-5559-1-git-send-email-zhichang.yuan@linaro.org> X-CRM114-Version: 20100106-BlameMichelson ( TRE 0.8.0 (BSD) ) MR-646709E3 X-CRM114-CacheID: sfid-20140427_221233_477128_BEC83F5E X-CRM114-Status: GOOD ( 14.65 ) X-Spam-Score: 0.0 (/) Cc: dsaxena@linaro.org, liguozhu@huawei.com, "zhichang.yuan" X-BeenThere: linux-arm-kernel@lists.infradead.org X-Mailman-Version: 2.1.15 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: "linux-arm-kernel" Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org X-Spam-Status: No, score=-2.5 required=5.0 tests=BAYES_00,RP_MATCHES_RCVD, UNPARSEABLE_RELAY autolearn=unavailable version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP From: "zhichang.yuan" This patch, based on Linaro's Cortex Strings library, improves the performance of the assembly optimized memcpy() function. Signed-off-by: Zhichang Yuan Signed-off-by: Deepak Saxena --- arch/arm64/lib/memcpy.S | 192 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 170 insertions(+), 22 deletions(-) diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 27b5003..8a9a96d 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -1,5 +1,13 @@ /* * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -16,6 +24,7 @@ #include #include +#include /* * Copy a buffer from src to dest (alignment handled by the hardware) @@ -27,27 +36,166 @@ * Returns: * x0 - dest */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +tmp3 .req x5 +tmp3w .req w5 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + ENTRY(memcpy) - mov x4, x0 - subs x2, x2, #8 - b.mi 2f -1: ldr x3, [x1], #8 - subs x2, x2, #8 - str x3, [x4], #8 - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f - ldr w3, [x1], #4 - sub x2, x2, #4 - str w3, [x4], #4 -3: adds x2, x2, #2 - b.mi 4f - ldrh w3, [x1], #2 - sub x2, x2, #2 - strh w3, [x4], #2 -4: adds x2, x2, #1 - b.mi 5f - ldrb w3, [x1] - strb w3, [x4] -5: ret + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15 + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwritting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb tmp1w, [src], #1 + strb tmp1w, [dst], #1 +1: + tbz tmp2, #1, 2f + ldrh tmp1w, [src], #2 + strh tmp1w, [dst], #2 +2: + tbz tmp2, #2, 3f + ldr tmp1w, [src], #4 + str tmp1w, [dst], #4 +3: + tbz tmp2, #3, .LSrcAligned + ldr tmp1, [src],#8 + str tmp1, [dst],#8 + +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp A_l, A_h, [src], #16 + stp A_l, A_h, [dst], #16 +1: + ldp A_l, A_h, [src], #16 + stp A_l, A_h, [dst], #16 +2: + ldp A_l, A_h, [src], #16 + stp A_l, A_h, [dst], #16 +.Ltiny15: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr tmp1, [src], #8 + str tmp1, [dst], #8 +1: + tbz count, #2, 2f + ldr tmp1w, [src], #4 + str tmp1w, [dst], #4 +2: + tbz count, #1, 3f + ldrh tmp1w, [src], #2 + strh tmp1w, [dst], #2 +3: + tbz count, #0, .Lexitfunc + ldrb tmp1w, [src] + strb tmp1w, [dst] + +.Lexitfunc: + ret + +.Lcpy_over64: + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp A_l, A_h, [src],#16 + stp A_l, A_h, [dst],#16 + ldp B_l, B_h, [src],#16 + ldp C_l, C_h, [src],#16 + stp B_l, B_h, [dst],#16 + stp C_l, C_h, [dst],#16 + ldp D_l, D_h, [src],#16 + stp D_l, D_h, [dst],#16 + + tst count, #0x3f + b.ne .Ltail63 + ret + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large: + /* pre-get 64 bytes data. */ + ldp A_l, A_h, [src],#16 + ldp B_l, B_h, [src],#16 + ldp C_l, C_h, [src],#16 + ldp D_l, D_h, [src],#16 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp A_l, A_h, [dst],#16 + ldp A_l, A_h, [src],#16 + stp B_l, B_h, [dst],#16 + ldp B_l, B_h, [src],#16 + stp C_l, C_h, [dst],#16 + ldp C_l, C_h, [src],#16 + stp D_l, D_h, [dst],#16 + ldp D_l, D_h, [src],#16 + subs count, count, #64 + b.ge 1b + stp A_l, A_h, [dst],#16 + stp B_l, B_h, [dst],#16 + stp C_l, C_h, [dst],#16 + stp D_l, D_h, [dst],#16 + + tst count, #0x3f + b.ne .Ltail63 + ret ENDPROC(memcpy)