From patchwork Wed Apr 29 00:38:52 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Feng Kan X-Patchwork-Id: 6293081 Return-Path: X-Original-To: patchwork-linux-arm@patchwork.kernel.org Delivered-To: patchwork-parsemail@patchwork2.web.kernel.org Received: from mail.kernel.org (mail.kernel.org [198.145.29.136]) by patchwork2.web.kernel.org (Postfix) with ESMTP id 16E36BEEE1 for ; Wed, 29 Apr 2015 00:42:24 +0000 (UTC) Received: from mail.kernel.org (localhost [127.0.0.1]) by mail.kernel.org (Postfix) with ESMTP id 8C8D3202FE for ; Wed, 29 Apr 2015 00:42:22 +0000 (UTC) Received: from bombadil.infradead.org (bombadil.infradead.org [198.137.202.9]) (using TLSv1.2 with cipher DHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.kernel.org (Postfix) with ESMTPS id 083E920375 for ; Wed, 29 Apr 2015 00:42:21 +0000 (UTC) Received: from localhost ([127.0.0.1] helo=bombadil.infradead.org) by bombadil.infradead.org with esmtp (Exim 4.80.1 #2 (Red Hat Linux)) id 1YnG1m-0005S7-Bw; Wed, 29 Apr 2015 00:39:02 +0000 Received: from exprod5og104.obsmtp.com ([64.18.0.178] helo=mail-pd0-f179.google.com) by bombadil.infradead.org with smtps (Exim 4.80.1 #2 (Red Hat Linux)) id 1YnG1j-0005HN-9G for linux-arm-kernel@lists.infradead.org; Wed, 29 Apr 2015 00:39:00 +0000 Received: from mail-pd0-f179.google.com ([209.85.192.179]) (using TLSv1) by exprod5ob104.postini.com ([64.18.4.12]) with SMTP ID DSNKVUAoCxs2+tjk7celvIhNM+C7EwOg+DLk@postini.com; Tue, 28 Apr 2015 17:38:59 PDT Received: by pdea3 with SMTP id a3so11465733pde.3 for ; Tue, 28 Apr 2015 17:38:35 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20130820; h=x-gm-message-state:from:to:cc:subject:date:message-id; bh=au5h9t0NYvYwggFMBDTq+oLqq9kV3wvEg4MSxTqMfxg=; b=LwmVNCVTtn5WWxGD2b+8VGnWuBBMFer4dtCRS3hrjTo/S7MrBuf82XRFFRQGRKdvfP t8wXoWiZ+YduW509zjX42SuDNEe7fmg4bKQAdg1voyzf2CSIUyW6earXz+qJ1t0NF6cz TO45MSYc1fcw79pFqKYFQZIxVcQJ0VsaIaIVQ5lkGTaPD0K1gvr/a6WU9ZBNrkw8Vw6q 41gEbAmVuJ1scXWgyZhYvILFlbGCfbsK5qtwK4lHEkDP/oXl6yU1sMNuA6V0SRQmRZw9 mB5TJmPdrYI/1lPyV+RuOGu+BtlQjr98jrHT7oYHlXIhsMrab+1r++B5aYrkUtQq4GGL pj0A== X-Gm-Message-State: ALoCoQmlzMBAdf/5ztgvmmYyF5WOMRtN3l4RxF1HXEP4SXAT+2sLKp3oyUJhe797q9siV5eOCz2WwZuZRRjI1XEaDKi1ada5+2dtfh+L5Um/IQ0fmXMFj9kZ3yfSn0H0izE7MGgdetSrRFZNEYVntx/Ps5i9bwwhHsVgZ6ArSlyrcBQNKAkWVo4= X-Received: by 10.68.129.39 with SMTP id nt7mr36719825pbb.72.1430267915306; Tue, 28 Apr 2015 17:38:35 -0700 (PDT) X-Received: by 10.68.129.39 with SMTP id nt7mr36719810pbb.72.1430267915169; Tue, 28 Apr 2015 17:38:35 -0700 (PDT) Received: from fkan-lpt.amcc.com (67-207-112-226.static.wiline.com. [67.207.112.226]) by mx.google.com with ESMTPSA id jd5sm23467898pbd.35.2015.04.28.17.38.33 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-SHA bits=128/128); Tue, 28 Apr 2015 17:38:34 -0700 (PDT) From: Feng Kan To: patches@apm.com, linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, philipp.tomsich@theobroma-systems.com, dann.frazier@canonical.com, tim.gardner@canonical.com, craig.magina@canonical.com Subject: [PATCH V3] arm64: optimized copy_to_user and copy_from_user assembly code Date: Tue, 28 Apr 2015 17:38:52 -0700 Message-Id: <1430267932-17730-1-git-send-email-fkan@apm.com> X-Mailer: git-send-email 1.9.1 X-CRM114-Version: 20100106-BlameMichelson ( TRE 0.8.0 (BSD) ) MR-646709E3 X-CRM114-CacheID: sfid-20150428_173859_417411_D812BC88 X-CRM114-Status: GOOD ( 16.99 ) X-Spam-Score: -2.3 (--) Cc: Feng Kan X-BeenThere: linux-arm-kernel@lists.infradead.org X-Mailman-Version: 2.1.18-1 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: "linux-arm-kernel" Errors-To: linux-arm-kernel-bounces+patchwork-linux-arm=patchwork.kernel.org@lists.infradead.org X-Spam-Status: No, score=-4.2 required=5.0 tests=BAYES_00, RCVD_IN_DNSWL_MED, T_RP_MATCHES_RCVD, UNPARSEABLE_RELAY autolearn=unavailable version=3.3.1 X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on mail.kernel.org X-Virus-Scanned: ClamAV using ClamSMTP Using the glibc cortex string work work authored by Linaro as base to create new copy to/from user kernel routine. Iperf performance increase: -l (size) 1 core result Optimized 64B 44-51Mb/s 1500B 4.9Gb/s 30000B 16.2Gb/s Original 64B 34-50.7Mb/s 1500B 4.7Gb/s 30000B 14.5Gb/s Signed-off-by: Feng Kan --- arch/arm64/lib/copy_from_user.S | 92 +++++++++++------ arch/arm64/lib/copy_template.S | 213 ++++++++++++++++++++++++++++++++++++++++ arch/arm64/lib/copy_to_user.S | 56 ++++++----- 3 files changed, 302 insertions(+), 59 deletions(-) create mode 100644 arch/arm64/lib/copy_template.S diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 5e27add..0e79ed9 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -15,7 +15,6 @@ */ #include -#include /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -28,39 +27,68 @@ * x0 - bytes not copied */ ENTRY(__copy_from_user) - add x4, x1, x2 // upper user buffer boundary - subs x2, x2, #8 - b.mi 2f -1: -USER(9f, ldr x3, [x1], #8 ) - subs x2, x2, #8 - str x3, [x0], #8 - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f -USER(9f, ldr w3, [x1], #4 ) - sub x2, x2, #4 - str w3, [x0], #4 -3: adds x2, x2, #2 - b.mi 4f -USER(9f, ldrh w3, [x1], #2 ) - sub x2, x2, #2 - strh w3, [x0], #2 -4: adds x2, x2, #1 - b.mi 5f -USER(9f, ldrb w3, [x1] ) - strb w3, [x0] -5: mov x0, #0 - ret +#include "copy_template.S" ENDPROC(__copy_from_user) .section .fixup,"ax" - .align 2 -9: sub x2, x4, x1 - mov x3, x2 -10: strb wzr, [x0], #1 // zero remaining buffer space - subs x3, x3, #1 - b.ne 10b - mov x0, x2 // bytes not copied + .align 2 +9: + /* + * count is accurate + * dst is accurate + */ + mov x0, count + sub dst, dst, tmp1 + b .Lfinalize + +10: + /* + * count is in the last 15 bytes + * dst is somewhere in there + */ + mov x0, count + sub dst, limit, count + b .Lfinalize +11: + /* + * count over counted by tmp2 + * dst could be anywhere in there + */ + add x0, count, tmp2 + sub dst, limit, x0 + b .Lfinalize +12: + /* + * (count + 64) bytes remain + * dst is accurate + */ + adds x0, count, #64 + b .Lfinalize +13: + /* + * (count + 128) bytes remain + * dst is pre-biased to (dst + 16) + */ + adds x0, count, #128 + add dst, dst, #16 + b .Lfinalize +14: + /* + * (count + 64) bytes remain + * dst is pre-biased to (dst + 16) + */ + adds x0, count, #64 + add dst, dst, #16 + +.Lfinalize: + /* + * Zeroize remaining destination-buffer + */ + mov count, x0 +20: + /* Zero remaining buffer space */ + strb wzr, [dst], #1 + subs count, count, #1 + b.ne 20b ret .previous diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S new file mode 100644 index 0000000..7a22c6b --- /dev/null +++ b/arch/arm64/lib/copy_template.S @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2013, Applied Micro Circuits Corporation + * Copyright (c) 2012-2013, Linaro Limited + * + * Author: Feng Kan + * Author: Philipp Tomsich + * + * The code is adopted from the memcpy routine by Linaro Limited. + * + * This file is free software: you may copy, redistribute and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 2 of the License, or (at your + * option) any later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * This file incorporates work covered by the following copyright and + * permission notice: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * 1 Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2 Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3 Neither the name of the Linaro nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include + +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +limit .req x5 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + add limit, dst, count + cmp count, #16 + b.lo .Ltail15 + + /* + * We don't much care about the alignment of DST, but we want SRC + * to be 128-bit (16 byte) aligned so that we don't cross cache line + * boundaries on both loads and stores. + */ + ands tmp2, src, #15 + b.eq .LSrcAligned + sub count, count, tmp2 + + tbz tmp2, #0, 1f + USER(11f, ldrb tmp1w, [src], #1) + USER(11f, strb tmp1w, [dst], #1) +1: + tbz tmp2, #1, 2f + USER(11f, ldrh tmp1w, [src], #2) + USER(11f, strh tmp1w, [dst], #2) +2: + tbz tmp2, #2, 3f + USER(11f, ldr tmp1w, [src], #4) + USER(11f, str tmp1w, [dst], #4) +3: + tbz tmp2, #3, .LSrcAligned + USER(11f, ldr tmp1, [src], #8) + USER(11f, str tmp1, [dst], #8) + +.LSrcAligned: + /* There may be less than 63 bytes to go now. */ + cmp count, #64 + b.ge .Lcpy_over64 + + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltail15 + add dst, dst, tmp1 + add src, src, tmp1 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + USER(9f, ldp A_l, A_h, [src, #-48]) + USER(9f, stp A_l, A_h, [dst, #-48]) +1: + USER(9f, ldp A_l, A_h, [src, #-32]) + USER(9f, stp A_l, A_h, [dst, #-32]) +2: + USER(9f, ldp A_l, A_h, [src, #-16]) + USER(9f, stp A_l, A_h, [dst, #-16]) + +.Ltail15: + ands count, count, #15 + beq .Lsuccess /* Quick exit if we are done*/ + /* + * Copy up to 15 bytes of data. Does not assume additional data + * being copied. + */ + tbz count, #3, 1f + USER(10f, ldr tmp1, [src], #8) + USER(10f, str tmp1, [dst], #8) +1: + tbz count, #2, 1f + USER(10f, ldr tmp1w, [src], #4) + USER(10f, str tmp1w, [dst], #4) +1: + tbz count, #1, 1f + USER(10f, ldrh tmp1w, [src], #2) + USER(10f, strh tmp1w, [dst], #2) +1: + tbz count, #0, 1f + USER(10f, ldrb tmp1w, [src], #1) + USER(10f, strb tmp1w, [dst], #1) +1: + b .Lsuccess + +.Lcpy_over64: + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + adds count, count, #64 + USER(12f, ldp A_l, A_h, [src]) + USER(12f, ldp B_l, B_h, [src, #16]) + USER(12f, ldp C_l, C_h, [src, #32]) + USER(12f, ldp D_l, D_h, [src, #48]) + USER(12f, stp A_l, A_h, [dst]) + USER(12f, stp B_l, B_h, [dst, #16]) + USER(12f, stp C_l, C_h, [dst, #32]) + USER(12f, stp D_l, D_h, [dst, #48]) + add src, src, #64 + add dst, dst, #64 + tst count, #0x3f + b.ne .Ltail63 + b .Lsuccess + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large: + /* There are at least 128 bytes to copy. */ + sub dst, dst, #16 /* Pre-bias. */ + USER(13f, ldp A_l, A_h, [src, #0]) + USER(13f, ldp B_l, B_h, [src, #16]) + USER(13f, ldp C_l, C_h, [src, #32]) + USER(13f, ldp D_l, D_h, [src, #48]!) /* src += 64 - Pre-bias. */ +1: + USER(13f, stp A_l, A_h, [dst, #16]) + USER(13f, ldp A_l, A_h, [src, #16]) + USER(13f, stp B_l, B_h, [dst, #32]) + USER(13f, ldp B_l, B_h, [src, #32]) + USER(13f, stp C_l, C_h, [dst, #48]) + USER(13f, ldp C_l, C_h, [src, #48]) + USER(14f, stp D_l, D_h, [dst, #64]!) + USER(14f, ldp D_l, D_h, [src, #64]!) + subs count, count, #64 + b.ge 1b + USER(13f, stp A_l, A_h, [dst, #16]) + USER(13f, stp B_l, B_h, [dst, #32]) + USER(13f, stp C_l, C_h, [dst, #48]) + USER(13f, stp D_l, D_h, [dst, #64]) + add src, src, #16 + add dst, dst, #80 /* 64 bytes + 16 prebias */ + adds count, count, #64 + tst count, #0x3f + b.ne .Ltail63 +.Lsuccess: + /* Nothing left to copy */ + mov x0, #0 + ret diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index a0aeeb9..2ce36de 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -15,7 +15,6 @@ */ #include -#include /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -28,34 +27,37 @@ * x0 - bytes not copied */ ENTRY(__copy_to_user) - add x4, x0, x2 // upper user buffer boundary - subs x2, x2, #8 - b.mi 2f -1: - ldr x3, [x1], #8 - subs x2, x2, #8 -USER(9f, str x3, [x0], #8 ) - b.pl 1b -2: adds x2, x2, #4 - b.mi 3f - ldr w3, [x1], #4 - sub x2, x2, #4 -USER(9f, str w3, [x0], #4 ) -3: adds x2, x2, #2 - b.mi 4f - ldrh w3, [x1], #2 - sub x2, x2, #2 -USER(9f, strh w3, [x0], #2 ) -4: adds x2, x2, #1 - b.mi 5f - ldrb w3, [x1] -USER(9f, strb w3, [x0] ) -5: mov x0, #0 - ret +#include "copy_template.S" ENDPROC(__copy_to_user) .section .fixup,"ax" - .align 2 -9: sub x0, x4, x0 // bytes not copied + .align 2 +9: +10: + /* + * count is accurate + */ + mov x0, count + b .Lfinalize +11: + /* + * count is over accounted by tmp2 + */ + add x0, count, tmp2 + b .Lfinalize +12: +14: + /* + * (count + 64) bytes remain + * dst is accurate + */ + adds x0, count, #64 + b .Lfinalize +13: + /* + * (count + 128) bytes remain + */ + add x0, count, #128 +.Lfinalize: ret .previous