Message ID | 20220906115359.173660-1-zouyipeng@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v3] riscv: lib: optimize memcmp with ld insn | expand |
On Tue, Sep 06, 2022 at 07:53:59PM +0800, Yipeng Zou wrote: > Currently memcmp was implemented in c code(lib/string.c), which compare > memory per byte. > > This patch use ld insn compare memory per word to improve. From the test > Results, this will take several times optimized. > > Alloc 8,4,1KB buffer to compare, each loop 10k times: > > Size(B) Min(ns) AVG(ns) //before > > 8k 40800 46316 > 4k 26500 32302 > 1k 15600 17965 > > Size(B) Min(ns) AVG(ns) //after > > 8k 16100 21281 > 4k 14200 16446 > 1k 12400 14316 > > Signed-off-by: Yipeng Zou <zouyipeng@huawei.com> > Reviewed-by: Conor Dooley <conor.dooley@microchip.com> > --- > V2: Patch test data into the commit message,and collect Reviewed-by > Tags. > V3: Fix some spelling mistakes. Improve register naming and coding style. > > arch/riscv/include/asm/string.h | 3 ++ > arch/riscv/lib/Makefile | 1 + > arch/riscv/lib/memcmp.S | 58 +++++++++++++++++++++++++++++++++ > 3 files changed, 62 insertions(+) > create mode 100644 arch/riscv/lib/memcmp.S > > diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h > index 909049366555..3337b43d3803 100644 > --- a/arch/riscv/include/asm/string.h > +++ b/arch/riscv/include/asm/string.h > @@ -18,6 +18,9 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t); > #define __HAVE_ARCH_MEMMOVE > extern asmlinkage void *memmove(void *, const void *, size_t); > extern asmlinkage void *__memmove(void *, const void *, size_t); > +#define __HAVE_ARCH_MEMCMP > +extern int memcmp(const void *, const void *, size_t); > + > /* For those files which don't want to check by kasan. */ > #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) > #define memcpy(dst, src, len) __memcpy(dst, src, len) > diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile > index 25d5c9664e57..70773bf0c471 100644 > --- a/arch/riscv/lib/Makefile > +++ b/arch/riscv/lib/Makefile > @@ -3,6 +3,7 @@ lib-y += delay.o > lib-y += memcpy.o > lib-y += memset.o > lib-y += memmove.o > +lib-y += memcmp.o > lib-$(CONFIG_MMU) += uaccess.o > lib-$(CONFIG_64BIT) += tishift.o > > diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S > new file mode 100644 > index 000000000000..eea5cc40e081 > --- /dev/null > +++ b/arch/riscv/lib/memcmp.S > @@ -0,0 +1,58 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (C) 2022 zouyipeng@huawei.com > + */ > +#include <linux/linkage.h> > +#include <asm-generic/export.h> > +#include <asm/asm.h> > + > +/* > + Input Arguments: > + a0: addr0 > + a1: addr1 > + a2: buffer size > + > + Output: > + a0: return value > +*/ > +#define data0 a3 > +#define data1 a4 > +#define tmp t3 > +#define tail t4 > + > +/* load and compare */ > +.macro LD_CMP op d0 d1 a0 a1 t1 offset > + \op \d0, 0(\a0) > + \op \d1, 0(\a1) > + addi \a0, \a0, \offset > + addi \a1, \a1, \offset > + sub \t1, \d0, \d1 > +.endm > + > +ENTRY(memcmp) > + /* test size aligned with SZREG */ > + andi tmp, a2, SZREG - 1 > + /* load tail */ > + add tail, a0, a2 > + sub tail, tail, tmp > + add a2, a0, a2 > + > +.LloopWord: > + sltu tmp, a0, tail > + beqz tmp, .LloopByte > + > + LD_CMP REG_L data0 data1 a0 a1 tmp SZREG > + beqz tmp, .LloopWord > + j .Lreturn > + > +.LloopByte: > + sltu tmp, a0, a2 > + beqz tmp, .Lreturn > + > + LD_CMP lbu data0 data1 a0 a1 tmp 1 > + beqz tmp, .LloopByte > +.Lreturn: > + mv a0, tmp > + ret > +END(memcmp) > +EXPORT_SYMBOL(memcmp); > -- > 2.17.1 > Thanks, looks nice. Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
On Tue, 06 Sep 2022 05:38:14 PDT (-0700), ajones@ventanamicro.com wrote: > On Tue, Sep 06, 2022 at 07:53:59PM +0800, Yipeng Zou wrote: >> Currently memcmp was implemented in c code(lib/string.c), which compare >> memory per byte. >> >> This patch use ld insn compare memory per word to improve. From the test >> Results, this will take several times optimized. >> >> Alloc 8,4,1KB buffer to compare, each loop 10k times: >> >> Size(B) Min(ns) AVG(ns) //before >> >> 8k 40800 46316 >> 4k 26500 32302 >> 1k 15600 17965 >> >> Size(B) Min(ns) AVG(ns) //after >> >> 8k 16100 21281 >> 4k 14200 16446 >> 1k 12400 14316 >> >> Signed-off-by: Yipeng Zou <zouyipeng@huawei.com> >> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> >> --- >> V2: Patch test data into the commit message,and collect Reviewed-by >> Tags. >> V3: Fix some spelling mistakes. Improve register naming and coding style. >> >> arch/riscv/include/asm/string.h | 3 ++ >> arch/riscv/lib/Makefile | 1 + >> arch/riscv/lib/memcmp.S | 58 +++++++++++++++++++++++++++++++++ >> 3 files changed, 62 insertions(+) >> create mode 100644 arch/riscv/lib/memcmp.S >> >> diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h >> index 909049366555..3337b43d3803 100644 >> --- a/arch/riscv/include/asm/string.h >> +++ b/arch/riscv/include/asm/string.h >> @@ -18,6 +18,9 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t); >> #define __HAVE_ARCH_MEMMOVE >> extern asmlinkage void *memmove(void *, const void *, size_t); >> extern asmlinkage void *__memmove(void *, const void *, size_t); >> +#define __HAVE_ARCH_MEMCMP >> +extern int memcmp(const void *, const void *, size_t); >> + >> /* For those files which don't want to check by kasan. */ >> #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) >> #define memcpy(dst, src, len) __memcpy(dst, src, len) >> diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile >> index 25d5c9664e57..70773bf0c471 100644 >> --- a/arch/riscv/lib/Makefile >> +++ b/arch/riscv/lib/Makefile >> @@ -3,6 +3,7 @@ lib-y += delay.o >> lib-y += memcpy.o >> lib-y += memset.o >> lib-y += memmove.o >> +lib-y += memcmp.o >> lib-$(CONFIG_MMU) += uaccess.o >> lib-$(CONFIG_64BIT) += tishift.o >> >> diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S >> new file mode 100644 >> index 000000000000..eea5cc40e081 >> --- /dev/null >> +++ b/arch/riscv/lib/memcmp.S >> @@ -0,0 +1,58 @@ >> +/* SPDX-License-Identifier: GPL-2.0-only */ >> +/* >> + * Copyright (C) 2022 zouyipeng@huawei.com >> + */ >> +#include <linux/linkage.h> >> +#include <asm-generic/export.h> >> +#include <asm/asm.h> >> + >> +/* >> + Input Arguments: >> + a0: addr0 >> + a1: addr1 >> + a2: buffer size >> + >> + Output: >> + a0: return value >> +*/ >> +#define data0 a3 >> +#define data1 a4 >> +#define tmp t3 >> +#define tail t4 >> + >> +/* load and compare */ >> +.macro LD_CMP op d0 d1 a0 a1 t1 offset >> + \op \d0, 0(\a0) >> + \op \d1, 0(\a1) >> + addi \a0, \a0, \offset >> + addi \a1, \a1, \offset >> + sub \t1, \d0, \d1 >> +.endm >> + >> +ENTRY(memcmp) >> + /* test size aligned with SZREG */ >> + andi tmp, a2, SZREG - 1 >> + /* load tail */ >> + add tail, a0, a2 >> + sub tail, tail, tmp >> + add a2, a0, a2 >> + >> +.LloopWord: >> + sltu tmp, a0, tail >> + beqz tmp, .LloopByte >> + >> + LD_CMP REG_L data0 data1 a0 a1 tmp SZREG >> + beqz tmp, .LloopWord >> + j .Lreturn >> + >> +.LloopByte: >> + sltu tmp, a0, a2 >> + beqz tmp, .Lreturn >> + >> + LD_CMP lbu data0 data1 a0 a1 tmp 1 >> + beqz tmp, .LloopByte >> +.Lreturn: >> + mv a0, tmp >> + ret >> +END(memcmp) >> +EXPORT_SYMBOL(memcmp); >> -- >> 2.17.1 >> > > Thanks, looks nice. Ya, I think normally for performance improvements I'd want a bit more (like "what was this benchmarked on", for example). This one seems straight-forward enough to just handle open-loop, though, so I think it's OK this time. It's also something we could probably do in C, but IIUC we're meant to have arch-specific routines for these anyway and we're going to end up with assembly at some point so we might as well just start now. It does trigger a bunch of build failures, though. This fixes most of the RISC-V related issues: diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h index 3337b43d3803..33bd88e17469 100644 --- a/arch/riscv/include/asm/string.h +++ b/arch/riscv/include/asm/string.h @@ -19,13 +19,15 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t); extern asmlinkage void *memmove(void *, const void *, size_t); extern asmlinkage void *__memmove(void *, const void *, size_t); #define __HAVE_ARCH_MEMCMP -extern int memcmp(const void *, const void *, size_t); +extern asmlinkage int memcmp(const void *, const void *, size_t); +extern asmlinkage int __memcmp(const void *, const void *, size_t); /* For those files which don't want to check by kasan. */ #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) #define memcpy(dst, src, len) __memcpy(dst, src, len) #define memset(s, c, n) __memset(s, c, n) #define memmove(dst, src, len) __memmove(dst, src, len) +#define memcmp(a, b, len) __memcmp(a, b, len) #ifndef __NO_FORTIFY #define __NO_FORTIFY /* FORTIFY_SOURCE uses __builtin_memcpy, etc. */ diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S index eea5cc40e081..518797cd6dda 100644 --- a/arch/riscv/lib/memcmp.S +++ b/arch/riscv/lib/memcmp.S @@ -29,7 +29,8 @@ sub \t1, \d0, \d1 .endm -ENTRY(memcmp) +SYM_FUNC_START(__memcmp) +SYM_FUNC_START_WEAK(memcmp) /* test size aligned with SZREG */ andi tmp, a2, SZREG - 1 /* load tail */ @@ -54,5 +55,9 @@ ENTRY(memcmp) .Lreturn: mv a0, tmp ret -END(memcmp) -EXPORT_SYMBOL(memcmp); + +SYM_FUNC_END(memcmp) +SYM_FUNC_END(__memcmp) + +EXPORT_SYMBOL(memcmp) +EXPORT_SYMBOL(__memcmp) diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile index dd58e1d99397..359227e392b2 100644 --- a/arch/riscv/purgatory/Makefile +++ b/arch/riscv/purgatory/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 OBJECT_FILES_NON_STANDARD := y -purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o +purgatory-y := purgatory.o sha256.o entry.o string.o ctype.o memcpy.o memset.o memcmp.o targets += $(purgatory-y) PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) @@ -18,6 +18,9 @@ $(obj)/memcpy.o: $(srctree)/arch/riscv/lib/memcpy.S FORCE $(obj)/memset.o: $(srctree)/arch/riscv/lib/memset.S FORCE $(call if_changed_rule,as_o_S) +$(obj)/memcmp.o: $(srctree)/arch/riscv/lib/memcmp.S FORCE + $(call if_changed_rule,as_o_S) + $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) but I'm still getting some EFI stub failures riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt.stub.o: in function `__efistub_.L130': __efistub_fdt.c:(.init.text+0x686): undefined reference to `__efistub___memcmp' riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt_ro.stub.o: in function `__efistub_.L87': __efistub_fdt_ro.c:(.init.text+0x554): undefined reference to `__efistub___memcmp' riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt_ro.stub.o: in function `__efistub_.L108': __efistub_fdt_ro.c:(.init.text+0x6fc): undefined reference to `__efistub___memcmp' riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt_ro.stub.o: in function `__efistub_.L263': __efistub_fdt_ro.c:(.init.text+0xfe0): undefined reference to `__efistub___memcmp' riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt_ro.stub.o: in function `__efistub_.L284': __efistub_fdt_ro.c:(.init.text+0x10c0): undefined reference to `__efistub___memcmp' riscv64-unknown-linux-gnu-ld: ./drivers/firmware/efi/libstub/lib-fdt_ro.stub.o:__efistub_fdt_ro.c:(.init.text+0x11fc): more undefined references to `__efistub___memcmp' follow riscv64-unknown-linux-gnu-ld: .tmp_vmlinux.kallsyms1: hidden symbol `__efistub___memcmp' isn't defined riscv64-unknown-linux-gnu-ld: final link failed: bad value make[2]: *** [/scratch/merges/ko-linux-next/linux/Makefile:1240: vmlinux] Error 1 make[2]: Leaving directory '/scratch/merges/ko-linux-next/kernel/rv64gc/allyesconfig' so I think it's going to be too late for 6.1 for this one, sorry! > > Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h index 909049366555..3337b43d3803 100644 --- a/arch/riscv/include/asm/string.h +++ b/arch/riscv/include/asm/string.h @@ -18,6 +18,9 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t); #define __HAVE_ARCH_MEMMOVE extern asmlinkage void *memmove(void *, const void *, size_t); extern asmlinkage void *__memmove(void *, const void *, size_t); +#define __HAVE_ARCH_MEMCMP +extern int memcmp(const void *, const void *, size_t); + /* For those files which don't want to check by kasan. */ #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) #define memcpy(dst, src, len) __memcpy(dst, src, len) diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 25d5c9664e57..70773bf0c471 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -3,6 +3,7 @@ lib-y += delay.o lib-y += memcpy.o lib-y += memset.o lib-y += memmove.o +lib-y += memcmp.o lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S new file mode 100644 index 000000000000..eea5cc40e081 --- /dev/null +++ b/arch/riscv/lib/memcmp.S @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 zouyipeng@huawei.com + */ +#include <linux/linkage.h> +#include <asm-generic/export.h> +#include <asm/asm.h> + +/* + Input Arguments: + a0: addr0 + a1: addr1 + a2: buffer size + + Output: + a0: return value +*/ +#define data0 a3 +#define data1 a4 +#define tmp t3 +#define tail t4 + +/* load and compare */ +.macro LD_CMP op d0 d1 a0 a1 t1 offset + \op \d0, 0(\a0) + \op \d1, 0(\a1) + addi \a0, \a0, \offset + addi \a1, \a1, \offset + sub \t1, \d0, \d1 +.endm + +ENTRY(memcmp) + /* test size aligned with SZREG */ + andi tmp, a2, SZREG - 1 + /* load tail */ + add tail, a0, a2 + sub tail, tail, tmp + add a2, a0, a2 + +.LloopWord: + sltu tmp, a0, tail + beqz tmp, .LloopByte + + LD_CMP REG_L data0 data1 a0 a1 tmp SZREG + beqz tmp, .LloopWord + j .Lreturn + +.LloopByte: + sltu tmp, a0, a2 + beqz tmp, .Lreturn + + LD_CMP lbu data0 data1 a0 a1 tmp 1 + beqz tmp, .LloopByte +.Lreturn: + mv a0, tmp + ret +END(memcmp) +EXPORT_SYMBOL(memcmp);