[101/147] lib/string: optimized memcpy

Message ID	20210908025839.81TnA0vU3%akpm@linux-foundation.org (mailing list archive)
State	New
Headers	show Return-Path: <SRS0=DBvG=N6=kvack.org=owner-linux-mm@kernel.org> DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 47CCF61100 Date: Tue, 07 Sep 2021 19:58:39 -0700 From: Andrew Morton <akpm@linux-foundation.org> To: akpm@linux-foundation.org, David.Laight@aculab.com, drew@beagleboard.org, guoren@kernel.org, hch@infradead.org, kernel@esmil.dk, linux-mm@kvack.org, mcroce@microsoft.com, mick@ics.forth.gr, mm-commits@vger.kernel.org, ndesaulniers@google.com, palmer@dabbelt.com, torvalds@linux-foundation.org Subject: [patch 101/147] lib/string: optimized memcpy Message-ID: <20210908025839.81TnA0vU3%akpm@linux-foundation.org> In-Reply-To: <20210907195226.14b1d22a07c085b22968b933@linux-foundation.org> User-Agent: s-nail v14.8.16 Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	[001/147] mm, slub: don't call flush_all() from slab_debug_trace_open() \| expand [001/147] mm, slub: don't call flush_all() from slab_debug_trace_open() [002/147] mm, slub: allocate private object map for debugfs listings [003/147] mm, slub: allocate private object map for validate_slab_cache() [004/147] mm, slub: don't disable irq for debug_check_no_locks_freed() [005/147] mm, slub: remove redundant unfreeze_partials() from put_cpu_partial() [006/147] mm, slub: extract get_partial() from new_slab_objects() [007/147] mm, slub: dissolve new_slab_objects() into ___slab_alloc() [008/147] mm, slub: return slab page from get_partial() and set c->page afterwards [009/147] mm, slub: restructure new page checks in ___slab_alloc() [010/147] mm, slub: simplify kmem_cache_cpu and tid setup [011/147] mm, slub: move disabling/enabling irqs to ___slab_alloc() [012/147] mm, slub: do initial checks in ___slab_alloc() with irqs enabled [013/147] mm, slub: move disabling irqs closer to get_partial() in ___slab_alloc() [014/147] mm, slub: restore irqs around calling new_slab() [015/147] mm, slub: validate slab from partial list or page allocator before making it cpu slab [016/147] mm, slub: check new pages with restored irqs [017/147] mm, slub: stop disabling irqs around get_partial() [018/147] mm, slub: move reset of c->page and freelist out of deactivate_slab() [019/147] mm, slub: make locking in deactivate_slab() irq-safe [020/147] mm, slub: call deactivate_slab() without disabling irqs [021/147] mm, slub: move irq control into unfreeze_partials() [022/147] mm, slub: discard slabs in unfreeze_partials() without irqs disabled [023/147] mm, slub: detach whole partial list at once in unfreeze_partials() [024/147] mm, slub: separate detaching of partial list in unfreeze_partials() from unfreezing [025/147] mm, slub: only disable irq with spin_lock in __unfreeze_partials() [026/147] mm, slub: don't disable irqs in slub_cpu_dead() [027/147] mm, slab: split out the cpu offline variant of flush_slab() [028/147] mm: slub: move flush_cpu_slab() invocations __free_slab() invocations out of IRQ context [029/147] mm: slub: make object_map_lock a raw_spinlock_t [030/147] mm, slub: make slab_lock() disable irqs with PREEMPT_RT [031/147] mm, slub: protect put_cpu_partial() with disabled irqs instead of cmpxchg [032/147] mm, slub: use migrate_disable() on PREEMPT_RT [033/147] mm, slub: convert kmem_cpu_slab protection to local_lock [034/147] memory-hotplug.rst: remove locking details from admin-guide [035/147] memory-hotplug.rst: complete admin-guide overhaul [036/147] mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE [037/147] mm: memory_hotplug: cleanup after removal of pfn_valid_within() [038/147] mm/memory_hotplug: use "unsigned long" for PFN in zone_for_pfn_range() [039/147] mm/memory_hotplug: remove nid parameter from arch_remove_memory() [040/147] mm/memory_hotplug: remove nid parameter from remove_memory() and friends [041/147] ACPI: memhotplug: memory resources cannot be enabled yet [042/147] mm: track present early pages per zone [043/147] mm/memory_hotplug: introduce "auto-movable" online policy [044/147] drivers/base/memory: introduce "memory groups" to logically group memory blocks [045/147] mm/memory_hotplug: track present pages in memory groups [046/147] ACPI: memhotplug: use a single static memory group for a single memory device [047/147] dax/kmem: use a single static memory group for a single probed unit [048/147] virtio-mem: use a single dynamic memory group for a single virtio-mem device [049/147] mm/memory_hotplug: memory group aware "auto-movable" online policy [050/147] mm/memory_hotplug: improved dynamic memory group aware "auto-movable" online policy [051/147] mm/memory_hotplug: use helper zone_is_zone_device() to simplify the code [052/147] mm: remove redundant compound_head() calling [053/147] riscv: only select GENERIC_IOREMAP if MMU support is enabled [054/147] mm: move ioremap_page_range to vmalloc.c [055/147] mm: don't allow executable ioremap mappings [056/147] mm/early_ioremap.c: remove redundant early_ioremap_shutdown() [057/147] highmem: don't disable preemption on RT in kmap_atomic() [058/147] mm: in_irq() cleanup [059/147] mm: introduce PAGEFLAGS_MASK to replace ((1UL << NR_PAGEFLAGS) - 1) [060/147] mm/secretmem: use refcount_t instead of atomic_t [061/147] kfence: show cpu and timestamp in alloc/free info [062/147] kfence: test: fail fast if disabled at boot [063/147] mm: introduce Data Access MONitor (DAMON) [064/147] mm/damon/core: implement region-based sampling [065/147] mm/damon: adaptively adjust regions [066/147] mm/idle_page_tracking: make PG_idle reusable [067/147] mm/damon: implement primitives for the virtual memory address spaces [068/147] mm/damon: add a tracepoint [069/147] mm/damon: implement a debugfs-based user space interface [070/147] mm/damon/dbgfs: export kdamond pid to the user space [071/147] mm/damon/dbgfs: support multiple contexts [072/147] Documentation: add documents for DAMON [073/147] mm/damon: add kunit tests [074/147] mm/damon: add user space selftests [075/147] MAINTAINERS: update for DAMON [076/147] alpha: agp: make empty macros use do-while-0 style [077/147] alpha: pci-sysfs: fix all kernel-doc warnings [078/147] percpu: remove export of pcpu_base_addr [079/147] fs/proc/kcore.c: add mmap interface [080/147] proc: stop using seq_get_buf in proc_task_name [081/147] connector: send event on write to /proc/[pid]/comm [082/147] arch: Kconfig: fix spelling mistake "seperate" -> "separate" [083/147] include/linux/once.h: fix trivia typo Not -> Note [084/147] units: change from 'L' to 'UL' [085/147] units: add the HZ macros [086/147] thermal/drivers/devfreq_cooling: use HZ macros [087/147] devfreq: use HZ macros [088/147] iio/drivers/as73211: use HZ macros [089/147] hwmon/drivers/mr75203: use HZ macros [090/147] iio/drivers/hid-sensor: use HZ macros [091/147] i2c/drivers/ov02q10: use HZ macros [092/147] mtd/drivers/nand: use HZ macros [093/147] phy/drivers/stm32: use HZ macros [094/147] kernel/acct.c: use dedicated helper to access rlimit values [095/147] profiling: fix shift-out-of-bounds bugs [096/147] MAINTAINERS: update ClangBuiltLinux mailing list [097/147] Documentation/llvm: update mailing list [098/147] Documentation/llvm: update IRC location [099/147] math: make RATIONAL tristate [100/147] math: RATIONAL_KUNIT_TEST should depend on RATIONAL instead of selecting it [101/147] lib/string: optimized memcpy [102/147] lib/string: optimized memmove [103/147] lib/string: optimized memset [104/147] lib/test: convert test_sort.c to use KUnit [105/147] lib/dump_stack: correct kernel-doc notation [106/147] lib/iov_iter.c: fix kernel-doc warnings [107/147] bitops: protect find_first_{,zero}_bit properly [108/147] bitops: move find_bit__le functions from le.h to find.h [109/147] include: move find.h from asm_generic to linux [110/147] arch: remove GENERIC_FIND_FIRST_BIT entirely [111/147] lib: add find_first_and_bit() [112/147] cpumask: use find_first_and_bit() [113/147] all: replace find_next{,_zero}_bit with find_first{,_zero}_bit where appropriate [114/147] tools: sync tools/bitmap with mother linux [115/147] cpumask: replace cpumask_next_ with cpumask_first_* where appropriate [116/147] include/linux: move for_each_bit() macros from bitops.h to find.h [117/147] find: micro-optimize for_each_{set,clear}_bit() [118/147] bitops: replace for_each__bit_from() with for_each__bit() where appropriate [119/147] tools: rename bitmap_alloc() to bitmap_zalloc() [120/147] mm/percpu: micro-optimize pcpu_is_populated() [121/147] bitmap: unify find_bit operations [122/147] lib: bitmap: add performance test for bitmap_print_to_pagebuf [123/147] vsprintf: rework bitmap_list_string [124/147] checkpatch: support wide strings [125/147] checkpatch: make email address check case insensitive [126/147] checkpatch: improve GIT_COMMIT_ID test [127/147] fs/epoll: use a per-cpu counter for user's watches count [128/147] init: move usermodehelper_enable() to populate_rootfs() [130/147] nilfs2: fix memory leak in nilfs_sysfs_create_device_group [131/147] nilfs2: fix NULL pointer in nilfs_##name##_attr_release [132/147] nilfs2: fix memory leak in nilfs_sysfs_create_##name##_group [133/147] nilfs2: fix memory leak in nilfs_sysfs_delete_##name##_group [134/147] nilfs2: fix memory leak in nilfs_sysfs_create_snapshot_group [135/147] nilfs2: fix memory leak in nilfs_sysfs_delete_snapshot_group [136/147] nilfs2: use refcount_dec_and_lock() to fix potential UAF [137/147] fs/coredump.c: log if a core dump is aborted due to changed file permissions [138/147] coredump: fix memleak in dump_vma_snapshot() [139/147] kernel/fork.c: unexport get_{mm,task}_exe_file [140/147] pid: cleanup the stale comment mentioning pidmap_init(). [141/147] prctl: allow to setup brk for et_dyn executables [142/147] configs: remove the obsolete CONFIG_INPUT_POLLDEV [143/147] Kconfig.debug: drop selecting non-existing HARDLOCKUP_DETECTOR_ARCH [144/147] selftests/memfd: remove unused variable [145/147] ipc: replace costly bailout check in sysvipc_find_ipc() [146/147] mm/workingset: correct kernel-doc notations [147/147] scripts: check_extable: fix typo in user error message [129/147] trap: cleanup trap_init()

Message ID

20210908025839.81TnA0vU3%akpm@linux-foundation.org (mailing list archive)

State

New

Headers

DMARC-Filter: OpenDMARC Filter v1.4.1 mail.kernel.org 47CCF61100
Date: Tue, 07 Sep 2021 19:58:39 -0700
From: Andrew Morton <akpm@linux-foundation.org>
To: akpm@linux-foundation.org, David.Laight@aculab.com,
 drew@beagleboard.org, guoren@kernel.org, hch@infradead.org,
 kernel@esmil.dk, linux-mm@kvack.org, mcroce@microsoft.com,
 mick@ics.forth.gr, mm-commits@vger.kernel.org, ndesaulniers@google.com,
 palmer@dabbelt.com, torvalds@linux-foundation.org
Subject: [patch 101/147] lib/string: optimized memcpy
Message-ID: <20210908025839.81TnA0vU3%akpm@linux-foundation.org>
In-Reply-To: <20210907195226.14b1d22a07c085b22968b933@linux-foundation.org>
User-Agent: s-nail v14.8.16
Sender: owner-linux-mm@kvack.org
Precedence: bulk

Series

[001/147] mm, slub: don't call flush_all() from slab_debug_trace_open() | expand

Commit Message

Andrew Morton Sept. 8, 2021, 2:58 a.m. UTC

From: Matteo Croce <mcroce@microsoft.com>
Subject: lib/string: optimized memcpy

Patch series "lib/string: optimized mem* functions", v2.

Rewrite the generic mem{cpy,move,set} so that memory is accessed with the
widest size possible, but without doing unaligned accesses.

This was originally posted as C string functions for RISC-V[1], but as
there was no specific RISC-V code, it was proposed for the generic
lib/string.c implementation.

Tested on RISC-V and on x86_64 by undefining __HAVE_ARCH_MEM{CPY,SET,MOVE}
and HAVE_EFFICIENT_UNALIGNED_ACCESS.

These are the performances of memcpy() and memset() of a RISC-V machine on
a 32 mbyte buffer:

memcpy:
original aligned:	 75 Mb/s
original unaligned:	 75 Mb/s
new aligned:		114 Mb/s
new unaligned:		107 Mb/s

memset:
original aligned:	140 Mb/s
original unaligned:	140 Mb/s
new aligned:		241 Mb/s
new unaligned:		241 Mb/s

The size increase is negligible:

$ scripts/bloat-o-meter vmlinux.orig vmlinux
add/remove: 0/0 grow/shrink: 4/1 up/down: 427/-6 (421)
Function                                     old     new   delta
memcpy                                        29     351    +322
memset                                        29     117     +88
strlcat                                       68      78     +10
strlcpy                                       50      57      +7
memmove                                       56      50      -6
Total: Before=8556964, After=8557385, chg +0.00%

These functions will be used for RISC-V initially.

[1] https://lore.kernel.org/linux-riscv/20210617152754.17960-1-mcroce@linux.microsoft.com/

The only architecture which will use all the three function will be riscv,
while memmove() will be used by arc, h8300, hexagon, ia64, openrisc and
parisc.

Keep in mind that memmove() isn't anything special, it just calls memcpy()
when possible (e.g.  buffers not overlapping), and fallbacks to the byte
by byte copy otherwise.

In future we can write two functions, one which copies forward and another
one which copies backward, and call the right one depending on the buffers
position.  Then, we could alias memcpy() and memmove(), as proposed by
Linus: https://bugzilla.redhat.com/show_bug.cgi?id=638477#c132

This patch (of 3):

Rewrite the generic memcpy() to copy a word at time, without generating
unaligned accesses.

The procedure is made of three steps: First copy data one byte at time
until the destination buffer is aligned to a long boundary.  Then copy the
data one long at time shifting the current and the next long to compose a
long at every cycle.  Finally, copy the remainder one byte at time.

This is the improvement on RISC-V:

original aligned:	 75 Mb/s
original unaligned:	 75 Mb/s
new aligned:		114 Mb/s
new unaligned:		107 Mb/s

and this the binary size increase according to bloat-o-meter:

Function     old     new   delta
memcpy        36     324    +288

Link: https://lkml.kernel.org/r/20210702123153.14093-1-mcroce@linux.microsoft.com
Link: https://lkml.kernel.org/r/20210702123153.14093-2-mcroce@linux.microsoft.com
Signed-off-by: Matteo Croce <mcroce@microsoft.com>
Cc: Nick Kossifidis <mick@ics.forth.gr>
Cc: Guo Ren <guoren@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Laight <David.Laight@aculab.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Emil Renner Berthing <kernel@esmil.dk>
Cc: Drew Fustini <drew@beagleboard.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 lib/string.c |   80 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 3 deletions(-)

Comments

Linus Torvalds Sept. 8, 2021, 6:26 p.m. UTC | #1

I'm going to skip this one too.

On Tue, Sep 7, 2021 at 7:58 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> From: Matteo Croce <mcroce@microsoft.com>
> Subject: lib/string: optimized memcpy
>
> Patch series "lib/string: optimized mem* functions", v2.

Honestly, if we change the fallback memcpy(), I think the change
should be to remove it.

This is a core architecture thing, and every architecture does their
own. And pretty much every architecture has their own optimizations
for memcpy.

Yes, the byte-at-a-time default implementation is bad. But it's
_intentionally_ bad. It's only meant for initial bringup. No
architecture should actually end up using this in the long run, and if
you see it in profiles it should make you go "Ahh" instead.

             Linus

--- a/lib/string.c~lib-string-optimized-memcpy
+++ a/lib/string.c
@@ -33,6 +33,23 @@ 
 #include <asm/word-at-a-time.h>
 #include <asm/page.h>
 
+#define BYTES_LONG	sizeof(long)
+#define WORD_MASK	(BYTES_LONG - 1)
+#define MIN_THRESHOLD	(BYTES_LONG * 2)
+
+/* convenience union to avoid cast between different pointer types */
+union types {
+	u8 *as_u8;
+	unsigned long *as_ulong;
+	uintptr_t as_uptr;
+};
+
+union const_types {
+	const u8 *as_u8;
+	const unsigned long *as_ulong;
+	uintptr_t as_uptr;
+};
+
 #ifndef __HAVE_ARCH_STRNCASECMP
 /**
  * strncasecmp - Case insensitive, length-limited string comparison
@@ -869,6 +886,13 @@  EXPORT_SYMBOL(memset64);
 #endif
 
 #ifndef __HAVE_ARCH_MEMCPY
+
+#ifdef __BIG_ENDIAN
+#define MERGE_UL(h, l, d) ((h) << ((d) * 8) | (l) >> ((BYTES_LONG - (d)) * 8))
+#else
+#define MERGE_UL(h, l, d) ((h) >> ((d) * 8) | (l) << ((BYTES_LONG - (d)) * 8))
+#endif
+
 /**
  * memcpy - Copy one area of memory to another
  * @dest: Where to copy to
@@ -880,14 +904,64 @@  EXPORT_SYMBOL(memset64);
  */
 void *memcpy(void *dest, const void *src, size_t count)
 {
-	char *tmp = dest;
-	const char *s = src;
+	union const_types s = { .as_u8 = src };
+	union types d = { .as_u8 = dest };
+	int distance = 0;
+
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
+		if (count < MIN_THRESHOLD)
+			goto copy_remainder;
+
+		/* Copy a byte at time until destination is aligned. */
+		for (; d.as_uptr & WORD_MASK; count--)
+			*d.as_u8++ = *s.as_u8++;
+
+		distance = s.as_uptr & WORD_MASK;
+	}
+
+	if (distance) {
+		unsigned long last, next;
 
+		/*
+		 * s is distance bytes ahead of d, and d just reached
+		 * the alignment boundary. Move s backward to word align it
+		 * and shift data to compensate for distance, in order to do
+		 * word-by-word copy.
+		 */
+		s.as_u8 -= distance;
+
+		next = s.as_ulong[0];
+		for (; count >= BYTES_LONG; count -= BYTES_LONG) {
+			last = next;
+			next = s.as_ulong[1];
+
+			d.as_ulong[0] = MERGE_UL(last, next, distance);
+
+			d.as_ulong++;
+			s.as_ulong++;
+		}
+
+		/* Restore s with the original offset. */
+		s.as_u8 += distance;
+	} else {
+		/*
+		 * If the source and dest lower bits are the same, do a simple
+		 * 32/64 bit wide copy.
+		 */
+		for (; count >= BYTES_LONG; count -= BYTES_LONG)
+			*d.as_ulong++ = *s.as_ulong++;
+	}
+
+copy_remainder:
 	while (count--)
-		*tmp++ = *s++;
+		*d.as_u8++ = *s.as_u8++;
+
 	return dest;
 }
 EXPORT_SYMBOL(memcpy);
+
+#undef MERGE_UL
+
 #endif
 
 #ifndef __HAVE_ARCH_MEMMOVE