diff mbox series

[RFC,bpf-next,05/20] trait: Replace memcpy calls with inline copies

Message ID 20250305-afabre-traits-010-rfc2-v1-5-d0ecfb869797@cloudflare.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series traits: Per packet metadata KV store | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-42 fail Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-46 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-33 fail Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 161 this patch: 135
netdev/build_tools success Errors and warnings before: 26 (+0) this patch: 26 (+0)
netdev/cc_maintainers warning 4 maintainers not CCed: kuba@kernel.org pabeni@redhat.com horms@kernel.org edumazet@google.com
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 242 this patch: 242
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 55 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Arthur Fabre March 5, 2025, 2:32 p.m. UTC
From: Arthur Fabre <afabre@cloudflare.com>

When copying trait values to or from the caller, the size isn't a
constant so memcpy() ends up being a function call.

Replace it with an inline implementation that only handles the sizes we
support.

We store values "packed", so they won't necessarily be 4 or 8 byte
aligned.

Setting and getting traits is roughly ~40% faster.

Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
---
 include/net/trait.h | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

Comments

Lorenzo Bianconi March 10, 2025, 10:50 a.m. UTC | #1
> From: Arthur Fabre <afabre@cloudflare.com>
> 
> When copying trait values to or from the caller, the size isn't a
> constant so memcpy() ends up being a function call.
> 
> Replace it with an inline implementation that only handles the sizes we
> support.
> 
> We store values "packed", so they won't necessarily be 4 or 8 byte
> aligned.
> 
> Setting and getting traits is roughly ~40% faster.

Nice! I guess in a formal series this patch can be squashed with patch 1/20
(adding some comments).

Regards,
Lorenzo

> 
> Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
> ---
>  include/net/trait.h | 25 +++++++++++++++++++------
>  1 file changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/include/net/trait.h b/include/net/trait.h
> index 536b8a17dbbc091b4d1a4d7b4b21c1e36adea86a..d4581a877bd57a32e2ad032147c906764d6d37f8 100644
> --- a/include/net/trait.h
> +++ b/include/net/trait.h
> @@ -7,6 +7,7 @@
>  #include <linux/errno.h>
>  #include <linux/string.h>
>  #include <linux/bitops.h>
> +#include <linux/unaligned.h>
>  
>  /* Traits are a very limited KV store, with:
>   * - 64 keys (0-63).
> @@ -145,23 +146,23 @@ int trait_set(void *traits, void *hard_end, u64 key, const void *val, u64 len, u
>  			memmove(traits + off + len, traits + off, traits_size(traits) - off);
>  	}
>  
> -	/* Set our value. */
> -	memcpy(traits + off, val, len);
> -
> -	/* Store our length in header. */
>  	u64 encode_len = 0;
> -
>  	switch (len) {
>  	case 2:
> +		/* Values are least two bytes, so they'll be two byte aligned */
> +		*(u16 *)(traits + off) = *(u16 *)val;
>  		encode_len = 1;
>  		break;
>  	case 4:
> +		put_unaligned(*(u32 *)val, (u32 *)(traits + off));
>  		encode_len = 2;
>  		break;
>  	case 8:
> +		put_unaligned(*(u64 *)val, (u64 *)(traits + off));
>  		encode_len = 3;
>  		break;
>  	}
> +
>  	h->high |= (encode_len >> 1) << key;
>  	h->low |= (encode_len & 1) << key;
>  	return 0;
> @@ -201,7 +202,19 @@ int trait_get(void *traits, u64 key, void *val, u64 val_len)
>  	if (real_len > val_len)
>  		return -ENOSPC;
>  
> -	memcpy(val, traits + off, real_len);
> +	switch (real_len) {
> +	case 2:
> +		/* Values are least two bytes, so they'll be two byte aligned */
> +		*(u16 *)val = *(u16 *)(traits + off);
> +		break;
> +	case 4:
> +		*(u32 *)val = get_unaligned((u32 *)(traits + off));
> +		break;
> +	case 8:
> +		*(u64 *)val = get_unaligned((u64 *)(traits + off));
> +		break;
> +	}
> +
>  	return real_len;
>  }
>  
> 
> -- 
> 2.43.0
> 
>
Arthur Fabre March 10, 2025, 3:52 p.m. UTC | #2
On Mon Mar 10, 2025 at 11:50 AM CET, Lorenzo Bianconi wrote:
> > From: Arthur Fabre <afabre@cloudflare.com>
> > 
> > When copying trait values to or from the caller, the size isn't a
> > constant so memcpy() ends up being a function call.
> > 
> > Replace it with an inline implementation that only handles the sizes we
> > support.
> > 
> > We store values "packed", so they won't necessarily be 4 or 8 byte
> > aligned.
> > 
> > Setting and getting traits is roughly ~40% faster.
>
> Nice! I guess in a formal series this patch can be squashed with patch 1/20
> (adding some comments).

Happy to squash and add comments instead if that's better :)

>
> Regards,
> Lorenzo
>
> > 
> > Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
> > ---
> >  include/net/trait.h | 25 +++++++++++++++++++------
> >  1 file changed, 19 insertions(+), 6 deletions(-)
> > 
> > diff --git a/include/net/trait.h b/include/net/trait.h
> > index 536b8a17dbbc091b4d1a4d7b4b21c1e36adea86a..d4581a877bd57a32e2ad032147c906764d6d37f8 100644
> > --- a/include/net/trait.h
> > +++ b/include/net/trait.h
> > @@ -7,6 +7,7 @@
> >  #include <linux/errno.h>
> >  #include <linux/string.h>
> >  #include <linux/bitops.h>
> > +#include <linux/unaligned.h>
> >  
> >  /* Traits are a very limited KV store, with:
> >   * - 64 keys (0-63).
> > @@ -145,23 +146,23 @@ int trait_set(void *traits, void *hard_end, u64 key, const void *val, u64 len, u
> >  			memmove(traits + off + len, traits + off, traits_size(traits) - off);
> >  	}
> >  
> > -	/* Set our value. */
> > -	memcpy(traits + off, val, len);
> > -
> > -	/* Store our length in header. */
> >  	u64 encode_len = 0;
> > -
> >  	switch (len) {
> >  	case 2:
> > +		/* Values are least two bytes, so they'll be two byte aligned */
> > +		*(u16 *)(traits + off) = *(u16 *)val;
> >  		encode_len = 1;
> >  		break;
> >  	case 4:
> > +		put_unaligned(*(u32 *)val, (u32 *)(traits + off));
> >  		encode_len = 2;
> >  		break;
> >  	case 8:
> > +		put_unaligned(*(u64 *)val, (u64 *)(traits + off));
> >  		encode_len = 3;
> >  		break;
> >  	}
> > +
> >  	h->high |= (encode_len >> 1) << key;
> >  	h->low |= (encode_len & 1) << key;
> >  	return 0;
> > @@ -201,7 +202,19 @@ int trait_get(void *traits, u64 key, void *val, u64 val_len)
> >  	if (real_len > val_len)
> >  		return -ENOSPC;
> >  
> > -	memcpy(val, traits + off, real_len);
> > +	switch (real_len) {
> > +	case 2:
> > +		/* Values are least two bytes, so they'll be two byte aligned */
> > +		*(u16 *)val = *(u16 *)(traits + off);
> > +		break;
> > +	case 4:
> > +		*(u32 *)val = get_unaligned((u32 *)(traits + off));
> > +		break;
> > +	case 8:
> > +		*(u64 *)val = get_unaligned((u64 *)(traits + off));
> > +		break;
> > +	}
> > +
> >  	return real_len;
> >  }
> >  
> > 
> > -- 
> > 2.43.0
> > 
> >
David Laight March 10, 2025, 10:15 p.m. UTC | #3
On Wed, 05 Mar 2025 15:32:02 +0100
arthur@arthurfabre.com wrote:

> From: Arthur Fabre <afabre@cloudflare.com>
> 
> When copying trait values to or from the caller, the size isn't a
> constant so memcpy() ends up being a function call.
> 
> Replace it with an inline implementation that only handles the sizes we
> support.
> 
> We store values "packed", so they won't necessarily be 4 or 8 byte
> aligned.
> 
> Setting and getting traits is roughly ~40% faster.
> 
> Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
> ---
>  include/net/trait.h | 25 +++++++++++++++++++------
>  1 file changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/include/net/trait.h b/include/net/trait.h
> index 536b8a17dbbc091b4d1a4d7b4b21c1e36adea86a..d4581a877bd57a32e2ad032147c906764d6d37f8 100644
> --- a/include/net/trait.h
> +++ b/include/net/trait.h
> @@ -7,6 +7,7 @@
>  #include <linux/errno.h>
>  #include <linux/string.h>
>  #include <linux/bitops.h>
> +#include <linux/unaligned.h>
>  
>  /* Traits are a very limited KV store, with:
>   * - 64 keys (0-63).
> @@ -145,23 +146,23 @@ int trait_set(void *traits, void *hard_end, u64 key, const void *val, u64 len, u
>  			memmove(traits + off + len, traits + off, traits_size(traits) - off);
>  	}
>  
> -	/* Set our value. */
> -	memcpy(traits + off, val, len);
> -
> -	/* Store our length in header. */
>  	u64 encode_len = 0;
> -
>  	switch (len) {
>  	case 2:
> +		/* Values are least two bytes, so they'll be two byte aligned */
> +		*(u16 *)(traits + off) = *(u16 *)val;
>  		encode_len = 1;
>  		break;
>  	case 4:
> +		put_unaligned(*(u32 *)val, (u32 *)(traits + off));
>  		encode_len = 2;
>  		break;
>  	case 8:
> +		put_unaligned(*(u64 *)val, (u64 *)(traits + off));
>  		encode_len = 3;
>  		break;
>  	}
> +
>  	h->high |= (encode_len >> 1) << key;
>  	h->low |= (encode_len & 1) << key;
>  	return 0;
> @@ -201,7 +202,19 @@ int trait_get(void *traits, u64 key, void *val, u64 val_len)
>  	if (real_len > val_len)
>  		return -ENOSPC;
>  
> -	memcpy(val, traits + off, real_len);
> +	switch (real_len) {
> +	case 2:
> +		/* Values are least two bytes, so they'll be two byte aligned */
> +		*(u16 *)val = *(u16 *)(traits + off);
> +		break;
> +	case 4:
> +		*(u32 *)val = get_unaligned((u32 *)(traits + off));
> +		break;
> +	case 8:
> +		*(u64 *)val = get_unaligned((u64 *)(traits + off));
> +		break;

Should there be a 'default' in here?
Possibly just 'return 0'?

> +	}
> +
>  	return real_len;
>  }
>  
>
diff mbox series

Patch

diff --git a/include/net/trait.h b/include/net/trait.h
index 536b8a17dbbc091b4d1a4d7b4b21c1e36adea86a..d4581a877bd57a32e2ad032147c906764d6d37f8 100644
--- a/include/net/trait.h
+++ b/include/net/trait.h
@@ -7,6 +7,7 @@ 
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/bitops.h>
+#include <linux/unaligned.h>
 
 /* Traits are a very limited KV store, with:
  * - 64 keys (0-63).
@@ -145,23 +146,23 @@  int trait_set(void *traits, void *hard_end, u64 key, const void *val, u64 len, u
 			memmove(traits + off + len, traits + off, traits_size(traits) - off);
 	}
 
-	/* Set our value. */
-	memcpy(traits + off, val, len);
-
-	/* Store our length in header. */
 	u64 encode_len = 0;
-
 	switch (len) {
 	case 2:
+		/* Values are least two bytes, so they'll be two byte aligned */
+		*(u16 *)(traits + off) = *(u16 *)val;
 		encode_len = 1;
 		break;
 	case 4:
+		put_unaligned(*(u32 *)val, (u32 *)(traits + off));
 		encode_len = 2;
 		break;
 	case 8:
+		put_unaligned(*(u64 *)val, (u64 *)(traits + off));
 		encode_len = 3;
 		break;
 	}
+
 	h->high |= (encode_len >> 1) << key;
 	h->low |= (encode_len & 1) << key;
 	return 0;
@@ -201,7 +202,19 @@  int trait_get(void *traits, u64 key, void *val, u64 val_len)
 	if (real_len > val_len)
 		return -ENOSPC;
 
-	memcpy(val, traits + off, real_len);
+	switch (real_len) {
+	case 2:
+		/* Values are least two bytes, so they'll be two byte aligned */
+		*(u16 *)val = *(u16 *)(traits + off);
+		break;
+	case 4:
+		*(u32 *)val = get_unaligned((u32 *)(traits + off));
+		break;
+	case 8:
+		*(u64 *)val = get_unaligned((u64 *)(traits + off));
+		break;
+	}
+
 	return real_len;
 }