diff mbox series

[RFC,bpf-next,01/20] trait: limited KV store for packet metadata

Message ID 20250305-afabre-traits-010-rfc2-v1-1-d0ecfb869797@cloudflare.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series traits: Per packet metadata KV store | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-42 fail Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-46 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-33 fail Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/build_tools success Errors and warnings before: 26 (+0) this patch: 26 (+0)
netdev/cc_maintainers warning 4 maintainers not CCed: kuba@kernel.org pabeni@redhat.com horms@kernel.org edumazet@google.com
netdev/build_clang success Errors and warnings before: 1 this patch: 1
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 100 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Arthur Fabre March 5, 2025, 2:31 p.m. UTC
From: Arthur Fabre <afabre@cloudflare.com>

A (very limited) KV store to support storing KVs in the packet headroom,
with:
- 64 keys (0-63).
- 2, 4 or 8 byte values.
- O(1) lookup
- O(n) insertion
- A fixed 16 byte header.

Values are stored ordered by key, immediately following the fixed
header.

This could be extended in the future, for now it implements the smallest
possible API. The API intentionally uses u64 keys to not impose
restrictions on the implementation in the future.

I picked 2¸ 4, and 8 bytes arbitrarily. We could also support 0 sized
values for use as flags.
A 16 byte value could be useful to store UUIDs and IPv6 addresses.
If we want more than 3 sizes, we can add a word to the header (for a
total of 24 bytes) to support 7 sizes.

We could also allow users to set several consecutive keys in one
trait_set() call to support storing larger values.

Implemented in the header file so functions are always inlinable.

Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
---
 include/net/trait.h | 243 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 243 insertions(+)

Comments

Alexei Starovoitov March 7, 2025, 6:36 a.m. UTC | #1
On Wed, Mar 5, 2025 at 6:33 AM <arthur@arthurfabre.com> wrote:
>
> +struct __trait_hdr {
> +       /* Values are stored ordered by key, immediately after the header.
> +        *
> +        * The size of each value set is stored in the header as two bits:
> +        *  - 00: Not set.
> +        *  - 01: 2 bytes.
> +        *  - 10: 4 bytes.
> +        *  - 11: 8 bytes.

...

> +        *  - hweight(low) + hweight(high)<<1 is offset.

the comment doesn't match the code

> +        */
> +       u64 high;
> +       u64 low;

...

> +static __always_inline int __trait_total_length(struct __trait_hdr h)
> +{
> +       return (hweight64(h.low) << 1) + (hweight64(h.high) << 2)
> +               // For size 8, we only get 4+2=6. Add another 2 in.
> +               + (hweight64(h.high & h.low) << 1);
> +}

This is really cool idea, but 2 byte size doesn't feel that useful.
How about:
- 00: Not set.
- 01: 4 bytes.
- 10: 8 bytes.
- 11: 16 bytes.

4 byte may be useful for ipv4, 16 for ipv6, and 8 is just a good number.
And compute the same way with 3 popcount with extra +1 to shifts.
Arthur Fabre March 7, 2025, 11:14 a.m. UTC | #2
On Fri Mar 7, 2025 at 7:36 AM CET, Alexei Starovoitov wrote:
> On Wed, Mar 5, 2025 at 6:33 AM <arthur@arthurfabre.com> wrote:
> >
> > +struct __trait_hdr {
> > +       /* Values are stored ordered by key, immediately after the header.
> > +        *
> > +        * The size of each value set is stored in the header as two bits:
> > +        *  - 00: Not set.
> > +        *  - 01: 2 bytes.
> > +        *  - 10: 4 bytes.
> > +        *  - 11: 8 bytes.
>
> ...
>
> > +        *  - hweight(low) + hweight(high)<<1 is offset.
>
> the comment doesn't match the code
>
> > +        */
> > +       u64 high;
> > +       u64 low;
>
> ...
>
> > +static __always_inline int __trait_total_length(struct __trait_hdr h)
> > +{
> > +       return (hweight64(h.low) << 1) + (hweight64(h.high) << 2)
> > +               // For size 8, we only get 4+2=6. Add another 2 in.
> > +               + (hweight64(h.high & h.low) << 1);
> > +}
>
> This is really cool idea, but 2 byte size doesn't feel that useful.
> How about:
> - 00: Not set.
> - 01: 4 bytes.
> - 10: 8 bytes.
> - 11: 16 bytes.
>
> 4 byte may be useful for ipv4, 16 for ipv6, and 8 is just a good number.
> And compute the same way with 3 popcount with extra +1 to shifts.

I chose the sizes arbitrarily, happy to change them.

16 is also useful for UUIDs, for tracing.

Size 0 could store bools / flags. Keys could be set without a value, 
and users could check if the key is set or not.
That replaces single bits of the mark today, for example a
"route locally" key.

That only leaves one other size, maybe 4 for smaller values?

If we want more sizes, we could also:

- Add another u64 word to the header, so we have 3 bits per key. It
  uses more room, and we need more popcnts, but most modern x86 CPUs can
  do 3 popcnts in parallel so it could be ok.

- Let users set consecutive keys to one big value. Instead of supporting
  size 16, we let them set two 8 byte KVs in one trait_set() call and
  provide a 16 byte value. Eg:

	trait_set_batch(u64 key_from, u64_key_to, size, ...);

  It's easy to implement, but it makes the API more complicated.
Alexei Starovoitov March 7, 2025, 5:29 p.m. UTC | #3
On Fri, Mar 7, 2025 at 3:14 AM Arthur Fabre <arthur@arthurfabre.com> wrote:
>
> On Fri Mar 7, 2025 at 7:36 AM CET, Alexei Starovoitov wrote:
> > On Wed, Mar 5, 2025 at 6:33 AM <arthur@arthurfabre.com> wrote:
> > >
> > > +struct __trait_hdr {
> > > +       /* Values are stored ordered by key, immediately after the header.
> > > +        *
> > > +        * The size of each value set is stored in the header as two bits:
> > > +        *  - 00: Not set.
> > > +        *  - 01: 2 bytes.
> > > +        *  - 10: 4 bytes.
> > > +        *  - 11: 8 bytes.
> >
> > ...
> >
> > > +        *  - hweight(low) + hweight(high)<<1 is offset.
> >
> > the comment doesn't match the code
> >
> > > +        */
> > > +       u64 high;
> > > +       u64 low;
> >
> > ...
> >
> > > +static __always_inline int __trait_total_length(struct __trait_hdr h)
> > > +{
> > > +       return (hweight64(h.low) << 1) + (hweight64(h.high) << 2)
> > > +               // For size 8, we only get 4+2=6. Add another 2 in.
> > > +               + (hweight64(h.high & h.low) << 1);
> > > +}
> >
> > This is really cool idea, but 2 byte size doesn't feel that useful.
> > How about:
> > - 00: Not set.
> > - 01: 4 bytes.
> > - 10: 8 bytes.
> > - 11: 16 bytes.
> >
> > 4 byte may be useful for ipv4, 16 for ipv6, and 8 is just a good number.
> > And compute the same way with 3 popcount with extra +1 to shifts.
>
> I chose the sizes arbitrarily, happy to change them.
>
> 16 is also useful for UUIDs, for tracing.
>
> Size 0 could store bools / flags. Keys could be set without a value,
> and users could check if the key is set or not.
> That replaces single bits of the mark today, for example a
> "route locally" key.

I don't understand how that would work.
If I'm reading the code correctly 00 means that key is not present.
How one would differentiate key present/not with zero size in value?

>
> That only leaves one other size, maybe 4 for smaller values?
>
> If we want more sizes, we could also:
>
> - Add another u64 word to the header, so we have 3 bits per key. It
>   uses more room, and we need more popcnts, but most modern x86 CPUs can
>   do 3 popcnts in parallel so it could be ok.

Two u64s already need 3 pop counts, so it's a good compromise as-is.

> - Let users set consecutive keys to one big value. Instead of supporting
>   size 16, we let them set two 8 byte KVs in one trait_set() call and
>   provide a 16 byte value. Eg:
>
>         trait_set_batch(u64 key_from, u64_key_to, size, ...);
>
>   It's easy to implement, but it makes the API more complicated.

I don't think it complicates the api.
With max size 16 the user can put two consecutive keys of, say, 16 and 8
to make 24 bytes of info,
or use 4 keys of 16 byte each to form 64-bytes of info.
The bit manipulations are too tricky for compilers to optimize.
So even with full inlining the two trait_set() of consecutive keys
will still be largely separate blobs of code.
So trait_[gs]et_batch() makes sense to me.

Also let's not over focus on networking use cases.
This mini KV will be useful in all bpf maps including local storage.
For example the user process can add information about itself into
task local storage while sched-ext can use that to make scheduling decisions.
Jakub Sitnicki March 7, 2025, 7:24 p.m. UTC | #4
On Wed, Mar 05, 2025 at 03:31 PM +01, arthur@arthurfabre.com wrote:
> From: Arthur Fabre <afabre@cloudflare.com>
>
> A (very limited) KV store to support storing KVs in the packet headroom,
> with:
> - 64 keys (0-63).
> - 2, 4 or 8 byte values.
> - O(1) lookup
> - O(n) insertion
> - A fixed 16 byte header.
>
> Values are stored ordered by key, immediately following the fixed
> header.
>
> This could be extended in the future, for now it implements the smallest
> possible API. The API intentionally uses u64 keys to not impose
> restrictions on the implementation in the future.
>
> I picked 2¸ 4, and 8 bytes arbitrarily. We could also support 0 sized
> values for use as flags.
> A 16 byte value could be useful to store UUIDs and IPv6 addresses.
> If we want more than 3 sizes, we can add a word to the header (for a
> total of 24 bytes) to support 7 sizes.
>
> We could also allow users to set several consecutive keys in one
> trait_set() call to support storing larger values.
>
> Implemented in the header file so functions are always inlinable.
>
> Signed-off-by: Arthur Fabre <afabre@cloudflare.com>
> ---

[...]

> +/**
> + * trait_get() - Get a trait key.
> + * @traits: Start of trait store area.
> + * @key: The key to get.
> + * @val: Where to put stored value.
> + * @val_len: The length of val.
> + *
> + * Return:
> + * * %>0      - Actual size of value.
> + * * %-EINVAL - Key or length invalid.
> + * * %-ENOENT - Key has not been set with trait_set() previously.
> + * * %-ENOSPC - Val is not big enough to hold stored value.
> + */
> +static __always_inline
> +int trait_get(void *traits, u64 key, void *val, u64 val_len)

Hmm. I think that passing void *val will bite us on big endian.
Seems I can't pass an u64 * if you don't know the value size up front.
For values under 8 bytes I would end up with bytes shifted to the left.
No?

Take u64 *val instead and copy it in at the right offset?

> +{
> +	if (!__trait_valid_key(key))
> +		return -EINVAL;
> +
> +	struct __trait_hdr h = *(struct __trait_hdr *)traits;
> +
> +	/* Check key is set */
> +	if (!((h.high & (1ull << key)) || (h.low & (1ull << key))))
> +		return -ENOENT;
> +
> +	/* Offset of value of this key */
> +	int off = __trait_offset(h, key);
> +
> +	/* Figure out our length */
> +	int real_len = __trait_total_length(__trait_and(h, (1ull << key)));
> +
> +	if (real_len > val_len)
> +		return -ENOSPC;
> +
> +	memcpy(val, traits + off, real_len);
> +	return real_len;
> +}
Arthur Fabre March 10, 2025, 2:45 p.m. UTC | #5
On Fri Mar 7, 2025 at 6:29 PM CET, Alexei Starovoitov wrote:
> On Fri, Mar 7, 2025 at 3:14 AM Arthur Fabre <arthur@arthurfabre.com> wrote:
> >
> > On Fri Mar 7, 2025 at 7:36 AM CET, Alexei Starovoitov wrote:
> > > On Wed, Mar 5, 2025 at 6:33 AM <arthur@arthurfabre.com> wrote:
> > > >
> > > > +struct __trait_hdr {
> > > > +       /* Values are stored ordered by key, immediately after the header.
> > > > +        *
> > > > +        * The size of each value set is stored in the header as two bits:
> > > > +        *  - 00: Not set.
> > > > +        *  - 01: 2 bytes.
> > > > +        *  - 10: 4 bytes.
> > > > +        *  - 11: 8 bytes.
> > >
> > > ...
> > >
> > > > +        *  - hweight(low) + hweight(high)<<1 is offset.
> > >
> > > the comment doesn't match the code
> > >
> > > > +        */
> > > > +       u64 high;
> > > > +       u64 low;
> > >
> > > ...
> > >
> > > > +static __always_inline int __trait_total_length(struct __trait_hdr h)
> > > > +{
> > > > +       return (hweight64(h.low) << 1) + (hweight64(h.high) << 2)
> > > > +               // For size 8, we only get 4+2=6. Add another 2 in.
> > > > +               + (hweight64(h.high & h.low) << 1);
> > > > +}
> > >
> > > This is really cool idea, but 2 byte size doesn't feel that useful.
> > > How about:
> > > - 00: Not set.
> > > - 01: 4 bytes.
> > > - 10: 8 bytes.
> > > - 11: 16 bytes.
> > >
> > > 4 byte may be useful for ipv4, 16 for ipv6, and 8 is just a good number.
> > > And compute the same way with 3 popcount with extra +1 to shifts.
> >
> > I chose the sizes arbitrarily, happy to change them.
> >
> > 16 is also useful for UUIDs, for tracing.
> >
> > Size 0 could store bools / flags. Keys could be set without a value,
> > and users could check if the key is set or not.
> > That replaces single bits of the mark today, for example a
> > "route locally" key.
>
> I don't understand how that would work.
> If I'm reading the code correctly 00 means that key is not present.
> How one would differentiate key present/not with zero size in value?

It isn't implemented right now, 0 could just be one of the three sizes.
Eg:

- 00: key not set
- 01: key set, no value
- 10: key set, 4 bytes
- 11: key set, 16 bytes

We wouldn't popcnt the low bit, just the high bit, and high & low.

>
> >
> > That only leaves one other size, maybe 4 for smaller values?
> >
> > If we want more sizes, we could also:
> >
> > - Add another u64 word to the header, so we have 3 bits per key. It
> >   uses more room, and we need more popcnts, but most modern x86 CPUs can
> >   do 3 popcnts in parallel so it could be ok.
>
> Two u64s already need 3 pop counts, so it's a good compromise as-is.
>
> > - Let users set consecutive keys to one big value. Instead of supporting
> >   size 16, we let them set two 8 byte KVs in one trait_set() call and
> >   provide a 16 byte value. Eg:
> >
> >         trait_set_batch(u64 key_from, u64_key_to, size, ...);
> >
> >   It's easy to implement, but it makes the API more complicated.
>
> I don't think it complicates the api.
> With max size 16 the user can put two consecutive keys of, say, 16 and 8
> to make 24 bytes of info,
> or use 4 keys of 16 byte each to form 64-bytes of info.
> The bit manipulations are too tricky for compilers to optimize.
> So even with full inlining the two trait_set() of consecutive keys
> will still be largely separate blobs of code.
> So trait_[gs]et_batch() makes sense to me.

Ok! I'll respin this with a batch API too.

>
> Also let's not over focus on networking use cases.
> This mini KV will be useful in all bpf maps including local storage.
> For example the user process can add information about itself into
> task local storage while sched-ext can use that to make scheduling decisions.

Good point, we've been focusing on networking only. Are you thinking
of the value sizes, or is there something else that would help for other
use cases?
diff mbox series

Patch

diff --git a/include/net/trait.h b/include/net/trait.h
new file mode 100644
index 0000000000000000000000000000000000000000..536b8a17dbbc091b4d1a4d7b4b21c1e36adea86a
--- /dev/null
+++ b/include/net/trait.h
@@ -0,0 +1,243 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2025 Arthur Fabre, Cloudflare */
+#ifndef __LINUX_NET_TRAIT_H__
+#define __LINUX_NET_TRAIT_H__
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+
+/* Traits are a very limited KV store, with:
+ * - 64 keys (0-63).
+ * - 2, 4 or 8 byte values.
+ * - O(1) lookup
+ * - O(n) insertion
+ * - A fixed 16 byte header.
+ */
+struct __trait_hdr {
+	/* Values are stored ordered by key, immediately after the header.
+	 *
+	 * The size of each value set is stored in the header as two bits:
+	 *  - 00: Not set.
+	 *  - 01: 2 bytes.
+	 *  - 10: 4 bytes.
+	 *  - 11: 8 bytes.
+	 *
+	 * Naively storing the size of each value (eg packing 4 of them in a byte)
+	 * doesn't let us efficiently calculate the offset of the value of an arbitrary key:
+	 * we'd have to mask out and sum the sizes of all preceding values.
+	 *
+	 * Instead, store the high and low bits of the size in two separate words:
+	 *  - A high bit in the high word.
+	 *  - A low bit in the low word.
+	 * The bit position of each word, LSb 0, is the key.
+	 *
+	 * To calculate the offset of the value of a key:
+	 *  - Mask out subsequent keys from both words.
+	 *  - Calculate the hamming weight / population count of each word.
+	 *    Single instruction on modern amd64 CPUs.
+	 *  - hweight(low) + hweight(high)<<1 is offset.
+	 */
+	u64 high;
+	u64 low;
+};
+
+static __always_inline bool __trait_valid_len(u64 len)
+{
+	return len == 2 || len == 4 || len == 8;
+}
+
+static __always_inline bool __trait_valid_key(u64 key)
+{
+	return key < 64;
+}
+
+static __always_inline int __trait_total_length(struct __trait_hdr h)
+{
+	return (hweight64(h.low) << 1) + (hweight64(h.high) << 2)
+		// For size 8, we only get 4+2=6. Add another 2 in.
+		+ (hweight64(h.high & h.low) << 1);
+}
+
+static __always_inline struct __trait_hdr __trait_and(struct __trait_hdr h, u64 mask)
+{
+	return (struct __trait_hdr){
+		h.high & mask,
+		h.low & mask,
+	};
+}
+
+static __always_inline int __trait_offset(struct __trait_hdr h, u64 key)
+{
+	/* Calculate total length of previous keys by masking out keys after. */
+	return sizeof(struct __trait_hdr) + __trait_total_length(__trait_and(h, ~(~0llu << key)));
+}
+
+/**
+ * traits_init() - Initialize a trait store.
+ * @traits: Start of trait store area.
+ * @hard_end: Hard limit the trait store can currently grow up against.
+ *            Can change dynamically after initialization, as long as it
+ *            does not overwrite any area already used (see traits_size()).
+ *
+ * Return:
+ * * %0       - Success.
+ * * %-ENOMEM - Not enough room to store any traits.
+ */
+static __always_inline int traits_init(void *traits, void *hard_end)
+{
+	if (traits + sizeof(struct __trait_hdr) > hard_end)
+		return -ENOMEM;
+
+	memset(traits, 0, sizeof(struct __trait_hdr));
+	return 0;
+}
+
+/**
+ * traits_size() - Total size currently used by a trait store.
+ * @traits: Start of trait store area.
+ *
+ * Return: Size in bytes.
+ */
+static __always_inline int traits_size(void *traits)
+{
+	return sizeof(struct __trait_hdr) + __trait_total_length(*(struct __trait_hdr *)traits);
+}
+
+/**
+ * trait_set() - Set a trait key.
+ * @traits: Start of trait store area.
+ * @hard_end: Hard limit the trait store can currently grow up against.
+ * @key: The key to set.
+ * @val: The value to set.
+ * @len: The length of the value.
+ * @flags: Unused for now. Should be 0.
+ *
+ * Return:
+ * * %0       - Success.
+ * * %-EINVAL - Key or length invalid.
+ * * %-EBUSY  - Key previously set with different length.
+ * * %-ENOSPC - Not enough room left to store value.
+ */
+static __always_inline
+int trait_set(void *traits, void *hard_end, u64 key, const void *val, u64 len, u64 flags)
+{
+	if (!__trait_valid_key(key) || !__trait_valid_len(len))
+		return -EINVAL;
+
+	struct __trait_hdr *h = (struct __trait_hdr *)traits;
+
+	/* Offset of value of this key. */
+	int off = __trait_offset(*h, key);
+
+	if ((h->high & (1ull << key)) || (h->low & (1ull << key))) {
+		/* Key is already set, but with a different length */
+		if (__trait_total_length(__trait_and(*h, (1ull << key))) != len)
+			return -EBUSY;
+	} else {
+		/* Figure out if we have enough room left: total length of everything now. */
+		if (traits + sizeof(struct __trait_hdr) + __trait_total_length(*h) + len > hard_end)
+			return -ENOSPC;
+
+		/* Memmove all the kvs after us over. */
+		if (traits_size(traits) > off)
+			memmove(traits + off + len, traits + off, traits_size(traits) - off);
+	}
+
+	/* Set our value. */
+	memcpy(traits + off, val, len);
+
+	/* Store our length in header. */
+	u64 encode_len = 0;
+
+	switch (len) {
+	case 2:
+		encode_len = 1;
+		break;
+	case 4:
+		encode_len = 2;
+		break;
+	case 8:
+		encode_len = 3;
+		break;
+	}
+	h->high |= (encode_len >> 1) << key;
+	h->low |= (encode_len & 1) << key;
+	return 0;
+}
+
+/**
+ * trait_get() - Get a trait key.
+ * @traits: Start of trait store area.
+ * @key: The key to get.
+ * @val: Where to put stored value.
+ * @val_len: The length of val.
+ *
+ * Return:
+ * * %>0      - Actual size of value.
+ * * %-EINVAL - Key or length invalid.
+ * * %-ENOENT - Key has not been set with trait_set() previously.
+ * * %-ENOSPC - Val is not big enough to hold stored value.
+ */
+static __always_inline
+int trait_get(void *traits, u64 key, void *val, u64 val_len)
+{
+	if (!__trait_valid_key(key))
+		return -EINVAL;
+
+	struct __trait_hdr h = *(struct __trait_hdr *)traits;
+
+	/* Check key is set */
+	if (!((h.high & (1ull << key)) || (h.low & (1ull << key))))
+		return -ENOENT;
+
+	/* Offset of value of this key */
+	int off = __trait_offset(h, key);
+
+	/* Figure out our length */
+	int real_len = __trait_total_length(__trait_and(h, (1ull << key)));
+
+	if (real_len > val_len)
+		return -ENOSPC;
+
+	memcpy(val, traits + off, real_len);
+	return real_len;
+}
+
+/**
+ * trait_del() - Delete a trait key.
+ * @traits: Start of trait store area.
+ * @key: The key to delete.
+ *
+ * Return:
+ * * %0       - Success.
+ * * %-EINVAL - Key or length invalid.
+ * * %-ENOENT - Key has not been set with trait_set() previously.
+ */
+static __always_inline int trait_del(void *traits, u64 key)
+{
+	if (!__trait_valid_key(key))
+		return -EINVAL;
+
+	struct __trait_hdr *h = (struct __trait_hdr *)traits;
+
+	/* Check key is set */
+	if (!((h->high & (1ull << key)) || (h->low & (1ull << key))))
+		return -ENOENT;
+
+	/* Offset and length of value of this key */
+	int off = __trait_offset(*h, key);
+	int len = __trait_total_length(__trait_and(*h, (1ull << key)));
+
+	/* Memmove all the kvs after us over */
+	if (traits_size(traits) > off + len)
+		memmove(traits + off, traits + off + len, traits_size(traits) - off - len);
+
+	/* Clear our length in header */
+	h->high &= ~(1ull << key);
+	h->low &= ~(1ull << key);
+	return 0;
+}
+
+#endif /* __LINUX_NET_TRAIT_H__ */