diff mbox series

[bpf-next,v3,2/3] selftests/bpf: Introduce arena spin lock

Message ID 20250305011849.1168917-3-memxor@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Arena Spin Lock | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/build_tools success Errors and warnings before: 26 (+0) this patch: 26 (+0)
netdev/cc_maintainers warning 11 maintainers not CCed: jolsa@kernel.org kpsingh@kernel.org linux-kselftest@vger.kernel.org sdf@fomichev.me john.fastabend@gmail.com haoluo@google.com mykolal@fb.com martin.lau@linux.dev shuah@kernel.org song@kernel.org yonghong.song@linux.dev
netdev/build_clang success Errors and warnings before: 1 this patch: 1
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch fail + bpf_printk("RUNTIME ERROR: %s idx=%u and cpu=%u are out-of-bounds!!!", __func__, idx, cpu); CHECK: Blank lines aren't necessary after an open brace '{' CHECK: Macro argument 'p' may be better as '(p)' to avoid precedence issues CHECK: Prefer using the BIT macro CHECK: Unnecessary parentheses around qnodes[bpf_get_smp_processor_id() + 1] CHECK: spaces preferred around that '*' (ctx:WxV) ERROR: spaces required around that ':' (ctx:VxW) ERROR: trailing statements should be on next line WARNING: Improper SPDX comment style for 'tools/testing/selftests/bpf/bpf_arena_spin_lock.h', please use '/*' instead WARNING: Improper SPDX comment style for 'tools/testing/selftests/bpf/bpf_atomic.h', please use '/*' instead WARNING: Missing a blank line after declarations WARNING: Missing or malformed SPDX-License-Identifier tag in line 1 WARNING: Prefer 'signed int' to bare use of 'signed' WARNING: Prefer 'unsigned int' to bare use of 'unsigned' WARNING: Use of volatile is usually wrong: see Documentation/process/volatile-considered-harmful.rst WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: labels should not be indented WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns WARNING: memory barrier without comment
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 0 this patch: 1
netdev/source_inline success Was 0 now: 0

Commit Message

Kumar Kartikeya Dwivedi March 5, 2025, 1:18 a.m. UTC
Implement queued spin lock algorithm as BPF program for lock words
living in BPF arena.

The algorithm is copied from kernel/locking/qspinlock.c and adapted for
BPF use.

We first implement abstract helpers for portable atomics and
acquire/release load instructions, by relying on X86_64 presence to
elide expensive barriers and rely on implementation details of the JIT,
and fall back to slow but correct implementations elsewhere. When
support for acquire/release load/stores lands, we can improve this
state.

Then, the qspinlock algorithm is adapted to remove dependence on
multi-word atomics due to lack of support in BPF ISA. For instance,
xchg_tail cannot use 16-bit xchg, and needs to be a implemented as a
32-bit try_cmpxchg loop.

Loops which are seemingly infinite from verifier PoV are annotated with
cond_break_label macro to return an error. Only 1024 NR_CPUs are
supported.

We need to allocate 1025 qnodes, one more than NR_CPUs, since if libbpf
maps the qnode global variable starting at the first page of the arena,
and the lower 32-bits are zeroed for the base address, the first node
for CPU 0 will become indistinguishable from a NULL pointer, leading to
spurious timeouts and failures.

Note that the slow path is a global function, hence the verifier doesn't
know the return value's precision. The recommended way of usage is to
always test against zero for success, and not ret < 0 for error, as the
verifier would assume ret > 0 has not been accounted for.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 .../selftests/bpf/bpf_arena_spin_lock.h       | 505 ++++++++++++++++++
 tools/testing/selftests/bpf/bpf_atomic.h      | 132 +++++
 2 files changed, 637 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/bpf_arena_spin_lock.h
 create mode 100644 tools/testing/selftests/bpf/bpf_atomic.h

Comments

Alexei Starovoitov March 5, 2025, 2:27 a.m. UTC | #1
On Tue, Mar 4, 2025 at 5:19 PM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
>
> Implement queued spin lock algorithm as BPF program for lock words
> living in BPF arena.
>
> The algorithm is copied from kernel/locking/qspinlock.c and adapted for
> BPF use.
>
> We first implement abstract helpers for portable atomics and
> acquire/release load instructions, by relying on X86_64 presence to
> elide expensive barriers and rely on implementation details of the JIT,
> and fall back to slow but correct implementations elsewhere. When
> support for acquire/release load/stores lands, we can improve this
> state.
>
> Then, the qspinlock algorithm is adapted to remove dependence on
> multi-word atomics due to lack of support in BPF ISA. For instance,
> xchg_tail cannot use 16-bit xchg, and needs to be a implemented as a
> 32-bit try_cmpxchg loop.
>
> Loops which are seemingly infinite from verifier PoV are annotated with
> cond_break_label macro to return an error. Only 1024 NR_CPUs are
> supported.
>
> We need to allocate 1025 qnodes, one more than NR_CPUs, since if libbpf
> maps the qnode global variable starting at the first page of the arena,
> and the lower 32-bits are zeroed for the base address, the first node
> for CPU 0 will become indistinguishable from a NULL pointer, leading to
> spurious timeouts and failures.
>
> Note that the slow path is a global function, hence the verifier doesn't
> know the return value's precision. The recommended way of usage is to
> always test against zero for success, and not ret < 0 for error, as the
> verifier would assume ret > 0 has not been accounted for.
>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> ---
>  .../selftests/bpf/bpf_arena_spin_lock.h       | 505 ++++++++++++++++++
>  tools/testing/selftests/bpf/bpf_atomic.h      | 132 +++++
>  2 files changed, 637 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/bpf_arena_spin_lock.h
>  create mode 100644 tools/testing/selftests/bpf/bpf_atomic.h
>
> diff --git a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h
> new file mode 100644
> index 000000000000..cc7de78e0373
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h
> @@ -0,0 +1,505 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
> +#ifndef BPF_ARENA_SPIN_LOCK_H
> +#define BPF_ARENA_SPIN_LOCK_H
> +
> +#include <vmlinux.h>
> +#include <bpf/bpf_helpers.h>
> +#include "bpf_atomic.h"
> +
> +#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
> +#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
> +
> +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
> +
> +#define EBUSY 16
> +#define EOPNOTSUPP 95
> +#define ETIMEDOUT 110
> +
> +#ifndef __arena
> +#define __arena __attribute__((address_space(1)))
> +#endif
> +
> +extern unsigned long CONFIG_NR_CPUS __kconfig;
> +
> +#define arena_spinlock_t struct qspinlock
> +/* FIXME: Using typedef causes CO-RE relocation error */
> +/* typedef struct qspinlock arena_spinlock_t; */
> +
> +struct arena_mcs_spinlock {
> +       struct arena_mcs_spinlock __arena *next;
> +       int locked;
> +       int count;
> +};
> +
> +struct arena_qnode {
> +       struct arena_mcs_spinlock mcs;
> +};
> +
> +#define _Q_MAX_NODES           4
> +#define _Q_PENDING_LOOPS       1
> +
> +/*
> + * Bitfields in the atomic value:
> + *
> + *  0- 7: locked byte
> + *     8: pending
> + *  9-15: not used
> + * 16-17: tail index
> + * 18-31: tail cpu (+1)
> + */
> +#define _Q_MAX_CPUS            1024
> +
> +#define        _Q_SET_MASK(type)       (((1U << _Q_ ## type ## _BITS) - 1)\
> +                                     << _Q_ ## type ## _OFFSET)
> +#define _Q_LOCKED_OFFSET       0
> +#define _Q_LOCKED_BITS         8
> +#define _Q_LOCKED_MASK         _Q_SET_MASK(LOCKED)
> +
> +#define _Q_PENDING_OFFSET      (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
> +#define _Q_PENDING_BITS                8
> +#define _Q_PENDING_MASK                _Q_SET_MASK(PENDING)
> +
> +#define _Q_TAIL_IDX_OFFSET     (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
> +#define _Q_TAIL_IDX_BITS       2
> +#define _Q_TAIL_IDX_MASK       _Q_SET_MASK(TAIL_IDX)
> +
> +#define _Q_TAIL_CPU_OFFSET     (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
> +#define _Q_TAIL_CPU_BITS       (32 - _Q_TAIL_CPU_OFFSET)
> +#define _Q_TAIL_CPU_MASK       _Q_SET_MASK(TAIL_CPU)
> +
> +#define _Q_TAIL_OFFSET         _Q_TAIL_IDX_OFFSET
> +#define _Q_TAIL_MASK           (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
> +
> +#define _Q_LOCKED_VAL          (1U << _Q_LOCKED_OFFSET)
> +#define _Q_PENDING_VAL         (1U << _Q_PENDING_OFFSET)
> +
> +#define likely(x) __builtin_expect(!!(x), 1)
> +#define unlikely(x) __builtin_expect(!!(x), 0)
> +
> +/*
> + * We always index with + 1, in case we unfortunately place the qnodes at
> + * pg_offset=0 and then CPU 0's qnodes is indistinguishable from NULL if lower
> + * 32-bits of the node pointer are 0.
> + */
> +struct arena_qnode __arena qnodes[_Q_MAX_CPUS + 1][_Q_MAX_NODES];
> +
> +static inline u32 encode_tail(int cpu, int idx)
> +{
> +       u32 tail;
> +
> +       tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
> +       tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
> +
> +       return tail;
> +}
> +
> +static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
> +{
> +       u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
> +       u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
> +
> +       /* See comments over definition of qnodes for the + 1. */
> +       if (likely(idx < _Q_MAX_NODES && cpu < _Q_MAX_CPUS))
> +               return &qnodes[cpu + 1][idx].mcs;
> +       bpf_printk("RUNTIME ERROR: %s idx=%u and cpu=%u are out-of-bounds!!!", __func__, idx, cpu);

Is it really possible? 1024 check is done early and
idx cannot be out of bounds.
So only to detect corruption?

> +       return NULL;
> +}
> +
> +static inline
> +struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
> +{
> +       return &((struct arena_qnode __arena *)base + idx)->mcs;
> +}
> +
> +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
> +
> +/**
> + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> + * @lock : Pointer to queued spinlock structure
> + * @tail : The new queue tail code word
> + * Return: The previous queue tail code word
> + *
> + * xchg(lock, tail)
> + *
> + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> + */
> +static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
> +{
> +       u32 old, new;
> +
> +       old = atomic_read(&lock->val);
> +       do {
> +               new = (old & _Q_LOCKED_PENDING_MASK) | tail;
> +               /*
> +                * We can use relaxed semantics since the caller ensures that
> +                * the MCS node is properly initialized before updating the
> +                * tail.
> +                */
> +               /* These loops are not expected to stall, but we still need to
> +                * prove to the verifier they will terminate eventually.
> +                */
> +               cond_break_label(out);
> +       } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
> +
> +       return old;
> +out:
> +       bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
> +       return old;
> +}
> +
> +/**
> + * clear_pending - clear the pending bit.
> + * @lock: Pointer to queued spinlock structure
> + *
> + * *,1,* -> *,0,*
> + */
> +static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
> +{
> +       WRITE_ONCE(lock->pending, 0);
> +}
> +
> +/**
> + * clear_pending_set_locked - take ownership and clear the pending bit.
> + * @lock: Pointer to queued spinlock structure
> + *
> + * *,1,0 -> *,0,1
> + *
> + * Lock stealing is not allowed if this function is used.
> + */
> +static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
> +{
> +       WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
> +}
> +
> +/**
> + * set_locked - Set the lock bit and own the lock
> + * @lock: Pointer to queued spinlock structure
> + *
> + * *,*,0 -> *,0,1
> + */
> +static __always_inline void set_locked(arena_spinlock_t __arena *lock)
> +{
> +       WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
> +}
> +
> +static __always_inline
> +u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
> +{
> +       u32 old, new;
> +
> +       old = atomic_read(&lock->val);
> +       do {
> +               new = old | _Q_PENDING_VAL;
> +               /*
> +                * These loops are not expected to stall, but we still need to
> +                * prove to the verifier they will terminate eventually.
> +                */
> +               cond_break_label(out);
> +       } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
> +
> +       return old;
> +out:
> +       bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
> +       return old;
> +}
> +
> +/**
> + * arena_spin_trylock - try to acquire the queued spinlock
> + * @lock : Pointer to queued spinlock structure
> + * Return: 1 if lock acquired, 0 if failed
> + */
> +static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
> +{
> +       int val = atomic_read(&lock->val);
> +
> +       if (unlikely(val))
> +               return 0;
> +
> +       return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
> +}
> +
> +__noinline
> +int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
> +{
> +       struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
> +       int ret = -ETIMEDOUT;
> +       u32 old, tail;
> +       int idx;
> +
> +       /*
> +        * Wait for in-progress pending->locked hand-overs with a bounded
> +        * number of spins so that we guarantee forward progress.
> +        *
> +        * 0,1,0 -> 0,0,1
> +        */
> +       if (val == _Q_PENDING_VAL) {
> +               int cnt = _Q_PENDING_LOOPS;
> +               val = atomic_cond_read_relaxed_label(&lock->val,
> +                                                    (VAL != _Q_PENDING_VAL) || !cnt--,
> +                                                    release_err);
> +       }
> +
> +       /*
> +        * If we observe any contention; queue.
> +        */
> +       if (val & ~_Q_LOCKED_MASK)
> +               goto queue;
> +
> +       /*
> +        * trylock || pending
> +        *
> +        * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
> +        */
> +       val = arena_fetch_set_pending_acquire(lock);
> +
> +       /*
> +        * If we observe contention, there is a concurrent locker.
> +        *
> +        * Undo and queue; our setting of PENDING might have made the
> +        * n,0,0 -> 0,0,0 transition fail and it will now be waiting
> +        * on @next to become !NULL.
> +        */
> +       if (unlikely(val & ~_Q_LOCKED_MASK)) {
> +
> +               /* Undo PENDING if we set it. */
> +               if (!(val & _Q_PENDING_MASK))
> +                       clear_pending(lock);
> +
> +               goto queue;
> +       }
> +
> +       /*
> +        * We're pending, wait for the owner to go away.
> +        *
> +        * 0,1,1 -> *,1,0
> +        *
> +        * this wait loop must be a load-acquire such that we match the
> +        * store-release that clears the locked bit and create lock
> +        * sequentiality; this is because not all
> +        * clear_pending_set_locked() implementations imply full
> +        * barriers.
> +        */
> +       if (val & _Q_LOCKED_MASK)
> +               smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
> +
> +       /*
> +        * take ownership and clear the pending bit.
> +        *
> +        * 0,1,0 -> 0,0,1
> +        */
> +       clear_pending_set_locked(lock);
> +       return 0;
> +
> +       /*
> +        * End of pending bit optimistic spinning and beginning of MCS
> +        * queuing.
> +        */
> +queue:
> +       /* See comments over definition of qnodes for the + 1. */
> +       node0 = &(qnodes[bpf_get_smp_processor_id() + 1])[0].mcs;
> +       idx = node0->count++;
> +       tail = encode_tail(bpf_get_smp_processor_id(), idx);
> +
> +       /*
> +        * 4 nodes are allocated based on the assumption that there will not be
> +        * nested NMIs taking spinlocks. That may not be true in some
> +        * architectures even though the chance of needing more than 4 nodes
> +        * will still be extremely unlikely. When that happens, we simply return
> +        * an error. Original qspinlock has a trylock fallback in this case.
> +        */
> +       if (unlikely(idx >= _Q_MAX_NODES)) {
> +               ret = -EBUSY;
> +               goto release_node_err;
> +       }
> +
> +       node = grab_mcs_node(node0, idx);
> +
> +       /*
> +        * Ensure that we increment the head node->count before initialising
> +        * the actual node. If the compiler is kind enough to reorder these
> +        * stores, then an IRQ could overwrite our assignments.
> +        */
> +       barrier();
> +
> +       node->locked = 0;
> +       node->next = NULL;
> +
> +       /*
> +        * We touched a (possibly) cold cacheline in the per-cpu queue node;
> +        * attempt the trylock once more in the hope someone let go while we
> +        * weren't watching.
> +        */
> +       if (arena_spin_trylock(lock))
> +               goto release;
> +
> +       /*
> +        * Ensure that the initialisation of @node is complete before we
> +        * publish the updated tail via xchg_tail() and potentially link
> +        * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
> +        */
> +       smp_wmb();
> +
> +       /*
> +        * Publish the updated tail.
> +        * We have already touched the queueing cacheline; don't bother with
> +        * pending stuff.
> +        *
> +        * p,*,* -> n,*,*
> +        */
> +       old = xchg_tail(lock, tail);
> +       next = NULL;
> +
> +       /*
> +        * if there was a previous node; link it and wait until reaching the
> +        * head of the waitqueue.
> +        */
> +       if (old & _Q_TAIL_MASK) {
> +               prev = decode_tail(old);
> +
> +               /* Link @node into the waitqueue. */
> +               WRITE_ONCE(prev->next, node);
> +
> +               arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
> +
> +               /*
> +                * While waiting for the MCS lock, the next pointer may have
> +                * been set by another lock waiter. We cannot prefetch here
> +                * due to lack of equivalent instruction in BPF ISA.
> +                */
> +               next = READ_ONCE(node->next);
> +       }
> +
> +       /*
> +        * we're at the head of the waitqueue, wait for the owner & pending to
> +        * go away.
> +        *
> +        * *,x,y -> *,0,0
> +        *
> +        * this wait loop must use a load-acquire such that we match the
> +        * store-release that clears the locked bit and create lock
> +        * sequentiality; this is because the set_locked() function below
> +        * does not imply a full barrier.
> +        */
> +       val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
> +                                            release_node_err);
> +
> +       /*
> +        * claim the lock:
> +        *
> +        * n,0,0 -> 0,0,1 : lock, uncontended
> +        * *,*,0 -> *,*,1 : lock, contended
> +        *
> +        * If the queue head is the only one in the queue (lock value == tail)
> +        * and nobody is pending, clear the tail code and grab the lock.
> +        * Otherwise, we only need to grab the lock.
> +        */
> +
> +       /*
> +        * In the PV case we might already have _Q_LOCKED_VAL set, because
> +        * of lock stealing; therefore we must also allow:
> +        *
> +        * n,0,1 -> 0,0,1
> +        *
> +        * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
> +        *       above wait condition, therefore any concurrent setting of
> +        *       PENDING will make the uncontended transition fail.
> +        */
> +       if ((val & _Q_TAIL_MASK) == tail) {
> +               if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
> +                       goto release; /* No contention */
> +       }
> +
> +       /*
> +        * Either somebody is queued behind us or _Q_PENDING_VAL got set
> +        * which will then detect the remaining tail and queue behind us
> +        * ensuring we'll see a @next.
> +        */
> +       set_locked(lock);
> +
> +       /*
> +        * contended path; wait for next if not observed yet, release.
> +        */
> +       if (!next)
> +               next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
> +
> +       arch_mcs_spin_unlock_contended(&next->locked);
> +
> +release:;
> +       /*
> +        * release the node
> +        *
> +        * Doing a normal dec vs this_cpu_dec is fine. An upper context always
> +        * decrements count it incremented before returning, thus we're fine.
> +        * For contexts interrupting us, they either observe our dec or not.
> +        * Just ensure the compiler doesn't reorder this statement, as a
> +        * this_cpu_dec implicitly implied that.
> +        */
> +       barrier();
> +       node0->count--;
> +       return 0;
> +release_node_err:
> +       barrier();
> +       node0->count--;
> +       goto release_err;
> +release_err:
> +       return ret;
> +}
> +
> +/**
> + * arena_spin_lock - acquire a queued spinlock
> + * @lock: Pointer to queued spinlock structure
> + *
> + * The return value _must_ be tested against zero for success.
> + * On error, returned value will be negative.
> + */
> +static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
> +{
> +       int val = 0;
> +
> +       if (CONFIG_NR_CPUS > 1024)
> +               return -EOPNOTSUPP;
> +
> +       bpf_preempt_disable();
> +       if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
> +               return 0;
> +
> +       val = arena_spin_lock_slowpath(lock, val);
> +       /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
> +       if (val)
> +               bpf_preempt_enable();
> +       return val;
> +}
> +
> +/**
> + * arena_spin_unlock - release a queued spinlock
> + * @lock : Pointer to queued spinlock structure
> + */
> +static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
> +{
> +       /*
> +        * unlock() needs release semantics:
> +        */
> +       smp_store_release(&lock->locked, 0);
> +       bpf_preempt_enable();
> +}
> +
> +#define arena_spin_lock_irqsave(lock, flags)             \
> +       ({                                               \
> +               int __ret;                               \
> +               bpf_local_irq_save(&(flags));            \
> +               __ret = arena_spin_lock((lock));         \
> +               if (__ret)                               \
> +                       bpf_local_irq_restore(&(flags)); \
> +               (__ret);                                 \
> +       })
> +
> +#define arena_spin_unlock_irqrestore(lock, flags) \
> +       ({                                        \
> +               arena_spin_unlock((lock));        \
> +               bpf_local_irq_restore(&(flags));  \
> +       })
> +
> +#endif
> +
> +#endif /* BPF_ARENA_SPIN_LOCK_H */
> diff --git a/tools/testing/selftests/bpf/bpf_atomic.h b/tools/testing/selftests/bpf/bpf_atomic.h
> new file mode 100644
> index 000000000000..06defb9e050d
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/bpf_atomic.h
> @@ -0,0 +1,132 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
> +#ifndef BPF_ATOMIC_H
> +#define BPF_ATOMIC_H
> +
> +#include <vmlinux.h>
> +#include <bpf/bpf_helpers.h>
> +#include "bpf_experimental.h"
> +
> +extern bool CONFIG_X86_64 __kconfig __weak;
> +
> +#define __scalar_type_to_expr_cases(type) \
> +       unsigned type : (unsigned type)0, signed type : (signed type)0
> +/*
> + * This is lifted from __unqual_scalar_typeof in the kernel (which is used to
> + * lose const qualifier etc.), but adapted to also cover pointers. It is
> + * necessary because we ascertain type to create local variables in macros
> + * below, but for pointers with __arena tag, we'll ascertain the underlying type
> + * with the tag, causing a compilation error (as local variables that are not
> + * pointers may not have __arena tag). This trick allows losing the qualifier.
> + */

I bet that's what causing llvm to miss the cases where it needs
to emit cast_user() or more likely it forces llvm to emit cast_kern()
where it shouldn't be doing it.
Not sure which pointer you saw as indistinguishable from NULL.
I suspect it's smp_cond_load_relaxed_label(&node->next, ..
which is doing:
__unqual_typeof(*(p)) VAL;
(__unqual_typeof(*(p)))READ_ONCE(*__ptr);

and llvm will insert cast_kern() there,
so if (VAL) always sees upper 32-bit as zero.

So I suspect it's not a zero page issue.

> +#define __unqual_typeof(x)                              \
> +       typeof(_Generic((x),                            \
> +               char: (char)0,                          \
> +               __scalar_type_to_expr_cases(char),      \
> +               __scalar_type_to_expr_cases(short),     \
> +               __scalar_type_to_expr_cases(int),       \
> +               __scalar_type_to_expr_cases(long),      \
> +               __scalar_type_to_expr_cases(long long), \
> +               default: (void *)0))
> +
> +/* No-op for BPF */
> +#define cpu_relax() ({})
> +
> +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
> +
> +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val))
> +
> +#define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new)
> +
> +#define try_cmpxchg(p, pold, new)                                 \
> +       ({                                                        \
> +               __unqual_typeof(*(pold)) __o = *(pold);           \
> +               __unqual_typeof(*(p)) __r = cmpxchg(p, __o, new); \
> +               if (__r != __o)                                   \
> +                       *(pold) = __r;                            \
> +               __r == __o;                                       \
> +       })
> +
> +#define try_cmpxchg_relaxed(p, pold, new) try_cmpxchg(p, pold, new)
> +
> +#define try_cmpxchg_acquire(p, pold, new) try_cmpxchg(p, pold, new)
> +
> +#define smp_mb()                                 \
> +       ({                                       \
> +               unsigned long __val;             \
> +               __sync_fetch_and_add(&__val, 0); \
> +       })
> +
> +#define smp_rmb()                   \
> +       ({                          \
> +               if (!CONFIG_X86_64) \
> +                       smp_mb();   \
> +               else                \
> +                       barrier();  \
> +       })
> +
> +#define smp_wmb()                   \
> +       ({                          \
> +               if (!CONFIG_X86_64) \
> +                       smp_mb();   \
> +               else                \
> +                       barrier();  \
> +       })
> +
> +/* Control dependency provides LOAD->STORE, provide LOAD->LOAD */
> +#define smp_acquire__after_ctrl_dep() ({ smp_rmb(); })
> +
> +#define smp_load_acquire(p)                                  \
> +       ({                                                   \
> +               __unqual_typeof(*(p)) __v = READ_ONCE(*(p)); \
> +               if (!CONFIG_X86_64)                          \
> +                       smp_mb();                            \
> +               barrier();                                   \
> +               __v;                                         \
> +       })
> +
> +#define smp_store_release(p, val)      \
> +       ({                             \
> +               if (!CONFIG_X86_64)    \
> +                       smp_mb();      \
> +               barrier();             \
> +               WRITE_ONCE(*(p), val); \
> +       })
> +
> +#define smp_cond_load_relaxed_label(p, cond_expr, label)                \
> +       ({                                                              \
> +               typeof(p) __ptr = (p);                                  \
> +               __unqual_typeof(*(p)) VAL;                              \
> +               for (;;) {                                              \
> +                       VAL = (__unqual_typeof(*(p)))READ_ONCE(*__ptr); \
> +                       if (cond_expr)                                  \
> +                               break;                                  \
> +                       cond_break_label(label);                        \
> +                       cpu_relax();                                    \
> +               }                                                       \
> +               (typeof(*(p)))VAL;                                      \
> +       })
> +
> +#define smp_cond_load_acquire_label(p, cond_expr, label)                  \
> +       ({                                                                \
> +               __unqual_typeof(*p) __val =                               \
> +                       smp_cond_load_relaxed_label(p, cond_expr, label); \
> +               smp_acquire__after_ctrl_dep();                            \
> +               (typeof(*(p)))__val;                                      \
> +       })
> +
> +#define atomic_read(p) READ_ONCE((p)->counter)
> +
> +#define atomic_cond_read_relaxed_label(p, cond_expr, label) \
> +       smp_cond_load_relaxed_label(&(p)->counter, cond_expr, label)
> +
> +#define atomic_cond_read_acquire_label(p, cond_expr, label) \
> +       smp_cond_load_acquire_label(&(p)->counter, cond_expr, label)
> +
> +#define atomic_try_cmpxchg_relaxed(p, pold, new) \
> +       try_cmpxchg_relaxed(&(p)->counter, pold, new)
> +
> +#define atomic_try_cmpxchg_acquire(p, pold, new) \
> +       try_cmpxchg_acquire(&(p)->counter, pold, new)
> +
> +#endif /* BPF_ATOMIC_H */
> --
> 2.47.1
>
Kumar Kartikeya Dwivedi March 5, 2025, 3:14 a.m. UTC | #2
On Wed, 5 Mar 2025 at 03:27, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Mar 4, 2025 at 5:19 PM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
> >
> > Implement queued spin lock algorithm as BPF program for lock words
> > living in BPF arena.
> >
> > The algorithm is copied from kernel/locking/qspinlock.c and adapted for
> > BPF use.
> >
> > We first implement abstract helpers for portable atomics and
> > acquire/release load instructions, by relying on X86_64 presence to
> > elide expensive barriers and rely on implementation details of the JIT,
> > and fall back to slow but correct implementations elsewhere. When
> > support for acquire/release load/stores lands, we can improve this
> > state.
> >
> > Then, the qspinlock algorithm is adapted to remove dependence on
> > multi-word atomics due to lack of support in BPF ISA. For instance,
> > xchg_tail cannot use 16-bit xchg, and needs to be a implemented as a
> > 32-bit try_cmpxchg loop.
> >
> > Loops which are seemingly infinite from verifier PoV are annotated with
> > cond_break_label macro to return an error. Only 1024 NR_CPUs are
> > supported.
> >
> > We need to allocate 1025 qnodes, one more than NR_CPUs, since if libbpf
> > maps the qnode global variable starting at the first page of the arena,
> > and the lower 32-bits are zeroed for the base address, the first node
> > for CPU 0 will become indistinguishable from a NULL pointer, leading to
> > spurious timeouts and failures.
> >
> > Note that the slow path is a global function, hence the verifier doesn't
> > know the return value's precision. The recommended way of usage is to
> > always test against zero for success, and not ret < 0 for error, as the
> > verifier would assume ret > 0 has not been accounted for.
> >
> > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> > ---
> >  .../selftests/bpf/bpf_arena_spin_lock.h       | 505 ++++++++++++++++++
> >  tools/testing/selftests/bpf/bpf_atomic.h      | 132 +++++
> >  2 files changed, 637 insertions(+)
> >  create mode 100644 tools/testing/selftests/bpf/bpf_arena_spin_lock.h
> >  create mode 100644 tools/testing/selftests/bpf/bpf_atomic.h
> >
> > diff --git a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h
> > new file mode 100644
> > index 000000000000..cc7de78e0373
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h
> > @@ -0,0 +1,505 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
> > +#ifndef BPF_ARENA_SPIN_LOCK_H
> > +#define BPF_ARENA_SPIN_LOCK_H
> > +
> > +#include <vmlinux.h>
> > +#include <bpf/bpf_helpers.h>
> > +#include "bpf_atomic.h"
> > +
> > +#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
> > +#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
> > +
> > +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
> > +
> > +#define EBUSY 16
> > +#define EOPNOTSUPP 95
> > +#define ETIMEDOUT 110
> > +
> > +#ifndef __arena
> > +#define __arena __attribute__((address_space(1)))
> > +#endif
> > +
> > +extern unsigned long CONFIG_NR_CPUS __kconfig;
> > +
> > +#define arena_spinlock_t struct qspinlock
> > +/* FIXME: Using typedef causes CO-RE relocation error */
> > +/* typedef struct qspinlock arena_spinlock_t; */
> > +
> > +struct arena_mcs_spinlock {
> > +       struct arena_mcs_spinlock __arena *next;
> > +       int locked;
> > +       int count;
> > +};
> > +
> > +struct arena_qnode {
> > +       struct arena_mcs_spinlock mcs;
> > +};
> > +
> > +#define _Q_MAX_NODES           4
> > +#define _Q_PENDING_LOOPS       1
> > +
> > +/*
> > + * Bitfields in the atomic value:
> > + *
> > + *  0- 7: locked byte
> > + *     8: pending
> > + *  9-15: not used
> > + * 16-17: tail index
> > + * 18-31: tail cpu (+1)
> > + */
> > +#define _Q_MAX_CPUS            1024
> > +
> > +#define        _Q_SET_MASK(type)       (((1U << _Q_ ## type ## _BITS) - 1)\
> > +                                     << _Q_ ## type ## _OFFSET)
> > +#define _Q_LOCKED_OFFSET       0
> > +#define _Q_LOCKED_BITS         8
> > +#define _Q_LOCKED_MASK         _Q_SET_MASK(LOCKED)
> > +
> > +#define _Q_PENDING_OFFSET      (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
> > +#define _Q_PENDING_BITS                8
> > +#define _Q_PENDING_MASK                _Q_SET_MASK(PENDING)
> > +
> > +#define _Q_TAIL_IDX_OFFSET     (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
> > +#define _Q_TAIL_IDX_BITS       2
> > +#define _Q_TAIL_IDX_MASK       _Q_SET_MASK(TAIL_IDX)
> > +
> > +#define _Q_TAIL_CPU_OFFSET     (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
> > +#define _Q_TAIL_CPU_BITS       (32 - _Q_TAIL_CPU_OFFSET)
> > +#define _Q_TAIL_CPU_MASK       _Q_SET_MASK(TAIL_CPU)
> > +
> > +#define _Q_TAIL_OFFSET         _Q_TAIL_IDX_OFFSET
> > +#define _Q_TAIL_MASK           (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
> > +
> > +#define _Q_LOCKED_VAL          (1U << _Q_LOCKED_OFFSET)
> > +#define _Q_PENDING_VAL         (1U << _Q_PENDING_OFFSET)
> > +
> > +#define likely(x) __builtin_expect(!!(x), 1)
> > +#define unlikely(x) __builtin_expect(!!(x), 0)
> > +
> > +/*
> > + * We always index with + 1, in case we unfortunately place the qnodes at
> > + * pg_offset=0 and then CPU 0's qnodes is indistinguishable from NULL if lower
> > + * 32-bits of the node pointer are 0.
> > + */
> > +struct arena_qnode __arena qnodes[_Q_MAX_CPUS + 1][_Q_MAX_NODES];
> > +
> > +static inline u32 encode_tail(int cpu, int idx)
> > +{
> > +       u32 tail;
> > +
> > +       tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
> > +       tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
> > +
> > +       return tail;
> > +}
> > +
> > +static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
> > +{
> > +       u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
> > +       u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
> > +
> > +       /* See comments over definition of qnodes for the + 1. */
> > +       if (likely(idx < _Q_MAX_NODES && cpu < _Q_MAX_CPUS))
> > +               return &qnodes[cpu + 1][idx].mcs;
> > +       bpf_printk("RUNTIME ERROR: %s idx=%u and cpu=%u are out-of-bounds!!!", __func__, idx, cpu);
>
> Is it really possible? 1024 check is done early and
> idx cannot be out of bounds.
> So only to detect corruption?

Yeah, shouldn't be possible unless corruption happens, but I can drop it.

>
> > +       return NULL;
> > +}
> > +
> > +static inline
> > +struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
> > +{
> > +       return &((struct arena_qnode __arena *)base + idx)->mcs;
> > +}
> > +
> > +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
> > +
> > +/**
> > + * xchg_tail - Put in the new queue tail code word & retrieve previous one
> > + * @lock : Pointer to queued spinlock structure
> > + * @tail : The new queue tail code word
> > + * Return: The previous queue tail code word
> > + *
> > + * xchg(lock, tail)
> > + *
> > + * p,*,* -> n,*,* ; prev = xchg(lock, node)
> > + */
> > +static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
> > +{
> > +       u32 old, new;
> > +
> > +       old = atomic_read(&lock->val);
> > +       do {
> > +               new = (old & _Q_LOCKED_PENDING_MASK) | tail;
> > +               /*
> > +                * We can use relaxed semantics since the caller ensures that
> > +                * the MCS node is properly initialized before updating the
> > +                * tail.
> > +                */
> > +               /* These loops are not expected to stall, but we still need to
> > +                * prove to the verifier they will terminate eventually.
> > +                */
> > +               cond_break_label(out);
> > +       } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
> > +
> > +       return old;
> > +out:
> > +       bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
> > +       return old;
> > +}
> > +
> > +/**
> > + * clear_pending - clear the pending bit.
> > + * @lock: Pointer to queued spinlock structure
> > + *
> > + * *,1,* -> *,0,*
> > + */
> > +static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
> > +{
> > +       WRITE_ONCE(lock->pending, 0);
> > +}
> > +
> > +/**
> > + * clear_pending_set_locked - take ownership and clear the pending bit.
> > + * @lock: Pointer to queued spinlock structure
> > + *
> > + * *,1,0 -> *,0,1
> > + *
> > + * Lock stealing is not allowed if this function is used.
> > + */
> > +static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
> > +{
> > +       WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
> > +}
> > +
> > +/**
> > + * set_locked - Set the lock bit and own the lock
> > + * @lock: Pointer to queued spinlock structure
> > + *
> > + * *,*,0 -> *,0,1
> > + */
> > +static __always_inline void set_locked(arena_spinlock_t __arena *lock)
> > +{
> > +       WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
> > +}
> > +
> > +static __always_inline
> > +u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
> > +{
> > +       u32 old, new;
> > +
> > +       old = atomic_read(&lock->val);
> > +       do {
> > +               new = old | _Q_PENDING_VAL;
> > +               /*
> > +                * These loops are not expected to stall, but we still need to
> > +                * prove to the verifier they will terminate eventually.
> > +                */
> > +               cond_break_label(out);
> > +       } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
> > +
> > +       return old;
> > +out:
> > +       bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
> > +       return old;
> > +}
> > +
> > +/**
> > + * arena_spin_trylock - try to acquire the queued spinlock
> > + * @lock : Pointer to queued spinlock structure
> > + * Return: 1 if lock acquired, 0 if failed
> > + */
> > +static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
> > +{
> > +       int val = atomic_read(&lock->val);
> > +
> > +       if (unlikely(val))
> > +               return 0;
> > +
> > +       return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
> > +}
> > +
> > +__noinline
> > +int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
> > +{
> > +       struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
> > +       int ret = -ETIMEDOUT;
> > +       u32 old, tail;
> > +       int idx;
> > +
> > +       /*
> > +        * Wait for in-progress pending->locked hand-overs with a bounded
> > +        * number of spins so that we guarantee forward progress.
> > +        *
> > +        * 0,1,0 -> 0,0,1
> > +        */
> > +       if (val == _Q_PENDING_VAL) {
> > +               int cnt = _Q_PENDING_LOOPS;
> > +               val = atomic_cond_read_relaxed_label(&lock->val,
> > +                                                    (VAL != _Q_PENDING_VAL) || !cnt--,
> > +                                                    release_err);
> > +       }
> > +
> > +       /*
> > +        * If we observe any contention; queue.
> > +        */
> > +       if (val & ~_Q_LOCKED_MASK)
> > +               goto queue;
> > +
> > +       /*
> > +        * trylock || pending
> > +        *
> > +        * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
> > +        */
> > +       val = arena_fetch_set_pending_acquire(lock);
> > +
> > +       /*
> > +        * If we observe contention, there is a concurrent locker.
> > +        *
> > +        * Undo and queue; our setting of PENDING might have made the
> > +        * n,0,0 -> 0,0,0 transition fail and it will now be waiting
> > +        * on @next to become !NULL.
> > +        */
> > +       if (unlikely(val & ~_Q_LOCKED_MASK)) {
> > +
> > +               /* Undo PENDING if we set it. */
> > +               if (!(val & _Q_PENDING_MASK))
> > +                       clear_pending(lock);
> > +
> > +               goto queue;
> > +       }
> > +
> > +       /*
> > +        * We're pending, wait for the owner to go away.
> > +        *
> > +        * 0,1,1 -> *,1,0
> > +        *
> > +        * this wait loop must be a load-acquire such that we match the
> > +        * store-release that clears the locked bit and create lock
> > +        * sequentiality; this is because not all
> > +        * clear_pending_set_locked() implementations imply full
> > +        * barriers.
> > +        */
> > +       if (val & _Q_LOCKED_MASK)
> > +               smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
> > +
> > +       /*
> > +        * take ownership and clear the pending bit.
> > +        *
> > +        * 0,1,0 -> 0,0,1
> > +        */
> > +       clear_pending_set_locked(lock);
> > +       return 0;
> > +
> > +       /*
> > +        * End of pending bit optimistic spinning and beginning of MCS
> > +        * queuing.
> > +        */
> > +queue:
> > +       /* See comments over definition of qnodes for the + 1. */
> > +       node0 = &(qnodes[bpf_get_smp_processor_id() + 1])[0].mcs;
> > +       idx = node0->count++;
> > +       tail = encode_tail(bpf_get_smp_processor_id(), idx);
> > +
> > +       /*
> > +        * 4 nodes are allocated based on the assumption that there will not be
> > +        * nested NMIs taking spinlocks. That may not be true in some
> > +        * architectures even though the chance of needing more than 4 nodes
> > +        * will still be extremely unlikely. When that happens, we simply return
> > +        * an error. Original qspinlock has a trylock fallback in this case.
> > +        */
> > +       if (unlikely(idx >= _Q_MAX_NODES)) {
> > +               ret = -EBUSY;
> > +               goto release_node_err;
> > +       }
> > +
> > +       node = grab_mcs_node(node0, idx);
> > +
> > +       /*
> > +        * Ensure that we increment the head node->count before initialising
> > +        * the actual node. If the compiler is kind enough to reorder these
> > +        * stores, then an IRQ could overwrite our assignments.
> > +        */
> > +       barrier();
> > +
> > +       node->locked = 0;
> > +       node->next = NULL;
> > +
> > +       /*
> > +        * We touched a (possibly) cold cacheline in the per-cpu queue node;
> > +        * attempt the trylock once more in the hope someone let go while we
> > +        * weren't watching.
> > +        */
> > +       if (arena_spin_trylock(lock))
> > +               goto release;
> > +
> > +       /*
> > +        * Ensure that the initialisation of @node is complete before we
> > +        * publish the updated tail via xchg_tail() and potentially link
> > +        * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
> > +        */
> > +       smp_wmb();
> > +
> > +       /*
> > +        * Publish the updated tail.
> > +        * We have already touched the queueing cacheline; don't bother with
> > +        * pending stuff.
> > +        *
> > +        * p,*,* -> n,*,*
> > +        */
> > +       old = xchg_tail(lock, tail);
> > +       next = NULL;
> > +
> > +       /*
> > +        * if there was a previous node; link it and wait until reaching the
> > +        * head of the waitqueue.
> > +        */
> > +       if (old & _Q_TAIL_MASK) {
> > +               prev = decode_tail(old);
> > +
> > +               /* Link @node into the waitqueue. */
> > +               WRITE_ONCE(prev->next, node);
> > +
> > +               arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
> > +
> > +               /*
> > +                * While waiting for the MCS lock, the next pointer may have
> > +                * been set by another lock waiter. We cannot prefetch here
> > +                * due to lack of equivalent instruction in BPF ISA.
> > +                */
> > +               next = READ_ONCE(node->next);
> > +       }
> > +
> > +       /*
> > +        * we're at the head of the waitqueue, wait for the owner & pending to
> > +        * go away.
> > +        *
> > +        * *,x,y -> *,0,0
> > +        *
> > +        * this wait loop must use a load-acquire such that we match the
> > +        * store-release that clears the locked bit and create lock
> > +        * sequentiality; this is because the set_locked() function below
> > +        * does not imply a full barrier.
> > +        */
> > +       val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
> > +                                            release_node_err);
> > +
> > +       /*
> > +        * claim the lock:
> > +        *
> > +        * n,0,0 -> 0,0,1 : lock, uncontended
> > +        * *,*,0 -> *,*,1 : lock, contended
> > +        *
> > +        * If the queue head is the only one in the queue (lock value == tail)
> > +        * and nobody is pending, clear the tail code and grab the lock.
> > +        * Otherwise, we only need to grab the lock.
> > +        */
> > +
> > +       /*
> > +        * In the PV case we might already have _Q_LOCKED_VAL set, because
> > +        * of lock stealing; therefore we must also allow:
> > +        *
> > +        * n,0,1 -> 0,0,1
> > +        *
> > +        * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
> > +        *       above wait condition, therefore any concurrent setting of
> > +        *       PENDING will make the uncontended transition fail.
> > +        */
> > +       if ((val & _Q_TAIL_MASK) == tail) {
> > +               if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
> > +                       goto release; /* No contention */
> > +       }
> > +
> > +       /*
> > +        * Either somebody is queued behind us or _Q_PENDING_VAL got set
> > +        * which will then detect the remaining tail and queue behind us
> > +        * ensuring we'll see a @next.
> > +        */
> > +       set_locked(lock);
> > +
> > +       /*
> > +        * contended path; wait for next if not observed yet, release.
> > +        */
> > +       if (!next)
> > +               next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
> > +
> > +       arch_mcs_spin_unlock_contended(&next->locked);
> > +
> > +release:;
> > +       /*
> > +        * release the node
> > +        *
> > +        * Doing a normal dec vs this_cpu_dec is fine. An upper context always
> > +        * decrements count it incremented before returning, thus we're fine.
> > +        * For contexts interrupting us, they either observe our dec or not.
> > +        * Just ensure the compiler doesn't reorder this statement, as a
> > +        * this_cpu_dec implicitly implied that.
> > +        */
> > +       barrier();
> > +       node0->count--;
> > +       return 0;
> > +release_node_err:
> > +       barrier();
> > +       node0->count--;
> > +       goto release_err;
> > +release_err:
> > +       return ret;
> > +}
> > +
> > +/**
> > + * arena_spin_lock - acquire a queued spinlock
> > + * @lock: Pointer to queued spinlock structure
> > + *
> > + * The return value _must_ be tested against zero for success.
> > + * On error, returned value will be negative.
> > + */
> > +static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
> > +{
> > +       int val = 0;
> > +
> > +       if (CONFIG_NR_CPUS > 1024)
> > +               return -EOPNOTSUPP;
> > +
> > +       bpf_preempt_disable();
> > +       if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
> > +               return 0;
> > +
> > +       val = arena_spin_lock_slowpath(lock, val);
> > +       /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
> > +       if (val)
> > +               bpf_preempt_enable();
> > +       return val;
> > +}
> > +
> > +/**
> > + * arena_spin_unlock - release a queued spinlock
> > + * @lock : Pointer to queued spinlock structure
> > + */
> > +static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
> > +{
> > +       /*
> > +        * unlock() needs release semantics:
> > +        */
> > +       smp_store_release(&lock->locked, 0);
> > +       bpf_preempt_enable();
> > +}
> > +
> > +#define arena_spin_lock_irqsave(lock, flags)             \
> > +       ({                                               \
> > +               int __ret;                               \
> > +               bpf_local_irq_save(&(flags));            \
> > +               __ret = arena_spin_lock((lock));         \
> > +               if (__ret)                               \
> > +                       bpf_local_irq_restore(&(flags)); \
> > +               (__ret);                                 \
> > +       })
> > +
> > +#define arena_spin_unlock_irqrestore(lock, flags) \
> > +       ({                                        \
> > +               arena_spin_unlock((lock));        \
> > +               bpf_local_irq_restore(&(flags));  \
> > +       })
> > +
> > +#endif
> > +
> > +#endif /* BPF_ARENA_SPIN_LOCK_H */
> > diff --git a/tools/testing/selftests/bpf/bpf_atomic.h b/tools/testing/selftests/bpf/bpf_atomic.h
> > new file mode 100644
> > index 000000000000..06defb9e050d
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/bpf_atomic.h
> > @@ -0,0 +1,132 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
> > +#ifndef BPF_ATOMIC_H
> > +#define BPF_ATOMIC_H
> > +
> > +#include <vmlinux.h>
> > +#include <bpf/bpf_helpers.h>
> > +#include "bpf_experimental.h"
> > +
> > +extern bool CONFIG_X86_64 __kconfig __weak;
> > +
> > +#define __scalar_type_to_expr_cases(type) \
> > +       unsigned type : (unsigned type)0, signed type : (signed type)0
> > +/*
> > + * This is lifted from __unqual_scalar_typeof in the kernel (which is used to
> > + * lose const qualifier etc.), but adapted to also cover pointers. It is
> > + * necessary because we ascertain type to create local variables in macros
> > + * below, but for pointers with __arena tag, we'll ascertain the underlying type
> > + * with the tag, causing a compilation error (as local variables that are not
> > + * pointers may not have __arena tag). This trick allows losing the qualifier.
> > + */
>
> I bet that's what causing llvm to miss the cases where it needs
> to emit cast_user() or more likely it forces llvm to emit cast_kern()
> where it shouldn't be doing it.
> Not sure which pointer you saw as indistinguishable from NULL.
> I suspect it's smp_cond_load_relaxed_label(&node->next, ..

Yes.

> which is doing:
> __unqual_typeof(*(p)) VAL;
> (__unqual_typeof(*(p)))READ_ONCE(*__ptr);
>
> and llvm will insert cast_kern() there,

Yes, I do see a r1 = addr_space_cast(r2, 0x0, 0x1).
r2 is node->next loaded from arena pointer 'node'.

But I can't understand why that's a problem.

If I do
for (;;) {
  next = READ_ONCE(node->next);
  if (next)
     break;
  cond_break_label(...);
}

instead of the macro, everything works ok.

But that's because LLVM didn't insert a cast, and the verifier sees
next as a scalar.
So if next is 0x100000000000, it will see 0x100000000000.
With cast_kern it only sees 0.
It will probably be casted once we try to write to next->locked later on.

I would gather there's a lot of other cases where someone dereferences
before doing some pointer equality comparison.
In that case we might end up in the same situation.
ptr = load_from_arena;
x = ptr->xyz;
if (ptr == ptr2) { ... }

The extra cast_kern is certainly causing this to surface, but I am not
sure whether it's something to fix in the macro.

> so if (VAL) always sees upper 32-bit as zero.
>
> So I suspect it's not a zero page issue.
>

When I bpf_printk the node address of the qnode of CPU 0, it is
0x100000000000 i.e. user_vm_start. This is the pointer that's misdetected.
So it appears to be on the first page.


> > +#define __unqual_typeof(x)                              \
> > +       typeof(_Generic((x),                            \
> > +               char: (char)0,                          \
> > +               __scalar_type_to_expr_cases(char),      \
> > +               __scalar_type_to_expr_cases(short),     \
> > +               __scalar_type_to_expr_cases(int),       \
> > +               __scalar_type_to_expr_cases(long),      \
> > +               __scalar_type_to_expr_cases(long long), \
> > +               default: (void *)0))
> > +
> > +/* No-op for BPF */
> > +#define cpu_relax() ({})
> > +
> > +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
> > +
> > +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val))
> > +
> > +#define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new)
> > +
> > +#define try_cmpxchg(p, pold, new)                                 \
> > +       ({                                                        \
> > +               __unqual_typeof(*(pold)) __o = *(pold);           \
> > +               __unqual_typeof(*(p)) __r = cmpxchg(p, __o, new); \
> > +               if (__r != __o)                                   \
> > +                       *(pold) = __r;                            \
> > +               __r == __o;                                       \
> > +       })
> > +
> > +#define try_cmpxchg_relaxed(p, pold, new) try_cmpxchg(p, pold, new)
> > +
> > +#define try_cmpxchg_acquire(p, pold, new) try_cmpxchg(p, pold, new)
> > +
> > +#define smp_mb()                                 \
> > +       ({                                       \
> > +               unsigned long __val;             \
> > +               __sync_fetch_and_add(&__val, 0); \
> > +       })
> > +
> > +#define smp_rmb()                   \
> > +       ({                          \
> > +               if (!CONFIG_X86_64) \
> > +                       smp_mb();   \
> > +               else                \
> > +                       barrier();  \
> > +       })
> > +
> > +#define smp_wmb()                   \
> > +       ({                          \
> > +               if (!CONFIG_X86_64) \
> > +                       smp_mb();   \
> > +               else                \
> > +                       barrier();  \
> > +       })
> > +
> > +/* Control dependency provides LOAD->STORE, provide LOAD->LOAD */
> > +#define smp_acquire__after_ctrl_dep() ({ smp_rmb(); })
> > +
> > +#define smp_load_acquire(p)                                  \
> > +       ({                                                   \
> > +               __unqual_typeof(*(p)) __v = READ_ONCE(*(p)); \
> > +               if (!CONFIG_X86_64)                          \
> > +                       smp_mb();                            \
> > +               barrier();                                   \
> > +               __v;                                         \
> > +       })
> > +
> > +#define smp_store_release(p, val)      \
> > +       ({                             \
> > +               if (!CONFIG_X86_64)    \
> > +                       smp_mb();      \
> > +               barrier();             \
> > +               WRITE_ONCE(*(p), val); \
> > +       })
> > +
> > +#define smp_cond_load_relaxed_label(p, cond_expr, label)                \
> > +       ({                                                              \
> > +               typeof(p) __ptr = (p);                                  \
> > +               __unqual_typeof(*(p)) VAL;                              \
> > +               for (;;) {                                              \
> > +                       VAL = (__unqual_typeof(*(p)))READ_ONCE(*__ptr); \
> > +                       if (cond_expr)                                  \
> > +                               break;                                  \
> > +                       cond_break_label(label);                        \
> > +                       cpu_relax();                                    \
> > +               }                                                       \
> > +               (typeof(*(p)))VAL;                                      \
> > +       })
> > +
> > +#define smp_cond_load_acquire_label(p, cond_expr, label)                  \
> > +       ({                                                                \
> > +               __unqual_typeof(*p) __val =                               \
> > +                       smp_cond_load_relaxed_label(p, cond_expr, label); \
> > +               smp_acquire__after_ctrl_dep();                            \
> > +               (typeof(*(p)))__val;                                      \
> > +       })
> > +
> > +#define atomic_read(p) READ_ONCE((p)->counter)
> > +
> > +#define atomic_cond_read_relaxed_label(p, cond_expr, label) \
> > +       smp_cond_load_relaxed_label(&(p)->counter, cond_expr, label)
> > +
> > +#define atomic_cond_read_acquire_label(p, cond_expr, label) \
> > +       smp_cond_load_acquire_label(&(p)->counter, cond_expr, label)
> > +
> > +#define atomic_try_cmpxchg_relaxed(p, pold, new) \
> > +       try_cmpxchg_relaxed(&(p)->counter, pold, new)
> > +
> > +#define atomic_try_cmpxchg_acquire(p, pold, new) \
> > +       try_cmpxchg_acquire(&(p)->counter, pold, new)
> > +
> > +#endif /* BPF_ATOMIC_H */
> > --
> > 2.47.1
> >
Alexei Starovoitov March 5, 2025, 3:40 a.m. UTC | #3
On Tue, Mar 4, 2025 at 7:15 PM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
>
> > which is doing:
> > __unqual_typeof(*(p)) VAL;
> > (__unqual_typeof(*(p)))READ_ONCE(*__ptr);
> >
> > and llvm will insert cast_kern() there,
>
> Yes, I do see a r1 = addr_space_cast(r2, 0x0, 0x1).
> r2 is node->next loaded from arena pointer 'node'.
>
> But I can't understand why that's a problem.
>
> If I do
> for (;;) {
>   next = READ_ONCE(node->next);
>   if (next)
>      break;
>   cond_break_label(...);
> }
>
> instead of the macro, everything works ok.

because the above doesn't have addr space casts.

> But that's because LLVM didn't insert a cast, and the verifier sees
> next as a scalar.
> So if next is 0x100000000000, it will see 0x100000000000.
> With cast_kern it only sees 0.

right.

> It will probably be casted once we try to write to next->locked later on.

not quite.
In a typical program llvm will emit bare minimum cast_user,
because all pointers are full 64-bit valid user space addresses all the time.
The cast_kern() is needed for read/write through the pointer
if it's not a kernel pointer yet.
See list_add_head() in bpf_arena_list.h that has
a bunch of explicit cast_kern/user (with llvm there will be a fraction
of them), but they illustrate the idea:
        cast_user(first);
        cast_kern(n); // before writing into 'n' it has to be 'kern'
        WRITE_ONCE(n->next, first); // first has to be full 64-bit
        cast_kern(first); // ignore this one :) it's my mistake.
should be after 'if'
        if (first) {
                tmp = &n->next;
                cast_user(tmp);
                WRITE_ONCE(first->pprev, tmp);
        }
        cast_user(n);
        WRITE_ONCE(h->first, n);

> I would gather there's a lot of other cases where someone dereferences
> before doing some pointer equality comparison.
> In that case we might end up in the same situation.
> ptr = load_from_arena;
> x = ptr->xyz;
> if (ptr == ptr2) { ... }

There shouldn't be any issues here.
The 'load from arena' will return full 64-bit and they should
be stored as full 64-bit in memory.
ptr->xyz (assuming xyz is another pointer) will read full 64-bit too.

> The extra cast_kern is certainly causing this to surface, but I am not
> sure whether it's something to fix in the macro.

I think it's a macro issue due to casting addr space off.

> > so if (VAL) always sees upper 32-bit as zero.
> >
> > So I suspect it's not a zero page issue.
> >
>
> When I bpf_printk the node address of the qnode of CPU 0, it is
> 0x100000000000 i.e. user_vm_start. This is the pointer that's misdetected.
> So it appears to be on the first page.

yes and looks the addr passed into printk is correct full 64-bit
as it should be.
So this part:
  return &qnodes[cpu + 1][idx].mcs;
is fine.
It's full 64-bit.
  &((struct arena_qnode __arena *)base + idx)->mcs;
is also ok.

There are no addr space casts there.
But the macro is problematic.
Kumar Kartikeya Dwivedi March 5, 2025, 4:28 a.m. UTC | #4
On Wed, 5 Mar 2025 at 04:41, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Mar 4, 2025 at 7:15 PM Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
> >
> > > which is doing:
> > > __unqual_typeof(*(p)) VAL;
> > > (__unqual_typeof(*(p)))READ_ONCE(*__ptr);
> > >
> > > and llvm will insert cast_kern() there,
> >
> > Yes, I do see a r1 = addr_space_cast(r2, 0x0, 0x1).
> > r2 is node->next loaded from arena pointer 'node'.
> >
> > But I can't understand why that's a problem.
> >
> > If I do
> > for (;;) {
> >   next = READ_ONCE(node->next);
> >   if (next)
> >      break;
> >   cond_break_label(...);
> > }
> >
> > instead of the macro, everything works ok.
>
> because the above doesn't have addr space casts.
>
> > But that's because LLVM didn't insert a cast, and the verifier sees
> > next as a scalar.
> > So if next is 0x100000000000, it will see 0x100000000000.
> > With cast_kern it only sees 0.
>
> right.
>
> > It will probably be casted once we try to write to next->locked later on.
>
> not quite.
> In a typical program llvm will emit bare minimum cast_user,
> because all pointers are full 64-bit valid user space addresses all the time.
> The cast_kern() is needed for read/write through the pointer
> if it's not a kernel pointer yet.
> See list_add_head() in bpf_arena_list.h that has
> a bunch of explicit cast_kern/user (with llvm there will be a fraction
> of them), but they illustrate the idea:
>         cast_user(first);
>         cast_kern(n); // before writing into 'n' it has to be 'kern'
>         WRITE_ONCE(n->next, first); // first has to be full 64-bit
>         cast_kern(first); // ignore this one :) it's my mistake.
> should be after 'if'
>         if (first) {
>                 tmp = &n->next;
>                 cast_user(tmp);
>                 WRITE_ONCE(first->pprev, tmp);
>         }
>         cast_user(n);
>         WRITE_ONCE(h->first, n);
>
> > I would gather there's a lot of other cases where someone dereferences
> > before doing some pointer equality comparison.
> > In that case we might end up in the same situation.
> > ptr = load_from_arena;
> > x = ptr->xyz;
> > if (ptr == ptr2) { ... }
>
> There shouldn't be any issues here.
> The 'load from arena' will return full 64-bit and they should
> be stored as full 64-bit in memory.
> ptr->xyz (assuming xyz is another pointer) will read full 64-bit too.
>
> > The extra cast_kern is certainly causing this to surface, but I am not
> > sure whether it's something to fix in the macro.
>
> I think it's a macro issue due to casting addr space off.
>
> > > so if (VAL) always sees upper 32-bit as zero.
> > >
> > > So I suspect it's not a zero page issue.
> > >
> >
> > When I bpf_printk the node address of the qnode of CPU 0, it is
> > 0x100000000000 i.e. user_vm_start. This is the pointer that's misdetected.
> > So it appears to be on the first page.
>
> yes and looks the addr passed into printk is correct full 64-bit
> as it should be.
> So this part:
>   return &qnodes[cpu + 1][idx].mcs;
> is fine.
> It's full 64-bit.
>   &((struct arena_qnode __arena *)base + idx)->mcs;
> is also ok.
>
> There are no addr space casts there.
> But the macro is problematic.

Ok, makes sense. Pointer values should always be the full 64-bit equivalent.
I fixed up unqual_typeof to not cause the extra cast, as it won't be
necessary in case of pointers anyway.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h
new file mode 100644
index 000000000000..cc7de78e0373
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h
@@ -0,0 +1,505 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#ifndef BPF_ARENA_SPIN_LOCK_H
+#define BPF_ARENA_SPIN_LOCK_H
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_atomic.h"
+
+#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label)
+#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1)
+
+#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+
+#define EBUSY 16
+#define EOPNOTSUPP 95
+#define ETIMEDOUT 110
+
+#ifndef __arena
+#define __arena __attribute__((address_space(1)))
+#endif
+
+extern unsigned long CONFIG_NR_CPUS __kconfig;
+
+#define arena_spinlock_t struct qspinlock
+/* FIXME: Using typedef causes CO-RE relocation error */
+/* typedef struct qspinlock arena_spinlock_t; */
+
+struct arena_mcs_spinlock {
+	struct arena_mcs_spinlock __arena *next;
+	int locked;
+	int count;
+};
+
+struct arena_qnode {
+	struct arena_mcs_spinlock mcs;
+};
+
+#define _Q_MAX_NODES		4
+#define _Q_PENDING_LOOPS	1
+
+/*
+ * Bitfields in the atomic value:
+ *
+ *  0- 7: locked byte
+ *     8: pending
+ *  9-15: not used
+ * 16-17: tail index
+ * 18-31: tail cpu (+1)
+ */
+#define _Q_MAX_CPUS		1024
+
+#define	_Q_SET_MASK(type)	(((1U << _Q_ ## type ## _BITS) - 1)\
+				      << _Q_ ## type ## _OFFSET)
+#define _Q_LOCKED_OFFSET	0
+#define _Q_LOCKED_BITS		8
+#define _Q_LOCKED_MASK		_Q_SET_MASK(LOCKED)
+
+#define _Q_PENDING_OFFSET	(_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#define _Q_PENDING_BITS		8
+#define _Q_PENDING_MASK		_Q_SET_MASK(PENDING)
+
+#define _Q_TAIL_IDX_OFFSET	(_Q_PENDING_OFFSET + _Q_PENDING_BITS)
+#define _Q_TAIL_IDX_BITS	2
+#define _Q_TAIL_IDX_MASK	_Q_SET_MASK(TAIL_IDX)
+
+#define _Q_TAIL_CPU_OFFSET	(_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS)
+#define _Q_TAIL_CPU_BITS	(32 - _Q_TAIL_CPU_OFFSET)
+#define _Q_TAIL_CPU_MASK	_Q_SET_MASK(TAIL_CPU)
+
+#define _Q_TAIL_OFFSET		_Q_TAIL_IDX_OFFSET
+#define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
+
+#define _Q_LOCKED_VAL		(1U << _Q_LOCKED_OFFSET)
+#define _Q_PENDING_VAL		(1U << _Q_PENDING_OFFSET)
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+/*
+ * We always index with + 1, in case we unfortunately place the qnodes at
+ * pg_offset=0 and then CPU 0's qnodes is indistinguishable from NULL if lower
+ * 32-bits of the node pointer are 0.
+ */
+struct arena_qnode __arena qnodes[_Q_MAX_CPUS + 1][_Q_MAX_NODES];
+
+static inline u32 encode_tail(int cpu, int idx)
+{
+	u32 tail;
+
+	tail  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
+	tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */
+
+	return tail;
+}
+
+static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail)
+{
+	u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
+	u32 idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+
+	/* See comments over definition of qnodes for the + 1. */
+	if (likely(idx < _Q_MAX_NODES && cpu < _Q_MAX_CPUS))
+		return &qnodes[cpu + 1][idx].mcs;
+	bpf_printk("RUNTIME ERROR: %s idx=%u and cpu=%u are out-of-bounds!!!", __func__, idx, cpu);
+	return NULL;
+}
+
+static inline
+struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx)
+{
+	return &((struct arena_qnode __arena *)base + idx)->mcs;
+}
+
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
+/**
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail)
+{
+	u32 old, new;
+
+	old = atomic_read(&lock->val);
+	do {
+		new = (old & _Q_LOCKED_PENDING_MASK) | tail;
+		/*
+		 * We can use relaxed semantics since the caller ensures that
+		 * the MCS node is properly initialized before updating the
+		 * tail.
+		 */
+		/* These loops are not expected to stall, but we still need to
+		 * prove to the verifier they will terminate eventually.
+		 */
+		cond_break_label(out);
+	} while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new));
+
+	return old;
+out:
+	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
+	return old;
+}
+
+/**
+ * clear_pending - clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,* -> *,0,*
+ */
+static __always_inline void clear_pending(arena_spinlock_t __arena *lock)
+{
+	WRITE_ONCE(lock->pending, 0);
+}
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock)
+{
+	WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+}
+
+/**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,*,0 -> *,0,1
+ */
+static __always_inline void set_locked(arena_spinlock_t __arena *lock)
+{
+	WRITE_ONCE(lock->locked, _Q_LOCKED_VAL);
+}
+
+static __always_inline
+u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock)
+{
+	u32 old, new;
+
+	old = atomic_read(&lock->val);
+	do {
+		new = old | _Q_PENDING_VAL;
+		/*
+		 * These loops are not expected to stall, but we still need to
+		 * prove to the verifier they will terminate eventually.
+		 */
+		cond_break_label(out);
+	} while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new));
+
+	return old;
+out:
+	bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__);
+	return old;
+}
+
+/**
+ * arena_spin_trylock - try to acquire the queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ * Return: 1 if lock acquired, 0 if failed
+ */
+static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock)
+{
+	int val = atomic_read(&lock->val);
+
+	if (unlikely(val))
+		return 0;
+
+	return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL));
+}
+
+__noinline
+int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val)
+{
+	struct arena_mcs_spinlock __arena *prev, *next, *node0, *node;
+	int ret = -ETIMEDOUT;
+	u32 old, tail;
+	int idx;
+
+	/*
+	 * Wait for in-progress pending->locked hand-overs with a bounded
+	 * number of spins so that we guarantee forward progress.
+	 *
+	 * 0,1,0 -> 0,0,1
+	 */
+	if (val == _Q_PENDING_VAL) {
+		int cnt = _Q_PENDING_LOOPS;
+		val = atomic_cond_read_relaxed_label(&lock->val,
+						     (VAL != _Q_PENDING_VAL) || !cnt--,
+						     release_err);
+	}
+
+	/*
+	 * If we observe any contention; queue.
+	 */
+	if (val & ~_Q_LOCKED_MASK)
+		goto queue;
+
+	/*
+	 * trylock || pending
+	 *
+	 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
+	 */
+	val = arena_fetch_set_pending_acquire(lock);
+
+	/*
+	 * If we observe contention, there is a concurrent locker.
+	 *
+	 * Undo and queue; our setting of PENDING might have made the
+	 * n,0,0 -> 0,0,0 transition fail and it will now be waiting
+	 * on @next to become !NULL.
+	 */
+	if (unlikely(val & ~_Q_LOCKED_MASK)) {
+
+		/* Undo PENDING if we set it. */
+		if (!(val & _Q_PENDING_MASK))
+			clear_pending(lock);
+
+		goto queue;
+	}
+
+	/*
+	 * We're pending, wait for the owner to go away.
+	 *
+	 * 0,1,1 -> *,1,0
+	 *
+	 * this wait loop must be a load-acquire such that we match the
+	 * store-release that clears the locked bit and create lock
+	 * sequentiality; this is because not all
+	 * clear_pending_set_locked() implementations imply full
+	 * barriers.
+	 */
+	if (val & _Q_LOCKED_MASK)
+		smp_cond_load_acquire_label(&lock->locked, !VAL, release_err);
+
+	/*
+	 * take ownership and clear the pending bit.
+	 *
+	 * 0,1,0 -> 0,0,1
+	 */
+	clear_pending_set_locked(lock);
+	return 0;
+
+	/*
+	 * End of pending bit optimistic spinning and beginning of MCS
+	 * queuing.
+	 */
+queue:
+	/* See comments over definition of qnodes for the + 1. */
+	node0 = &(qnodes[bpf_get_smp_processor_id() + 1])[0].mcs;
+	idx = node0->count++;
+	tail = encode_tail(bpf_get_smp_processor_id(), idx);
+
+	/*
+	 * 4 nodes are allocated based on the assumption that there will not be
+	 * nested NMIs taking spinlocks. That may not be true in some
+	 * architectures even though the chance of needing more than 4 nodes
+	 * will still be extremely unlikely. When that happens, we simply return
+	 * an error. Original qspinlock has a trylock fallback in this case.
+	 */
+	if (unlikely(idx >= _Q_MAX_NODES)) {
+		ret = -EBUSY;
+		goto release_node_err;
+	}
+
+	node = grab_mcs_node(node0, idx);
+
+	/*
+	 * Ensure that we increment the head node->count before initialising
+	 * the actual node. If the compiler is kind enough to reorder these
+	 * stores, then an IRQ could overwrite our assignments.
+	 */
+	barrier();
+
+	node->locked = 0;
+	node->next = NULL;
+
+	/*
+	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
+	 * attempt the trylock once more in the hope someone let go while we
+	 * weren't watching.
+	 */
+	if (arena_spin_trylock(lock))
+		goto release;
+
+	/*
+	 * Ensure that the initialisation of @node is complete before we
+	 * publish the updated tail via xchg_tail() and potentially link
+	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
+	 */
+	smp_wmb();
+
+	/*
+	 * Publish the updated tail.
+	 * We have already touched the queueing cacheline; don't bother with
+	 * pending stuff.
+	 *
+	 * p,*,* -> n,*,*
+	 */
+	old = xchg_tail(lock, tail);
+	next = NULL;
+
+	/*
+	 * if there was a previous node; link it and wait until reaching the
+	 * head of the waitqueue.
+	 */
+	if (old & _Q_TAIL_MASK) {
+		prev = decode_tail(old);
+
+		/* Link @node into the waitqueue. */
+		WRITE_ONCE(prev->next, node);
+
+		arch_mcs_spin_lock_contended_label(&node->locked, release_node_err);
+
+		/*
+		 * While waiting for the MCS lock, the next pointer may have
+		 * been set by another lock waiter. We cannot prefetch here
+		 * due to lack of equivalent instruction in BPF ISA.
+		 */
+		next = READ_ONCE(node->next);
+	}
+
+	/*
+	 * we're at the head of the waitqueue, wait for the owner & pending to
+	 * go away.
+	 *
+	 * *,x,y -> *,0,0
+	 *
+	 * this wait loop must use a load-acquire such that we match the
+	 * store-release that clears the locked bit and create lock
+	 * sequentiality; this is because the set_locked() function below
+	 * does not imply a full barrier.
+	 */
+	val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK),
+					     release_node_err);
+
+	/*
+	 * claim the lock:
+	 *
+	 * n,0,0 -> 0,0,1 : lock, uncontended
+	 * *,*,0 -> *,*,1 : lock, contended
+	 *
+	 * If the queue head is the only one in the queue (lock value == tail)
+	 * and nobody is pending, clear the tail code and grab the lock.
+	 * Otherwise, we only need to grab the lock.
+	 */
+
+	/*
+	 * In the PV case we might already have _Q_LOCKED_VAL set, because
+	 * of lock stealing; therefore we must also allow:
+	 *
+	 * n,0,1 -> 0,0,1
+	 *
+	 * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
+	 *       above wait condition, therefore any concurrent setting of
+	 *       PENDING will make the uncontended transition fail.
+	 */
+	if ((val & _Q_TAIL_MASK) == tail) {
+		if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
+			goto release; /* No contention */
+	}
+
+	/*
+	 * Either somebody is queued behind us or _Q_PENDING_VAL got set
+	 * which will then detect the remaining tail and queue behind us
+	 * ensuring we'll see a @next.
+	 */
+	set_locked(lock);
+
+	/*
+	 * contended path; wait for next if not observed yet, release.
+	 */
+	if (!next)
+		next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err);
+
+	arch_mcs_spin_unlock_contended(&next->locked);
+
+release:;
+	/*
+	 * release the node
+	 *
+	 * Doing a normal dec vs this_cpu_dec is fine. An upper context always
+	 * decrements count it incremented before returning, thus we're fine.
+	 * For contexts interrupting us, they either observe our dec or not.
+	 * Just ensure the compiler doesn't reorder this statement, as a
+	 * this_cpu_dec implicitly implied that.
+	 */
+	barrier();
+	node0->count--;
+	return 0;
+release_node_err:
+	barrier();
+	node0->count--;
+	goto release_err;
+release_err:
+	return ret;
+}
+
+/**
+ * arena_spin_lock - acquire a queued spinlock
+ * @lock: Pointer to queued spinlock structure
+ *
+ * The return value _must_ be tested against zero for success.
+ * On error, returned value will be negative.
+ */
+static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock)
+{
+	int val = 0;
+
+	if (CONFIG_NR_CPUS > 1024)
+		return -EOPNOTSUPP;
+
+	bpf_preempt_disable();
+	if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
+		return 0;
+
+	val = arena_spin_lock_slowpath(lock, val);
+	/* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */
+	if (val)
+		bpf_preempt_enable();
+	return val;
+}
+
+/**
+ * arena_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ */
+static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock)
+{
+	/*
+	 * unlock() needs release semantics:
+	 */
+	smp_store_release(&lock->locked, 0);
+	bpf_preempt_enable();
+}
+
+#define arena_spin_lock_irqsave(lock, flags)             \
+	({                                               \
+		int __ret;                               \
+		bpf_local_irq_save(&(flags));            \
+		__ret = arena_spin_lock((lock));         \
+		if (__ret)                               \
+			bpf_local_irq_restore(&(flags)); \
+		(__ret);                                 \
+	})
+
+#define arena_spin_unlock_irqrestore(lock, flags) \
+	({                                        \
+		arena_spin_unlock((lock));        \
+		bpf_local_irq_restore(&(flags));  \
+	})
+
+#endif
+
+#endif /* BPF_ARENA_SPIN_LOCK_H */
diff --git a/tools/testing/selftests/bpf/bpf_atomic.h b/tools/testing/selftests/bpf/bpf_atomic.h
new file mode 100644
index 000000000000..06defb9e050d
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_atomic.h
@@ -0,0 +1,132 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */
+#ifndef BPF_ATOMIC_H
+#define BPF_ATOMIC_H
+
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_experimental.h"
+
+extern bool CONFIG_X86_64 __kconfig __weak;
+
+#define __scalar_type_to_expr_cases(type) \
+	unsigned type : (unsigned type)0, signed type : (signed type)0
+/*
+ * This is lifted from __unqual_scalar_typeof in the kernel (which is used to
+ * lose const qualifier etc.), but adapted to also cover pointers. It is
+ * necessary because we ascertain type to create local variables in macros
+ * below, but for pointers with __arena tag, we'll ascertain the underlying type
+ * with the tag, causing a compilation error (as local variables that are not
+ * pointers may not have __arena tag). This trick allows losing the qualifier.
+ */
+#define __unqual_typeof(x)                              \
+	typeof(_Generic((x),                            \
+		char: (char)0,                          \
+		__scalar_type_to_expr_cases(char),      \
+		__scalar_type_to_expr_cases(short),     \
+		__scalar_type_to_expr_cases(int),       \
+		__scalar_type_to_expr_cases(long),      \
+		__scalar_type_to_expr_cases(long long), \
+		default: (void *)0))
+
+/* No-op for BPF */
+#define cpu_relax() ({})
+
+#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val))
+
+#define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new)
+
+#define try_cmpxchg(p, pold, new)                                 \
+	({                                                        \
+		__unqual_typeof(*(pold)) __o = *(pold);           \
+		__unqual_typeof(*(p)) __r = cmpxchg(p, __o, new); \
+		if (__r != __o)                                   \
+			*(pold) = __r;                            \
+		__r == __o;                                       \
+	})
+
+#define try_cmpxchg_relaxed(p, pold, new) try_cmpxchg(p, pold, new)
+
+#define try_cmpxchg_acquire(p, pold, new) try_cmpxchg(p, pold, new)
+
+#define smp_mb()                                 \
+	({                                       \
+		unsigned long __val;             \
+		__sync_fetch_and_add(&__val, 0); \
+	})
+
+#define smp_rmb()                   \
+	({                          \
+		if (!CONFIG_X86_64) \
+			smp_mb();   \
+		else                \
+			barrier();  \
+	})
+
+#define smp_wmb()                   \
+	({                          \
+		if (!CONFIG_X86_64) \
+			smp_mb();   \
+		else                \
+			barrier();  \
+	})
+
+/* Control dependency provides LOAD->STORE, provide LOAD->LOAD */
+#define smp_acquire__after_ctrl_dep() ({ smp_rmb(); })
+
+#define smp_load_acquire(p)                                  \
+	({                                                   \
+		__unqual_typeof(*(p)) __v = READ_ONCE(*(p)); \
+		if (!CONFIG_X86_64)                          \
+			smp_mb();                            \
+		barrier();                                   \
+		__v;                                         \
+	})
+
+#define smp_store_release(p, val)      \
+	({                             \
+		if (!CONFIG_X86_64)    \
+			smp_mb();      \
+		barrier();             \
+		WRITE_ONCE(*(p), val); \
+	})
+
+#define smp_cond_load_relaxed_label(p, cond_expr, label)                \
+	({                                                              \
+		typeof(p) __ptr = (p);                                  \
+		__unqual_typeof(*(p)) VAL;                              \
+		for (;;) {                                              \
+			VAL = (__unqual_typeof(*(p)))READ_ONCE(*__ptr); \
+			if (cond_expr)                                  \
+				break;                                  \
+			cond_break_label(label);			\
+			cpu_relax();                                    \
+		}                                                       \
+		(typeof(*(p)))VAL;                                      \
+	})
+
+#define smp_cond_load_acquire_label(p, cond_expr, label)                  \
+	({                                                                \
+		__unqual_typeof(*p) __val =                               \
+			smp_cond_load_relaxed_label(p, cond_expr, label); \
+		smp_acquire__after_ctrl_dep();                            \
+		(typeof(*(p)))__val;                                      \
+	})
+
+#define atomic_read(p) READ_ONCE((p)->counter)
+
+#define atomic_cond_read_relaxed_label(p, cond_expr, label) \
+	smp_cond_load_relaxed_label(&(p)->counter, cond_expr, label)
+
+#define atomic_cond_read_acquire_label(p, cond_expr, label) \
+	smp_cond_load_acquire_label(&(p)->counter, cond_expr, label)
+
+#define atomic_try_cmpxchg_relaxed(p, pold, new) \
+	try_cmpxchg_relaxed(&(p)->counter, pold, new)
+
+#define atomic_try_cmpxchg_acquire(p, pold, new) \
+	try_cmpxchg_acquire(&(p)->counter, pold, new)
+
+#endif /* BPF_ATOMIC_H */