diff mbox series

[bpf-next,v2] selftests/bpf: Fix arena_atomics selftest failure due to llvm change

Message ID 20240803025928.4184433-1-yonghong.song@linux.dev (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next,v2] selftests/bpf: Fix arena_atomics selftest failure due to llvm change | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 7 this patch: 7
netdev/build_tools success Errors and warnings before: 10 this patch: 10
netdev/cc_maintainers warning 17 maintainers not CCed: kpsingh@kernel.org shuah@kernel.org haoluo@google.com nathan@kernel.org john.fastabend@gmail.com mykolal@fb.com song@kernel.org morbo@google.com iii@linux.ibm.com sdf@fomichev.me llvm@lists.linux.dev justinstitt@google.com jolsa@kernel.org linux-kselftest@vger.kernel.org martin.lau@linux.dev eddyz87@gmail.com ndesaulniers@google.com
netdev/build_clang success Errors and warnings before: 7 this patch: 7
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7 this patch: 7
netdev/checkpatch warning CHECK: Lines should not end with a '(' WARNING: 'compatability' may be misspelled - perhaps 'compatibility'?
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Yonghong Song Aug. 3, 2024, 2:59 a.m. UTC
Peilen Ye reported an issue ([1]) where for __sync_fetch_and_add(...) without
return value like
  __sync_fetch_and_add(&foo, 1);
llvm BPF backend generates locked insn e.g.
  lock *(u32 *)(r1 + 0) += r2

If __sync_fetch_and_add(...) returns a value like
  res = __sync_fetch_and_add(&foo, 1);
llvm BPF backend generates like
  r2 = atomic_fetch_add((u32 *)(r1 + 0), r2)

But 'lock *(u32 *)(r1 + 0) += r2' caused a problem in jit
since proper barrier is not inserted based on __sync_fetch_and_add() semantics.

The above discrepancy is due to commit [2] where it tries to maintain backward
compatability since before commit [2], __sync_fetch_and_add(...) generates
lock insn in BPF backend.

Based on discussion in [1], now it is time to fix the above discrepancy so we can
have proper barrier support in jit. llvm patch [3] made sure that __sync_fetch_and_add(...)
always generates atomic_fetch_add(...) insns. Now 'lock *(u32 *)(r1 + 0) += r2' can only
be generated by inline asm. The same for __sync_fetch_and_and(), __sync_fetch_and_or()
and __sync_fetch_and_xor().

But the change in [3] caused arena_atomics selftest failure.

  test_arena_atomics:PASS:arena atomics skeleton open 0 nsec
  libbpf: prog 'and': BPF program load failed: Permission denied
  libbpf: prog 'and': -- BEGIN PROG LOAD LOG --
  arg#0 reference type('UNKNOWN ') size cannot be determined: -22
  0: R1=ctx() R10=fp0
  ; if (pid != (bpf_get_current_pid_tgid() >> 32)) @ arena_atomics.c:87
  0: (18) r1 = 0xffffc90000064000       ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4)
  2: (61) r6 = *(u32 *)(r1 +0)          ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
  3: (85) call bpf_get_current_pid_tgid#14      ; R0_w=scalar()
  4: (77) r0 >>= 32                     ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
  5: (5d) if r0 != r6 goto pc+11        ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0x)
  ; __sync_fetch_and_and(&and64_value, 0x011ull << 32); @ arena_atomics.c:91
  6: (18) r1 = 0x100000000060           ; R1_w=scalar()
  8: (bf) r1 = addr_space_cast(r1, 0, 1)        ; R1_w=arena
  9: (18) r2 = 0x1100000000             ; R2_w=0x1100000000
  11: (db) r2 = atomic64_fetch_and((u64 *)(r1 +0), r2)
  BPF_ATOMIC stores into R1 arena is not allowed
  processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
  -- END PROG LOAD LOG --
  libbpf: prog 'and': failed to load: -13
  libbpf: failed to load object 'arena_atomics'
  libbpf: failed to load BPF skeleton 'arena_atomics': -13
  test_arena_atomics:FAIL:arena atomics skeleton load unexpected error: -13 (errno 13)
  #3       arena_atomics:FAIL

The reason of the failure is due to [4] where atomic{64,}_fetch_{and,or,xor}() are not
allowed by arena addresses. Without llvm patch [3], the compiler will generate 'lock ...'
insn and everything will work fine.

This patch fixed the problem by using inline asms. Instead of __sync_fetch_and_{and,or,xor}() functions,
the inline asm with 'lock' insn is used and it will work with or without [3].
Note that three bpf programs ('and', 'or' and 'xor') are guarded with __BPF_FEATURE_ADDR_SPACE_CAST
as well to ensure compilation failure for llvm <= 18 version. Note that for llvm <= 18 where
addr_space_cast is not supported, all arena_atomics subtests are skipped with below message:
  test_arena_atomics:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang
  #3 arena_atomics:SKIP

  [1] https://lore.kernel.org/bpf/ZqqiQQWRnz7H93Hc@google.com/T/#mb68d67bc8f39e35a0c3db52468b9de59b79f021f
  [2] https://github.com/llvm/llvm-project/commit/286daafd65129228e08a1d07aa4ca74488615744
  [3] https://github.com/llvm/llvm-project/pull/101428
  [4] d503a04f8bc0 ("bpf: Add support for certain atomics in bpf_arena to x86 JIT")

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 .../selftests/bpf/progs/arena_atomics.c       | 63 ++++++++++++++++---
 1 file changed, 54 insertions(+), 9 deletions(-)

Changelog:
  v1 -> v2:
    - Add __BPF_FEATURE_ADDR_SPACE_CAST to guard newly added asm codes for llvm >= 19

Comments

Alexei Starovoitov Aug. 5, 2024, 5:53 p.m. UTC | #1
On Fri, Aug 2, 2024 at 7:59 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> Peilen Ye reported an issue ([1]) where for __sync_fetch_and_add(...) without
> return value like
>   __sync_fetch_and_add(&foo, 1);
> llvm BPF backend generates locked insn e.g.
>   lock *(u32 *)(r1 + 0) += r2
>
> If __sync_fetch_and_add(...) returns a value like
>   res = __sync_fetch_and_add(&foo, 1);
> llvm BPF backend generates like
>   r2 = atomic_fetch_add((u32 *)(r1 + 0), r2)
>
> But 'lock *(u32 *)(r1 + 0) += r2' caused a problem in jit
> since proper barrier is not inserted based on __sync_fetch_and_add() semantics.
>
> The above discrepancy is due to commit [2] where it tries to maintain backward
> compatability since before commit [2], __sync_fetch_and_add(...) generates
> lock insn in BPF backend.
>
> Based on discussion in [1], now it is time to fix the above discrepancy so we can
> have proper barrier support in jit. llvm patch [3] made sure that __sync_fetch_and_add(...)
> always generates atomic_fetch_add(...) insns. Now 'lock *(u32 *)(r1 + 0) += r2' can only
> be generated by inline asm. The same for __sync_fetch_and_and(), __sync_fetch_and_or()
> and __sync_fetch_and_xor().
>
> But the change in [3] caused arena_atomics selftest failure.
>
>   test_arena_atomics:PASS:arena atomics skeleton open 0 nsec
>   libbpf: prog 'and': BPF program load failed: Permission denied
>   libbpf: prog 'and': -- BEGIN PROG LOAD LOG --
>   arg#0 reference type('UNKNOWN ') size cannot be determined: -22
>   0: R1=ctx() R10=fp0
>   ; if (pid != (bpf_get_current_pid_tgid() >> 32)) @ arena_atomics.c:87
>   0: (18) r1 = 0xffffc90000064000       ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4)
>   2: (61) r6 = *(u32 *)(r1 +0)          ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
>   3: (85) call bpf_get_current_pid_tgid#14      ; R0_w=scalar()
>   4: (77) r0 >>= 32                     ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
>   5: (5d) if r0 != r6 goto pc+11        ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0x)
>   ; __sync_fetch_and_and(&and64_value, 0x011ull << 32); @ arena_atomics.c:91
>   6: (18) r1 = 0x100000000060           ; R1_w=scalar()
>   8: (bf) r1 = addr_space_cast(r1, 0, 1)        ; R1_w=arena
>   9: (18) r2 = 0x1100000000             ; R2_w=0x1100000000
>   11: (db) r2 = atomic64_fetch_and((u64 *)(r1 +0), r2)
>   BPF_ATOMIC stores into R1 arena is not allowed
>   processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
>   -- END PROG LOAD LOG --
>   libbpf: prog 'and': failed to load: -13
>   libbpf: failed to load object 'arena_atomics'
>   libbpf: failed to load BPF skeleton 'arena_atomics': -13
>   test_arena_atomics:FAIL:arena atomics skeleton load unexpected error: -13 (errno 13)
>   #3       arena_atomics:FAIL
>
> The reason of the failure is due to [4] where atomic{64,}_fetch_{and,or,xor}() are not
> allowed by arena addresses. Without llvm patch [3], the compiler will generate 'lock ...'
> insn and everything will work fine.
>
> This patch fixed the problem by using inline asms. Instead of __sync_fetch_and_{and,or,xor}() functions,
> the inline asm with 'lock' insn is used and it will work with or without [3].
> Note that three bpf programs ('and', 'or' and 'xor') are guarded with __BPF_FEATURE_ADDR_SPACE_CAST
> as well to ensure compilation failure for llvm <= 18 version. Note that for llvm <= 18 where
> addr_space_cast is not supported, all arena_atomics subtests are skipped with below message:
>   test_arena_atomics:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang
>   #3 arena_atomics:SKIP
>
>   [1] https://lore.kernel.org/bpf/ZqqiQQWRnz7H93Hc@google.com/T/#mb68d67bc8f39e35a0c3db52468b9de59b79f021f
>   [2] https://github.com/llvm/llvm-project/commit/286daafd65129228e08a1d07aa4ca74488615744
>   [3] https://github.com/llvm/llvm-project/pull/101428
>   [4] d503a04f8bc0 ("bpf: Add support for certain atomics in bpf_arena to x86 JIT")
>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  .../selftests/bpf/progs/arena_atomics.c       | 63 ++++++++++++++++---
>  1 file changed, 54 insertions(+), 9 deletions(-)
>
> Changelog:
>   v1 -> v2:
>     - Add __BPF_FEATURE_ADDR_SPACE_CAST to guard newly added asm codes for llvm >= 19
>
> diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c
> index bb0acd79d28a..dea54557fc00 100644
> --- a/tools/testing/selftests/bpf/progs/arena_atomics.c
> +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c
> @@ -5,6 +5,7 @@
>  #include <bpf/bpf_tracing.h>
>  #include <stdbool.h>
>  #include "bpf_arena_common.h"
> +#include "bpf_misc.h"
>
>  struct {
>         __uint(type, BPF_MAP_TYPE_ARENA);
> @@ -85,10 +86,24 @@ int and(const void *ctx)
>  {
>         if (pid != (bpf_get_current_pid_tgid() >> 32))
>                 return 0;
> -#ifdef ENABLE_ATOMICS_TESTS
> +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
>
> -       __sync_fetch_and_and(&and64_value, 0x011ull << 32);
> -       __sync_fetch_and_and(&and32_value, 0x011);
> +       asm volatile(
> +               "r1 = addr_space_cast(%[and64_value], 0, 1);"
> +               "lock *(u64 *)(r1 + 0) &= %[val]"
> +               :
> +               : __imm_ptr(and64_value),
> +                 [val]"r"(0x011ull << 32)
> +               : "r1"
> +       );
> +       asm volatile(
> +               "r1 = addr_space_cast(%[and32_value], 0, 1);"
> +               "lock *(u32 *)(r1 + 0) &= %[val]"
> +               :
> +               : __imm_ptr(and32_value),
> +                 [val]"w"(0x011)
> +               : "r1"
> +       );

Instead of inline asm there is a better way to do the same in C.
https://godbolt.org/z/71PYx1oqE

void foo(int a, _Atomic int *b)
{
 *b += a;
}

generates:
lock *(u32 *)(r2 + 0) += r1

but
*b &= a;

crashes llvm :( with

<source>:3:5: error: unsupported atomic operation, please use 64 bit version
    3 |  *b &= a;

but works with -mcpu=v3

So let's make this test mcpu=v3 only and use normal C ?

pw-bot: cr
Yonghong Song Aug. 5, 2024, 10:36 p.m. UTC | #2
On 8/5/24 10:53 AM, Alexei Starovoitov wrote:
> On Fri, Aug 2, 2024 at 7:59 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> Peilen Ye reported an issue ([1]) where for __sync_fetch_and_add(...) without
>> return value like
>>    __sync_fetch_and_add(&foo, 1);
>> llvm BPF backend generates locked insn e.g.
>>    lock *(u32 *)(r1 + 0) += r2
>>
>> If __sync_fetch_and_add(...) returns a value like
>>    res = __sync_fetch_and_add(&foo, 1);
>> llvm BPF backend generates like
>>    r2 = atomic_fetch_add((u32 *)(r1 + 0), r2)
>>
>> But 'lock *(u32 *)(r1 + 0) += r2' caused a problem in jit
>> since proper barrier is not inserted based on __sync_fetch_and_add() semantics.
>>
>> The above discrepancy is due to commit [2] where it tries to maintain backward
>> compatability since before commit [2], __sync_fetch_and_add(...) generates
>> lock insn in BPF backend.
>>
>> Based on discussion in [1], now it is time to fix the above discrepancy so we can
>> have proper barrier support in jit. llvm patch [3] made sure that __sync_fetch_and_add(...)
>> always generates atomic_fetch_add(...) insns. Now 'lock *(u32 *)(r1 + 0) += r2' can only
>> be generated by inline asm. The same for __sync_fetch_and_and(), __sync_fetch_and_or()
>> and __sync_fetch_and_xor().
>>
>> But the change in [3] caused arena_atomics selftest failure.
>>
>>    test_arena_atomics:PASS:arena atomics skeleton open 0 nsec
>>    libbpf: prog 'and': BPF program load failed: Permission denied
>>    libbpf: prog 'and': -- BEGIN PROG LOAD LOG --
>>    arg#0 reference type('UNKNOWN ') size cannot be determined: -22
>>    0: R1=ctx() R10=fp0
>>    ; if (pid != (bpf_get_current_pid_tgid() >> 32)) @ arena_atomics.c:87
>>    0: (18) r1 = 0xffffc90000064000       ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4)
>>    2: (61) r6 = *(u32 *)(r1 +0)          ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
>>    3: (85) call bpf_get_current_pid_tgid#14      ; R0_w=scalar()
>>    4: (77) r0 >>= 32                     ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
>>    5: (5d) if r0 != r6 goto pc+11        ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0x)
>>    ; __sync_fetch_and_and(&and64_value, 0x011ull << 32); @ arena_atomics.c:91
>>    6: (18) r1 = 0x100000000060           ; R1_w=scalar()
>>    8: (bf) r1 = addr_space_cast(r1, 0, 1)        ; R1_w=arena
>>    9: (18) r2 = 0x1100000000             ; R2_w=0x1100000000
>>    11: (db) r2 = atomic64_fetch_and((u64 *)(r1 +0), r2)
>>    BPF_ATOMIC stores into R1 arena is not allowed
>>    processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
>>    -- END PROG LOAD LOG --
>>    libbpf: prog 'and': failed to load: -13
>>    libbpf: failed to load object 'arena_atomics'
>>    libbpf: failed to load BPF skeleton 'arena_atomics': -13
>>    test_arena_atomics:FAIL:arena atomics skeleton load unexpected error: -13 (errno 13)
>>    #3       arena_atomics:FAIL
>>
>> The reason of the failure is due to [4] where atomic{64,}_fetch_{and,or,xor}() are not
>> allowed by arena addresses. Without llvm patch [3], the compiler will generate 'lock ...'
>> insn and everything will work fine.
>>
>> This patch fixed the problem by using inline asms. Instead of __sync_fetch_and_{and,or,xor}() functions,
>> the inline asm with 'lock' insn is used and it will work with or without [3].
>> Note that three bpf programs ('and', 'or' and 'xor') are guarded with __BPF_FEATURE_ADDR_SPACE_CAST
>> as well to ensure compilation failure for llvm <= 18 version. Note that for llvm <= 18 where
>> addr_space_cast is not supported, all arena_atomics subtests are skipped with below message:
>>    test_arena_atomics:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang
>>    #3 arena_atomics:SKIP
>>
>>    [1] https://lore.kernel.org/bpf/ZqqiQQWRnz7H93Hc@google.com/T/#mb68d67bc8f39e35a0c3db52468b9de59b79f021f
>>    [2] https://github.com/llvm/llvm-project/commit/286daafd65129228e08a1d07aa4ca74488615744
>>    [3] https://github.com/llvm/llvm-project/pull/101428
>>    [4] d503a04f8bc0 ("bpf: Add support for certain atomics in bpf_arena to x86 JIT")
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   .../selftests/bpf/progs/arena_atomics.c       | 63 ++++++++++++++++---
>>   1 file changed, 54 insertions(+), 9 deletions(-)
>>
>> Changelog:
>>    v1 -> v2:
>>      - Add __BPF_FEATURE_ADDR_SPACE_CAST to guard newly added asm codes for llvm >= 19
>>
>> diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c
>> index bb0acd79d28a..dea54557fc00 100644
>> --- a/tools/testing/selftests/bpf/progs/arena_atomics.c
>> +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c
>> @@ -5,6 +5,7 @@
>>   #include <bpf/bpf_tracing.h>
>>   #include <stdbool.h>
>>   #include "bpf_arena_common.h"
>> +#include "bpf_misc.h"
>>
>>   struct {
>>          __uint(type, BPF_MAP_TYPE_ARENA);
>> @@ -85,10 +86,24 @@ int and(const void *ctx)
>>   {
>>          if (pid != (bpf_get_current_pid_tgid() >> 32))
>>                  return 0;
>> -#ifdef ENABLE_ATOMICS_TESTS
>> +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
>>
>> -       __sync_fetch_and_and(&and64_value, 0x011ull << 32);
>> -       __sync_fetch_and_and(&and32_value, 0x011);
>> +       asm volatile(
>> +               "r1 = addr_space_cast(%[and64_value], 0, 1);"
>> +               "lock *(u64 *)(r1 + 0) &= %[val]"
>> +               :
>> +               : __imm_ptr(and64_value),
>> +                 [val]"r"(0x011ull << 32)
>> +               : "r1"
>> +       );
>> +       asm volatile(
>> +               "r1 = addr_space_cast(%[and32_value], 0, 1);"
>> +               "lock *(u32 *)(r1 + 0) &= %[val]"
>> +               :
>> +               : __imm_ptr(and32_value),
>> +                 [val]"w"(0x011)
>> +               : "r1"
>> +       );
> Instead of inline asm there is a better way to do the same in C.
> https://godbolt.org/z/71PYx1oqE
>
> void foo(int a, _Atomic int *b)
> {
>   *b += a;
> }
>
> generates:
> lock *(u32 *)(r2 + 0) += r1

If you use latest llvm-project with
https://github.com/llvm/llvm-project/pull/101428
included, the above code will generate like

$ clang --target=bpf -O2 -c t.c && llvm-objdump -d t.o
t.o:    file format elf64-bpf
Disassembly of section .text:
0000000000000000 <foo>:
        0:       c3 12 00 00 01 00 00 00 r1 = atomic_fetch_add((u32 *)(r2 + 0x0), r1)
        1:       95 00 00 00 00 00 00 00 exit


With -mcpu=v3 the same code can be generated.

>
> but
> *b &= a;
>
> crashes llvm :( with
>
> <source>:3:5: error: unsupported atomic operation, please use 64 bit version
>      3 |  *b &= a;

It failed with the following llvm error message:
t.c:1:6: error: unsupported atomic operation, please use 64 bit version
     1 | void foo(int a, _Atomic int *b)
       |      ^
fatal error: error in backend: Cannot select: t8: i64,ch = AtomicLoadAnd<(load store seq_cst (s32) on %ir.b)> t0, t4, t2
   t4: i64,ch = CopyFromReg t0, Register:i64 %1
     t3: i64 = Register %1
   t2: i64,ch = CopyFromReg t0, Register:i64 %0
     t1: i64 = Register %0
In function: foo

>
> but works with -mcpu=v3

Yes. it does work with -mcpu=v3:

$ clang --target=bpf -O2 -mcpu=v3 -c t.c && llvm-objdump -d --mcpu=v3 t.o

t.o:    file format elf64-bpf
Disassembly of section .text:
0000000000000000 <foo>:
        0:       c3 12 00 00 51 00 00 00 w1 = atomic_fetch_and((u32 *)(r2 + 0x0), w1)
        1:       95 00 00 00 00 00 00 00 exit

NOTE: I need -mcpu=v3 for llvm-objdump to print asm code 'atomic_fetch_and' properly.
Will double check this.

For code:
void foo(int a, _Atomic int *b)
{
  *b &= a;
}

The initial IR generated by clang frontend is:

define dso_local void @foo(i32 noundef %a, ptr noundef %b) #0 {
entry:
   %a.addr = alloca i32, align 4
   %b.addr = alloca ptr, align 8
   store i32 %a, ptr %a.addr, align 4, !tbaa !3
   store ptr %b, ptr %b.addr, align 8, !tbaa !7
   %0 = load i32, ptr %a.addr, align 4, !tbaa !3
   %1 = load ptr, ptr %b.addr, align 8, !tbaa !7
   %2 = atomicrmw and ptr %1, i32 %0 seq_cst, align 4
   %3 = and i32 %2, %0
   ret void
}

Note that atomicrmw in the above. Eventually it optimized to

define dso_local void @foo(i32 noundef %a, ptr noundef %b) #0 {
entry:
   %0 = atomicrmw and ptr %b, i32 %a seq_cst, align 4
   ret void
}

The 'atomicrmw' is the same IR as generated by
__sync_fetch_and_*() and eventually will generate atomic_fetch_*() bpf 
insn.
Discussed with Andrii, and
another option is to specify relaxed consistency, so llvm
internal could translate it into locked insn. For example,

$ cat t1.c
#include <stdatomic.h>

void f(_Atomic int *i) {
   __c11_atomic_fetch_and(i, 1, memory_order_relaxed);
}

# to have gnu/stubs-32.h in the current directory to make it compile
[yhs@devvm1513.prn0 ~/tmp6]$ ls gnu
stubs-32.h
[yhs@devvm1513.prn0 ~/tmp6]$ clang --target=bpf -O2 -I. -c -mcpu=v3 t1.c

The initial IR:
define dso_local void @f(ptr noundef %i) #0 {
entry:
   %i.addr = alloca ptr, align 8
   %.atomictmp = alloca i32, align 4
   %atomic-temp = alloca i32, align 4
   store ptr %i, ptr %i.addr, align 8, !tbaa !3
   %0 = load ptr, ptr %i.addr, align 8, !tbaa !3
   store i32 1, ptr %.atomictmp, align 4, !tbaa !7
   %1 = load i32, ptr %.atomictmp, align 4
   %2 = atomicrmw and ptr %0, i32 %1 monotonic, align 4
   store i32 %2, ptr %atomic-temp, align 4
   %3 = load i32, ptr %atomic-temp, align 4, !tbaa !7
   ret void
}

The IR right before machine code generation:

define dso_local void @f(ptr nocapture noundef %i) local_unnamed_addr #0 {
entry:
   %0 = atomicrmw and ptr %i, i32 1 monotonic, align 4
   ret void
}

Maybe we could special process the above to generate
a locked insn if
   - atomicrmw operator
   - monotonic (related) consistency
   - return value is not used

So this will not violate original program semantics.
Does this sound a reasonable apporach?

>
> So let's make this test mcpu=v3 only and use normal C ?
>
> pw-bot: cr
Jose E. Marchesi Aug. 6, 2024, 4:48 a.m. UTC | #3
> On 8/5/24 10:53 AM, Alexei Starovoitov wrote:
>> On Fri, Aug 2, 2024 at 7:59 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>> Peilen Ye reported an issue ([1]) where for __sync_fetch_and_add(...) without
>>> return value like
>>>    __sync_fetch_and_add(&foo, 1);
>>> llvm BPF backend generates locked insn e.g.
>>>    lock *(u32 *)(r1 + 0) += r2
>>>
>>> If __sync_fetch_and_add(...) returns a value like
>>>    res = __sync_fetch_and_add(&foo, 1);
>>> llvm BPF backend generates like
>>>    r2 = atomic_fetch_add((u32 *)(r1 + 0), r2)
>>>
>>> But 'lock *(u32 *)(r1 + 0) += r2' caused a problem in jit
>>> since proper barrier is not inserted based on __sync_fetch_and_add() semantics.
>>>
>>> The above discrepancy is due to commit [2] where it tries to maintain backward
>>> compatability since before commit [2], __sync_fetch_and_add(...) generates
>>> lock insn in BPF backend.
>>>
>>> Based on discussion in [1], now it is time to fix the above discrepancy so we can
>>> have proper barrier support in jit. llvm patch [3] made sure that __sync_fetch_and_add(...)
>>> always generates atomic_fetch_add(...) insns. Now 'lock *(u32 *)(r1 + 0) += r2' can only
>>> be generated by inline asm. The same for __sync_fetch_and_and(), __sync_fetch_and_or()
>>> and __sync_fetch_and_xor().
>>>
>>> But the change in [3] caused arena_atomics selftest failure.
>>>
>>>    test_arena_atomics:PASS:arena atomics skeleton open 0 nsec
>>>    libbpf: prog 'and': BPF program load failed: Permission denied
>>>    libbpf: prog 'and': -- BEGIN PROG LOAD LOG --
>>>    arg#0 reference type('UNKNOWN ') size cannot be determined: -22
>>>    0: R1=ctx() R10=fp0
>>>    ; if (pid != (bpf_get_current_pid_tgid() >> 32)) @ arena_atomics.c:87
>>>    0: (18) r1 = 0xffffc90000064000       ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4)
>>>    2: (61) r6 = *(u32 *)(r1 +0)          ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
>>>    3: (85) call bpf_get_current_pid_tgid#14      ; R0_w=scalar()
>>>    4: (77) r0 >>= 32                     ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
>>>    5: (5d) if r0 != r6 goto pc+11        ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0x)
>>>    ; __sync_fetch_and_and(&and64_value, 0x011ull << 32); @ arena_atomics.c:91
>>>    6: (18) r1 = 0x100000000060           ; R1_w=scalar()
>>>    8: (bf) r1 = addr_space_cast(r1, 0, 1)        ; R1_w=arena
>>>    9: (18) r2 = 0x1100000000             ; R2_w=0x1100000000
>>>    11: (db) r2 = atomic64_fetch_and((u64 *)(r1 +0), r2)
>>>    BPF_ATOMIC stores into R1 arena is not allowed
>>>    processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
>>>    -- END PROG LOAD LOG --
>>>    libbpf: prog 'and': failed to load: -13
>>>    libbpf: failed to load object 'arena_atomics'
>>>    libbpf: failed to load BPF skeleton 'arena_atomics': -13
>>>    test_arena_atomics:FAIL:arena atomics skeleton load unexpected error: -13 (errno 13)
>>>    #3       arena_atomics:FAIL
>>>
>>> The reason of the failure is due to [4] where atomic{64,}_fetch_{and,or,xor}() are not
>>> allowed by arena addresses. Without llvm patch [3], the compiler will generate 'lock ...'
>>> insn and everything will work fine.
>>>
>>> This patch fixed the problem by using inline asms. Instead of __sync_fetch_and_{and,or,xor}() functions,
>>> the inline asm with 'lock' insn is used and it will work with or without [3].
>>> Note that three bpf programs ('and', 'or' and 'xor') are guarded with __BPF_FEATURE_ADDR_SPACE_CAST
>>> as well to ensure compilation failure for llvm <= 18 version. Note that for llvm <= 18 where
>>> addr_space_cast is not supported, all arena_atomics subtests are skipped with below message:
>>>    test_arena_atomics:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang
>>>    #3 arena_atomics:SKIP
>>>
>>>    [1] https://lore.kernel.org/bpf/ZqqiQQWRnz7H93Hc@google.com/T/#mb68d67bc8f39e35a0c3db52468b9de59b79f021f
>>>    [2] https://github.com/llvm/llvm-project/commit/286daafd65129228e08a1d07aa4ca74488615744
>>>    [3] https://github.com/llvm/llvm-project/pull/101428
>>>    [4] d503a04f8bc0 ("bpf: Add support for certain atomics in bpf_arena to x86 JIT")
>>>
>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>>> ---
>>>   .../selftests/bpf/progs/arena_atomics.c       | 63 ++++++++++++++++---
>>>   1 file changed, 54 insertions(+), 9 deletions(-)
>>>
>>> Changelog:
>>>    v1 -> v2:
>>>      - Add __BPF_FEATURE_ADDR_SPACE_CAST to guard newly added asm codes for llvm >= 19
>>>
>>> diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c
>>> index bb0acd79d28a..dea54557fc00 100644
>>> --- a/tools/testing/selftests/bpf/progs/arena_atomics.c
>>> +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c
>>> @@ -5,6 +5,7 @@
>>>   #include <bpf/bpf_tracing.h>
>>>   #include <stdbool.h>
>>>   #include "bpf_arena_common.h"
>>> +#include "bpf_misc.h"
>>>
>>>   struct {
>>>          __uint(type, BPF_MAP_TYPE_ARENA);
>>> @@ -85,10 +86,24 @@ int and(const void *ctx)
>>>   {
>>>          if (pid != (bpf_get_current_pid_tgid() >> 32))
>>>                  return 0;
>>> -#ifdef ENABLE_ATOMICS_TESTS
>>> +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
>>>
>>> -       __sync_fetch_and_and(&and64_value, 0x011ull << 32);
>>> -       __sync_fetch_and_and(&and32_value, 0x011);
>>> +       asm volatile(
>>> +               "r1 = addr_space_cast(%[and64_value], 0, 1);"
>>> +               "lock *(u64 *)(r1 + 0) &= %[val]"
>>> +               :
>>> +               : __imm_ptr(and64_value),
>>> +                 [val]"r"(0x011ull << 32)
>>> +               : "r1"
>>> +       );
>>> +       asm volatile(
>>> +               "r1 = addr_space_cast(%[and32_value], 0, 1);"
>>> +               "lock *(u32 *)(r1 + 0) &= %[val]"
>>> +               :
>>> +               : __imm_ptr(and32_value),
>>> +                 [val]"w"(0x011)
>>> +               : "r1"
>>> +       );
>> Instead of inline asm there is a better way to do the same in C.
>> https://godbolt.org/z/71PYx1oqE
>>
>> void foo(int a, _Atomic int *b)
>> {
>>   *b += a;
>> }
>>
>> generates:
>> lock *(u32 *)(r2 + 0) += r1
>
> If you use latest llvm-project with
> https://github.com/llvm/llvm-project/pull/101428
> included, the above code will generate like
>
> $ clang --target=bpf -O2 -c t.c && llvm-objdump -d t.o
> t.o:    file format elf64-bpf
> Disassembly of section .text:
> 0000000000000000 <foo>:
>        0:       c3 12 00 00 01 00 00 00 r1 = atomic_fetch_add((u32 *)(r2 + 0x0), r1)
>        1:       95 00 00 00 00 00 00 00 exit
>
>
> With -mcpu=v3 the same code can be generated.

Same for current GCC.

>>
>> but
>> *b &= a;
>>
>> crashes llvm :( with
>>
>> <source>:3:5: error: unsupported atomic operation, please use 64 bit version
>>      3 |  *b &= a;
>
> It failed with the following llvm error message:
> t.c:1:6: error: unsupported atomic operation, please use 64 bit version
>     1 | void foo(int a, _Atomic int *b)
>       |      ^
> fatal error: error in backend: Cannot select: t8: i64,ch = AtomicLoadAnd<(load store seq_cst (s32) on %ir.b)> t0, t4, t2
>   t4: i64,ch = CopyFromReg t0, Register:i64 %1
>     t3: i64 = Register %1
>   t2: i64,ch = CopyFromReg t0, Register:i64 %0
>     t1: i64 = Register %0
> In function: foo
>
>>
>> but works with -mcpu=v3
>
> Yes. it does work with -mcpu=v3:

In GCC if you specify -mcpu=v2 and use atomic built-ins you get
workaround code in the form of libcalls (like calls to
__atomic_fetch_and_4) which are of course of no use in BPF atm.

> $ clang --target=bpf -O2 -mcpu=v3 -c t.c && llvm-objdump -d --mcpu=v3 t.o
>
> t.o:    file format elf64-bpf
> Disassembly of section .text:
> 0000000000000000 <foo>:
>        0:       c3 12 00 00 51 00 00 00 w1 = atomic_fetch_and((u32 *)(r2 + 0x0), w1)
>        1:       95 00 00 00 00 00 00 00 exit
>
> NOTE: I need -mcpu=v3 for llvm-objdump to print asm code 'atomic_fetch_and' properly.
> Will double check this.

The GNU binutils objdump will try to recognize instructions in the
latest supported cpu version, unless an explicit option is passed to
specify a particular ISA version:

$ bpf-unknown-none-objdump -M pseudoc -d foo.o

foo.o:     file format elf64-bpfle


Disassembly of section .text:

0000000000000000 <foo>:
   0:	bf 11 20 00 00 00 00 00 	r1 = (s32) r1
   8:	c3 12 00 00 51 00 00 00 	w1=atomic_fetch_and((u32*)(r2+0),w1)
  10:	95 00 00 00 00 00 00 00 	exit

$ bpf-unknown-none-objdump -M v2,pseudoc -d foo.o

foo.o:     file format elf64-bpfle


Disassembly of section .text:

0000000000000000 <foo>:
   0:	bf 11 20 00 00 00 00 00 	r1=r1
   8:	c3 12 00 00 51 00 00 00 	<unknown>
  10:	95 00 00 00 00 00 00 00 	exit

> For code:
> void foo(int a, _Atomic int *b)
> {
>  *b &= a;
> }
>
> The initial IR generated by clang frontend is:
>
> define dso_local void @foo(i32 noundef %a, ptr noundef %b) #0 {
> entry:
>   %a.addr = alloca i32, align 4
>   %b.addr = alloca ptr, align 8
>   store i32 %a, ptr %a.addr, align 4, !tbaa !3
>   store ptr %b, ptr %b.addr, align 8, !tbaa !7
>   %0 = load i32, ptr %a.addr, align 4, !tbaa !3
>   %1 = load ptr, ptr %b.addr, align 8, !tbaa !7
>   %2 = atomicrmw and ptr %1, i32 %0 seq_cst, align 4
>   %3 = and i32 %2, %0
>   ret void
> }
>
> Note that atomicrmw in the above. Eventually it optimized to
>
> define dso_local void @foo(i32 noundef %a, ptr noundef %b) #0 {
> entry:
>   %0 = atomicrmw and ptr %b, i32 %a seq_cst, align 4
>   ret void
> }
>
> The 'atomicrmw' is the same IR as generated by
> __sync_fetch_and_*() and eventually will generate atomic_fetch_*() bpf
> insn.
> Discussed with Andrii, and
> another option is to specify relaxed consistency, so llvm
> internal could translate it into locked insn. For example,
>
> $ cat t1.c
> #include <stdatomic.h>
>
> void f(_Atomic int *i) {
>   __c11_atomic_fetch_and(i, 1, memory_order_relaxed);
> }

I think this makes sense.  Currently we removed the GCC insns
atomic_{add,or,xor,and} all together so the compiler is forced to
implement them in terms of atomic_fetch_and_{add,or,xor,and} regardless
of the memory ordering policy specified to the __sync_fetch_and_OP.  So
even if you specify relaxed memory ordered, the resulting ordering is
whatever implied by the fetching operation.

> # to have gnu/stubs-32.h in the current directory to make it compile
> [yhs@devvm1513.prn0 ~/tmp6]$ ls gnu
> stubs-32.h
> [yhs@devvm1513.prn0 ~/tmp6]$ clang --target=bpf -O2 -I. -c -mcpu=v3 t1.c
>
> The initial IR:
> define dso_local void @f(ptr noundef %i) #0 {
> entry:
>   %i.addr = alloca ptr, align 8
>   %.atomictmp = alloca i32, align 4
>   %atomic-temp = alloca i32, align 4
>   store ptr %i, ptr %i.addr, align 8, !tbaa !3
>   %0 = load ptr, ptr %i.addr, align 8, !tbaa !3
>   store i32 1, ptr %.atomictmp, align 4, !tbaa !7
>   %1 = load i32, ptr %.atomictmp, align 4
>   %2 = atomicrmw and ptr %0, i32 %1 monotonic, align 4
>   store i32 %2, ptr %atomic-temp, align 4
>   %3 = load i32, ptr %atomic-temp, align 4, !tbaa !7
>   ret void
> }
>
> The IR right before machine code generation:
>
> define dso_local void @f(ptr nocapture noundef %i) local_unnamed_addr #0 {
> entry:
>   %0 = atomicrmw and ptr %i, i32 1 monotonic, align 4
>   ret void
> }
>
> Maybe we could special process the above to generate
> a locked insn if
>   - atomicrmw operator
>   - monotonic (related) consistency
>   - return value is not used
>
> So this will not violate original program semantics.
> Does this sound a reasonable apporach?

Whether monotonic consistency is desired (ordered writes) can be
probably deduced from the memory_order_* flag of the built-ins, but I
don't know what atomiccrmw is...  what is it in non-llvm terms?

>>
>> So let's make this test mcpu=v3 only and use normal C ?
>>
>> pw-bot: cr
Yonghong Song Aug. 6, 2024, 5:26 a.m. UTC | #4
On 8/5/24 9:48 PM, Jose E. Marchesi wrote:
>> On 8/5/24 10:53 AM, Alexei Starovoitov wrote:
>>> On Fri, Aug 2, 2024 at 7:59 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>> Peilen Ye reported an issue ([1]) where for __sync_fetch_and_add(...) without
>>>> return value like
>>>>     __sync_fetch_and_add(&foo, 1);
>>>> llvm BPF backend generates locked insn e.g.
>>>>     lock *(u32 *)(r1 + 0) += r2
>>>>
>>>> If __sync_fetch_and_add(...) returns a value like
>>>>     res = __sync_fetch_and_add(&foo, 1);
>>>> llvm BPF backend generates like
>>>>     r2 = atomic_fetch_add((u32 *)(r1 + 0), r2)
>>>>
>>>> But 'lock *(u32 *)(r1 + 0) += r2' caused a problem in jit
>>>> since proper barrier is not inserted based on __sync_fetch_and_add() semantics.
>>>>
>>>> The above discrepancy is due to commit [2] where it tries to maintain backward
>>>> compatability since before commit [2], __sync_fetch_and_add(...) generates
>>>> lock insn in BPF backend.
>>>>
>>>> Based on discussion in [1], now it is time to fix the above discrepancy so we can
>>>> have proper barrier support in jit. llvm patch [3] made sure that __sync_fetch_and_add(...)
>>>> always generates atomic_fetch_add(...) insns. Now 'lock *(u32 *)(r1 + 0) += r2' can only
>>>> be generated by inline asm. The same for __sync_fetch_and_and(), __sync_fetch_and_or()
>>>> and __sync_fetch_and_xor().
>>>>
>>>> But the change in [3] caused arena_atomics selftest failure.
>>>>
>>>>     test_arena_atomics:PASS:arena atomics skeleton open 0 nsec
>>>>     libbpf: prog 'and': BPF program load failed: Permission denied
>>>>     libbpf: prog 'and': -- BEGIN PROG LOAD LOG --
>>>>     arg#0 reference type('UNKNOWN ') size cannot be determined: -22
>>>>     0: R1=ctx() R10=fp0
>>>>     ; if (pid != (bpf_get_current_pid_tgid() >> 32)) @ arena_atomics.c:87
>>>>     0: (18) r1 = 0xffffc90000064000       ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4)
>>>>     2: (61) r6 = *(u32 *)(r1 +0)          ; R1_w=map_value(map=arena_at.bss,ks=4,vs=4) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
>>>>     3: (85) call bpf_get_current_pid_tgid#14      ; R0_w=scalar()
>>>>     4: (77) r0 >>= 32                     ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff))
>>>>     5: (5d) if r0 != r6 goto pc+11        ; R0_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0xffffffff)) R6_w=scalar(smin=0,smax=umax=0xffffffff,var_off=(0x0; 0x)
>>>>     ; __sync_fetch_and_and(&and64_value, 0x011ull << 32); @ arena_atomics.c:91
>>>>     6: (18) r1 = 0x100000000060           ; R1_w=scalar()
>>>>     8: (bf) r1 = addr_space_cast(r1, 0, 1)        ; R1_w=arena
>>>>     9: (18) r2 = 0x1100000000             ; R2_w=0x1100000000
>>>>     11: (db) r2 = atomic64_fetch_and((u64 *)(r1 +0), r2)
>>>>     BPF_ATOMIC stores into R1 arena is not allowed
>>>>     processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0
>>>>     -- END PROG LOAD LOG --
>>>>     libbpf: prog 'and': failed to load: -13
>>>>     libbpf: failed to load object 'arena_atomics'
>>>>     libbpf: failed to load BPF skeleton 'arena_atomics': -13
>>>>     test_arena_atomics:FAIL:arena atomics skeleton load unexpected error: -13 (errno 13)
>>>>     #3       arena_atomics:FAIL
>>>>
>>>> The reason of the failure is due to [4] where atomic{64,}_fetch_{and,or,xor}() are not
>>>> allowed by arena addresses. Without llvm patch [3], the compiler will generate 'lock ...'
>>>> insn and everything will work fine.
>>>>
>>>> This patch fixed the problem by using inline asms. Instead of __sync_fetch_and_{and,or,xor}() functions,
>>>> the inline asm with 'lock' insn is used and it will work with or without [3].
>>>> Note that three bpf programs ('and', 'or' and 'xor') are guarded with __BPF_FEATURE_ADDR_SPACE_CAST
>>>> as well to ensure compilation failure for llvm <= 18 version. Note that for llvm <= 18 where
>>>> addr_space_cast is not supported, all arena_atomics subtests are skipped with below message:
>>>>     test_arena_atomics:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang
>>>>     #3 arena_atomics:SKIP
>>>>
>>>>     [1] https://lore.kernel.org/bpf/ZqqiQQWRnz7H93Hc@google.com/T/#mb68d67bc8f39e35a0c3db52468b9de59b79f021f
>>>>     [2] https://github.com/llvm/llvm-project/commit/286daafd65129228e08a1d07aa4ca74488615744
>>>>     [3] https://github.com/llvm/llvm-project/pull/101428
>>>>     [4] d503a04f8bc0 ("bpf: Add support for certain atomics in bpf_arena to x86 JIT")
>>>>
>>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>>>> ---
>>>>    .../selftests/bpf/progs/arena_atomics.c       | 63 ++++++++++++++++---
>>>>    1 file changed, 54 insertions(+), 9 deletions(-)
>>>>
>>>> Changelog:
>>>>     v1 -> v2:
>>>>       - Add __BPF_FEATURE_ADDR_SPACE_CAST to guard newly added asm codes for llvm >= 19
>>>>
>>>> diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c
>>>> index bb0acd79d28a..dea54557fc00 100644
>>>> --- a/tools/testing/selftests/bpf/progs/arena_atomics.c
>>>> +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c
>>>> @@ -5,6 +5,7 @@
>>>>    #include <bpf/bpf_tracing.h>
>>>>    #include <stdbool.h>
>>>>    #include "bpf_arena_common.h"
>>>> +#include "bpf_misc.h"
>>>>
>>>>    struct {
>>>>           __uint(type, BPF_MAP_TYPE_ARENA);
>>>> @@ -85,10 +86,24 @@ int and(const void *ctx)
>>>>    {
>>>>           if (pid != (bpf_get_current_pid_tgid() >> 32))
>>>>                   return 0;
>>>> -#ifdef ENABLE_ATOMICS_TESTS
>>>> +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
>>>>
>>>> -       __sync_fetch_and_and(&and64_value, 0x011ull << 32);
>>>> -       __sync_fetch_and_and(&and32_value, 0x011);
>>>> +       asm volatile(
>>>> +               "r1 = addr_space_cast(%[and64_value], 0, 1);"
>>>> +               "lock *(u64 *)(r1 + 0) &= %[val]"
>>>> +               :
>>>> +               : __imm_ptr(and64_value),
>>>> +                 [val]"r"(0x011ull << 32)
>>>> +               : "r1"
>>>> +       );
>>>> +       asm volatile(
>>>> +               "r1 = addr_space_cast(%[and32_value], 0, 1);"
>>>> +               "lock *(u32 *)(r1 + 0) &= %[val]"
>>>> +               :
>>>> +               : __imm_ptr(and32_value),
>>>> +                 [val]"w"(0x011)
>>>> +               : "r1"
>>>> +       );
>>> Instead of inline asm there is a better way to do the same in C.
>>> https://godbolt.org/z/71PYx1oqE
>>>
>>> void foo(int a, _Atomic int *b)
>>> {
>>>    *b += a;
>>> }
>>>
>>> generates:
>>> lock *(u32 *)(r2 + 0) += r1
>> If you use latest llvm-project with
>> https://github.com/llvm/llvm-project/pull/101428
>> included, the above code will generate like
>>
>> $ clang --target=bpf -O2 -c t.c && llvm-objdump -d t.o
>> t.o:    file format elf64-bpf
>> Disassembly of section .text:
>> 0000000000000000 <foo>:
>>         0:       c3 12 00 00 01 00 00 00 r1 = atomic_fetch_add((u32 *)(r2 + 0x0), r1)
>>         1:       95 00 00 00 00 00 00 00 exit
>>
>>
>> With -mcpu=v3 the same code can be generated.
> Same for current GCC.
>
>>> but
>>> *b &= a;
>>>
>>> crashes llvm :( with
>>>
>>> <source>:3:5: error: unsupported atomic operation, please use 64 bit version
>>>       3 |  *b &= a;
>> It failed with the following llvm error message:
>> t.c:1:6: error: unsupported atomic operation, please use 64 bit version
>>      1 | void foo(int a, _Atomic int *b)
>>        |      ^
>> fatal error: error in backend: Cannot select: t8: i64,ch = AtomicLoadAnd<(load store seq_cst (s32) on %ir.b)> t0, t4, t2
>>    t4: i64,ch = CopyFromReg t0, Register:i64 %1
>>      t3: i64 = Register %1
>>    t2: i64,ch = CopyFromReg t0, Register:i64 %0
>>      t1: i64 = Register %0
>> In function: foo
>>
>>> but works with -mcpu=v3
>> Yes. it does work with -mcpu=v3:
> In GCC if you specify -mcpu=v2 and use atomic built-ins you get
> workaround code in the form of libcalls (like calls to
> __atomic_fetch_and_4) which are of course of no use in BPF atm.
>> $ clang --target=bpf -O2 -mcpu=v3 -c t.c && llvm-objdump -d --mcpu=v3 t.o
>>
>> t.o:    file format elf64-bpf
>> Disassembly of section .text:
>> 0000000000000000 <foo>:
>>         0:       c3 12 00 00 51 00 00 00 w1 = atomic_fetch_and((u32 *)(r2 + 0x0), w1)
>>         1:       95 00 00 00 00 00 00 00 exit
>>
>> NOTE: I need -mcpu=v3 for llvm-objdump to print asm code 'atomic_fetch_and' properly.
>> Will double check this.
> The GNU binutils objdump will try to recognize instructions in the
> latest supported cpu version, unless an explicit option is passed to
> specify a particular ISA version:

Agree. As I mentioned in the above, I will take a look at this soon.

>
> $ bpf-unknown-none-objdump -M pseudoc -d foo.o
>
> foo.o:     file format elf64-bpfle
>
>
> Disassembly of section .text:
>
> 0000000000000000 <foo>:
>     0:	bf 11 20 00 00 00 00 00 	r1 = (s32) r1
>     8:	c3 12 00 00 51 00 00 00 	w1=atomic_fetch_and((u32*)(r2+0),w1)
>    10:	95 00 00 00 00 00 00 00 	exit
>
> $ bpf-unknown-none-objdump -M v2,pseudoc -d foo.o
>
> foo.o:     file format elf64-bpfle
>
>
> Disassembly of section .text:
>
> 0000000000000000 <foo>:
>     0:	bf 11 20 00 00 00 00 00 	r1=r1
>     8:	c3 12 00 00 51 00 00 00 	<unknown>
>    10:	95 00 00 00 00 00 00 00 	exit
>
>> For code:
>> void foo(int a, _Atomic int *b)
>> {
>>   *b &= a;
>> }
>>
>> The initial IR generated by clang frontend is:
>>
>> define dso_local void @foo(i32 noundef %a, ptr noundef %b) #0 {
>> entry:
>>    %a.addr = alloca i32, align 4
>>    %b.addr = alloca ptr, align 8
>>    store i32 %a, ptr %a.addr, align 4, !tbaa !3
>>    store ptr %b, ptr %b.addr, align 8, !tbaa !7
>>    %0 = load i32, ptr %a.addr, align 4, !tbaa !3
>>    %1 = load ptr, ptr %b.addr, align 8, !tbaa !7
>>    %2 = atomicrmw and ptr %1, i32 %0 seq_cst, align 4
>>    %3 = and i32 %2, %0
>>    ret void
>> }
>>
>> Note that atomicrmw in the above. Eventually it optimized to
>>
>> define dso_local void @foo(i32 noundef %a, ptr noundef %b) #0 {
>> entry:
>>    %0 = atomicrmw and ptr %b, i32 %a seq_cst, align 4
>>    ret void
>> }
>>
>> The 'atomicrmw' is the same IR as generated by
>> __sync_fetch_and_*() and eventually will generate atomic_fetch_*() bpf
>> insn.
>> Discussed with Andrii, and
>> another option is to specify relaxed consistency, so llvm
>> internal could translate it into locked insn. For example,
>>
>> $ cat t1.c
>> #include <stdatomic.h>
>>
>> void f(_Atomic int *i) {
>>    __c11_atomic_fetch_and(i, 1, memory_order_relaxed);
>> }
> I think this makes sense.  Currently we removed the GCC insns
> atomic_{add,or,xor,and} all together so the compiler is forced to
> implement them in terms of atomic_fetch_and_{add,or,xor,and} regardless
> of the memory ordering policy specified to the __sync_fetch_and_OP.  So
> even if you specify relaxed memory ordered, the resulting ordering is
> whatever implied by the fetching operation.

Encoding memory order constraints in BPF insn might be too complicated.
I am not sure. But at least we could map different memory constraints
into different insns so jit can add proper barrier for those insns.

>
>> # to have gnu/stubs-32.h in the current directory to make it compile
>> [yhs@devvm1513.prn0 ~/tmp6]$ ls gnu
>> stubs-32.h
>> [yhs@devvm1513.prn0 ~/tmp6]$ clang --target=bpf -O2 -I. -c -mcpu=v3 t1.c
>>
>> The initial IR:
>> define dso_local void @f(ptr noundef %i) #0 {
>> entry:
>>    %i.addr = alloca ptr, align 8
>>    %.atomictmp = alloca i32, align 4
>>    %atomic-temp = alloca i32, align 4
>>    store ptr %i, ptr %i.addr, align 8, !tbaa !3
>>    %0 = load ptr, ptr %i.addr, align 8, !tbaa !3
>>    store i32 1, ptr %.atomictmp, align 4, !tbaa !7
>>    %1 = load i32, ptr %.atomictmp, align 4
>>    %2 = atomicrmw and ptr %0, i32 %1 monotonic, align 4
>>    store i32 %2, ptr %atomic-temp, align 4
>>    %3 = load i32, ptr %atomic-temp, align 4, !tbaa !7
>>    ret void
>> }
>>
>> The IR right before machine code generation:
>>
>> define dso_local void @f(ptr nocapture noundef %i) local_unnamed_addr #0 {
>> entry:
>>    %0 = atomicrmw and ptr %i, i32 1 monotonic, align 4
>>    ret void
>> }
>>
>> Maybe we could special process the above to generate
>> a locked insn if
>>    - atomicrmw operator
>>    - monotonic (related) consistency
>>    - return value is not used
>>
>> So this will not violate original program semantics.
>> Does this sound a reasonable apporach?
> Whether monotonic consistency is desired (ordered writes) can be
> probably deduced from the memory_order_* flag of the built-ins, but I
> don't know what atomiccrmw is...  what is it in non-llvm terms?

The llvm language reference for atomicrmw:

   https://llvm.org/docs/LangRef.html#atomicrmw-instruction

>
>>> So let's make this test mcpu=v3 only and use normal C ?
>>>
>>> pw-bot: cr
Alexei Starovoitov Aug. 8, 2024, 4:26 p.m. UTC | #5
On Mon, Aug 5, 2024 at 10:26 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> >>
> >> Maybe we could special process the above to generate
> >> a locked insn if
> >>    - atomicrmw operator
> >>    - monotonic (related) consistency
> >>    - return value is not used

This sounds like a good idea, but...

> >>
> >> So this will not violate original program semantics.
> >> Does this sound a reasonable apporach?
> > Whether monotonic consistency is desired (ordered writes) can be
> > probably deduced from the memory_order_* flag of the built-ins, but I
> > don't know what atomiccrmw is...  what is it in non-llvm terms?
>
> The llvm language reference for atomicrmw:
>
>    https://llvm.org/docs/LangRef.html#atomicrmw-instruction

I read it back and forth, but couldn't find whether it's ok
for the backend to use stronger ordering insn when weaker ordering
is specified in atomicrmw.
It's probably ok.
Otherwise atomicrmw with monotnic (memory_order_relaxed) and
return value is used cannot be mapped to any bpf insn.
x86 doesn't have monotonic either, but I suspect the backend
still generates the code without any warnings.
Would be good to clarify before we proceed with the above plan.
Yonghong Song Aug. 12, 2024, 8:42 p.m. UTC | #6
On 8/8/24 9:26 AM, Alexei Starovoitov wrote:
> On Mon, Aug 5, 2024 at 10:26 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>>>> Maybe we could special process the above to generate
>>>> a locked insn if
>>>>     - atomicrmw operator
>>>>     - monotonic (related) consistency
>>>>     - return value is not used
> This sounds like a good idea, but...
>
>>>> So this will not violate original program semantics.
>>>> Does this sound a reasonable apporach?
>>> Whether monotonic consistency is desired (ordered writes) can be
>>> probably deduced from the memory_order_* flag of the built-ins, but I
>>> don't know what atomiccrmw is...  what is it in non-llvm terms?
>> The llvm language reference for atomicrmw:
>>
>>     https://llvm.org/docs/LangRef.html#atomicrmw-instruction
> I read it back and forth, but couldn't find whether it's ok
> for the backend to use stronger ordering insn when weaker ordering
> is specified in atomicrmw.
> It's probably ok.
> Otherwise atomicrmw with monotnic (memory_order_relaxed) and
> return value is used cannot be mapped to any bpf insn.
> x86 doesn't have monotonic either, but I suspect the backend
> still generates the code without any warnings.

I did a little bit experiment for x86,

$ cat t.c
int foo;
void bar1(void) {
         __sync_fetch_and_add(&foo, 10);
}
int bar2(void) {
         return __sync_fetch_and_add(&foo, 10);
}
$ clang -O2 -I. -c t.c && llvm-objdump -d t.o

t.o:    file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <bar1>:
        0: f0                            lock
        1: 83 05 00 00 00 00 0a          addl    $0xa, (%rip)            # 0x8 <bar1+0x8>
        8: c3                            retq
        9: 0f 1f 80 00 00 00 00          nopl    (%rax)
         
0000000000000010 <bar2>:
       10: b8 0a 00 00 00                movl    $0xa, %eax
       15: f0                            lock
       16: 0f c1 05 00 00 00 00          xaddl   %eax, (%rip)            # 0x1d <bar2+0xd>
       1d: c3                            retq

So without return value, __sync_fetch_and_add will generated locked add insn.
With return value, 'lock xaddl' is used which should be similar to bpf atomic_fetch_add.

Another example,

$ cat t1.c
#include <stdatomic.h>

void f1(_Atomic int *i) {
   __c11_atomic_fetch_and(i, 10, memory_order_relaxed);
}

void f2(_Atomic int *i) {
   __c11_atomic_fetch_and(i, 10, memory_order_seq_cst);
}

void f3(_Atomic int *i) {
   __c11_atomic_fetch_and(i, 10, memory_order_release);
}

_Atomic int f4(_Atomic int *i) {
   return __c11_atomic_fetch_and(i, 10, memory_order_release);
}
$ clang -O2 -I. -c t1.c && llvm-objdump -d t1.o

t1.o:   file format elf64-x86-64

Disassembly of section .text:

0000000000000000 <f1>:
        0: f0                            lock
        1: 83 27 0a                      andl    $0xa, (%rdi)
        4: c3                            retq
        5: 66 66 2e 0f 1f 84 00 00 00 00 00      nopw    %cs:(%rax,%rax)

0000000000000010 <f2>:
       10: f0                            lock
       11: 83 27 0a                      andl    $0xa, (%rdi)
       14: c3                            retq
       15: 66 66 2e 0f 1f 84 00 00 00 00 00      nopw    %cs:(%rax,%rax)

0000000000000020 <f3>:
       20: f0                            lock
       21: 83 27 0a                      andl    $0xa, (%rdi)
       24: c3                            retq
       25: 66 66 2e 0f 1f 84 00 00 00 00 00      nopw    %cs:(%rax,%rax)

0000000000000030 <f4>:
       30: 8b 07                         movl    (%rdi), %eax
       32: 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00     nopw    %cs:(%rax,%rax)
       40: 89 c1                         movl    %eax, %ecx
       42: 83 e1 0a                      andl    $0xa, %ecx
       45: f0                            lock
       46: 0f b1 0f                      cmpxchgl        %ecx, (%rdi)
       49: 75 f5                         jne     0x40 <f4+0x10>
       4b: c3                            retq
$

In all cases without return value, for x86, 'lock andl' insn is generated
regardless of the memory order constraint. This is similar to
what we had before.

Although previous 'lock' encoding matches to x86 nicely.
But for ARM64 and for 'lock ...' bpf insn,
the generated insn won't have a barrier (acquire or release or both)
semantics. So I guess that ARM64 wants to preserve the current hehavior:
   - for 'lock ...' insn, no barrier,
   - for 'atomic_fetch_add ...' insn, proper barrier.

For x86, for both 'lock ...' and 'atomic_fetch_add ...' will have proper barrier.

To keep 'lock ...' no-barrier required, user needs to specify
memory_order_relaxed constraint. This will permit bpf backend
to generate 'lock ...' insn.

Looks like x86 does check memory_order_relaxed to do some
special processing:

X86CompressEVEX.cpp:  if (!TableChecked.load(std::memory_order_relaxed)) {
X86CompressEVEX.cpp:    TableChecked.store(true, std::memory_order_relaxed);
X86FloatingPoint.cpp:    if (!TABLE##Checked.load(std::memory_order_relaxed)) {                     \
X86FloatingPoint.cpp:      TABLE##Checked.store(true, std::memory_order_relaxed);                   \
X86InstrFMA3Info.cpp:  if (!TableChecked.load(std::memory_order_relaxed)) {
X86InstrFMA3Info.cpp:    TableChecked.store(true, std::memory_order_relaxed);
X86InstrFoldTables.cpp:  if (!FoldTablesChecked.load(std::memory_order_relaxed)) {
X86InstrFoldTables.cpp:    FoldTablesChecked.store(true, std::memory_order_relaxed);

So I guess that bpf can do similar thing to check memory_order_relaxed in IR
and may generate different (more efficient insns) as needed.

> Would be good to clarify before we proceed with the above plan.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c
index bb0acd79d28a..dea54557fc00 100644
--- a/tools/testing/selftests/bpf/progs/arena_atomics.c
+++ b/tools/testing/selftests/bpf/progs/arena_atomics.c
@@ -5,6 +5,7 @@ 
 #include <bpf/bpf_tracing.h>
 #include <stdbool.h>
 #include "bpf_arena_common.h"
+#include "bpf_misc.h"
 
 struct {
 	__uint(type, BPF_MAP_TYPE_ARENA);
@@ -85,10 +86,24 @@  int and(const void *ctx)
 {
 	if (pid != (bpf_get_current_pid_tgid() >> 32))
 		return 0;
-#ifdef ENABLE_ATOMICS_TESTS
+#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
 
-	__sync_fetch_and_and(&and64_value, 0x011ull << 32);
-	__sync_fetch_and_and(&and32_value, 0x011);
+	asm volatile(
+		"r1 = addr_space_cast(%[and64_value], 0, 1);"
+		"lock *(u64 *)(r1 + 0) &= %[val]"
+		:
+		: __imm_ptr(and64_value),
+		  [val]"r"(0x011ull << 32)
+		: "r1"
+	);
+	asm volatile(
+		"r1 = addr_space_cast(%[and32_value], 0, 1);"
+		"lock *(u32 *)(r1 + 0) &= %[val]"
+		:
+		: __imm_ptr(and32_value),
+		  [val]"w"(0x011)
+		: "r1"
+	);
 #endif
 
 	return 0;
@@ -102,9 +117,24 @@  int or(const void *ctx)
 {
 	if (pid != (bpf_get_current_pid_tgid() >> 32))
 		return 0;
-#ifdef ENABLE_ATOMICS_TESTS
-	__sync_fetch_and_or(&or64_value, 0x011ull << 32);
-	__sync_fetch_and_or(&or32_value, 0x011);
+#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+
+	asm volatile(
+		"r1 = addr_space_cast(%[or64_value], 0, 1);"
+		"lock *(u64 *)(r1 + 0) |= %[val]"
+		:
+		: __imm_ptr(or64_value),
+		  [val]"r"(0x011ull << 32)
+		: "r1"
+	);
+	asm volatile(
+		"r1 = addr_space_cast(%[or32_value], 0, 1);"
+		"lock *(u32 *)(r1 + 0) |= %[val]"
+		:
+		: __imm_ptr(or32_value),
+		  [val]"w"(0x011)
+		: "r1"
+	);
 #endif
 
 	return 0;
@@ -118,9 +148,24 @@  int xor(const void *ctx)
 {
 	if (pid != (bpf_get_current_pid_tgid() >> 32))
 		return 0;
-#ifdef ENABLE_ATOMICS_TESTS
-	__sync_fetch_and_xor(&xor64_value, 0x011ull << 32);
-	__sync_fetch_and_xor(&xor32_value, 0x011);
+#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST)
+
+	asm volatile(
+		"r1 = addr_space_cast(%[xor64_value], 0, 1);"
+		"lock *(u64 *)(r1 + 0) ^= %[val]"
+		:
+		: __imm_ptr(xor64_value),
+		  [val]"r"(0x011ull << 32)
+		: "r1"
+	);
+	asm volatile(
+		"r1 = addr_space_cast(%[xor32_value], 0, 1);"
+		"lock *(u32 *)(r1 + 0) ^= %[val]"
+		:
+		: __imm_ptr(xor32_value),
+		  [val]"w"(0x011)
+		: "r1"
+	);
 #endif
 
 	return 0;