diff mbox series

bpf: smp_wmb before bpf_ringbuf really commit

Message ID 20241031084246.20737-1-zhongjinji@honor.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: smp_wmb before bpf_ringbuf really commit | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format warning Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 5 this patch: 5
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 10 maintainers not CCed: daniel@iogearbox.net sdf@fomichev.me kpsingh@kernel.org john.fastabend@gmail.com eddyz87@gmail.com martin.lau@linux.dev song@kernel.org haoluo@google.com yonghong.song@linux.dev jolsa@kernel.org
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 10 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17

Commit Message

zhongjinji Oct. 31, 2024, 8:42 a.m. UTC
From: zhongjinji <zhongjinji@honor.com>

To guarantee visibility of writing ringbuffer,it is necessary
to call smp_wmb before ringbuffer really commit.
for instance, when updating the data of buffer in cpu1,
it is not visible for the cpu2 which may be accessing buffer. This may
lead to the consumer accessing a incorrect data. using the smp_wmb
before commmit will guarantee that the consume can access the correct data.

CPU1:
    struct mem_event_t* data = bpf_ringbuf_reserve();
    data->type = MEM_EVENT_KSWAPD_WAKE;
    data->event_data.kswapd_wake.node_id = args->nid;
    bpf_ringbuf_commit(data);

CPU2:
    cons_pos = smp_load_acquire(r->consumer_pos);
    len_ptr = r->data + (cons_pos & r->mask);
    sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
    access to sample

Signed-off-by: zhongjinji <zhongjinji@honor.com>
---
 kernel/bpf/ringbuf.c | 4 ++++
 1 file changed, 4 insertions(+)

Comments

Hou Tao Oct. 31, 2024, 11:03 a.m. UTC | #1
Hi,

On 10/31/2024 4:42 PM, zhongjinji@honor.com wrote:
> From: zhongjinji <zhongjinji@honor.com>
>
> To guarantee visibility of writing ringbuffer,it is necessary
> to call smp_wmb before ringbuffer really commit.
> for instance, when updating the data of buffer in cpu1,
> it is not visible for the cpu2 which may be accessing buffer. This may
> lead to the consumer accessing a incorrect data. using the smp_wmb
> before commmit will guarantee that the consume can access the correct data.
>
> CPU1:
>     struct mem_event_t* data = bpf_ringbuf_reserve();
>     data->type = MEM_EVENT_KSWAPD_WAKE;
>     data->event_data.kswapd_wake.node_id = args->nid;
>     bpf_ringbuf_commit(data);
>
> CPU2:
>     cons_pos = smp_load_acquire(r->consumer_pos);
>     len_ptr = r->data + (cons_pos & r->mask);
>     sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
>     access to sample

It seems you didn't use the ringbuf related APIs (e.g.,
ring_buffer__consume()) in libbpf to access the ring buffer. In my
understanding, the "xchg(&hdr->len, new_len)" in bpf_ringbuf_commit()
works as the barrier to ensure the order of committed data and hdr->len,
and accordingly the "smp_load_acquire(len_ptr)" in
ringbuf_process_ring() in libbpf works as the paired barrier to ensure
the order of hdr->len and the committed data. So I think the extra
smp_wmb() in the kernel is not necessary here. Instead, you should fix
your code in the userspace.
>
> Signed-off-by: zhongjinji <zhongjinji@honor.com>
> ---
>  kernel/bpf/ringbuf.c | 4 ++++
>  1 file changed, 4 insertions(+)
>
> diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
> index e1cfe890e0be..a66059e2b0d6 100644
> --- a/kernel/bpf/ringbuf.c
> +++ b/kernel/bpf/ringbuf.c
> @@ -508,6 +508,10 @@ static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
>  	rec_pos = (void *)hdr - (void *)rb->data;
>  	cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
>  
> +	/* Make sure the modification of data is visible on other CPU's
> +	 * before consume the event
> +	 */
> +	smp_wmb();
>  	if (flags & BPF_RB_FORCE_WAKEUP)
>  		irq_work_queue(&rb->work);
>  	else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
zhongjinji Oct. 31, 2024, 4:19 p.m. UTC | #2
>It seems you didn't use the ringbuf related APIs (e.g.,
>ring_buffer__consume()) in libbpf to access the ring buffer. In my
>understanding, the "xchg(&hdr->len, new_len)" in bpf_ringbuf_commit()
>works as the barrier to ensure the order of committed data and hdr->len,
>and accordingly the "smp_load_acquire(len_ptr)" in
>ringbuf_process_ring() in libbpf works as the paired barrier to ensure
>the order of hdr->len and the committed data. So I think the extra
>smp_wmb() in the kernel is not necessary here. Instead, you should fix
>your code in the userspace.
>>
>> Signed-off-by: zhongjinji <zhongjinji@honor.com>
>> ---
>>  kernel/bpf/ringbuf.c | 4 ++++
>>  1 file changed, 4 insertions(+)
>>
>> diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
>> index e1cfe890e0be..a66059e2b0d6 100644
>> --- a/kernel/bpf/ringbuf.c
>> +++ b/kernel/bpf/ringbuf.c
>> @@ -508,6 +508,10 @@ static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
>>  	rec_pos = (void *)hdr - (void *)rb->data;
>>  	cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
>>  
>> +	/* Make sure the modification of data is visible on other CPU's
>> +	 * before consume the event
>> +	 */
>> +	smp_wmb();
>>  	if (flags & BPF_RB_FORCE_WAKEUP)
>>  		irq_work_queue(&rb->work);
>>  	else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))

well, I missed the implementation of __smp_store_release in the x86 architecture,
it is not necessary because __smp_store_release has a memory barrier in x86,
but it can't guarantee the order of committed data in arm64 because
__smp_store_release does not include a memory barrier, and it just ensure
the variable is visible. The implementation of ARM64 is as follows,
in common/arch/arm64/include/asm/barrier.h

#define __smp_load_acquire(p)						\
({									\
	union { __unqual_scalar_typeof(*p) __val; char __c[1]; } __u;	\
	typeof(p) __p = (p);						\
	compiletime_assert_atomic_type(*p);				\
	kasan_check_read(__p, sizeof(*p));				\
	switch (sizeof(*p)) {						\
	case 1:								\
		asm volatile ("ldarb %w0, %1"				\
			: "=r" (*(__u8 *)__u.__c)			\
			: "Q" (*__p) : "memory");			\
		break;							\
	case 2:								\
		asm volatile ("ldarh %w0, %1"				\
			: "=r" (*(__u16 *)__u.__c)			\
			: "Q" (*__p) : "memory");			\
		break;							\
	case 4:								\
		asm volatile ("ldar %w0, %1"				\
			: "=r" (*(__u32 *)__u.__c)			\
			: "Q" (*__p) : "memory");			\
		break;							\
	case 8:								\
		asm volatile ("ldar %0, %1"				\
			: "=r" (*(__u64 *)__u.__c)			\
			: "Q" (*__p) : "memory");			\
		break;							\
	}								\
	(typeof(*p))__u.__val;						\
})
Andrii Nakryiko Nov. 1, 2024, 7:12 p.m. UTC | #3
On Thu, Oct 31, 2024 at 9:19 AM zhongjinji <zhongjinji@honor.com> wrote:
>
> >It seems you didn't use the ringbuf related APIs (e.g.,
> >ring_buffer__consume()) in libbpf to access the ring buffer. In my
> >understanding, the "xchg(&hdr->len, new_len)" in bpf_ringbuf_commit()
> >works as the barrier to ensure the order of committed data and hdr->len,
> >and accordingly the "smp_load_acquire(len_ptr)" in
> >ringbuf_process_ring() in libbpf works as the paired barrier to ensure
> >the order of hdr->len and the committed data. So I think the extra
> >smp_wmb() in the kernel is not necessary here. Instead, you should fix
> >your code in the userspace.
> >>
> >> Signed-off-by: zhongjinji <zhongjinji@honor.com>
> >> ---
> >>  kernel/bpf/ringbuf.c | 4 ++++
> >>  1 file changed, 4 insertions(+)
> >>
> >> diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
> >> index e1cfe890e0be..a66059e2b0d6 100644
> >> --- a/kernel/bpf/ringbuf.c
> >> +++ b/kernel/bpf/ringbuf.c
> >> @@ -508,6 +508,10 @@ static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
> >>      rec_pos = (void *)hdr - (void *)rb->data;
> >>      cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
> >>
> >> +    /* Make sure the modification of data is visible on other CPU's
> >> +     * before consume the event
> >> +     */
> >> +    smp_wmb();
> >>      if (flags & BPF_RB_FORCE_WAKEUP)
> >>              irq_work_queue(&rb->work);
> >>      else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
>
> well, I missed the implementation of __smp_store_release in the x86 architecture,
> it is not necessary because __smp_store_release has a memory barrier in x86,

yep, we do rely on xchg() and smp_load_acquire() pairing. I'm not
familiar with arm64 specifics, I didn't know that this assumption is
x86-64-specific. Cc'ed Paul who I believe suggested xchg() as a more
performant way of achieving this. That happened years ago so all of
us, I'm sure, lost a lot of context by now, but still, let's see.

> but it can't guarantee the order of committed data in arm64 because
> __smp_store_release does not include a memory barrier, and it just ensure
> the variable is visible. The implementation of ARM64 is as follows,
> in common/arch/arm64/include/asm/barrier.h
>
> #define __smp_load_acquire(p)                                           \
> ({                                                                      \
>         union { __unqual_scalar_typeof(*p) __val; char __c[1]; } __u;   \
>         typeof(p) __p = (p);                                            \
>         compiletime_assert_atomic_type(*p);                             \
>         kasan_check_read(__p, sizeof(*p));                              \
>         switch (sizeof(*p)) {                                           \
>         case 1:                                                         \
>                 asm volatile ("ldarb %w0, %1"                           \
>                         : "=r" (*(__u8 *)__u.__c)                       \
>                         : "Q" (*__p) : "memory");                       \
>                 break;                                                  \
>         case 2:                                                         \
>                 asm volatile ("ldarh %w0, %1"                           \
>                         : "=r" (*(__u16 *)__u.__c)                      \
>                         : "Q" (*__p) : "memory");                       \
>                 break;                                                  \
>         case 4:                                                         \
>                 asm volatile ("ldar %w0, %1"                            \
>                         : "=r" (*(__u32 *)__u.__c)                      \
>                         : "Q" (*__p) : "memory");                       \
>                 break;                                                  \
>         case 8:                                                         \
>                 asm volatile ("ldar %0, %1"                             \
>                         : "=r" (*(__u64 *)__u.__c)                      \
>                         : "Q" (*__p) : "memory");                       \
>                 break;                                                  \
>         }                                                               \
>         (typeof(*p))__u.__val;                                          \
> })
>
Paul E. McKenney Nov. 1, 2024, 8:30 p.m. UTC | #4
On Fri, Nov 01, 2024 at 12:12:58PM -0700, Andrii Nakryiko wrote:
> On Thu, Oct 31, 2024 at 9:19 AM zhongjinji <zhongjinji@honor.com> wrote:
> >
> > >It seems you didn't use the ringbuf related APIs (e.g.,
> > >ring_buffer__consume()) in libbpf to access the ring buffer. In my
> > >understanding, the "xchg(&hdr->len, new_len)" in bpf_ringbuf_commit()
> > >works as the barrier to ensure the order of committed data and hdr->len,
> > >and accordingly the "smp_load_acquire(len_ptr)" in
> > >ringbuf_process_ring() in libbpf works as the paired barrier to ensure
> > >the order of hdr->len and the committed data. So I think the extra
> > >smp_wmb() in the kernel is not necessary here. Instead, you should fix
> > >your code in the userspace.
> > >>
> > >> Signed-off-by: zhongjinji <zhongjinji@honor.com>
> > >> ---
> > >>  kernel/bpf/ringbuf.c | 4 ++++
> > >>  1 file changed, 4 insertions(+)
> > >>
> > >> diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
> > >> index e1cfe890e0be..a66059e2b0d6 100644
> > >> --- a/kernel/bpf/ringbuf.c
> > >> +++ b/kernel/bpf/ringbuf.c
> > >> @@ -508,6 +508,10 @@ static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
> > >>      rec_pos = (void *)hdr - (void *)rb->data;
> > >>      cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
> > >>
> > >> +    /* Make sure the modification of data is visible on other CPU's
> > >> +     * before consume the event
> > >> +     */
> > >> +    smp_wmb();

This won't order the prior smp_load_acquire(), but you knew that already.

> > >>      if (flags & BPF_RB_FORCE_WAKEUP)
> > >>              irq_work_queue(&rb->work);
> > >>      else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
> >
> > well, I missed the implementation of __smp_store_release in the x86 architecture,
> > it is not necessary because __smp_store_release has a memory barrier in x86,
> 
> yep, we do rely on xchg() and smp_load_acquire() pairing. I'm not
> familiar with arm64 specifics, I didn't know that this assumption is
> x86-64-specific. Cc'ed Paul who I believe suggested xchg() as a more
> performant way of achieving this. That happened years ago so all of
> us, I'm sure, lost a lot of context by now, but still, let's see.

The xchg() is supposed to be fully ordered on all architectures,
including arm64.  And smp_load_acquire() provides acquire ordering,
again on all architectures.  So if you have something like this
with x, y, and z all initially zero:

	CPU 0				CPU 1

	WRITE_ONCE(x, 1);		r3 = smp_load_acquire(&z);
	r1 = READ_ONCE(y);		r4 = READ_ONCE(x);
	r2 = xchg(&z, 1);		WRITE_ONCE(y, 1);

Then if r3==1, it is guaranteed that r1==0 and r4==1.

Or are you looking for some other ordering guarantee, and if so, what
is it?

							Thanx, Paul

> > but it can't guarantee the order of committed data in arm64 because
> > __smp_store_release does not include a memory barrier, and it just ensure
> > the variable is visible. The implementation of ARM64 is as follows,
> > in common/arch/arm64/include/asm/barrier.h
> >
> > #define __smp_load_acquire(p)                                           \
> > ({                                                                      \
> >         union { __unqual_scalar_typeof(*p) __val; char __c[1]; } __u;   \
> >         typeof(p) __p = (p);                                            \
> >         compiletime_assert_atomic_type(*p);                             \
> >         kasan_check_read(__p, sizeof(*p));                              \
> >         switch (sizeof(*p)) {                                           \
> >         case 1:                                                         \
> >                 asm volatile ("ldarb %w0, %1"                           \
> >                         : "=r" (*(__u8 *)__u.__c)                       \
> >                         : "Q" (*__p) : "memory");                       \
> >                 break;                                                  \
> >         case 2:                                                         \
> >                 asm volatile ("ldarh %w0, %1"                           \
> >                         : "=r" (*(__u16 *)__u.__c)                      \
> >                         : "Q" (*__p) : "memory");                       \
> >                 break;                                                  \
> >         case 4:                                                         \
> >                 asm volatile ("ldar %w0, %1"                            \
> >                         : "=r" (*(__u32 *)__u.__c)                      \
> >                         : "Q" (*__p) : "memory");                       \
> >                 break;                                                  \
> >         case 8:                                                         \
> >                 asm volatile ("ldar %0, %1"                             \
> >                         : "=r" (*(__u64 *)__u.__c)                      \
> >                         : "Q" (*__p) : "memory");                       \
> >                 break;                                                  \
> >         }                                                               \
> >         (typeof(*p))__u.__val;                                          \
> > })
> >
Andrii Nakryiko Nov. 6, 2024, 8:02 p.m. UTC | #5
On Fri, Nov 1, 2024 at 1:30 PM Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Fri, Nov 01, 2024 at 12:12:58PM -0700, Andrii Nakryiko wrote:
> > On Thu, Oct 31, 2024 at 9:19 AM zhongjinji <zhongjinji@honor.com> wrote:
> > >
> > > >It seems you didn't use the ringbuf related APIs (e.g.,
> > > >ring_buffer__consume()) in libbpf to access the ring buffer. In my
> > > >understanding, the "xchg(&hdr->len, new_len)" in bpf_ringbuf_commit()
> > > >works as the barrier to ensure the order of committed data and hdr->len,
> > > >and accordingly the "smp_load_acquire(len_ptr)" in
> > > >ringbuf_process_ring() in libbpf works as the paired barrier to ensure
> > > >the order of hdr->len and the committed data. So I think the extra
> > > >smp_wmb() in the kernel is not necessary here. Instead, you should fix
> > > >your code in the userspace.
> > > >>
> > > >> Signed-off-by: zhongjinji <zhongjinji@honor.com>
> > > >> ---
> > > >>  kernel/bpf/ringbuf.c | 4 ++++
> > > >>  1 file changed, 4 insertions(+)
> > > >>
> > > >> diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
> > > >> index e1cfe890e0be..a66059e2b0d6 100644
> > > >> --- a/kernel/bpf/ringbuf.c
> > > >> +++ b/kernel/bpf/ringbuf.c
> > > >> @@ -508,6 +508,10 @@ static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
> > > >>      rec_pos = (void *)hdr - (void *)rb->data;
> > > >>      cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
> > > >>
> > > >> +    /* Make sure the modification of data is visible on other CPU's
> > > >> +     * before consume the event
> > > >> +     */
> > > >> +    smp_wmb();
>
> This won't order the prior smp_load_acquire(), but you knew that already.
>
> > > >>      if (flags & BPF_RB_FORCE_WAKEUP)
> > > >>              irq_work_queue(&rb->work);
> > > >>      else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
> > >
> > > well, I missed the implementation of __smp_store_release in the x86 architecture,
> > > it is not necessary because __smp_store_release has a memory barrier in x86,
> >
> > yep, we do rely on xchg() and smp_load_acquire() pairing. I'm not
> > familiar with arm64 specifics, I didn't know that this assumption is
> > x86-64-specific. Cc'ed Paul who I believe suggested xchg() as a more
> > performant way of achieving this. That happened years ago so all of
> > us, I'm sure, lost a lot of context by now, but still, let's see.
>
> The xchg() is supposed to be fully ordered on all architectures,
> including arm64.  And smp_load_acquire() provides acquire ordering,
> again on all architectures.  So if you have something like this

I think this is what we need and should be sufficient. Thank you,
Paul, for confirming! Seems like we don't really need to add anything
extra here.

> with x, y, and z all initially zero:
>
>         CPU 0                           CPU 1
>
>         WRITE_ONCE(x, 1);               r3 = smp_load_acquire(&z);
>         r1 = READ_ONCE(y);              r4 = READ_ONCE(x);
>         r2 = xchg(&z, 1);               WRITE_ONCE(y, 1);
>
> Then if r3==1, it is guaranteed that r1==0 and r4==1.
>
> Or are you looking for some other ordering guarantee, and if so, what
> is it?
>
>                                                         Thanx, Paul
>
> > > but it can't guarantee the order of committed data in arm64 because
> > > __smp_store_release does not include a memory barrier, and it just ensure
> > > the variable is visible. The implementation of ARM64 is as follows,
> > > in common/arch/arm64/include/asm/barrier.h
> > >
> > > #define __smp_load_acquire(p)                                           \
> > > ({                                                                      \
> > >         union { __unqual_scalar_typeof(*p) __val; char __c[1]; } __u;   \
> > >         typeof(p) __p = (p);                                            \
> > >         compiletime_assert_atomic_type(*p);                             \
> > >         kasan_check_read(__p, sizeof(*p));                              \
> > >         switch (sizeof(*p)) {                                           \
> > >         case 1:                                                         \
> > >                 asm volatile ("ldarb %w0, %1"                           \
> > >                         : "=r" (*(__u8 *)__u.__c)                       \
> > >                         : "Q" (*__p) : "memory");                       \
> > >                 break;                                                  \
> > >         case 2:                                                         \
> > >                 asm volatile ("ldarh %w0, %1"                           \
> > >                         : "=r" (*(__u16 *)__u.__c)                      \
> > >                         : "Q" (*__p) : "memory");                       \
> > >                 break;                                                  \
> > >         case 4:                                                         \
> > >                 asm volatile ("ldar %w0, %1"                            \
> > >                         : "=r" (*(__u32 *)__u.__c)                      \
> > >                         : "Q" (*__p) : "memory");                       \
> > >                 break;                                                  \
> > >         case 8:                                                         \
> > >                 asm volatile ("ldar %0, %1"                             \
> > >                         : "=r" (*(__u64 *)__u.__c)                      \
> > >                         : "Q" (*__p) : "memory");                       \
> > >                 break;                                                  \
> > >         }                                                               \
> > >         (typeof(*p))__u.__val;                                          \
> > > })
> > >
diff mbox series

Patch

diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index e1cfe890e0be..a66059e2b0d6 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -508,6 +508,10 @@  static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
 	rec_pos = (void *)hdr - (void *)rb->data;
 	cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
 
+	/* Make sure the modification of data is visible on other CPU's
+	 * before consume the event
+	 */
+	smp_wmb();
 	if (flags & BPF_RB_FORCE_WAKEUP)
 		irq_work_queue(&rb->work);
 	else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))