diff mbox series

[bpf-next,v12,01/12] bpf: add networking timestamping support to bpf_get/setsockopt()

Message ID 20250218050125.73676-2-kerneljasonxing@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series net-timestamp: bpf extension to equip applications transparently | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 193 this patch: 193
netdev/build_tools fail Errors and warnings before: 26 (+1) this patch: 27 (+1)
netdev/cc_maintainers warning 1 maintainers not CCed: horms@kernel.org
netdev/build_clang success Errors and warnings before: 9362 this patch: 9362
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7019 this patch: 7019
netdev/checkpatch warning CHECK: spaces preferred around that '<<' (ctx:VxV)
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 10 this patch: 10
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Jason Xing Feb. 18, 2025, 5:01 a.m. UTC
The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are
added to bpf_get/setsockopt. The later patches will implement the
BPF networking timestamping. The BPF program will use
bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to
enable the BPF networking timestamping on a socket.

Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
---
 include/net/sock.h             |  3 +++
 include/uapi/linux/bpf.h       |  8 ++++++++
 net/core/filter.c              | 23 +++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 4 files changed, 35 insertions(+)

Comments

Willem de Bruijn Feb. 18, 2025, 2:22 p.m. UTC | #1
Jason Xing wrote:
> The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are
> added to bpf_get/setsockopt. The later patches will implement the
> BPF networking timestamping. The BPF program will use
> bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to
> enable the BPF networking timestamping on a socket.
> 
> Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
> ---
>  include/net/sock.h             |  3 +++
>  include/uapi/linux/bpf.h       |  8 ++++++++
>  net/core/filter.c              | 23 +++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h |  1 +
>  4 files changed, 35 insertions(+)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 8036b3b79cd8..7916982343c6 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -303,6 +303,7 @@ struct sk_filter;
>    *	@sk_stamp: time stamp of last packet received
>    *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
>    *	@sk_tsflags: SO_TIMESTAMPING flags
> +  *	@sk_bpf_cb_flags: used in bpf_setsockopt()
>    *	@sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
>    *			   Sockets that can be used under memory reclaim should
>    *			   set this to false.
> @@ -445,6 +446,8 @@ struct sock {
>  	u32			sk_reserved_mem;
>  	int			sk_forward_alloc;
>  	u32			sk_tsflags;
> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> +	u32			sk_bpf_cb_flags;
>  	__cacheline_group_end(sock_write_rxtx);

So far only one bit is defined. Does this have to be a 32-bit field in
every socket?

Sorry for the late notice. I had not followed the BPF part closely, as
that is not my expertise.
Martin KaFai Lau Feb. 18, 2025, 9:55 p.m. UTC | #2
On 2/18/25 6:22 AM, Willem de Bruijn wrote:
> Jason Xing wrote:
>> The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are
>> added to bpf_get/setsockopt. The later patches will implement the
>> BPF networking timestamping. The BPF program will use
>> bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to
>> enable the BPF networking timestamping on a socket.
>>
>> Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
>> ---
>>   include/net/sock.h             |  3 +++
>>   include/uapi/linux/bpf.h       |  8 ++++++++
>>   net/core/filter.c              | 23 +++++++++++++++++++++++
>>   tools/include/uapi/linux/bpf.h |  1 +
>>   4 files changed, 35 insertions(+)
>>
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 8036b3b79cd8..7916982343c6 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -303,6 +303,7 @@ struct sk_filter;
>>     *	@sk_stamp: time stamp of last packet received
>>     *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
>>     *	@sk_tsflags: SO_TIMESTAMPING flags
>> +  *	@sk_bpf_cb_flags: used in bpf_setsockopt()
>>     *	@sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
>>     *			   Sockets that can be used under memory reclaim should
>>     *			   set this to false.
>> @@ -445,6 +446,8 @@ struct sock {
>>   	u32			sk_reserved_mem;
>>   	int			sk_forward_alloc;
>>   	u32			sk_tsflags;
>> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
>> +	u32			sk_bpf_cb_flags;
>>   	__cacheline_group_end(sock_write_rxtx);
> 
> So far only one bit is defined. Does this have to be a 32-bit field in
> every socket?

iirc, I think there were multiple callback (cb) flags/bits in the earlier 
revisions, but it had been simplified to one bit in the later revisions.

It's an internal implementation detail. We can reuse some free bits from another 
variable for now. Probably get a bit from sk_tsflags? SOCKCM_FLAG_TS_OPT_ID uses 
BIT(31). Maybe a new SK_TS_FLAG_BPF_TX that uses BIT(30)? I don't have a strong 
preference on the name.

When the BPF program calls bpf_setsockopt(SK_BPF_CB_FLAGS, 
SK_BPF_CB_TX_TIMESTAMPING), the kernel will set/test the BIT(30) of sk_tsflags.

We can wait until there are more socket-level cb flags in the future (e.g., more 
SK_BPF_CB_XXX will be needed) before adding a dedicated int field in the sock.
Jason Xing Feb. 18, 2025, 11:43 p.m. UTC | #3
On Wed, Feb 19, 2025 at 5:55 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 2/18/25 6:22 AM, Willem de Bruijn wrote:
> > Jason Xing wrote:
> >> The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are
> >> added to bpf_get/setsockopt. The later patches will implement the
> >> BPF networking timestamping. The BPF program will use
> >> bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to
> >> enable the BPF networking timestamping on a socket.
> >>
> >> Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
> >> ---
> >>   include/net/sock.h             |  3 +++
> >>   include/uapi/linux/bpf.h       |  8 ++++++++
> >>   net/core/filter.c              | 23 +++++++++++++++++++++++
> >>   tools/include/uapi/linux/bpf.h |  1 +
> >>   4 files changed, 35 insertions(+)
> >>
> >> diff --git a/include/net/sock.h b/include/net/sock.h
> >> index 8036b3b79cd8..7916982343c6 100644
> >> --- a/include/net/sock.h
> >> +++ b/include/net/sock.h
> >> @@ -303,6 +303,7 @@ struct sk_filter;
> >>     *        @sk_stamp: time stamp of last packet received
> >>     *        @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
> >>     *        @sk_tsflags: SO_TIMESTAMPING flags
> >> +  * @sk_bpf_cb_flags: used in bpf_setsockopt()
> >>     *        @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
> >>     *                           Sockets that can be used under memory reclaim should
> >>     *                           set this to false.
> >> @@ -445,6 +446,8 @@ struct sock {
> >>      u32                     sk_reserved_mem;
> >>      int                     sk_forward_alloc;
> >>      u32                     sk_tsflags;
> >> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> >> +    u32                     sk_bpf_cb_flags;
> >>      __cacheline_group_end(sock_write_rxtx);
> >
> > So far only one bit is defined. Does this have to be a 32-bit field in
> > every socket?
>
> iirc, I think there were multiple callback (cb) flags/bits in the earlier
> revisions, but it had been simplified to one bit in the later revisions.
>
> It's an internal implementation detail. We can reuse some free bits from another
> variable for now. Probably get a bit from sk_tsflags? SOCKCM_FLAG_TS_OPT_ID uses
> BIT(31). Maybe a new SK_TS_FLAG_BPF_TX that uses BIT(30)? I don't have a strong
> preference on the name.
>
> When the BPF program calls bpf_setsockopt(SK_BPF_CB_FLAGS,
> SK_BPF_CB_TX_TIMESTAMPING), the kernel will set/test the BIT(30) of sk_tsflags.
>
> We can wait until there are more socket-level cb flags in the future (e.g., more
> SK_BPF_CB_XXX will be needed) before adding a dedicated int field in the sock.

Sorry, I still preferred the way we've discussed already:
1) Introducing a new field sk_bpf_cb_flags extends the use for bpf
timestamping, more than SK_BPF_CB_TX_TIMESTAMPING one flag. I think
SK_BPF_CB_RX_TIMESTAMPING is also needed in the next move. And more
subfeatures (like bpf extension for OPT_ID) will use it. It gives us a
separate way to do more things based on this bpf timestamping.
2) sk_bpf_cb_flags provides a way to let the socket-level use new
features for bpf while now we only have a tcp_sock-level, namely,
bpf_sock_ops_cb_flags. It's obviously good for others.

It's the first move to open the gate for socket-level usage for BPF,
just like how TCP_BPF_SOCK_OPS_CB_FLAGS works in sol_tcp_sockopt(). So
I hope we will not abandon this good approach :(

Now I wonder if I should use the u8 sk_bpf_cb_flags in V13 or just
keep it as-is? Either way is fine with me :) bpf_sock_ops_cb_flags
uses u8 as an example, thus I think we prefer the former?

Thanks,
Jason
Willem de Bruijn Feb. 19, 2025, 2:32 a.m. UTC | #4
Jason Xing wrote:
> On Wed, Feb 19, 2025 at 5:55 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 2/18/25 6:22 AM, Willem de Bruijn wrote:
> > > Jason Xing wrote:
> > >> The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are
> > >> added to bpf_get/setsockopt. The later patches will implement the
> > >> BPF networking timestamping. The BPF program will use
> > >> bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to
> > >> enable the BPF networking timestamping on a socket.
> > >>
> > >> Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
> > >> ---
> > >>   include/net/sock.h             |  3 +++
> > >>   include/uapi/linux/bpf.h       |  8 ++++++++
> > >>   net/core/filter.c              | 23 +++++++++++++++++++++++
> > >>   tools/include/uapi/linux/bpf.h |  1 +
> > >>   4 files changed, 35 insertions(+)
> > >>
> > >> diff --git a/include/net/sock.h b/include/net/sock.h
> > >> index 8036b3b79cd8..7916982343c6 100644
> > >> --- a/include/net/sock.h
> > >> +++ b/include/net/sock.h
> > >> @@ -303,6 +303,7 @@ struct sk_filter;
> > >>     *        @sk_stamp: time stamp of last packet received
> > >>     *        @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
> > >>     *        @sk_tsflags: SO_TIMESTAMPING flags
> > >> +  * @sk_bpf_cb_flags: used in bpf_setsockopt()
> > >>     *        @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
> > >>     *                           Sockets that can be used under memory reclaim should
> > >>     *                           set this to false.
> > >> @@ -445,6 +446,8 @@ struct sock {
> > >>      u32                     sk_reserved_mem;
> > >>      int                     sk_forward_alloc;
> > >>      u32                     sk_tsflags;
> > >> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> > >> +    u32                     sk_bpf_cb_flags;
> > >>      __cacheline_group_end(sock_write_rxtx);
> > >
> > > So far only one bit is defined. Does this have to be a 32-bit field in
> > > every socket?
> >
> > iirc, I think there were multiple callback (cb) flags/bits in the earlier
> > revisions, but it had been simplified to one bit in the later revisions.
> >
> > It's an internal implementation detail. We can reuse some free bits from another
> > variable for now. Probably get a bit from sk_tsflags? SOCKCM_FLAG_TS_OPT_ID uses
> > BIT(31). Maybe a new SK_TS_FLAG_BPF_TX that uses BIT(30)? I don't have a strong
> > preference on the name.
> >
> > When the BPF program calls bpf_setsockopt(SK_BPF_CB_FLAGS,
> > SK_BPF_CB_TX_TIMESTAMPING), the kernel will set/test the BIT(30) of sk_tsflags.
> >
> > We can wait until there are more socket-level cb flags in the future (e.g., more
> > SK_BPF_CB_XXX will be needed) before adding a dedicated int field in the sock.
> 
> Sorry, I still preferred the way we've discussed already:

Adding fields to structs in the hot path is a tragedy of the commons.

Every developer focuses on their specific workload and pet feature,
while imposing a cost on everyone else.

We have a duty to be frugal and mitigate this cost where possible.
Especially for a feature that is likely to be used sparingly.

> 1) Introducing a new field sk_bpf_cb_flags extends the use for bpf
> timestamping, more than SK_BPF_CB_TX_TIMESTAMPING one flag. I think
> SK_BPF_CB_RX_TIMESTAMPING is also needed in the next move. And more
> subfeatures (like bpf extension for OPT_ID) will use it. It gives us a
> separate way to do more things based on this bpf timestamping.
> 2) sk_bpf_cb_flags provides a way to let the socket-level use new
> features for bpf while now we only have a tcp_sock-level, namely,
> bpf_sock_ops_cb_flags. It's obviously good for others.
> 
> It's the first move to open the gate for socket-level usage for BPF,

Can you give a short list of bits that you could see being used, to
get an idea of the count. In my mind this is a very short list, not
worth reserving 32 bits for. But you might have more developed plans.

> just like how TCP_BPF_SOCK_OPS_CB_FLAGS works in sol_tcp_sockopt(). So
> I hope we will not abandon this good approach :(
> 
> Now I wonder if I should use the u8 sk_bpf_cb_flags in V13 or just
> keep it as-is? Either way is fine with me :) bpf_sock_ops_cb_flags
> uses u8 as an example, thus I think we prefer the former?

If it fits in a u8 and that in practice also results in less memory
and cache pressure (i.e., does not just add a 24b hole), then it is a
net improvement.
Jason Xing Feb. 19, 2025, 6:29 a.m. UTC | #5
On Wed, Feb 19, 2025 at 10:32 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Wed, Feb 19, 2025 at 5:55 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >
> > > On 2/18/25 6:22 AM, Willem de Bruijn wrote:
> > > > Jason Xing wrote:
> > > >> The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are
> > > >> added to bpf_get/setsockopt. The later patches will implement the
> > > >> BPF networking timestamping. The BPF program will use
> > > >> bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to
> > > >> enable the BPF networking timestamping on a socket.
> > > >>
> > > >> Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
> > > >> ---
> > > >>   include/net/sock.h             |  3 +++
> > > >>   include/uapi/linux/bpf.h       |  8 ++++++++
> > > >>   net/core/filter.c              | 23 +++++++++++++++++++++++
> > > >>   tools/include/uapi/linux/bpf.h |  1 +
> > > >>   4 files changed, 35 insertions(+)
> > > >>
> > > >> diff --git a/include/net/sock.h b/include/net/sock.h
> > > >> index 8036b3b79cd8..7916982343c6 100644
> > > >> --- a/include/net/sock.h
> > > >> +++ b/include/net/sock.h
> > > >> @@ -303,6 +303,7 @@ struct sk_filter;
> > > >>     *        @sk_stamp: time stamp of last packet received
> > > >>     *        @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
> > > >>     *        @sk_tsflags: SO_TIMESTAMPING flags
> > > >> +  * @sk_bpf_cb_flags: used in bpf_setsockopt()
> > > >>     *        @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
> > > >>     *                           Sockets that can be used under memory reclaim should
> > > >>     *                           set this to false.
> > > >> @@ -445,6 +446,8 @@ struct sock {
> > > >>      u32                     sk_reserved_mem;
> > > >>      int                     sk_forward_alloc;
> > > >>      u32                     sk_tsflags;
> > > >> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> > > >> +    u32                     sk_bpf_cb_flags;
> > > >>      __cacheline_group_end(sock_write_rxtx);
> > > >
> > > > So far only one bit is defined. Does this have to be a 32-bit field in
> > > > every socket?
> > >
> > > iirc, I think there were multiple callback (cb) flags/bits in the earlier
> > > revisions, but it had been simplified to one bit in the later revisions.
> > >
> > > It's an internal implementation detail. We can reuse some free bits from another
> > > variable for now. Probably get a bit from sk_tsflags? SOCKCM_FLAG_TS_OPT_ID uses
> > > BIT(31). Maybe a new SK_TS_FLAG_BPF_TX that uses BIT(30)? I don't have a strong
> > > preference on the name.
> > >
> > > When the BPF program calls bpf_setsockopt(SK_BPF_CB_FLAGS,
> > > SK_BPF_CB_TX_TIMESTAMPING), the kernel will set/test the BIT(30) of sk_tsflags.
> > >
> > > We can wait until there are more socket-level cb flags in the future (e.g., more
> > > SK_BPF_CB_XXX will be needed) before adding a dedicated int field in the sock.
> >
> > Sorry, I still preferred the way we've discussed already:
>
> Adding fields to structs in the hot path is a tragedy of the commons.
>
> Every developer focuses on their specific workload and pet feature,
> while imposing a cost on everyone else.
>
> We have a duty to be frugal and mitigate this cost where possible.
> Especially for a feature that is likely to be used sparingly.

Point taken and I agree on this point all along. Don't get me wrong. I
insisted on using sk_bpf_cb_flags only, not the size of this field.
Please see details below.

>
> > 1) Introducing a new field sk_bpf_cb_flags extends the use for bpf
> > timestamping, more than SK_BPF_CB_TX_TIMESTAMPING one flag. I think
> > SK_BPF_CB_RX_TIMESTAMPING is also needed in the next move. And more
> > subfeatures (like bpf extension for OPT_ID) will use it. It gives us a
> > separate way to do more things based on this bpf timestamping.
> > 2) sk_bpf_cb_flags provides a way to let the socket-level use new
> > features for bpf while now we only have a tcp_sock-level, namely,
> > bpf_sock_ops_cb_flags. It's obviously good for others.
> >
> > It's the first move to open the gate for socket-level usage for BPF,
>
> Can you give a short list of bits that you could see being used, to
> get an idea of the count. In my mind this is a very short list, not
> worth reserving 32 bits for. But you might have more developed plans.

RX is one flag that we're going to add. Another new bit might be used
for bpf extension of OPT_ID to handle the udp lockless case, or
tracing every skb (which needs more official discussion) 'm not that
sure.

>
> > just like how TCP_BPF_SOCK_OPS_CB_FLAGS works in sol_tcp_sockopt(). So
> > I hope we will not abandon this good approach :(
> >
> > Now I wonder if I should use the u8 sk_bpf_cb_flags in V13 or just
> > keep it as-is? Either way is fine with me :) bpf_sock_ops_cb_flags
> > uses u8 as an example, thus I think we prefer the former?
>
> If it fits in a u8 and that in practice also results in less memory
> and cache pressure (i.e., does not just add a 24b hole), then it is a
> net improvement.

Probably I didn't state it clearly. I agree with you on saving memory:)

In the previous response, I was trying to keep the sk_bpf_cb_flags
flag and use a u8 instead. I admit u32 is too large after you noticed
this.

Would the following diff on top of this series be acceptable for you?
And would it be a proper place to put the u8 sk_bpf_cb_flags in struct
sock?
diff --git a/include/net/sock.h b/include/net/sock.h
index 6f4d54faba92..e85d6fb3a2ba 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -447,7 +447,7 @@ struct sock {
        int                     sk_forward_alloc;
        u32                     sk_tsflags;
 #define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
-       u32                     sk_bpf_cb_flags;
+       u8                      sk_bpf_cb_flags;
        __cacheline_group_end(sock_write_rxtx);

        __cacheline_group_begin(sock_write_tx);

The following output is the result of running 'pahole --hex -C sock vmlinux'.
Before this series:
        u32                        sk_tsflags;           /* 0x168   0x4 */
        __u8
__cacheline_group_end__sock_write_rxtx[0]; /* 0x16c     0 */
        __u8
__cacheline_group_begin__sock_write_tx[0]; /* 0x16c     0 */
        int                        sk_write_pending;     /* 0x16c   0x4 */
        atomic_t                   sk_omem_alloc;        /* 0x170   0x4 */
        int                        sk_sndbuf;            /* 0x174   0x4 */
        int                        sk_wmem_queued;       /* 0x178   0x4 */
        refcount_t                 sk_wmem_alloc;        /* 0x17c   0x4 */
        /* --- cacheline 6 boundary (384 bytes) --- */
        long unsigned int          sk_tsq_flags;         /* 0x180   0x8 */
...
/* sum members: 773, holes: 1, sum holes: 1 */

After this diff patch:
        u32                        sk_tsflags;           /* 0x168   0x4 */
        u8                         sk_bpf_cb_flags;      /* 0x16c   0x1 */
        __u8
__cacheline_group_end__sock_write_rxtx[0]; /* 0x16d     0 */
        __u8
__cacheline_group_begin__sock_write_tx[0]; /* 0x16d     0 */

        /* XXX 3 bytes hole, try to pack */

        int                        sk_write_pending;     /* 0x170   0x4 */
        atomic_t                   sk_omem_alloc;        /* 0x174   0x4 */
        int                        sk_sndbuf;            /* 0x178   0x4 */
        int                        sk_wmem_queued;       /* 0x17c   0x4 */
        /* --- cacheline 6 boundary (384 bytes) --- */
        refcount_t                 sk_wmem_alloc;        /* 0x180   0x4 */

        /* XXX 4 bytes hole, try to pack */

        long unsigned int          sk_tsq_flags;         /* 0x188   0x8 */
...
/* sum members: 774, holes: 3, sum holes: 8 */

It will introduce 7 extra sum holes if this series with this u8 change
gets applied. I think it's a proper position because this new
sk_bpf_cb_flags will be used in the tx and rx path just like
sk_tsflags, aligned with rules introduced by the commit[1].

I'd like to know your opinion here. Hopefully we can let this series
pass in the next re-spin:)

Thanks in advance.

[1]
commit 5d4cc87414c5d11345c4b11d61377d351b5c28a2
Author: Eric Dumazet <edumazet@google.com>
Date:   Fri Feb 16 16:20:06 2024 +0000

    net: reorganize "struct sock" fields

    Last major reorg happened in commit 9115e8cd2a0c ("net: reorganize
    struct sock for better data locality")

    Since then, many changes have been done.

    Before SO_PEEK_OFF support is added to TCP, we need
    to move sk_peek_off to a better location.

    It is time to make another pass, and add six groups,
    without explicit alignment.

    - sock_write_rx (following sk_refcnt) read-write fields in rx path.
    - sock_read_rx read-mostly fields in rx path.
    - sock_read_rxtx read-mostly fields in both rx and tx paths.
    - sock_write_rxtx read-write fields in both rx and tx paths.
    - sock_write_tx read-write fields in tx paths.
    - sock_read_tx read-mostly fields in tx paths.

Thanks,
Jason
Jason Xing Feb. 19, 2025, 7:03 a.m. UTC | #6
On Tue, Feb 18, 2025 at 1:02 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are
> added to bpf_get/setsockopt. The later patches will implement the
> BPF networking timestamping. The BPF program will use
> bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to
> enable the BPF networking timestamping on a socket.
>
> Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
> ---
>  include/net/sock.h             |  3 +++
>  include/uapi/linux/bpf.h       |  8 ++++++++
>  net/core/filter.c              | 23 +++++++++++++++++++++++
>  tools/include/uapi/linux/bpf.h |  1 +
>  4 files changed, 35 insertions(+)
>
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 8036b3b79cd8..7916982343c6 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -303,6 +303,7 @@ struct sk_filter;
>    *    @sk_stamp: time stamp of last packet received
>    *    @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
>    *    @sk_tsflags: SO_TIMESTAMPING flags
> +  *    @sk_bpf_cb_flags: used in bpf_setsockopt()
>    *    @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
>    *                       Sockets that can be used under memory reclaim should
>    *                       set this to false.
> @@ -445,6 +446,8 @@ struct sock {
>         u32                     sk_reserved_mem;
>         int                     sk_forward_alloc;
>         u32                     sk_tsflags;
> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> +       u32                     sk_bpf_cb_flags;
>         __cacheline_group_end(sock_write_rxtx);
>
>         __cacheline_group_begin(sock_write_tx);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index fff6cdb8d11a..fa666d51dffe 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -6916,6 +6916,13 @@ enum {
>         BPF_SOCK_OPS_ALL_CB_FLAGS       = 0x7F,
>  };
>
> +/* Definitions for bpf_sk_cb_flags */

nit: s/bpf_sk_cb_flags/sk_bpf_cb_flags

I will correct it.

> +enum {
> +       SK_BPF_CB_TX_TIMESTAMPING       = 1<<0,
> +       SK_BPF_CB_MASK                  = (SK_BPF_CB_TX_TIMESTAMPING - 1) |
> +                                          SK_BPF_CB_TX_TIMESTAMPING
> +};

Martin, I would like to know if it's necessary to update the above new
enum in tools/include/uapi/linux/bpf.h as well?

Thanks,
Jason
Willem de Bruijn Feb. 19, 2025, 3:12 p.m. UTC | #7
> > > Now I wonder if I should use the u8 sk_bpf_cb_flags in V13 or just
> > > keep it as-is? Either way is fine with me :) bpf_sock_ops_cb_flags
> > > uses u8 as an example, thus I think we prefer the former?
> >
> > If it fits in a u8 and that in practice also results in less memory
> > and cache pressure (i.e., does not just add a 24b hole), then it is a
> > net improvement.
> 
> Probably I didn't state it clearly. I agree with you on saving memory:)
> 
> In the previous response, I was trying to keep the sk_bpf_cb_flags
> flag and use a u8 instead. I admit u32 is too large after you noticed
> this.
> 
> Would the following diff on top of this series be acceptable for you?
> And would it be a proper place to put the u8 sk_bpf_cb_flags in struct
> sock?
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 6f4d54faba92..e85d6fb3a2ba 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -447,7 +447,7 @@ struct sock {
>         int                     sk_forward_alloc;
>         u32                     sk_tsflags;
>  #define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> -       u32                     sk_bpf_cb_flags;
> +       u8                      sk_bpf_cb_flags;
>         __cacheline_group_end(sock_write_rxtx);
> 
>         __cacheline_group_begin(sock_write_tx);
> 
> The following output is the result of running 'pahole --hex -C sock vmlinux'.
> Before this series:
>         u32                        sk_tsflags;           /* 0x168   0x4 */
>         __u8
> __cacheline_group_end__sock_write_rxtx[0]; /* 0x16c     0 */
>         __u8
> __cacheline_group_begin__sock_write_tx[0]; /* 0x16c     0 */
>         int                        sk_write_pending;     /* 0x16c   0x4 */
>         atomic_t                   sk_omem_alloc;        /* 0x170   0x4 */
>         int                        sk_sndbuf;            /* 0x174   0x4 */
>         int                        sk_wmem_queued;       /* 0x178   0x4 */
>         refcount_t                 sk_wmem_alloc;        /* 0x17c   0x4 */
>         /* --- cacheline 6 boundary (384 bytes) --- */
>         long unsigned int          sk_tsq_flags;         /* 0x180   0x8 */
> ...
> /* sum members: 773, holes: 1, sum holes: 1 */
> 
> After this diff patch:
>         u32                        sk_tsflags;           /* 0x168   0x4 */
>         u8                         sk_bpf_cb_flags;      /* 0x16c   0x1 */
>         __u8
> __cacheline_group_end__sock_write_rxtx[0]; /* 0x16d     0 */
>         __u8
> __cacheline_group_begin__sock_write_tx[0]; /* 0x16d     0 */
> 
>         /* XXX 3 bytes hole, try to pack */
> 
>         int                        sk_write_pending;     /* 0x170   0x4 */
>         atomic_t                   sk_omem_alloc;        /* 0x174   0x4 */
>         int                        sk_sndbuf;            /* 0x178   0x4 */
>         int                        sk_wmem_queued;       /* 0x17c   0x4 */
>         /* --- cacheline 6 boundary (384 bytes) --- */
>         refcount_t                 sk_wmem_alloc;        /* 0x180   0x4 */
> 
>         /* XXX 4 bytes hole, try to pack */
> 
>         long unsigned int          sk_tsq_flags;         /* 0x188   0x8 */
> ...
> /* sum members: 774, holes: 3, sum holes: 8 */
> 
> It will introduce 7 extra sum holes if this series with this u8 change
> gets applied. I think it's a proper position because this new
> sk_bpf_cb_flags will be used in the tx and rx path just like
> sk_tsflags, aligned with rules introduced by the commit[1].

Reducing a u64 to u8 can leave 7b of holes, but that is not great,
of course.

Since this bitmap is only touched if a BPF program is loaded, arguably
it need not be in the hot path cacheline groups.

Can you find a hole further down to place this in, or at least a spot
that does not result in 7b of wasted space (in the hotpath cacheline
groups of all places).
Martin KaFai Lau Feb. 19, 2025, 7:48 p.m. UTC | #8
On 2/18/25 11:03 PM, Jason Xing wrote:
> On Tue, Feb 18, 2025 at 1:02 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
>>
>> The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are
>> added to bpf_get/setsockopt. The later patches will implement the
>> BPF networking timestamping. The BPF program will use
>> bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to
>> enable the BPF networking timestamping on a socket.
>>
>> Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
>> ---
>>   include/net/sock.h             |  3 +++
>>   include/uapi/linux/bpf.h       |  8 ++++++++
>>   net/core/filter.c              | 23 +++++++++++++++++++++++
>>   tools/include/uapi/linux/bpf.h |  1 +
>>   4 files changed, 35 insertions(+)
>>
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 8036b3b79cd8..7916982343c6 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -303,6 +303,7 @@ struct sk_filter;
>>     *    @sk_stamp: time stamp of last packet received
>>     *    @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
>>     *    @sk_tsflags: SO_TIMESTAMPING flags
>> +  *    @sk_bpf_cb_flags: used in bpf_setsockopt()
>>     *    @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
>>     *                       Sockets that can be used under memory reclaim should
>>     *                       set this to false.
>> @@ -445,6 +446,8 @@ struct sock {
>>          u32                     sk_reserved_mem;
>>          int                     sk_forward_alloc;
>>          u32                     sk_tsflags;
>> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
>> +       u32                     sk_bpf_cb_flags;
>>          __cacheline_group_end(sock_write_rxtx);
>>
>>          __cacheline_group_begin(sock_write_tx);
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index fff6cdb8d11a..fa666d51dffe 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -6916,6 +6916,13 @@ enum {
>>          BPF_SOCK_OPS_ALL_CB_FLAGS       = 0x7F,
>>   };
>>
>> +/* Definitions for bpf_sk_cb_flags */
> 
> nit: s/bpf_sk_cb_flags/sk_bpf_cb_flags
> 
> I will correct it.
> 
>> +enum {
>> +       SK_BPF_CB_TX_TIMESTAMPING       = 1<<0,
>> +       SK_BPF_CB_MASK                  = (SK_BPF_CB_TX_TIMESTAMPING - 1) |
>> +                                          SK_BPF_CB_TX_TIMESTAMPING
>> +};
> 
> Martin, I would like to know if it's necessary to update the above new
> enum in tools/include/uapi/linux/bpf.h as well?

Yes, the tools/include/uapi/linux/bpf.h should be updated. If you diff them, two 
of them should be exactly the same. This patch should do the same to keep the 
tools bpf.h up-to-date.

For other headers in tools/include/uapi, I guess it depends. e.g. the tcp.h in 
your another RTO patch, the two tcp.h files are very different already and the 
selftest does not need the new macro either.
Jason Xing Feb. 20, 2025, 12:04 a.m. UTC | #9
On Wed, Feb 19, 2025 at 11:12 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> > > > Now I wonder if I should use the u8 sk_bpf_cb_flags in V13 or just
> > > > keep it as-is? Either way is fine with me :) bpf_sock_ops_cb_flags
> > > > uses u8 as an example, thus I think we prefer the former?
> > >
> > > If it fits in a u8 and that in practice also results in less memory
> > > and cache pressure (i.e., does not just add a 24b hole), then it is a
> > > net improvement.
> >
> > Probably I didn't state it clearly. I agree with you on saving memory:)
> >
> > In the previous response, I was trying to keep the sk_bpf_cb_flags
> > flag and use a u8 instead. I admit u32 is too large after you noticed
> > this.
> >
> > Would the following diff on top of this series be acceptable for you?
> > And would it be a proper place to put the u8 sk_bpf_cb_flags in struct
> > sock?
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 6f4d54faba92..e85d6fb3a2ba 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -447,7 +447,7 @@ struct sock {
> >         int                     sk_forward_alloc;
> >         u32                     sk_tsflags;
> >  #define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> > -       u32                     sk_bpf_cb_flags;
> > +       u8                      sk_bpf_cb_flags;
> >         __cacheline_group_end(sock_write_rxtx);
> >
> >         __cacheline_group_begin(sock_write_tx);
> >
> > The following output is the result of running 'pahole --hex -C sock vmlinux'.
> > Before this series:
> >         u32                        sk_tsflags;           /* 0x168   0x4 */
> >         __u8
> > __cacheline_group_end__sock_write_rxtx[0]; /* 0x16c     0 */
> >         __u8
> > __cacheline_group_begin__sock_write_tx[0]; /* 0x16c     0 */
> >         int                        sk_write_pending;     /* 0x16c   0x4 */
> >         atomic_t                   sk_omem_alloc;        /* 0x170   0x4 */
> >         int                        sk_sndbuf;            /* 0x174   0x4 */
> >         int                        sk_wmem_queued;       /* 0x178   0x4 */
> >         refcount_t                 sk_wmem_alloc;        /* 0x17c   0x4 */
> >         /* --- cacheline 6 boundary (384 bytes) --- */
> >         long unsigned int          sk_tsq_flags;         /* 0x180   0x8 */
> > ...
> > /* sum members: 773, holes: 1, sum holes: 1 */
> >
> > After this diff patch:
> >         u32                        sk_tsflags;           /* 0x168   0x4 */
> >         u8                         sk_bpf_cb_flags;      /* 0x16c   0x1 */
> >         __u8
> > __cacheline_group_end__sock_write_rxtx[0]; /* 0x16d     0 */
> >         __u8
> > __cacheline_group_begin__sock_write_tx[0]; /* 0x16d     0 */
> >
> >         /* XXX 3 bytes hole, try to pack */
> >
> >         int                        sk_write_pending;     /* 0x170   0x4 */
> >         atomic_t                   sk_omem_alloc;        /* 0x174   0x4 */
> >         int                        sk_sndbuf;            /* 0x178   0x4 */
> >         int                        sk_wmem_queued;       /* 0x17c   0x4 */
> >         /* --- cacheline 6 boundary (384 bytes) --- */
> >         refcount_t                 sk_wmem_alloc;        /* 0x180   0x4 */
> >
> >         /* XXX 4 bytes hole, try to pack */
> >
> >         long unsigned int          sk_tsq_flags;         /* 0x188   0x8 */
> > ...
> > /* sum members: 774, holes: 3, sum holes: 8 */
> >
> > It will introduce 7 extra sum holes if this series with this u8 change
> > gets applied. I think it's a proper position because this new
> > sk_bpf_cb_flags will be used in the tx and rx path just like
> > sk_tsflags, aligned with rules introduced by the commit[1].
>
> Reducing a u64 to u8 can leave 7b of holes, but that is not great,
> of course.
>
> Since this bitmap is only touched if a BPF program is loaded, arguably
> it need not be in the hot path cacheline groups.

Point taken.

>
> Can you find a hole further down to place this in, or at least a spot
> that does not result in 7b of wasted space (in the hotpath cacheline
> groups of all places).

There is one place where I can simply insert the flag.

The diff patch on top of this series is:
diff --git a/include/net/sock.h b/include/net/sock.h
index e85d6fb3a2ba..9fa27693fb02 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -446,8 +446,6 @@ struct sock {
        u32                     sk_reserved_mem;
        int                     sk_forward_alloc;
        u32                     sk_tsflags;
-#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
-       u8                      sk_bpf_cb_flags;
        __cacheline_group_end(sock_write_rxtx);

        __cacheline_group_begin(sock_write_tx);
@@ -528,6 +526,8 @@ struct sock {
        u8                      sk_txtime_deadline_mode : 1,
                                sk_txtime_report_errors : 1,
                                sk_txtime_unused : 6;
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
+       u8                      sk_bpf_cb_flags;

        void                    *sk_user_data;
 #ifdef CONFIG_SECURITY


1) before applying the whole series:
...
        /* --- cacheline 10 boundary (640 bytes) --- */
        ktime_t                    sk_stamp;             /* 0x280   0x8 */
        int                        sk_disconnects;       /* 0x288   0x4 */
        u8                         sk_txrehash;          /* 0x28c   0x1 */
        u8                         sk_clockid;           /* 0x28d   0x1 */
        u8                         sk_txtime_deadline_mode:1; /* 0x28e: 0 0x1 */
        u8                         sk_txtime_report_errors:1; /*
0x28e:0x1 0x1 */
        u8                         sk_txtime_unused:6;   /* 0x28e:0x2 0x1 */

        /* XXX 1 byte hole, try to pack */

        void *                     sk_user_data;         /* 0x290   0x8 */
        void *                     sk_security;          /* 0x298   0x8 */
        struct sock_cgroup_data    sk_cgrp_data;         /* 0x2a0  0x10 */
...
/* sum members: 773, holes: 1, sum holes: 1 */


2) after applying the series with the above diff patch:
...
        /* --- cacheline 10 boundary (640 bytes) --- */
        ktime_t                    sk_stamp;             /* 0x280   0x8 */
        int                        sk_disconnects;       /* 0x288   0x4 */
        u8                         sk_txrehash;          /* 0x28c   0x1 */
        u8                         sk_clockid;           /* 0x28d   0x1 */
        u8                         sk_txtime_deadline_mode:1; /* 0x28e: 0 0x1 */
        u8                         sk_txtime_report_errors:1; /*
0x28e:0x1 0x1 */
        u8                         sk_txtime_unused:6;   /* 0x28e:0x2 0x1 */
        u8                         sk_bpf_cb_flags;      /* 0x28f   0x1 */
        void *                     sk_user_data;         /* 0x290
0x8 */
        void *                     sk_security;          /* 0x298   0x8 */
        struct sock_cgroup_data    sk_cgrp_data;         /* 0x2a0  0x10 */
...
/* sum members: 774 */

It turns out that the new sk_bpf_cb_flags fills the hole exactly. The
new field and some of its nearby fields are quite similar because they
are only/nearly written during the creation or setsockopt phase.

I think now it's a good place to insert the new flag?

Thanks,
Jason
Jason Xing Feb. 20, 2025, 12:05 a.m. UTC | #10
On Thu, Feb 20, 2025 at 3:48 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 2/18/25 11:03 PM, Jason Xing wrote:
> > On Tue, Feb 18, 2025 at 1:02 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >>
> >> The new SK_BPF_CB_FLAGS and new SK_BPF_CB_TX_TIMESTAMPING are
> >> added to bpf_get/setsockopt. The later patches will implement the
> >> BPF networking timestamping. The BPF program will use
> >> bpf_setsockopt(SK_BPF_CB_FLAGS, SK_BPF_CB_TX_TIMESTAMPING) to
> >> enable the BPF networking timestamping on a socket.
> >>
> >> Signed-off-by: Jason Xing <kerneljasonxing@gmail.com>
> >> ---
> >>   include/net/sock.h             |  3 +++
> >>   include/uapi/linux/bpf.h       |  8 ++++++++
> >>   net/core/filter.c              | 23 +++++++++++++++++++++++
> >>   tools/include/uapi/linux/bpf.h |  1 +
> >>   4 files changed, 35 insertions(+)
> >>
> >> diff --git a/include/net/sock.h b/include/net/sock.h
> >> index 8036b3b79cd8..7916982343c6 100644
> >> --- a/include/net/sock.h
> >> +++ b/include/net/sock.h
> >> @@ -303,6 +303,7 @@ struct sk_filter;
> >>     *    @sk_stamp: time stamp of last packet received
> >>     *    @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
> >>     *    @sk_tsflags: SO_TIMESTAMPING flags
> >> +  *    @sk_bpf_cb_flags: used in bpf_setsockopt()
> >>     *    @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
> >>     *                       Sockets that can be used under memory reclaim should
> >>     *                       set this to false.
> >> @@ -445,6 +446,8 @@ struct sock {
> >>          u32                     sk_reserved_mem;
> >>          int                     sk_forward_alloc;
> >>          u32                     sk_tsflags;
> >> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> >> +       u32                     sk_bpf_cb_flags;
> >>          __cacheline_group_end(sock_write_rxtx);
> >>
> >>          __cacheline_group_begin(sock_write_tx);
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index fff6cdb8d11a..fa666d51dffe 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -6916,6 +6916,13 @@ enum {
> >>          BPF_SOCK_OPS_ALL_CB_FLAGS       = 0x7F,
> >>   };
> >>
> >> +/* Definitions for bpf_sk_cb_flags */
> >
> > nit: s/bpf_sk_cb_flags/sk_bpf_cb_flags
> >
> > I will correct it.
> >
> >> +enum {
> >> +       SK_BPF_CB_TX_TIMESTAMPING       = 1<<0,
> >> +       SK_BPF_CB_MASK                  = (SK_BPF_CB_TX_TIMESTAMPING - 1) |
> >> +                                          SK_BPF_CB_TX_TIMESTAMPING
> >> +};
> >
> > Martin, I would like to know if it's necessary to update the above new
> > enum in tools/include/uapi/linux/bpf.h as well?
>
> Yes, the tools/include/uapi/linux/bpf.h should be updated. If you diff them, two
> of them should be exactly the same. This patch should do the same to keep the
> tools bpf.h up-to-date.
>
> For other headers in tools/include/uapi, I guess it depends. e.g. the tcp.h in
> your another RTO patch, the two tcp.h files are very different already and the
> selftest does not need the new macro either.

I learned. Thanks.
Willem de Bruijn Feb. 20, 2025, 2:46 a.m. UTC | #11
> > Can you find a hole further down to place this in, or at least a spot
> > that does not result in 7b of wasted space (in the hotpath cacheline
> > groups of all places).
> 
> There is one place where I can simply insert the flag.
> 
> The diff patch on top of this series is:
> diff --git a/include/net/sock.h b/include/net/sock.h
> index e85d6fb3a2ba..9fa27693fb02 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -446,8 +446,6 @@ struct sock {
>         u32                     sk_reserved_mem;
>         int                     sk_forward_alloc;
>         u32                     sk_tsflags;
> -#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> -       u8                      sk_bpf_cb_flags;
>         __cacheline_group_end(sock_write_rxtx);
> 
>         __cacheline_group_begin(sock_write_tx);
> @@ -528,6 +526,8 @@ struct sock {
>         u8                      sk_txtime_deadline_mode : 1,
>                                 sk_txtime_report_errors : 1,
>                                 sk_txtime_unused : 6;
> +#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
> +       u8                      sk_bpf_cb_flags;
> 
>         void                    *sk_user_data;
>  #ifdef CONFIG_SECURITY
> 
> 
> 1) before applying the whole series:
> ...
>         /* --- cacheline 10 boundary (640 bytes) --- */
>         ktime_t                    sk_stamp;             /* 0x280   0x8 */
>         int                        sk_disconnects;       /* 0x288   0x4 */
>         u8                         sk_txrehash;          /* 0x28c   0x1 */
>         u8                         sk_clockid;           /* 0x28d   0x1 */
>         u8                         sk_txtime_deadline_mode:1; /* 0x28e: 0 0x1 */
>         u8                         sk_txtime_report_errors:1; /*
> 0x28e:0x1 0x1 */
>         u8                         sk_txtime_unused:6;   /* 0x28e:0x2 0x1 */
> 
>         /* XXX 1 byte hole, try to pack */
> 
>         void *                     sk_user_data;         /* 0x290   0x8 */
>         void *                     sk_security;          /* 0x298   0x8 */
>         struct sock_cgroup_data    sk_cgrp_data;         /* 0x2a0  0x10 */
> ...
> /* sum members: 773, holes: 1, sum holes: 1 */
> 
> 
> 2) after applying the series with the above diff patch:
> ...
>         /* --- cacheline 10 boundary (640 bytes) --- */
>         ktime_t                    sk_stamp;             /* 0x280   0x8 */
>         int                        sk_disconnects;       /* 0x288   0x4 */
>         u8                         sk_txrehash;          /* 0x28c   0x1 */
>         u8                         sk_clockid;           /* 0x28d   0x1 */
>         u8                         sk_txtime_deadline_mode:1; /* 0x28e: 0 0x1 */
>         u8                         sk_txtime_report_errors:1; /*
> 0x28e:0x1 0x1 */
>         u8                         sk_txtime_unused:6;   /* 0x28e:0x2 0x1 */
>         u8                         sk_bpf_cb_flags;      /* 0x28f   0x1 */
>         void *                     sk_user_data;         /* 0x290
> 0x8 */
>         void *                     sk_security;          /* 0x298   0x8 */
>         struct sock_cgroup_data    sk_cgrp_data;         /* 0x2a0  0x10 */
> ...
> /* sum members: 774 */
> 
> It turns out that the new sk_bpf_cb_flags fills the hole exactly. The
> new field and some of its nearby fields are quite similar because they
> are only/nearly written during the creation or setsockopt phase.
> 
> I think now it's a good place to insert the new flag?

Thanks. This seems fine to me.
diff mbox series

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index 8036b3b79cd8..7916982343c6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -303,6 +303,7 @@  struct sk_filter;
   *	@sk_stamp: time stamp of last packet received
   *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
   *	@sk_tsflags: SO_TIMESTAMPING flags
+  *	@sk_bpf_cb_flags: used in bpf_setsockopt()
   *	@sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
   *			   Sockets that can be used under memory reclaim should
   *			   set this to false.
@@ -445,6 +446,8 @@  struct sock {
 	u32			sk_reserved_mem;
 	int			sk_forward_alloc;
 	u32			sk_tsflags;
+#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
+	u32			sk_bpf_cb_flags;
 	__cacheline_group_end(sock_write_rxtx);
 
 	__cacheline_group_begin(sock_write_tx);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index fff6cdb8d11a..fa666d51dffe 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6916,6 +6916,13 @@  enum {
 	BPF_SOCK_OPS_ALL_CB_FLAGS       = 0x7F,
 };
 
+/* Definitions for bpf_sk_cb_flags */
+enum {
+	SK_BPF_CB_TX_TIMESTAMPING	= 1<<0,
+	SK_BPF_CB_MASK			= (SK_BPF_CB_TX_TIMESTAMPING - 1) |
+					   SK_BPF_CB_TX_TIMESTAMPING
+};
+
 /* List of known BPF sock_ops operators.
  * New entries can only be added at the end
  */
@@ -7094,6 +7101,7 @@  enum {
 	TCP_BPF_SYN_IP		= 1006, /* Copy the IP[46] and TCP header */
 	TCP_BPF_SYN_MAC         = 1007, /* Copy the MAC, IP[46], and TCP header */
 	TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
+	SK_BPF_CB_FLAGS		= 1009, /* Used to set socket bpf flags */
 };
 
 enum {
diff --git a/net/core/filter.c b/net/core/filter.c
index 2ec162dd83c4..1c6c07507a78 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5222,6 +5222,25 @@  static const struct bpf_func_proto bpf_get_socket_uid_proto = {
 	.arg1_type      = ARG_PTR_TO_CTX,
 };
 
+static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
+{
+	u32 sk_bpf_cb_flags;
+
+	if (getopt) {
+		*(u32 *)optval = sk->sk_bpf_cb_flags;
+		return 0;
+	}
+
+	sk_bpf_cb_flags = *(u32 *)optval;
+
+	if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
+		return -EINVAL;
+
+	sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
+
+	return 0;
+}
+
 static int sol_socket_sockopt(struct sock *sk, int optname,
 			      char *optval, int *optlen,
 			      bool getopt)
@@ -5238,6 +5257,7 @@  static int sol_socket_sockopt(struct sock *sk, int optname,
 	case SO_MAX_PACING_RATE:
 	case SO_BINDTOIFINDEX:
 	case SO_TXREHASH:
+	case SK_BPF_CB_FLAGS:
 		if (*optlen != sizeof(int))
 			return -EINVAL;
 		break;
@@ -5247,6 +5267,9 @@  static int sol_socket_sockopt(struct sock *sk, int optname,
 		return -EINVAL;
 	}
 
+	if (optname == SK_BPF_CB_FLAGS)
+		return sk_bpf_set_get_cb_flags(sk, optval, getopt);
+
 	if (getopt) {
 		if (optname == SO_BINDTODEVICE)
 			return -EINVAL;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2acf9b336371..70366f74ef4e 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7091,6 +7091,7 @@  enum {
 	TCP_BPF_SYN_IP		= 1006, /* Copy the IP[46] and TCP header */
 	TCP_BPF_SYN_MAC         = 1007, /* Copy the MAC, IP[46], and TCP header */
 	TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */
+	SK_BPF_CB_FLAGS		= 1009, /* Used to set socket bpf flags */
 };
 
 enum {