Message ID | 20241028110535.82999-11-kerneljasonxing@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net-timestamp: bpf extension to equip applications transparently | expand |
Jason Xing wrote: > From: Jason Xing <kernelxing@tencent.com> > > Use the offset to record the delta value between current socket key > and bpf socket key. > > 1. If there is only bpf feature running, the socket key is bpf socket > key and the offset is zero; > 2. If there is only traditional feature running, and then bpf feature > is turned on, the socket key is still used by the former while the offset > is the delta between them; > 3. if there is only bpf feature running, and then application uses it, > the socket key would be re-init for application and the offset is the > delta. We need to also figure out the rare conflict when one user sets OPT_ID | OPT_ID_TCP while the other only uses OPT_ID. It is so obscure, that perhaps we can punt and say that the BPF program just has to follow the application preference and be aware of the subtle difference. > Signed-off-by: Jason Xing <kernelxing@tencent.com> > --- > include/net/sock.h | 1 + > net/core/skbuff.c | 15 ++++++++--- > net/core/sock.c | 66 ++++++++++++++++++++++++++++++++++++++-------- > 3 files changed, 68 insertions(+), 14 deletions(-) > > diff --git a/include/net/sock.h b/include/net/sock.h > index 91398b20a4a3..41c6c6f78e55 100644 > --- a/include/net/sock.h > +++ b/include/net/sock.h > @@ -469,6 +469,7 @@ struct sock { > unsigned long sk_pacing_rate; /* bytes per second */ > atomic_t sk_zckey; > atomic_t sk_tskey; > + u32 sk_tskey_bpf_offset; > __cacheline_group_end(sock_write_tx); > > __cacheline_group_begin(sock_read_tx); > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index 0b571306f7ea..d1739317b97d 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -5641,9 +5641,10 @@ void timestamp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) > } > > static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype, > + struct sk_buff *skb, > struct skb_shared_hwtstamps *hwtstamps) > { > - u32 args[2] = {0, 0}; > + u32 args[3] = {0, 0, 0}; > u32 tsflags, cb_flag; > > tsflags = READ_ONCE(sk->sk_tsflags_bpf); > @@ -5672,7 +5673,15 @@ static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype, > args[1] = ts.tv_nsec; > } > > - timestamp_call_bpf(sk, cb_flag, 2, args); > + if (tsflags & SOF_TIMESTAMPING_OPT_ID) { > + args[2] = skb_shinfo(skb)->tskey; > + if (sk_is_tcp(sk)) > + args[2] -= atomic_read(&sk->sk_tskey); > + if (sk->sk_tskey_bpf_offset) > + args[2] += sk->sk_tskey_bpf_offset; > + } > + > + timestamp_call_bpf(sk, cb_flag, 3, args); So the BPF interface is effectively OPT_TSONLY: the packet data is never shared. Then OPT_ID should be mandatory, because it without it the data is not actionable: which byte in the bytestream or packet in the case of datagram sockets does a callback refer to. > +/* Used to track the tskey for bpf extension > + * > + * @sk_tskey: bpf extension can use it only when no application uses. > + * Application can use it directly regardless of bpf extension. > + * > + * There are three strategies: > + * 1) If we've already set through setsockopt() and here we're going to set > + * OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will > + * keep the record of delta between the current "key" and previous key. > + * 2) If we've already set through bpf_setsockopt() and here we're going to > + * set for application use, we will record the delta first and then > + * override/initialize the @sk_tskey. > + * 3) other cases, which means only either of them takes effect, so initialize > + * everything simplely. > + */ Please explain in the commit message that these gymnastics are needed because there can only be one tskey in skb_shared_info. > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) > +{ > + u32 tskey; > + > + if (sk_is_tcp(sk)) { > + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > + return -EINVAL; > + > + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > + tskey = tcp_sk(sk)->write_seq; > + else > + tskey = tcp_sk(sk)->snd_una; > + } else { > + tskey = 0; > + } > + > + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { > + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); > + return 0; > + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { > + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; > + } else { > + sk->sk_tskey_bpf_offset = 0; > + } > + > + return tskey; > +} > + > int sock_set_tskey(struct sock *sk, int val, int bpf_type) > { > u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags; > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > if (val & SOF_TIMESTAMPING_OPT_ID && > !(tsflags & SOF_TIMESTAMPING_OPT_ID)) { > - if (sk_is_tcp(sk)) { > - if ((1 << sk->sk_state) & > - (TCPF_CLOSE | TCPF_LISTEN)) > - return -EINVAL; > - if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); > - else > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); > - } else { > - atomic_set(&sk->sk_tskey, 0); > - } > + long int ret; > + > + ret = sock_calculate_tskey_offset(sk, val, bpf_type); > + if (ret <= 0) > + return ret; > + > + atomic_set(&sk->sk_tskey, ret); > } > > return 0; > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk, > struct so_timestamping timestamping) > { > u32 flags = timestamping.flags; > + int ret; > > if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK) > return -EINVAL; > > + ret = sock_set_tskey(sk, flags, 1); > + if (ret) > + return ret; > + > WRITE_ONCE(sk->sk_tsflags_bpf, flags); > > return 0; I'm a bit hazy on when this can be called. We can assume that this new BPF operation cannot race with the existing setsockopt nor with the datapath that might touch the atomic fields, right?
On Tue, Oct 29, 2024 at 9:24 AM Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > Jason Xing wrote: > > From: Jason Xing <kernelxing@tencent.com> > > > > Use the offset to record the delta value between current socket key > > and bpf socket key. > > > > 1. If there is only bpf feature running, the socket key is bpf socket > > key and the offset is zero; > > 2. If there is only traditional feature running, and then bpf feature > > is turned on, the socket key is still used by the former while the offset > > is the delta between them; > > 3. if there is only bpf feature running, and then application uses it, > > the socket key would be re-init for application and the offset is the > > delta. > > We need to also figure out the rare conflict when one user sets > OPT_ID | OPT_ID_TCP while the other only uses OPT_ID. I think the current patch handles the case because: 1. sock_calculate_tskey_offset() gets the final key first whether the OPT_ID_TCP is set or not. 2. we will use that tskey to calculate the delta. > > It is so obscure, that perhaps we can punt and say that the BPF > program just has to follow the application preference and be aware of > the subtle difference. Right. > > > Signed-off-by: Jason Xing <kernelxing@tencent.com> > > --- > > include/net/sock.h | 1 + > > net/core/skbuff.c | 15 ++++++++--- > > net/core/sock.c | 66 ++++++++++++++++++++++++++++++++++++++-------- > > 3 files changed, 68 insertions(+), 14 deletions(-) > > > > diff --git a/include/net/sock.h b/include/net/sock.h > > index 91398b20a4a3..41c6c6f78e55 100644 > > --- a/include/net/sock.h > > +++ b/include/net/sock.h > > @@ -469,6 +469,7 @@ struct sock { > > unsigned long sk_pacing_rate; /* bytes per second */ > > atomic_t sk_zckey; > > atomic_t sk_tskey; > > + u32 sk_tskey_bpf_offset; > > __cacheline_group_end(sock_write_tx); > > > > __cacheline_group_begin(sock_read_tx); > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > > index 0b571306f7ea..d1739317b97d 100644 > > --- a/net/core/skbuff.c > > +++ b/net/core/skbuff.c > > @@ -5641,9 +5641,10 @@ void timestamp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) > > } > > > > static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype, > > + struct sk_buff *skb, > > struct skb_shared_hwtstamps *hwtstamps) > > { > > - u32 args[2] = {0, 0}; > > + u32 args[3] = {0, 0, 0}; > > u32 tsflags, cb_flag; > > > > tsflags = READ_ONCE(sk->sk_tsflags_bpf); > > @@ -5672,7 +5673,15 @@ static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype, > > args[1] = ts.tv_nsec; > > } > > > > - timestamp_call_bpf(sk, cb_flag, 2, args); > > + if (tsflags & SOF_TIMESTAMPING_OPT_ID) { > > + args[2] = skb_shinfo(skb)->tskey; > > + if (sk_is_tcp(sk)) > > + args[2] -= atomic_read(&sk->sk_tskey); > > + if (sk->sk_tskey_bpf_offset) > > + args[2] += sk->sk_tskey_bpf_offset; > > + } > > + > > + timestamp_call_bpf(sk, cb_flag, 3, args); > > > So the BPF interface is effectively OPT_TSONLY: the packet data is > never shared. > > Then OPT_ID should be mandatory, because it without it the data is > not actionable: which byte in the bytestream or packet in the case > of datagram sockets does a callback refer to. It does make sense, I think I will implement it when bpf_setsockopt() is called. > > > +/* Used to track the tskey for bpf extension > > + * > > + * @sk_tskey: bpf extension can use it only when no application uses. > > + * Application can use it directly regardless of bpf extension. > > + * > > + * There are three strategies: > > + * 1) If we've already set through setsockopt() and here we're going to set > > + * OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will > > + * keep the record of delta between the current "key" and previous key. > > + * 2) If we've already set through bpf_setsockopt() and here we're going to > > + * set for application use, we will record the delta first and then > > + * override/initialize the @sk_tskey. > > + * 3) other cases, which means only either of them takes effect, so initialize > > + * everything simplely. > > + */ > > Please explain in the commit message that these gymnastics are needed > because there can only be one tskey in skb_shared_info. No problem. > > > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) > > +{ > > + u32 tskey; > > + > > + if (sk_is_tcp(sk)) { > > + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > > + return -EINVAL; > > + > > + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > + tskey = tcp_sk(sk)->write_seq; > > + else > > + tskey = tcp_sk(sk)->snd_una; > > + } else { > > + tskey = 0; > > + } > > + > > + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); > > + return 0; > > + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { > > + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; > > + } else { > > + sk->sk_tskey_bpf_offset = 0; > > + } > > + > > + return tskey; > > +} > > + > > int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > { > > u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags; > > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > > > if (val & SOF_TIMESTAMPING_OPT_ID && > > !(tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > - if (sk_is_tcp(sk)) { > > - if ((1 << sk->sk_state) & > > - (TCPF_CLOSE | TCPF_LISTEN)) > > - return -EINVAL; > > - if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); > > - else > > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); > > - } else { > > - atomic_set(&sk->sk_tskey, 0); > > - } > > + long int ret; > > + > > + ret = sock_calculate_tskey_offset(sk, val, bpf_type); > > + if (ret <= 0) > > + return ret; > > + > > + atomic_set(&sk->sk_tskey, ret); > > } > > > > return 0; > > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk, > > struct so_timestamping timestamping) > > { > > u32 flags = timestamping.flags; > > + int ret; > > > > if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK) > > return -EINVAL; > > > > + ret = sock_set_tskey(sk, flags, 1); > > + if (ret) > > + return ret; > > + > > WRITE_ONCE(sk->sk_tsflags_bpf, flags); > > > > return 0; > > I'm a bit hazy on when this can be called. We can assume that this new > BPF operation cannot race with the existing setsockopt nor with the > datapath that might touch the atomic fields, right? It surely can race with the existing setsockopt. 1) if (only existing setsockopt works) { then sk->sk_tskey is set through setsockopt, sk_tskey_bpf_offset is 0. } 2) if (only bpf setsockopt works) { then sk->sk_tskey is set through bpf_setsockopt, sk_tskey_bpf_offset is 0. } 3) if (existing setsockopt already started, here we enable the bpf feature) { then sk->sk_tskey will not change, but the sk_tskey_bpf_offset will be calculated. } 4) if (bpf setsockopt already started, here we enable the application feature) { then sk->sk_tskey will re-initialized/overridden by setsockopt, and the sk_tskey_bpf_offset will be calculated. } Then the skb tskey will use the sk->sk_tskey like before. At last, when we are about to print in bpf extension if we're allowed (by testing the sk_tsflags_bpf), we only need to check if sk_tskey_bpf_offset is zero or not. If the value is zero, it means only the bpf program runs; if not, it means the sk->sk_tskey servers for application feature, we need to compute the real bpf tskey. Please see skb_tstamp_tx_output_bpf(). Above makes sure that two features can work parallelly. It's honestly a little bit complicated. Before writing this part, I drew a few pictures to help me understand how it works. Thanks, Jason
Jason Xing wrote: > On Tue, Oct 29, 2024 at 9:24 AM Willem de Bruijn > <willemdebruijn.kernel@gmail.com> wrote: > > > > Jason Xing wrote: > > > From: Jason Xing <kernelxing@tencent.com> > > > > > > Use the offset to record the delta value between current socket key > > > and bpf socket key. > > > > > > 1. If there is only bpf feature running, the socket key is bpf socket > > > key and the offset is zero; > > > 2. If there is only traditional feature running, and then bpf feature > > > is turned on, the socket key is still used by the former while the offset > > > is the delta between them; > > > 3. if there is only bpf feature running, and then application uses it, > > > the socket key would be re-init for application and the offset is the > > > delta. > > > > We need to also figure out the rare conflict when one user sets > > OPT_ID | OPT_ID_TCP while the other only uses OPT_ID. > > I think the current patch handles the case because: > 1. sock_calculate_tskey_offset() gets the final key first whether the > OPT_ID_TCP is set or not. > 2. we will use that tskey to calculate the delta. Oh yes of course. Great, then this is resolved. > > > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) > > > +{ > > > + u32 tskey; > > > + > > > + if (sk_is_tcp(sk)) { > > > + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > > > + return -EINVAL; > > > + > > > + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > > + tskey = tcp_sk(sk)->write_seq; > > > + else > > > + tskey = tcp_sk(sk)->snd_una; > > > + } else { > > > + tskey = 0; > > > + } > > > + > > > + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > > + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); > > > + return 0; > > > + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { > > > + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; > > > + } else { > > > + sk->sk_tskey_bpf_offset = 0; > > > + } > > > + > > > + return tskey; > > > +} > > > + > > > int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > > { > > > u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags; > > > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > > > > > if (val & SOF_TIMESTAMPING_OPT_ID && > > > !(tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > > - if (sk_is_tcp(sk)) { > > > - if ((1 << sk->sk_state) & > > > - (TCPF_CLOSE | TCPF_LISTEN)) > > > - return -EINVAL; > > > - if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); > > > - else > > > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); > > > - } else { > > > - atomic_set(&sk->sk_tskey, 0); > > > - } > > > + long int ret; > > > + > > > + ret = sock_calculate_tskey_offset(sk, val, bpf_type); > > > + if (ret <= 0) > > > + return ret; > > > + > > > + atomic_set(&sk->sk_tskey, ret); > > > } > > > > > > return 0; > > > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk, > > > struct so_timestamping timestamping) > > > { > > > u32 flags = timestamping.flags; > > > + int ret; > > > > > > if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK) > > > return -EINVAL; > > > > > > + ret = sock_set_tskey(sk, flags, 1); > > > + if (ret) > > > + return ret; > > > + > > > WRITE_ONCE(sk->sk_tsflags_bpf, flags); > > > > > > return 0; > > > > I'm a bit hazy on when this can be called. We can assume that this new > > BPF operation cannot race with the existing setsockopt nor with the > > datapath that might touch the atomic fields, right? > > It surely can race with the existing setsockopt. > > 1) > if (only existing setsockopt works) { > then sk->sk_tskey is set through setsockopt, sk_tskey_bpf_offset is 0. > } > > 2) > if (only bpf setsockopt works) { > then sk->sk_tskey is set through bpf_setsockopt, > sk_tskey_bpf_offset is 0. > } > > 3) > if (existing setsockopt already started, here we enable the bpf feature) { > then sk->sk_tskey will not change, but the sk_tskey_bpf_offset > will be calculated. > } > > 4) > if (bpf setsockopt already started, here we enable the application feature) { > then sk->sk_tskey will re-initialized/overridden by > setsockopt, and the sk_tskey_bpf_offset will be calculated. > } > > Then the skb tskey will use the sk->sk_tskey like before. I mean race as in the setsockopt and bpf setsockopt and datapath running concurrently. As long as both variants of setsockopt hold the socket lock, that won't happen. The datapath is lockless for UDP, so atomic_inc sk_tskey can race with calculating the difference. But this is a known issue. A process that cares should not run setsockopt and send concurrently. So this is fine too.
On Tue, Oct 29, 2024 at 11:03 PM Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > Jason Xing wrote: > > On Tue, Oct 29, 2024 at 9:24 AM Willem de Bruijn > > <willemdebruijn.kernel@gmail.com> wrote: > > > > > > Jason Xing wrote: > > > > From: Jason Xing <kernelxing@tencent.com> > > > > > > > > Use the offset to record the delta value between current socket key > > > > and bpf socket key. > > > > > > > > 1. If there is only bpf feature running, the socket key is bpf socket > > > > key and the offset is zero; > > > > 2. If there is only traditional feature running, and then bpf feature > > > > is turned on, the socket key is still used by the former while the offset > > > > is the delta between them; > > > > 3. if there is only bpf feature running, and then application uses it, > > > > the socket key would be re-init for application and the offset is the > > > > delta. > > > > > > We need to also figure out the rare conflict when one user sets > > > OPT_ID | OPT_ID_TCP while the other only uses OPT_ID. > > > > I think the current patch handles the case because: > > 1. sock_calculate_tskey_offset() gets the final key first whether the > > OPT_ID_TCP is set or not. > > 2. we will use that tskey to calculate the delta. > > Oh yes of course. Great, then this is resolved. > > > > > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) > > > > +{ > > > > + u32 tskey; > > > > + > > > > + if (sk_is_tcp(sk)) { > > > > + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > > > > + return -EINVAL; > > > > + > > > > + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > > > + tskey = tcp_sk(sk)->write_seq; > > > > + else > > > > + tskey = tcp_sk(sk)->snd_una; > > > > + } else { > > > > + tskey = 0; > > > > + } > > > > + > > > > + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > > > + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); > > > > + return 0; > > > > + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { > > > > + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; > > > > + } else { > > > > + sk->sk_tskey_bpf_offset = 0; > > > > + } > > > > + > > > > + return tskey; > > > > +} > > > > + > > > > int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > > > { > > > > u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags; > > > > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > > > > > > > if (val & SOF_TIMESTAMPING_OPT_ID && > > > > !(tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > > > - if (sk_is_tcp(sk)) { > > > > - if ((1 << sk->sk_state) & > > > > - (TCPF_CLOSE | TCPF_LISTEN)) > > > > - return -EINVAL; > > > > - if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > > > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); > > > > - else > > > > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); > > > > - } else { > > > > - atomic_set(&sk->sk_tskey, 0); > > > > - } > > > > + long int ret; > > > > + > > > > + ret = sock_calculate_tskey_offset(sk, val, bpf_type); > > > > + if (ret <= 0) > > > > + return ret; > > > > + > > > > + atomic_set(&sk->sk_tskey, ret); > > > > } > > > > > > > > return 0; > > > > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk, > > > > struct so_timestamping timestamping) > > > > { > > > > u32 flags = timestamping.flags; > > > > + int ret; > > > > > > > > if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK) > > > > return -EINVAL; > > > > > > > > + ret = sock_set_tskey(sk, flags, 1); > > > > + if (ret) > > > > + return ret; > > > > + > > > > WRITE_ONCE(sk->sk_tsflags_bpf, flags); > > > > > > > > return 0; > > > > > > I'm a bit hazy on when this can be called. We can assume that this new > > > BPF operation cannot race with the existing setsockopt nor with the > > > datapath that might touch the atomic fields, right? > > > > It surely can race with the existing setsockopt. > > > > 1) > > if (only existing setsockopt works) { > > then sk->sk_tskey is set through setsockopt, sk_tskey_bpf_offset is 0. > > } > > > > 2) > > if (only bpf setsockopt works) { > > then sk->sk_tskey is set through bpf_setsockopt, > > sk_tskey_bpf_offset is 0. > > } > > > > 3) > > if (existing setsockopt already started, here we enable the bpf feature) { > > then sk->sk_tskey will not change, but the sk_tskey_bpf_offset > > will be calculated. > > } > > > > 4) > > if (bpf setsockopt already started, here we enable the application feature) { > > then sk->sk_tskey will re-initialized/overridden by > > setsockopt, and the sk_tskey_bpf_offset will be calculated. > > } I will copy the above to the commit message next time in order to provide a clear design to future readers. > > > > Then the skb tskey will use the sk->sk_tskey like before. > > I mean race as in the setsockopt and bpf setsockopt and datapath > running concurrently. > > As long as both variants of setsockopt hold the socket lock, that > won't happen. > > The datapath is lockless for UDP, so atomic_inc sk_tskey can race > with calculating the difference. But this is a known issue. A process > that cares should not run setsockopt and send concurrently. So this is > fine too. Oh, now I see. Thanks for the detailed explanation! So Do you feel if we need to take care of this in the future, I mean, after this series gets merged...? Thanks, Jason
> > > > > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) > > > > > +{ > > > > > + u32 tskey; > > > > > + > > > > > + if (sk_is_tcp(sk)) { > > > > > + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > > > > > + return -EINVAL; > > > > > + > > > > > + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > > > > + tskey = tcp_sk(sk)->write_seq; > > > > > + else > > > > > + tskey = tcp_sk(sk)->snd_una; > > > > > + } else { > > > > > + tskey = 0; > > > > > + } > > > > > + > > > > > + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > > > > + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); > > > > > + return 0; > > > > > + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { > > > > > + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; > > > > > + } else { > > > > > + sk->sk_tskey_bpf_offset = 0; > > > > > + } > > > > > + > > > > > + return tskey; > > > > > +} > > > > > + > > > > > int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > > > > { > > > > > u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags; > > > > > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > > > > > > > > > if (val & SOF_TIMESTAMPING_OPT_ID && > > > > > !(tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > > > > - if (sk_is_tcp(sk)) { > > > > > - if ((1 << sk->sk_state) & > > > > > - (TCPF_CLOSE | TCPF_LISTEN)) > > > > > - return -EINVAL; > > > > > - if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > > > > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); > > > > > - else > > > > > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); > > > > > - } else { > > > > > - atomic_set(&sk->sk_tskey, 0); > > > > > - } > > > > > + long int ret; > > > > > + > > > > > + ret = sock_calculate_tskey_offset(sk, val, bpf_type); > > > > > + if (ret <= 0) > > > > > + return ret; > > > > > + > > > > > + atomic_set(&sk->sk_tskey, ret); > > > > > } > > > > > > > > > > return 0; > > > > > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk, > > > > > struct so_timestamping timestamping) > > > > > { > > > > > u32 flags = timestamping.flags; > > > > > + int ret; > > > > > > > > > > if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK) > > > > > return -EINVAL; > > > > > > > > > > + ret = sock_set_tskey(sk, flags, 1); > > > > > + if (ret) > > > > > + return ret; > > > > > + > > > > > WRITE_ONCE(sk->sk_tsflags_bpf, flags); > > > > > > > > > > return 0; > > > > > > > > I'm a bit hazy on when this can be called. We can assume that this new > > > > BPF operation cannot race with the existing setsockopt nor with the > > > > datapath that might touch the atomic fields, right? > > > > > > It surely can race with the existing setsockopt. > > > > > > 1) > > > if (only existing setsockopt works) { > > > then sk->sk_tskey is set through setsockopt, sk_tskey_bpf_offset is 0. > > > } > > > > > > 2) > > > if (only bpf setsockopt works) { > > > then sk->sk_tskey is set through bpf_setsockopt, > > > sk_tskey_bpf_offset is 0. > > > } > > > > > > 3) > > > if (existing setsockopt already started, here we enable the bpf feature) { > > > then sk->sk_tskey will not change, but the sk_tskey_bpf_offset > > > will be calculated. > > > } > > > > > > 4) > > > if (bpf setsockopt already started, here we enable the application feature) { > > > then sk->sk_tskey will re-initialized/overridden by > > > setsockopt, and the sk_tskey_bpf_offset will be calculated. > > > } > > I will copy the above to the commit message next time in order to > provide a clear design to future readers. > > > > > > > Then the skb tskey will use the sk->sk_tskey like before. > > > > I mean race as in the setsockopt and bpf setsockopt and datapath > > running concurrently. > > > > As long as both variants of setsockopt hold the socket lock, that > > won't happen. > > > > The datapath is lockless for UDP, so atomic_inc sk_tskey can race > > with calculating the difference. But this is a known issue. A process > > that cares should not run setsockopt and send concurrently. So this is > > fine too. > > Oh, now I see. Thanks for the detailed explanation! So Do you feel if > we need to take care of this in the future, I mean, after this series > gets merged...? If there is a race condition, then that cannot be fixed up later. But from my admittedly brief analysis, it seems that there is nothing here that needs to be fixed: control plane operations (setsockopt) hold the socket lock. A setsockopt that conflicts with a lockless datapath update will have a slightly ambiguous offset. It is under controlof and up to the user to avoid that if they care.
On Wed, Oct 30, 2024 at 3:45 AM Willem de Bruijn <willemdebruijn.kernel@gmail.com> wrote: > > > > > > > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) > > > > > > +{ > > > > > > + u32 tskey; > > > > > > + > > > > > > + if (sk_is_tcp(sk)) { > > > > > > + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > > > > > > + return -EINVAL; > > > > > > + > > > > > > + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > > > > > + tskey = tcp_sk(sk)->write_seq; > > > > > > + else > > > > > > + tskey = tcp_sk(sk)->snd_una; > > > > > > + } else { > > > > > > + tskey = 0; > > > > > > + } > > > > > > + > > > > > > + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > > > > > + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); > > > > > > + return 0; > > > > > > + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { > > > > > > + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; > > > > > > + } else { > > > > > > + sk->sk_tskey_bpf_offset = 0; > > > > > > + } > > > > > > + > > > > > > + return tskey; > > > > > > +} > > > > > > + > > > > > > int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > > > > > { > > > > > > u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags; > > > > > > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type) > > > > > > > > > > > > if (val & SOF_TIMESTAMPING_OPT_ID && > > > > > > !(tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > > > > > - if (sk_is_tcp(sk)) { > > > > > > - if ((1 << sk->sk_state) & > > > > > > - (TCPF_CLOSE | TCPF_LISTEN)) > > > > > > - return -EINVAL; > > > > > > - if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > > > > > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); > > > > > > - else > > > > > > - atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); > > > > > > - } else { > > > > > > - atomic_set(&sk->sk_tskey, 0); > > > > > > - } > > > > > > + long int ret; > > > > > > + > > > > > > + ret = sock_calculate_tskey_offset(sk, val, bpf_type); > > > > > > + if (ret <= 0) > > > > > > + return ret; > > > > > > + > > > > > > + atomic_set(&sk->sk_tskey, ret); > > > > > > } > > > > > > > > > > > > return 0; > > > > > > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk, > > > > > > struct so_timestamping timestamping) > > > > > > { > > > > > > u32 flags = timestamping.flags; > > > > > > + int ret; > > > > > > > > > > > > if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK) > > > > > > return -EINVAL; > > > > > > > > > > > > + ret = sock_set_tskey(sk, flags, 1); > > > > > > + if (ret) > > > > > > + return ret; > > > > > > + > > > > > > WRITE_ONCE(sk->sk_tsflags_bpf, flags); > > > > > > > > > > > > return 0; > > > > > > > > > > I'm a bit hazy on when this can be called. We can assume that this new > > > > > BPF operation cannot race with the existing setsockopt nor with the > > > > > datapath that might touch the atomic fields, right? > > > > > > > > It surely can race with the existing setsockopt. > > > > > > > > 1) > > > > if (only existing setsockopt works) { > > > > then sk->sk_tskey is set through setsockopt, sk_tskey_bpf_offset is 0. > > > > } > > > > > > > > 2) > > > > if (only bpf setsockopt works) { > > > > then sk->sk_tskey is set through bpf_setsockopt, > > > > sk_tskey_bpf_offset is 0. > > > > } > > > > > > > > 3) > > > > if (existing setsockopt already started, here we enable the bpf feature) { > > > > then sk->sk_tskey will not change, but the sk_tskey_bpf_offset > > > > will be calculated. > > > > } > > > > > > > > 4) > > > > if (bpf setsockopt already started, here we enable the application feature) { > > > > then sk->sk_tskey will re-initialized/overridden by > > > > setsockopt, and the sk_tskey_bpf_offset will be calculated. > > > > } > > > > I will copy the above to the commit message next time in order to > > provide a clear design to future readers. > > > > > > > > > > Then the skb tskey will use the sk->sk_tskey like before. > > > > > > I mean race as in the setsockopt and bpf setsockopt and datapath > > > running concurrently. > > > > > > As long as both variants of setsockopt hold the socket lock, that > > > won't happen. > > > > > > The datapath is lockless for UDP, so atomic_inc sk_tskey can race > > > with calculating the difference. But this is a known issue. A process > > > that cares should not run setsockopt and send concurrently. So this is > > > fine too. > > > > Oh, now I see. Thanks for the detailed explanation! So Do you feel if > > we need to take care of this in the future, I mean, after this series > > gets merged...? > > If there is a race condition, then that cannot be fixed up later. > > But from my admittedly brief analysis, it seems that there is nothing > here that needs to be fixed: control plane operations (setsockopt) > hold the socket lock. A setsockopt that conflicts with a lockless > datapath update will have a slightly ambiguous offset. It is under > controlof and up to the user to avoid that if they care. I got it. Thanks.
On 10/28/24 4:05 AM, Jason Xing wrote: > +/* Used to track the tskey for bpf extension > + * > + * @sk_tskey: bpf extension can use it only when no application uses. > + * Application can use it directly regardless of bpf extension. > + * > + * There are three strategies: > + * 1) If we've already set through setsockopt() and here we're going to set > + * OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will > + * keep the record of delta between the current "key" and previous key. > + * 2) If we've already set through bpf_setsockopt() and here we're going to > + * set for application use, we will record the delta first and then > + * override/initialize the @sk_tskey. > + * 3) other cases, which means only either of them takes effect, so initialize > + * everything simplely. > + */ > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) > +{ > + u32 tskey; > + > + if (sk_is_tcp(sk)) { > + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > + return -EINVAL; > + > + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > + tskey = tcp_sk(sk)->write_seq; > + else > + tskey = tcp_sk(sk)->snd_una; > + } else { > + tskey = 0; > + } > + > + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { > + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); > + return 0; > + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { > + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; > + } else { > + sk->sk_tskey_bpf_offset = 0; > + } > + > + return tskey; > +} Before diving into this route, the bpf prog can peek into the tcp seq no in the skb. It can also look at the sk->sk_tskey for UDP socket. Can you explain why those are not enough information for the bpf prog?
On Wed, Oct 30, 2024 at 1:42 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 10/28/24 4:05 AM, Jason Xing wrote: > > +/* Used to track the tskey for bpf extension > > + * > > + * @sk_tskey: bpf extension can use it only when no application uses. > > + * Application can use it directly regardless of bpf extension. > > + * > > + * There are three strategies: > > + * 1) If we've already set through setsockopt() and here we're going to set > > + * OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will > > + * keep the record of delta between the current "key" and previous key. > > + * 2) If we've already set through bpf_setsockopt() and here we're going to > > + * set for application use, we will record the delta first and then > > + * override/initialize the @sk_tskey. > > + * 3) other cases, which means only either of them takes effect, so initialize > > + * everything simplely. > > + */ > > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) > > +{ > > + u32 tskey; > > + > > + if (sk_is_tcp(sk)) { > > + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > > + return -EINVAL; > > + > > + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > + tskey = tcp_sk(sk)->write_seq; > > + else > > + tskey = tcp_sk(sk)->snd_una; > > + } else { > > + tskey = 0; > > + } > > + > > + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); > > + return 0; > > + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { > > + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; > > + } else { > > + sk->sk_tskey_bpf_offset = 0; > > + } > > + > > + return tskey; > > +} > > Before diving into this route, the bpf prog can peek into the tcp seq no in the > skb. It can also look at the sk->sk_tskey for UDP socket. Can you explain why > those are not enough information for the bpf prog? Well, it does make sense. It seems we don't need to implement tskey for this bpf feature... Due to lack of enough knowledge of bpf, could you provide more hints that I can follow to write a bpf program to print more information from the skb? Like in the last patch of this series, in tools/testing/selftests/bpf/prog_tests/so_timestamping.c, do we have a feasible way to do that? Thanks, Jason
On 10/29/24 11:50 PM, Jason Xing wrote: > On Wed, Oct 30, 2024 at 1:42 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: >> >> On 10/28/24 4:05 AM, Jason Xing wrote: >>> +/* Used to track the tskey for bpf extension >>> + * >>> + * @sk_tskey: bpf extension can use it only when no application uses. >>> + * Application can use it directly regardless of bpf extension. >>> + * >>> + * There are three strategies: >>> + * 1) If we've already set through setsockopt() and here we're going to set >>> + * OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will >>> + * keep the record of delta between the current "key" and previous key. >>> + * 2) If we've already set through bpf_setsockopt() and here we're going to >>> + * set for application use, we will record the delta first and then >>> + * override/initialize the @sk_tskey. >>> + * 3) other cases, which means only either of them takes effect, so initialize >>> + * everything simplely. >>> + */ >>> +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) >>> +{ >>> + u32 tskey; >>> + >>> + if (sk_is_tcp(sk)) { >>> + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) >>> + return -EINVAL; >>> + >>> + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) >>> + tskey = tcp_sk(sk)->write_seq; >>> + else >>> + tskey = tcp_sk(sk)->snd_una; >>> + } else { >>> + tskey = 0; >>> + } >>> + >>> + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { >>> + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); >>> + return 0; >>> + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { >>> + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; >>> + } else { >>> + sk->sk_tskey_bpf_offset = 0; >>> + } >>> + >>> + return tskey; >>> +} >> >> Before diving into this route, the bpf prog can peek into the tcp seq no in the >> skb. It can also look at the sk->sk_tskey for UDP socket. Can you explain why >> those are not enough information for the bpf prog? > > Well, it does make sense. It seems we don't need to implement tskey > for this bpf feature... > > Due to lack of enough knowledge of bpf, could you provide more hints > that I can follow to write a bpf program to print more information > from the skb? Like in the last patch of this series, in > tools/testing/selftests/bpf/prog_tests/so_timestamping.c, do we have a > feasible way to do that? The bpf-prog@sendmsg() will be run to capture a timestamp for sendmsg(). When running the bpf-prog@sendmsg(), the skb can be set to the "struct bpf_sock_ops_kern sock_ops;" which is passed to the sockops prog. Take a look at bpf_skops_write_hdr_opt(). bpf prog cannot directly access the skops->skb now. It is because the sockops prog sees the uapi "struct bpf_sock_ops" instead of "struct bpf_sock_ops(_kern)". The conversion is done in sock_ops_convert_ctx_access. It is an old way before BTF. I don't want to extend the uapi "struct bpf_sock_ops". Instead, use bpf_cast_to_kern_ctx((struct bpf_sock_ops *)skops_ctx) to get a trusted "struct bpf_sock_ops(_kern) *skops" pointer. Then it can access the skops->skb. afaik, the tcb->seq should be available already during sendmsg. it should be able to get it from TCP_SKB_CB(skb)->seq with the bpf_core_cast. Take a look at the existing examples of bpf_core_cast. The same goes for the skb->data. It can use the bpf_dynptr_from_skb(). It is not available to skops program now but should be easy to expose. The bpf prog wants to calculate the delay between [sendmsg, SCHED], [SCHED, SND], [SND, ACK]. It is why (at least in my mental model) a key is needed to co-relate the sendmsg, SCHED, SND, and ACK timestamp. The tcp seqno could be served as that key. All that said, while looking at tcp_tx_timestamp() again, there is always "shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;". shinfo->tskey can be used directly as-is by the bpf prog. I think now I am missing why the bpf prog needs the sk_tskey in the sk? In the bpf prog, when the SCHED/SND/ACK timestamp comes back, it has to find the earlier sendmsg timestamp. One option is to store the earlier sendmsg timestamp at the bpf map key-ed by seqno or the shinfo's tskey. Storing in a bpf map key-ed by seqno/tskey is probably what the selftest should do. In the future, we can consider allowing the rbtree in the bpf sk local storage for searching seqno. There is shinfo's hwtstamp that can be used also if there is a need.
On Thu, Oct 31, 2024 at 9:17 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 10/29/24 11:50 PM, Jason Xing wrote: > > On Wed, Oct 30, 2024 at 1:42 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > >> > >> On 10/28/24 4:05 AM, Jason Xing wrote: > >>> +/* Used to track the tskey for bpf extension > >>> + * > >>> + * @sk_tskey: bpf extension can use it only when no application uses. > >>> + * Application can use it directly regardless of bpf extension. > >>> + * > >>> + * There are three strategies: > >>> + * 1) If we've already set through setsockopt() and here we're going to set > >>> + * OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will > >>> + * keep the record of delta between the current "key" and previous key. > >>> + * 2) If we've already set through bpf_setsockopt() and here we're going to > >>> + * set for application use, we will record the delta first and then > >>> + * override/initialize the @sk_tskey. > >>> + * 3) other cases, which means only either of them takes effect, so initialize > >>> + * everything simplely. > >>> + */ > >>> +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) > >>> +{ > >>> + u32 tskey; > >>> + > >>> + if (sk_is_tcp(sk)) { > >>> + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > >>> + return -EINVAL; > >>> + > >>> + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > >>> + tskey = tcp_sk(sk)->write_seq; > >>> + else > >>> + tskey = tcp_sk(sk)->snd_una; > >>> + } else { > >>> + tskey = 0; > >>> + } > >>> + > >>> + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { > >>> + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); > >>> + return 0; > >>> + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { > >>> + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; > >>> + } else { > >>> + sk->sk_tskey_bpf_offset = 0; > >>> + } > >>> + > >>> + return tskey; > >>> +} > >> > >> Before diving into this route, the bpf prog can peek into the tcp seq no in the > >> skb. It can also look at the sk->sk_tskey for UDP socket. Can you explain why > >> those are not enough information for the bpf prog? > > > > Well, it does make sense. It seems we don't need to implement tskey > > for this bpf feature... > > > > Due to lack of enough knowledge of bpf, could you provide more hints > > that I can follow to write a bpf program to print more information > > from the skb? Like in the last patch of this series, in > > tools/testing/selftests/bpf/prog_tests/so_timestamping.c, do we have a > > feasible way to do that? > > The bpf-prog@sendmsg() will be run to capture a timestamp for sendmsg(). > When running the bpf-prog@sendmsg(), the skb can be set to the "struct > bpf_sock_ops_kern sock_ops;" which is passed to the sockops prog. Take a look at > bpf_skops_write_hdr_opt(). Thanks. I see the skb field in struct bpf_sock_ops_kern. > > bpf prog cannot directly access the skops->skb now. It is because the sockops > prog sees the uapi "struct bpf_sock_ops" instead of "struct > bpf_sock_ops(_kern)". The conversion is done in sock_ops_convert_ctx_access. It > is an old way before BTF. I don't want to extend the uapi "struct bpf_sock_ops". Oh, so it seems we cannot use this way, right? I also noticed a use case that allow users to get the information from one skb: "int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb)" in tools/testing/selftests/bpf/progs/netif_receive_skb.c But it requires us to add the tracepoint in __skb_tstamp_tx() first. Two months ago, I was planning to use a tracepoint for some people who find it difficult to deploy bpf. > > Instead, use bpf_cast_to_kern_ctx((struct bpf_sock_ops *)skops_ctx) to get a > trusted "struct bpf_sock_ops(_kern) *skops" pointer. Then it can access the > skops->skb. Let me spend some time on it. Thanks. > afaik, the tcb->seq should be available already during sendmsg. it > should be able to get it from TCP_SKB_CB(skb)->seq with the bpf_core_cast. Take > a look at the existing examples of bpf_core_cast. > > The same goes for the skb->data. It can use the bpf_dynptr_from_skb(). It is not > available to skops program now but should be easy to expose. I wonder what the use of skb->data is here. > > The bpf prog wants to calculate the delay between [sendmsg, SCHED], [SCHED, > SND], [SND, ACK]. It is why (at least in my mental model) a key is needed to > co-relate the sendmsg, SCHED, SND, and ACK timestamp. The tcp seqno could be > served as that key. > > All that said, while looking at tcp_tx_timestamp() again, there is always > "shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;". shinfo->tskey can be > used directly as-is by the bpf prog. I think now I am missing why the bpf prog > needs the sk_tskey in the sk? As you said, tcp seqno could be treated as the key, but it leaks the information in TCP layer to users. Please see the commit: commit 4ed2d765dfaccff5ebdac68e2064b59125033a3b Author: Willem de Bruijn <willemb@google.com> Date: Mon Aug 4 22:11:49 2014 -0400 net-timestamp: TCP timestamping ... - To avoid leaking the absolute seqno to userspace, the offset returned in ee_data must always be relative. It is an offset between an skb and sk field. It has to be computed in the kernel before reporting to the user space, I think. > > In the bpf prog, when the SCHED/SND/ACK timestamp comes back, it has to find the > earlier sendmsg timestamp. One option is to store the earlier sendmsg timestamp > at the bpf map key-ed by seqno or the shinfo's tskey. Storing in a bpf map > key-ed by seqno/tskey is probably what the selftest should do. In the future, we > can consider allowing the rbtree in the bpf sk local storage for searching > seqno. There is shinfo's hwtstamp that can be used also if there is a need. Thanks for the information! Let me investigate how the bpf map works... I wonder that for the selftests could it be much simpler if we just record each timestamp stored in three variables and calculate them at last since we only send the small packet once instead of using bpf map. I mean, bpf map is really good as far as I know, but I'm a bit worried that implementing such a function could cause more extra work (implementation and review).
On Thu, Oct 31, 2024 at 10:41 AM Jason Xing <kerneljasonxing@gmail.com> wrote: > > On Thu, Oct 31, 2024 at 9:17 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > > > On 10/29/24 11:50 PM, Jason Xing wrote: > > > On Wed, Oct 30, 2024 at 1:42 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > >> > > >> On 10/28/24 4:05 AM, Jason Xing wrote: > > >>> +/* Used to track the tskey for bpf extension > > >>> + * > > >>> + * @sk_tskey: bpf extension can use it only when no application uses. > > >>> + * Application can use it directly regardless of bpf extension. > > >>> + * > > >>> + * There are three strategies: > > >>> + * 1) If we've already set through setsockopt() and here we're going to set > > >>> + * OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will > > >>> + * keep the record of delta between the current "key" and previous key. > > >>> + * 2) If we've already set through bpf_setsockopt() and here we're going to > > >>> + * set for application use, we will record the delta first and then > > >>> + * override/initialize the @sk_tskey. > > >>> + * 3) other cases, which means only either of them takes effect, so initialize > > >>> + * everything simplely. > > >>> + */ > > >>> +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) > > >>> +{ > > >>> + u32 tskey; > > >>> + > > >>> + if (sk_is_tcp(sk)) { > > >>> + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) > > >>> + return -EINVAL; > > >>> + > > >>> + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) > > >>> + tskey = tcp_sk(sk)->write_seq; > > >>> + else > > >>> + tskey = tcp_sk(sk)->snd_una; > > >>> + } else { > > >>> + tskey = 0; > > >>> + } > > >>> + > > >>> + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { > > >>> + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); > > >>> + return 0; > > >>> + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { > > >>> + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; > > >>> + } else { > > >>> + sk->sk_tskey_bpf_offset = 0; > > >>> + } > > >>> + > > >>> + return tskey; > > >>> +} > > >> > > >> Before diving into this route, the bpf prog can peek into the tcp seq no in the > > >> skb. It can also look at the sk->sk_tskey for UDP socket. Can you explain why > > >> those are not enough information for the bpf prog? > > > > > > Well, it does make sense. It seems we don't need to implement tskey > > > for this bpf feature... > > > > > > Due to lack of enough knowledge of bpf, could you provide more hints > > > that I can follow to write a bpf program to print more information > > > from the skb? Like in the last patch of this series, in > > > tools/testing/selftests/bpf/prog_tests/so_timestamping.c, do we have a > > > feasible way to do that? > > > > The bpf-prog@sendmsg() will be run to capture a timestamp for sendmsg(). > > When running the bpf-prog@sendmsg(), the skb can be set to the "struct > > bpf_sock_ops_kern sock_ops;" which is passed to the sockops prog. Take a look at > > bpf_skops_write_hdr_opt(). > > Thanks. I see the skb field in struct bpf_sock_ops_kern. > > > > > bpf prog cannot directly access the skops->skb now. It is because the sockops > > prog sees the uapi "struct bpf_sock_ops" instead of "struct > > bpf_sock_ops(_kern)". The conversion is done in sock_ops_convert_ctx_access. It > > is an old way before BTF. I don't want to extend the uapi "struct bpf_sock_ops". > > Oh, so it seems we cannot use this way, right? > > I also noticed a use case that allow users to get the information from one skb: > "int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb)" in > tools/testing/selftests/bpf/progs/netif_receive_skb.c > But it requires us to add the tracepoint in __skb_tstamp_tx() first. > Two months ago, I was planning to use a tracepoint for some people who > find it difficult to deploy bpf. > > > > > Instead, use bpf_cast_to_kern_ctx((struct bpf_sock_ops *)skops_ctx) to get a > > trusted "struct bpf_sock_ops(_kern) *skops" pointer. Then it can access the > > skops->skb. > > Let me spend some time on it. Thanks. > > > afaik, the tcb->seq should be available already during sendmsg. it > > should be able to get it from TCP_SKB_CB(skb)->seq with the bpf_core_cast. Take > > a look at the existing examples of bpf_core_cast. > > > > The same goes for the skb->data. It can use the bpf_dynptr_from_skb(). It is not > > available to skops program now but should be easy to expose. > > I wonder what the use of skb->data is here. > > > > > The bpf prog wants to calculate the delay between [sendmsg, SCHED], [SCHED, > > SND], [SND, ACK]. It is why (at least in my mental model) a key is needed to > > co-relate the sendmsg, SCHED, SND, and ACK timestamp. The tcp seqno could be > > served as that key. > > > > All that said, while looking at tcp_tx_timestamp() again, there is always > > "shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;". shinfo->tskey can be > > used directly as-is by the bpf prog. I think now I am missing why the bpf prog > > needs the sk_tskey in the sk? > > As you said, tcp seqno could be treated as the key, but it leaks the > information in TCP layer to users. Please see the commit: > commit 4ed2d765dfaccff5ebdac68e2064b59125033a3b > Author: Willem de Bruijn <willemb@google.com> > Date: Mon Aug 4 22:11:49 2014 -0400 > > net-timestamp: TCP timestamping > ... > - To avoid leaking the absolute seqno to userspace, the offset > returned in ee_data must always be relative. It is an offset between > an skb and sk field. > > It has to be computed in the kernel before reporting to the user space, I think. Well, I'm thinking since the BPF program can only be used by _admin_, we will not take any risk even if the raw seq is exported to the BPF program. Willem, I would like to know your opinions about this point (about whether we can export the raw seqno or not). Thanks. Thanks, Jason
On 10/30/24 7:41 PM, Jason Xing wrote: >> All that said, while looking at tcp_tx_timestamp() again, there is always >> "shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;". shinfo->tskey can be >> used directly as-is by the bpf prog. I think now I am missing why the bpf prog >> needs the sk_tskey in the sk? > > As you said, tcp seqno could be treated as the key, but it leaks the > information in TCP layer to users. Please see the commit: I don't think it is a concern for bpf prog running in the kernel. The sockops bpf prog can already read the sk, the skb (which has seqno), and many others. The bpf prog is not a print-only logic. Only using bpf prog to do raw data dumping is not fully utilizing its capability, e.g. data aggregation. The bpf prog should aggregate the data first which is to calculate the delay here.
On Thu, Oct 31, 2024 at 1:52 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 10/30/24 7:41 PM, Jason Xing wrote: > > >> All that said, while looking at tcp_tx_timestamp() again, there is always > >> "shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;". shinfo->tskey can be > >> used directly as-is by the bpf prog. I think now I am missing why the bpf prog > >> needs the sk_tskey in the sk? > > > > As you said, tcp seqno could be treated as the key, but it leaks the > > information in TCP layer to users. Please see the commit: > > I don't think it is a concern for bpf prog running in the kernel. The sockops > bpf prog can already read the sk, the skb (which has seqno), and many others. > > The bpf prog is not a print-only logic. Only using bpf prog to do raw data > dumping is not fully utilizing its capability, e.g. data aggregation. The bpf > prog should aggregate the data first which is to calculate the delay here. Agree, I forgot BPF is only for admin, so it's a feasible solution. It saves a lot of energy :) It looks like the thing is getting simpler and simpler, which could be mostly taken over by bpf itself at last. Good news! Thanks, Jason
On 10/30/24 7:41 PM, Jason Xing wrote: >> bpf prog cannot directly access the skops->skb now. It is because the sockops >> prog sees the uapi "struct bpf_sock_ops" instead of "struct >> bpf_sock_ops(_kern)". The conversion is done in sock_ops_convert_ctx_access. It >> is an old way before BTF. I don't want to extend the uapi "struct bpf_sock_ops". > > Oh, so it seems we cannot use this way, right? No. don't extend the uapi "struct bpf_sock_ops". Use bpf_cast_to_kern_ctx() instead. > > I also noticed a use case that allow users to get the information from one skb: > "int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb)" in > tools/testing/selftests/bpf/progs/netif_receive_skb.c > But it requires us to add the tracepoint in __skb_tstamp_tx() first. > Two months ago, I was planning to use a tracepoint for some people who > find it difficult to deploy bpf. It is a tracing prog instead of sockops prog. The verifier allows accessing different things based on the program type. This patch set is using the sockops bpf prog type which is not a tracing prog. Tracing can do a lot of read-only things but here we need write (e.g. bpf_setsockopt), so tracing prog is not suitable here. > >> >> Instead, use bpf_cast_to_kern_ctx((struct bpf_sock_ops *)skops_ctx) to get a >> trusted "struct bpf_sock_ops(_kern) *skops" pointer. Then it can access the >> skops->skb. > > Let me spend some time on it. Thanks. Take a look at the bpf_cast_to_kern_ctx() examples in selftests/bpf. I think this can be directly used to get to (struct bpf_sock_ops_kern *)skops->skb. Ping back if your selftest bpf prog cannot load. > >> afaik, the tcb->seq should be available already during sendmsg. it >> should be able to get it from TCP_SKB_CB(skb)->seq with the bpf_core_cast. Take >> a look at the existing examples of bpf_core_cast. >> >> The same goes for the skb->data. It can use the bpf_dynptr_from_skb(). It is not >> available to skops program now but should be easy to expose. > I wonder what the use of skb->data is here. You are right, not needed. I was thinking it may need to parse the tcp header from the skb at the rx timestamping. It is not needed. The tcp stack should have already parsed it and TCP_SKB_CB can be directly used as long as the sockops prog can get to the skops->skb. >> >> In the bpf prog, when the SCHED/SND/ACK timestamp comes back, it has to find the >> earlier sendmsg timestamp. One option is to store the earlier sendmsg timestamp >> at the bpf map key-ed by seqno or the shinfo's tskey. Storing in a bpf map >> key-ed by seqno/tskey is probably what the selftest should do. In the future, we >> can consider allowing the rbtree in the bpf sk local storage for searching >> seqno. There is shinfo's hwtstamp that can be used also if there is a need. > > Thanks for the information! Let me investigate how the bpf map works... > > I wonder that for the selftests could it be much simpler if we just > record each timestamp stored in three variables and calculate them at > last since we only send the small packet once instead of using bpf > map. I mean, bpf map is really good as far as I know, but I'm a bit > worried that implementing such a function could cause more extra work > (implementation and review). Don't worry on the review side. imo, a closer to the real world selftest prog is actually helping the review process. It needs to test the tskey anyway and it needs to store somewhere. bpf map is pretty simple to use. I don't think it will have much different in term of complexity also.
On Fri, Nov 1, 2024 at 7:50 AM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > On 10/30/24 7:41 PM, Jason Xing wrote: > >> bpf prog cannot directly access the skops->skb now. It is because the sockops > >> prog sees the uapi "struct bpf_sock_ops" instead of "struct > >> bpf_sock_ops(_kern)". The conversion is done in sock_ops_convert_ctx_access. It > >> is an old way before BTF. I don't want to extend the uapi "struct bpf_sock_ops". > > > > Oh, so it seems we cannot use this way, right? > > No. don't extend the uapi "struct bpf_sock_ops". Use bpf_cast_to_kern_ctx() instead. Got it! > > > > > I also noticed a use case that allow users to get the information from one skb: > > "int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb)" in > > tools/testing/selftests/bpf/progs/netif_receive_skb.c > > But it requires us to add the tracepoint in __skb_tstamp_tx() first. > > Two months ago, I was planning to use a tracepoint for some people who > > find it difficult to deploy bpf. > > > It is a tracing prog instead of sockops prog. The verifier allows accessing > different things based on the program type. This patch set is using the sockops > bpf prog type which is not a tracing prog. Tracing can do a lot of read-only > things but here we need write (e.g. bpf_setsockopt), so tracing prog is not > suitable here. Thanks for the explaination. > > > > >> > >> Instead, use bpf_cast_to_kern_ctx((struct bpf_sock_ops *)skops_ctx) to get a > >> trusted "struct bpf_sock_ops(_kern) *skops" pointer. Then it can access the > >> skops->skb. > > > > Let me spend some time on it. Thanks. > > Take a look at the bpf_cast_to_kern_ctx() examples in selftests/bpf. I think > this can be directly used to get to (struct bpf_sock_ops_kern *)skops->skb. Ping > back if your selftest bpf prog cannot load. No problem :) > > > > >> afaik, the tcb->seq should be available already during sendmsg. it > >> should be able to get it from TCP_SKB_CB(skb)->seq with the bpf_core_cast. Take > >> a look at the existing examples of bpf_core_cast. > >> > >> The same goes for the skb->data. It can use the bpf_dynptr_from_skb(). It is not > >> available to skops program now but should be easy to expose. > > I wonder what the use of skb->data is here. > > You are right, not needed. I was thinking it may need to parse the tcp header > from the skb at the rx timestamping. It is not needed. The tcp stack should have > already parsed it and TCP_SKB_CB can be directly used as long as the sockops > prog can get to the skops->skb. Agreed. > > >> > >> In the bpf prog, when the SCHED/SND/ACK timestamp comes back, it has to find the > >> earlier sendmsg timestamp. One option is to store the earlier sendmsg timestamp > >> at the bpf map key-ed by seqno or the shinfo's tskey. Storing in a bpf map > >> key-ed by seqno/tskey is probably what the selftest should do. In the future, we > >> can consider allowing the rbtree in the bpf sk local storage for searching > >> seqno. There is shinfo's hwtstamp that can be used also if there is a need. > > > > Thanks for the information! Let me investigate how the bpf map works... > > > > I wonder that for the selftests could it be much simpler if we just > > record each timestamp stored in three variables and calculate them at > > last since we only send the small packet once instead of using bpf > > map. I mean, bpf map is really good as far as I know, but I'm a bit > > worried that implementing such a function could cause more extra work > > (implementation and review). > > Don't worry on the review side. imo, a closer to the real world selftest prog is > actually helping the review process. It needs to test the tskey anyway and it > needs to store somewhere. bpf map is pretty simple to use. I don't think it will > have much different in term of complexity also. Got it, will do it soon :) Thanks, Jason
diff --git a/include/net/sock.h b/include/net/sock.h index 91398b20a4a3..41c6c6f78e55 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -469,6 +469,7 @@ struct sock { unsigned long sk_pacing_rate; /* bytes per second */ atomic_t sk_zckey; atomic_t sk_tskey; + u32 sk_tskey_bpf_offset; __cacheline_group_end(sock_write_tx); __cacheline_group_begin(sock_read_tx); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 0b571306f7ea..d1739317b97d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5641,9 +5641,10 @@ void timestamp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) } static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype, + struct sk_buff *skb, struct skb_shared_hwtstamps *hwtstamps) { - u32 args[2] = {0, 0}; + u32 args[3] = {0, 0, 0}; u32 tsflags, cb_flag; tsflags = READ_ONCE(sk->sk_tsflags_bpf); @@ -5672,7 +5673,15 @@ static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype, args[1] = ts.tv_nsec; } - timestamp_call_bpf(sk, cb_flag, 2, args); + if (tsflags & SOF_TIMESTAMPING_OPT_ID) { + args[2] = skb_shinfo(skb)->tskey; + if (sk_is_tcp(sk)) + args[2] -= atomic_read(&sk->sk_tskey); + if (sk->sk_tskey_bpf_offset) + args[2] += sk->sk_tskey_bpf_offset; + } + + timestamp_call_bpf(sk, cb_flag, 3, args); } void __skb_tstamp_tx(struct sk_buff *orig_skb, @@ -5683,7 +5692,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; - skb_tstamp_tx_output_bpf(sk, tstype, hwtstamps); + skb_tstamp_tx_output_bpf(sk, tstype, orig_skb, hwtstamps); skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype); } EXPORT_SYMBOL_GPL(__skb_tstamp_tx); diff --git a/net/core/sock.c b/net/core/sock.c index 42c1aba0b3fe..914ec8046f86 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -891,6 +891,49 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) return 0; } +/* Used to track the tskey for bpf extension + * + * @sk_tskey: bpf extension can use it only when no application uses. + * Application can use it directly regardless of bpf extension. + * + * There are three strategies: + * 1) If we've already set through setsockopt() and here we're going to set + * OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will + * keep the record of delta between the current "key" and previous key. + * 2) If we've already set through bpf_setsockopt() and here we're going to + * set for application use, we will record the delta first and then + * override/initialize the @sk_tskey. + * 3) other cases, which means only either of them takes effect, so initialize + * everything simplely. + */ +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type) +{ + u32 tskey; + + if (sk_is_tcp(sk)) { + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) + return -EINVAL; + + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) + tskey = tcp_sk(sk)->write_seq; + else + tskey = tcp_sk(sk)->snd_una; + } else { + tskey = 0; + } + + if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { + sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey); + return 0; + } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) { + sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey; + } else { + sk->sk_tskey_bpf_offset = 0; + } + + return tskey; +} + int sock_set_tskey(struct sock *sk, int val, int bpf_type) { u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags; @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type) if (val & SOF_TIMESTAMPING_OPT_ID && !(tsflags & SOF_TIMESTAMPING_OPT_ID)) { - if (sk_is_tcp(sk)) { - if ((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN)) - return -EINVAL; - if (val & SOF_TIMESTAMPING_OPT_ID_TCP) - atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); - else - atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); - } else { - atomic_set(&sk->sk_tskey, 0); - } + long int ret; + + ret = sock_calculate_tskey_offset(sk, val, bpf_type); + if (ret <= 0) + return ret; + + atomic_set(&sk->sk_tskey, ret); } return 0; @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk, struct so_timestamping timestamping) { u32 flags = timestamping.flags; + int ret; if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK) return -EINVAL; + ret = sock_set_tskey(sk, flags, 1); + if (ret) + return ret; + WRITE_ONCE(sk->sk_tsflags_bpf, flags); return 0;