diff mbox series

[net-next,v3,10/14] net-timestamp: add basic support with tskey offset

Message ID 20241028110535.82999-11-kerneljasonxing@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net-timestamp: bpf extension to equip applications transparently | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 18 this patch: 18
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 1 maintainers not CCed: horms@kernel.org
netdev/build_clang success Errors and warnings before: 27 this patch: 27
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2608 this patch: 2608
netdev/checkpatch warning WARNING: Prefer 'long' over 'long int' as the int is unnecessary WARNING: line length of 81 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 68 this patch: 69
netdev/source_inline success Was 0 now: 0

Commit Message

Jason Xing Oct. 28, 2024, 11:05 a.m. UTC
From: Jason Xing <kernelxing@tencent.com>

Use the offset to record the delta value between current socket key
and bpf socket key.

1. If there is only bpf feature running, the socket key is bpf socket
key and the offset is zero;
2. If there is only traditional feature running, and then bpf feature
is turned on, the socket key is still used by the former while the offset
is the delta between them;
3. if there is only bpf feature running, and then application uses it,
the socket key would be re-init for application and the offset is the
delta.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
---
 include/net/sock.h |  1 +
 net/core/skbuff.c  | 15 ++++++++---
 net/core/sock.c    | 66 ++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 68 insertions(+), 14 deletions(-)

Comments

Willem de Bruijn Oct. 29, 2024, 1:24 a.m. UTC | #1
Jason Xing wrote:
> From: Jason Xing <kernelxing@tencent.com>
> 
> Use the offset to record the delta value between current socket key
> and bpf socket key.
> 
> 1. If there is only bpf feature running, the socket key is bpf socket
> key and the offset is zero;
> 2. If there is only traditional feature running, and then bpf feature
> is turned on, the socket key is still used by the former while the offset
> is the delta between them;
> 3. if there is only bpf feature running, and then application uses it,
> the socket key would be re-init for application and the offset is the
> delta.

We need to also figure out the rare conflict when one user sets
OPT_ID | OPT_ID_TCP while the other only uses OPT_ID.

It is so obscure, that perhaps we can punt and say that the BPF
program just has to follow the application preference and be aware of
the subtle difference.

> Signed-off-by: Jason Xing <kernelxing@tencent.com>
> ---
>  include/net/sock.h |  1 +
>  net/core/skbuff.c  | 15 ++++++++---
>  net/core/sock.c    | 66 ++++++++++++++++++++++++++++++++++++++--------
>  3 files changed, 68 insertions(+), 14 deletions(-)
> 
> diff --git a/include/net/sock.h b/include/net/sock.h
> index 91398b20a4a3..41c6c6f78e55 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -469,6 +469,7 @@ struct sock {
>  	unsigned long		sk_pacing_rate; /* bytes per second */
>  	atomic_t		sk_zckey;
>  	atomic_t		sk_tskey;
> +	u32			sk_tskey_bpf_offset;
>  	__cacheline_group_end(sock_write_tx);
>  
>  	__cacheline_group_begin(sock_read_tx);
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 0b571306f7ea..d1739317b97d 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -5641,9 +5641,10 @@ void timestamp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
>  }
>  
>  static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype,
> +				     struct sk_buff *skb,
>  				     struct skb_shared_hwtstamps *hwtstamps)
>  {
> -	u32 args[2] = {0, 0};
> +	u32 args[3] = {0, 0, 0};
>  	u32 tsflags, cb_flag;
>  
>  	tsflags = READ_ONCE(sk->sk_tsflags_bpf);
> @@ -5672,7 +5673,15 @@ static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype,
>  		args[1] = ts.tv_nsec;
>  	}
>  
> -	timestamp_call_bpf(sk, cb_flag, 2, args);
> +	if (tsflags & SOF_TIMESTAMPING_OPT_ID) {
> +		args[2] = skb_shinfo(skb)->tskey;
> +		if (sk_is_tcp(sk))
> +			args[2] -= atomic_read(&sk->sk_tskey);
> +		if (sk->sk_tskey_bpf_offset)
> +			args[2] += sk->sk_tskey_bpf_offset;
> +	}
> +
> +	timestamp_call_bpf(sk, cb_flag, 3, args);


So the BPF interface is effectively OPT_TSONLY: the packet data is
never shared.

Then OPT_ID should be mandatory, because it without it the data is
not actionable: which byte in the bytestream or packet in the case
of datagram sockets does a callback refer to.

> +/* Used to track the tskey for bpf extension
> + *
> + * @sk_tskey: bpf extension can use it only when no application uses.
> + *            Application can use it directly regardless of bpf extension.
> + *
> + * There are three strategies:
> + * 1) If we've already set through setsockopt() and here we're going to set
> + *    OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will
> + *    keep the record of delta between the current "key" and previous key.
> + * 2) If we've already set through bpf_setsockopt() and here we're going to
> + *    set for application use, we will record the delta first and then
> + *    override/initialize the @sk_tskey.
> + * 3) other cases, which means only either of them takes effect, so initialize
> + *    everything simplely.
> + */

Please explain in the commit message that these gymnastics are needed
because there can only be one tskey in skb_shared_info.

> +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> +{
> +	u32 tskey;
> +
> +	if (sk_is_tcp(sk)) {
> +		if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> +			return -EINVAL;
> +
> +		if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> +			tskey = tcp_sk(sk)->write_seq;
> +		else
> +			tskey = tcp_sk(sk)->snd_una;
> +	} else {
> +		tskey = 0;
> +	}
> +
> +	if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> +		sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> +		return 0;
> +	} else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> +		sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> +	} else {
> +		sk->sk_tskey_bpf_offset = 0;
> +	}
> +
> +	return tskey;
> +}
> +
>  int sock_set_tskey(struct sock *sk, int val, int bpf_type)
>  {
>  	u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags;
> @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type)
>  
>  	if (val & SOF_TIMESTAMPING_OPT_ID &&
>  	    !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> -		if (sk_is_tcp(sk)) {
> -			if ((1 << sk->sk_state) &
> -			    (TCPF_CLOSE | TCPF_LISTEN))
> -				return -EINVAL;
> -			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> -				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
> -			else
> -				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
> -		} else {
> -			atomic_set(&sk->sk_tskey, 0);
> -		}
> +		long int ret;
> +
> +		ret = sock_calculate_tskey_offset(sk, val, bpf_type);
> +		if (ret <= 0)
> +			return ret;
> +
> +		atomic_set(&sk->sk_tskey, ret);
>  	}
>  
>  	return 0;
> @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk,
>  				     struct so_timestamping timestamping)
>  {
>  	u32 flags = timestamping.flags;
> +	int ret;
>  
>  	if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK)
>  		return -EINVAL;
>  
> +	ret = sock_set_tskey(sk, flags, 1);
> +	if (ret)
> +		return ret;
> +
>  	WRITE_ONCE(sk->sk_tsflags_bpf, flags);
>  
>  	return 0;

I'm a bit hazy on when this can be called. We can assume that this new
BPF operation cannot race with the existing setsockopt nor with the
datapath that might touch the atomic fields, right?
Jason Xing Oct. 29, 2024, 2:41 a.m. UTC | #2
On Tue, Oct 29, 2024 at 9:24 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > From: Jason Xing <kernelxing@tencent.com>
> >
> > Use the offset to record the delta value between current socket key
> > and bpf socket key.
> >
> > 1. If there is only bpf feature running, the socket key is bpf socket
> > key and the offset is zero;
> > 2. If there is only traditional feature running, and then bpf feature
> > is turned on, the socket key is still used by the former while the offset
> > is the delta between them;
> > 3. if there is only bpf feature running, and then application uses it,
> > the socket key would be re-init for application and the offset is the
> > delta.
>
> We need to also figure out the rare conflict when one user sets
> OPT_ID | OPT_ID_TCP while the other only uses OPT_ID.

I think the current patch handles the case because:
1. sock_calculate_tskey_offset() gets the final key first whether the
OPT_ID_TCP is set or not.
2. we will use that tskey to calculate the delta.

>
> It is so obscure, that perhaps we can punt and say that the BPF
> program just has to follow the application preference and be aware of
> the subtle difference.

Right.

>
> > Signed-off-by: Jason Xing <kernelxing@tencent.com>
> > ---
> >  include/net/sock.h |  1 +
> >  net/core/skbuff.c  | 15 ++++++++---
> >  net/core/sock.c    | 66 ++++++++++++++++++++++++++++++++++++++--------
> >  3 files changed, 68 insertions(+), 14 deletions(-)
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h
> > index 91398b20a4a3..41c6c6f78e55 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -469,6 +469,7 @@ struct sock {
> >       unsigned long           sk_pacing_rate; /* bytes per second */
> >       atomic_t                sk_zckey;
> >       atomic_t                sk_tskey;
> > +     u32                     sk_tskey_bpf_offset;
> >       __cacheline_group_end(sock_write_tx);
> >
> >       __cacheline_group_begin(sock_read_tx);
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 0b571306f7ea..d1739317b97d 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -5641,9 +5641,10 @@ void timestamp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
> >  }
> >
> >  static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype,
> > +                                  struct sk_buff *skb,
> >                                    struct skb_shared_hwtstamps *hwtstamps)
> >  {
> > -     u32 args[2] = {0, 0};
> > +     u32 args[3] = {0, 0, 0};
> >       u32 tsflags, cb_flag;
> >
> >       tsflags = READ_ONCE(sk->sk_tsflags_bpf);
> > @@ -5672,7 +5673,15 @@ static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype,
> >               args[1] = ts.tv_nsec;
> >       }
> >
> > -     timestamp_call_bpf(sk, cb_flag, 2, args);
> > +     if (tsflags & SOF_TIMESTAMPING_OPT_ID) {
> > +             args[2] = skb_shinfo(skb)->tskey;
> > +             if (sk_is_tcp(sk))
> > +                     args[2] -= atomic_read(&sk->sk_tskey);
> > +             if (sk->sk_tskey_bpf_offset)
> > +                     args[2] += sk->sk_tskey_bpf_offset;
> > +     }
> > +
> > +     timestamp_call_bpf(sk, cb_flag, 3, args);
>
>
> So the BPF interface is effectively OPT_TSONLY: the packet data is
> never shared.
>
> Then OPT_ID should be mandatory, because it without it the data is
> not actionable: which byte in the bytestream or packet in the case
> of datagram sockets does a callback refer to.

It does make sense, I think I will implement it when bpf_setsockopt() is called.

>
> > +/* Used to track the tskey for bpf extension
> > + *
> > + * @sk_tskey: bpf extension can use it only when no application uses.
> > + *            Application can use it directly regardless of bpf extension.
> > + *
> > + * There are three strategies:
> > + * 1) If we've already set through setsockopt() and here we're going to set
> > + *    OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will
> > + *    keep the record of delta between the current "key" and previous key.
> > + * 2) If we've already set through bpf_setsockopt() and here we're going to
> > + *    set for application use, we will record the delta first and then
> > + *    override/initialize the @sk_tskey.
> > + * 3) other cases, which means only either of them takes effect, so initialize
> > + *    everything simplely.
> > + */
>
> Please explain in the commit message that these gymnastics are needed
> because there can only be one tskey in skb_shared_info.

No problem.

>
> > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> > +{
> > +     u32 tskey;
> > +
> > +     if (sk_is_tcp(sk)) {
> > +             if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> > +                     return -EINVAL;
> > +
> > +             if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > +                     tskey = tcp_sk(sk)->write_seq;
> > +             else
> > +                     tskey = tcp_sk(sk)->snd_una;
> > +     } else {
> > +             tskey = 0;
> > +     }
> > +
> > +     if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > +             sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> > +             return 0;
> > +     } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> > +             sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> > +     } else {
> > +             sk->sk_tskey_bpf_offset = 0;
> > +     }
> > +
> > +     return tskey;
> > +}
> > +
> >  int sock_set_tskey(struct sock *sk, int val, int bpf_type)
> >  {
> >       u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags;
> > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type)
> >
> >       if (val & SOF_TIMESTAMPING_OPT_ID &&
> >           !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > -             if (sk_is_tcp(sk)) {
> > -                     if ((1 << sk->sk_state) &
> > -                         (TCPF_CLOSE | TCPF_LISTEN))
> > -                             return -EINVAL;
> > -                     if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > -                             atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
> > -                     else
> > -                             atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
> > -             } else {
> > -                     atomic_set(&sk->sk_tskey, 0);
> > -             }
> > +             long int ret;
> > +
> > +             ret = sock_calculate_tskey_offset(sk, val, bpf_type);
> > +             if (ret <= 0)
> > +                     return ret;
> > +
> > +             atomic_set(&sk->sk_tskey, ret);
> >       }
> >
> >       return 0;
> > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk,
> >                                    struct so_timestamping timestamping)
> >  {
> >       u32 flags = timestamping.flags;
> > +     int ret;
> >
> >       if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK)
> >               return -EINVAL;
> >
> > +     ret = sock_set_tskey(sk, flags, 1);
> > +     if (ret)
> > +             return ret;
> > +
> >       WRITE_ONCE(sk->sk_tsflags_bpf, flags);
> >
> >       return 0;
>
> I'm a bit hazy on when this can be called. We can assume that this new
> BPF operation cannot race with the existing setsockopt nor with the
> datapath that might touch the atomic fields, right?

It surely can race with the existing setsockopt.

1)
if (only existing setsockopt works) {
        then sk->sk_tskey is set through setsockopt, sk_tskey_bpf_offset is 0.
}

2)
if (only bpf setsockopt works) {
        then sk->sk_tskey is set through bpf_setsockopt,
sk_tskey_bpf_offset is 0.
}

3)
if (existing setsockopt already started, here we enable the bpf feature) {
        then sk->sk_tskey will not change, but the sk_tskey_bpf_offset
will be calculated.
}

4)
if (bpf setsockopt already started, here we enable the application feature) {
        then sk->sk_tskey will re-initialized/overridden by
setsockopt, and the sk_tskey_bpf_offset will be calculated.
}

Then the skb tskey will use the sk->sk_tskey like before.

At last, when we are about to print in bpf extension if we're allowed
(by testing the sk_tsflags_bpf), we only need to check if
sk_tskey_bpf_offset is zero or not. If the value is zero, it means
only the bpf program runs; if not, it means the sk->sk_tskey servers
for application feature, we need to compute the real bpf tskey. Please
see skb_tstamp_tx_output_bpf().

Above makes sure that two features can work parallelly. It's honestly
a little bit complicated. Before writing this part, I drew a few
pictures to help me understand how it works.

Thanks,
Jason
Willem de Bruijn Oct. 29, 2024, 3:03 p.m. UTC | #3
Jason Xing wrote:
> On Tue, Oct 29, 2024 at 9:24 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Jason Xing wrote:
> > > From: Jason Xing <kernelxing@tencent.com>
> > >
> > > Use the offset to record the delta value between current socket key
> > > and bpf socket key.
> > >
> > > 1. If there is only bpf feature running, the socket key is bpf socket
> > > key and the offset is zero;
> > > 2. If there is only traditional feature running, and then bpf feature
> > > is turned on, the socket key is still used by the former while the offset
> > > is the delta between them;
> > > 3. if there is only bpf feature running, and then application uses it,
> > > the socket key would be re-init for application and the offset is the
> > > delta.
> >
> > We need to also figure out the rare conflict when one user sets
> > OPT_ID | OPT_ID_TCP while the other only uses OPT_ID.
> 
> I think the current patch handles the case because:
> 1. sock_calculate_tskey_offset() gets the final key first whether the
> OPT_ID_TCP is set or not.
> 2. we will use that tskey to calculate the delta.

Oh yes of course. Great, then this is resolved.

> > > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> > > +{
> > > +     u32 tskey;
> > > +
> > > +     if (sk_is_tcp(sk)) {
> > > +             if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> > > +                     return -EINVAL;
> > > +
> > > +             if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > > +                     tskey = tcp_sk(sk)->write_seq;
> > > +             else
> > > +                     tskey = tcp_sk(sk)->snd_una;
> > > +     } else {
> > > +             tskey = 0;
> > > +     }
> > > +
> > > +     if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > > +             sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> > > +             return 0;
> > > +     } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> > > +             sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> > > +     } else {
> > > +             sk->sk_tskey_bpf_offset = 0;
> > > +     }
> > > +
> > > +     return tskey;
> > > +}
> > > +
> > >  int sock_set_tskey(struct sock *sk, int val, int bpf_type)
> > >  {
> > >       u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags;
> > > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type)
> > >
> > >       if (val & SOF_TIMESTAMPING_OPT_ID &&
> > >           !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > > -             if (sk_is_tcp(sk)) {
> > > -                     if ((1 << sk->sk_state) &
> > > -                         (TCPF_CLOSE | TCPF_LISTEN))
> > > -                             return -EINVAL;
> > > -                     if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > > -                             atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
> > > -                     else
> > > -                             atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
> > > -             } else {
> > > -                     atomic_set(&sk->sk_tskey, 0);
> > > -             }
> > > +             long int ret;
> > > +
> > > +             ret = sock_calculate_tskey_offset(sk, val, bpf_type);
> > > +             if (ret <= 0)
> > > +                     return ret;
> > > +
> > > +             atomic_set(&sk->sk_tskey, ret);
> > >       }
> > >
> > >       return 0;
> > > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk,
> > >                                    struct so_timestamping timestamping)
> > >  {
> > >       u32 flags = timestamping.flags;
> > > +     int ret;
> > >
> > >       if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK)
> > >               return -EINVAL;
> > >
> > > +     ret = sock_set_tskey(sk, flags, 1);
> > > +     if (ret)
> > > +             return ret;
> > > +
> > >       WRITE_ONCE(sk->sk_tsflags_bpf, flags);
> > >
> > >       return 0;
> >
> > I'm a bit hazy on when this can be called. We can assume that this new
> > BPF operation cannot race with the existing setsockopt nor with the
> > datapath that might touch the atomic fields, right?
> 
> It surely can race with the existing setsockopt.
> 
> 1)
> if (only existing setsockopt works) {
>         then sk->sk_tskey is set through setsockopt, sk_tskey_bpf_offset is 0.
> }
> 
> 2)
> if (only bpf setsockopt works) {
>         then sk->sk_tskey is set through bpf_setsockopt,
> sk_tskey_bpf_offset is 0.
> }
> 
> 3)
> if (existing setsockopt already started, here we enable the bpf feature) {
>         then sk->sk_tskey will not change, but the sk_tskey_bpf_offset
> will be calculated.
> }
> 
> 4)
> if (bpf setsockopt already started, here we enable the application feature) {
>         then sk->sk_tskey will re-initialized/overridden by
> setsockopt, and the sk_tskey_bpf_offset will be calculated.
> }
> 
> Then the skb tskey will use the sk->sk_tskey like before.

I mean race as in the setsockopt and bpf setsockopt and datapath
running concurrently.

As long as both variants of setsockopt hold the socket lock, that
won't happen.

The datapath is lockless for UDP, so atomic_inc sk_tskey can race
with calculating the difference. But this is a known issue. A process
that cares should not run setsockopt and send concurrently. So this is
fine too.
Jason Xing Oct. 29, 2024, 3:50 p.m. UTC | #4
On Tue, Oct 29, 2024 at 11:03 PM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Jason Xing wrote:
> > On Tue, Oct 29, 2024 at 9:24 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Jason Xing wrote:
> > > > From: Jason Xing <kernelxing@tencent.com>
> > > >
> > > > Use the offset to record the delta value between current socket key
> > > > and bpf socket key.
> > > >
> > > > 1. If there is only bpf feature running, the socket key is bpf socket
> > > > key and the offset is zero;
> > > > 2. If there is only traditional feature running, and then bpf feature
> > > > is turned on, the socket key is still used by the former while the offset
> > > > is the delta between them;
> > > > 3. if there is only bpf feature running, and then application uses it,
> > > > the socket key would be re-init for application and the offset is the
> > > > delta.
> > >
> > > We need to also figure out the rare conflict when one user sets
> > > OPT_ID | OPT_ID_TCP while the other only uses OPT_ID.
> >
> > I think the current patch handles the case because:
> > 1. sock_calculate_tskey_offset() gets the final key first whether the
> > OPT_ID_TCP is set or not.
> > 2. we will use that tskey to calculate the delta.
>
> Oh yes of course. Great, then this is resolved.
>
> > > > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> > > > +{
> > > > +     u32 tskey;
> > > > +
> > > > +     if (sk_is_tcp(sk)) {
> > > > +             if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> > > > +                     return -EINVAL;
> > > > +
> > > > +             if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > > > +                     tskey = tcp_sk(sk)->write_seq;
> > > > +             else
> > > > +                     tskey = tcp_sk(sk)->snd_una;
> > > > +     } else {
> > > > +             tskey = 0;
> > > > +     }
> > > > +
> > > > +     if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > > > +             sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> > > > +             return 0;
> > > > +     } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> > > > +             sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> > > > +     } else {
> > > > +             sk->sk_tskey_bpf_offset = 0;
> > > > +     }
> > > > +
> > > > +     return tskey;
> > > > +}
> > > > +
> > > >  int sock_set_tskey(struct sock *sk, int val, int bpf_type)
> > > >  {
> > > >       u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags;
> > > > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type)
> > > >
> > > >       if (val & SOF_TIMESTAMPING_OPT_ID &&
> > > >           !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > > > -             if (sk_is_tcp(sk)) {
> > > > -                     if ((1 << sk->sk_state) &
> > > > -                         (TCPF_CLOSE | TCPF_LISTEN))
> > > > -                             return -EINVAL;
> > > > -                     if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > > > -                             atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
> > > > -                     else
> > > > -                             atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
> > > > -             } else {
> > > > -                     atomic_set(&sk->sk_tskey, 0);
> > > > -             }
> > > > +             long int ret;
> > > > +
> > > > +             ret = sock_calculate_tskey_offset(sk, val, bpf_type);
> > > > +             if (ret <= 0)
> > > > +                     return ret;
> > > > +
> > > > +             atomic_set(&sk->sk_tskey, ret);
> > > >       }
> > > >
> > > >       return 0;
> > > > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk,
> > > >                                    struct so_timestamping timestamping)
> > > >  {
> > > >       u32 flags = timestamping.flags;
> > > > +     int ret;
> > > >
> > > >       if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK)
> > > >               return -EINVAL;
> > > >
> > > > +     ret = sock_set_tskey(sk, flags, 1);
> > > > +     if (ret)
> > > > +             return ret;
> > > > +
> > > >       WRITE_ONCE(sk->sk_tsflags_bpf, flags);
> > > >
> > > >       return 0;
> > >
> > > I'm a bit hazy on when this can be called. We can assume that this new
> > > BPF operation cannot race with the existing setsockopt nor with the
> > > datapath that might touch the atomic fields, right?
> >
> > It surely can race with the existing setsockopt.
> >
> > 1)
> > if (only existing setsockopt works) {
> >         then sk->sk_tskey is set through setsockopt, sk_tskey_bpf_offset is 0.
> > }
> >
> > 2)
> > if (only bpf setsockopt works) {
> >         then sk->sk_tskey is set through bpf_setsockopt,
> > sk_tskey_bpf_offset is 0.
> > }
> >
> > 3)
> > if (existing setsockopt already started, here we enable the bpf feature) {
> >         then sk->sk_tskey will not change, but the sk_tskey_bpf_offset
> > will be calculated.
> > }
> >
> > 4)
> > if (bpf setsockopt already started, here we enable the application feature) {
> >         then sk->sk_tskey will re-initialized/overridden by
> > setsockopt, and the sk_tskey_bpf_offset will be calculated.
> > }

I will copy the above to the commit message next time in order to
provide a clear design to future readers.

> >
> > Then the skb tskey will use the sk->sk_tskey like before.
>
> I mean race as in the setsockopt and bpf setsockopt and datapath
> running concurrently.
>
> As long as both variants of setsockopt hold the socket lock, that
> won't happen.
>
> The datapath is lockless for UDP, so atomic_inc sk_tskey can race
> with calculating the difference. But this is a known issue. A process
> that cares should not run setsockopt and send concurrently. So this is
> fine too.

Oh, now I see. Thanks for the detailed explanation! So Do you feel if
we need to take care of this in the future, I mean, after this series
gets merged...?

Thanks,
Jason
Willem de Bruijn Oct. 29, 2024, 7:45 p.m. UTC | #5
> > > > > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> > > > > +{
> > > > > +     u32 tskey;
> > > > > +
> > > > > +     if (sk_is_tcp(sk)) {
> > > > > +             if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> > > > > +                     return -EINVAL;
> > > > > +
> > > > > +             if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > > > > +                     tskey = tcp_sk(sk)->write_seq;
> > > > > +             else
> > > > > +                     tskey = tcp_sk(sk)->snd_una;
> > > > > +     } else {
> > > > > +             tskey = 0;
> > > > > +     }
> > > > > +
> > > > > +     if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > > > > +             sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> > > > > +             return 0;
> > > > > +     } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> > > > > +             sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> > > > > +     } else {
> > > > > +             sk->sk_tskey_bpf_offset = 0;
> > > > > +     }
> > > > > +
> > > > > +     return tskey;
> > > > > +}
> > > > > +
> > > > >  int sock_set_tskey(struct sock *sk, int val, int bpf_type)
> > > > >  {
> > > > >       u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags;
> > > > > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type)
> > > > >
> > > > >       if (val & SOF_TIMESTAMPING_OPT_ID &&
> > > > >           !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > > > > -             if (sk_is_tcp(sk)) {
> > > > > -                     if ((1 << sk->sk_state) &
> > > > > -                         (TCPF_CLOSE | TCPF_LISTEN))
> > > > > -                             return -EINVAL;
> > > > > -                     if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > > > > -                             atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
> > > > > -                     else
> > > > > -                             atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
> > > > > -             } else {
> > > > > -                     atomic_set(&sk->sk_tskey, 0);
> > > > > -             }
> > > > > +             long int ret;
> > > > > +
> > > > > +             ret = sock_calculate_tskey_offset(sk, val, bpf_type);
> > > > > +             if (ret <= 0)
> > > > > +                     return ret;
> > > > > +
> > > > > +             atomic_set(&sk->sk_tskey, ret);
> > > > >       }
> > > > >
> > > > >       return 0;
> > > > > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk,
> > > > >                                    struct so_timestamping timestamping)
> > > > >  {
> > > > >       u32 flags = timestamping.flags;
> > > > > +     int ret;
> > > > >
> > > > >       if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK)
> > > > >               return -EINVAL;
> > > > >
> > > > > +     ret = sock_set_tskey(sk, flags, 1);
> > > > > +     if (ret)
> > > > > +             return ret;
> > > > > +
> > > > >       WRITE_ONCE(sk->sk_tsflags_bpf, flags);
> > > > >
> > > > >       return 0;
> > > >
> > > > I'm a bit hazy on when this can be called. We can assume that this new
> > > > BPF operation cannot race with the existing setsockopt nor with the
> > > > datapath that might touch the atomic fields, right?
> > >
> > > It surely can race with the existing setsockopt.
> > >
> > > 1)
> > > if (only existing setsockopt works) {
> > >         then sk->sk_tskey is set through setsockopt, sk_tskey_bpf_offset is 0.
> > > }
> > >
> > > 2)
> > > if (only bpf setsockopt works) {
> > >         then sk->sk_tskey is set through bpf_setsockopt,
> > > sk_tskey_bpf_offset is 0.
> > > }
> > >
> > > 3)
> > > if (existing setsockopt already started, here we enable the bpf feature) {
> > >         then sk->sk_tskey will not change, but the sk_tskey_bpf_offset
> > > will be calculated.
> > > }
> > >
> > > 4)
> > > if (bpf setsockopt already started, here we enable the application feature) {
> > >         then sk->sk_tskey will re-initialized/overridden by
> > > setsockopt, and the sk_tskey_bpf_offset will be calculated.
> > > }
> 
> I will copy the above to the commit message next time in order to
> provide a clear design to future readers.
> 
> > >
> > > Then the skb tskey will use the sk->sk_tskey like before.
> >
> > I mean race as in the setsockopt and bpf setsockopt and datapath
> > running concurrently.
> >
> > As long as both variants of setsockopt hold the socket lock, that
> > won't happen.
> >
> > The datapath is lockless for UDP, so atomic_inc sk_tskey can race
> > with calculating the difference. But this is a known issue. A process
> > that cares should not run setsockopt and send concurrently. So this is
> > fine too.
> 
> Oh, now I see. Thanks for the detailed explanation! So Do you feel if
> we need to take care of this in the future, I mean, after this series
> gets merged...?

If there is a race condition, then that cannot be fixed up later.

But from my admittedly brief analysis, it seems that there is nothing
here that needs to be fixed: control plane operations (setsockopt)
hold the socket lock. A setsockopt that conflicts with a lockless
datapath update will have a slightly ambiguous offset. It is under
controlof and up to the user to avoid that if they care.
Jason Xing Oct. 30, 2024, 3:27 a.m. UTC | #6
On Wed, Oct 30, 2024 at 3:45 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> > > > > > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> > > > > > +{
> > > > > > +     u32 tskey;
> > > > > > +
> > > > > > +     if (sk_is_tcp(sk)) {
> > > > > > +             if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> > > > > > +                     return -EINVAL;
> > > > > > +
> > > > > > +             if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > > > > > +                     tskey = tcp_sk(sk)->write_seq;
> > > > > > +             else
> > > > > > +                     tskey = tcp_sk(sk)->snd_una;
> > > > > > +     } else {
> > > > > > +             tskey = 0;
> > > > > > +     }
> > > > > > +
> > > > > > +     if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > > > > > +             sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> > > > > > +             return 0;
> > > > > > +     } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> > > > > > +             sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> > > > > > +     } else {
> > > > > > +             sk->sk_tskey_bpf_offset = 0;
> > > > > > +     }
> > > > > > +
> > > > > > +     return tskey;
> > > > > > +}
> > > > > > +
> > > > > >  int sock_set_tskey(struct sock *sk, int val, int bpf_type)
> > > > > >  {
> > > > > >       u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags;
> > > > > > @@ -901,17 +944,13 @@ int sock_set_tskey(struct sock *sk, int val, int bpf_type)
> > > > > >
> > > > > >       if (val & SOF_TIMESTAMPING_OPT_ID &&
> > > > > >           !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > > > > > -             if (sk_is_tcp(sk)) {
> > > > > > -                     if ((1 << sk->sk_state) &
> > > > > > -                         (TCPF_CLOSE | TCPF_LISTEN))
> > > > > > -                             return -EINVAL;
> > > > > > -                     if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > > > > > -                             atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
> > > > > > -                     else
> > > > > > -                             atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
> > > > > > -             } else {
> > > > > > -                     atomic_set(&sk->sk_tskey, 0);
> > > > > > -             }
> > > > > > +             long int ret;
> > > > > > +
> > > > > > +             ret = sock_calculate_tskey_offset(sk, val, bpf_type);
> > > > > > +             if (ret <= 0)
> > > > > > +                     return ret;
> > > > > > +
> > > > > > +             atomic_set(&sk->sk_tskey, ret);
> > > > > >       }
> > > > > >
> > > > > >       return 0;
> > > > > > @@ -956,10 +995,15 @@ static int sock_set_timestamping_bpf(struct sock *sk,
> > > > > >                                    struct so_timestamping timestamping)
> > > > > >  {
> > > > > >       u32 flags = timestamping.flags;
> > > > > > +     int ret;
> > > > > >
> > > > > >       if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK)
> > > > > >               return -EINVAL;
> > > > > >
> > > > > > +     ret = sock_set_tskey(sk, flags, 1);
> > > > > > +     if (ret)
> > > > > > +             return ret;
> > > > > > +
> > > > > >       WRITE_ONCE(sk->sk_tsflags_bpf, flags);
> > > > > >
> > > > > >       return 0;
> > > > >
> > > > > I'm a bit hazy on when this can be called. We can assume that this new
> > > > > BPF operation cannot race with the existing setsockopt nor with the
> > > > > datapath that might touch the atomic fields, right?
> > > >
> > > > It surely can race with the existing setsockopt.
> > > >
> > > > 1)
> > > > if (only existing setsockopt works) {
> > > >         then sk->sk_tskey is set through setsockopt, sk_tskey_bpf_offset is 0.
> > > > }
> > > >
> > > > 2)
> > > > if (only bpf setsockopt works) {
> > > >         then sk->sk_tskey is set through bpf_setsockopt,
> > > > sk_tskey_bpf_offset is 0.
> > > > }
> > > >
> > > > 3)
> > > > if (existing setsockopt already started, here we enable the bpf feature) {
> > > >         then sk->sk_tskey will not change, but the sk_tskey_bpf_offset
> > > > will be calculated.
> > > > }
> > > >
> > > > 4)
> > > > if (bpf setsockopt already started, here we enable the application feature) {
> > > >         then sk->sk_tskey will re-initialized/overridden by
> > > > setsockopt, and the sk_tskey_bpf_offset will be calculated.
> > > > }
> >
> > I will copy the above to the commit message next time in order to
> > provide a clear design to future readers.
> >
> > > >
> > > > Then the skb tskey will use the sk->sk_tskey like before.
> > >
> > > I mean race as in the setsockopt and bpf setsockopt and datapath
> > > running concurrently.
> > >
> > > As long as both variants of setsockopt hold the socket lock, that
> > > won't happen.
> > >
> > > The datapath is lockless for UDP, so atomic_inc sk_tskey can race
> > > with calculating the difference. But this is a known issue. A process
> > > that cares should not run setsockopt and send concurrently. So this is
> > > fine too.
> >
> > Oh, now I see. Thanks for the detailed explanation! So Do you feel if
> > we need to take care of this in the future, I mean, after this series
> > gets merged...?
>
> If there is a race condition, then that cannot be fixed up later.
>
> But from my admittedly brief analysis, it seems that there is nothing
> here that needs to be fixed: control plane operations (setsockopt)
> hold the socket lock. A setsockopt that conflicts with a lockless
> datapath update will have a slightly ambiguous offset. It is under
> controlof and up to the user to avoid that if they care.

I got it. Thanks.
Martin KaFai Lau Oct. 30, 2024, 5:42 a.m. UTC | #7
On 10/28/24 4:05 AM, Jason Xing wrote:
> +/* Used to track the tskey for bpf extension
> + *
> + * @sk_tskey: bpf extension can use it only when no application uses.
> + *            Application can use it directly regardless of bpf extension.
> + *
> + * There are three strategies:
> + * 1) If we've already set through setsockopt() and here we're going to set
> + *    OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will
> + *    keep the record of delta between the current "key" and previous key.
> + * 2) If we've already set through bpf_setsockopt() and here we're going to
> + *    set for application use, we will record the delta first and then
> + *    override/initialize the @sk_tskey.
> + * 3) other cases, which means only either of them takes effect, so initialize
> + *    everything simplely.
> + */
> +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> +{
> +	u32 tskey;
> +
> +	if (sk_is_tcp(sk)) {
> +		if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> +			return -EINVAL;
> +
> +		if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> +			tskey = tcp_sk(sk)->write_seq;
> +		else
> +			tskey = tcp_sk(sk)->snd_una;
> +	} else {
> +		tskey = 0;
> +	}
> +
> +	if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> +		sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> +		return 0;
> +	} else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> +		sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> +	} else {
> +		sk->sk_tskey_bpf_offset = 0;
> +	}
> +
> +	return tskey;
> +}

Before diving into this route, the bpf prog can peek into the tcp seq no in the 
skb. It can also look at the sk->sk_tskey for UDP socket. Can you explain why 
those are not enough information for the bpf prog?
Jason Xing Oct. 30, 2024, 6:50 a.m. UTC | #8
On Wed, Oct 30, 2024 at 1:42 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/28/24 4:05 AM, Jason Xing wrote:
> > +/* Used to track the tskey for bpf extension
> > + *
> > + * @sk_tskey: bpf extension can use it only when no application uses.
> > + *            Application can use it directly regardless of bpf extension.
> > + *
> > + * There are three strategies:
> > + * 1) If we've already set through setsockopt() and here we're going to set
> > + *    OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will
> > + *    keep the record of delta between the current "key" and previous key.
> > + * 2) If we've already set through bpf_setsockopt() and here we're going to
> > + *    set for application use, we will record the delta first and then
> > + *    override/initialize the @sk_tskey.
> > + * 3) other cases, which means only either of them takes effect, so initialize
> > + *    everything simplely.
> > + */
> > +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> > +{
> > +     u32 tskey;
> > +
> > +     if (sk_is_tcp(sk)) {
> > +             if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> > +                     return -EINVAL;
> > +
> > +             if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > +                     tskey = tcp_sk(sk)->write_seq;
> > +             else
> > +                     tskey = tcp_sk(sk)->snd_una;
> > +     } else {
> > +             tskey = 0;
> > +     }
> > +
> > +     if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > +             sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> > +             return 0;
> > +     } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> > +             sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> > +     } else {
> > +             sk->sk_tskey_bpf_offset = 0;
> > +     }
> > +
> > +     return tskey;
> > +}
>
> Before diving into this route, the bpf prog can peek into the tcp seq no in the
> skb. It can also look at the sk->sk_tskey for UDP socket. Can you explain why
> those are not enough information for the bpf prog?

Well, it does make sense. It seems we don't need to implement tskey
for this bpf feature...

Due to lack of enough knowledge of bpf, could you provide more hints
that I can follow to write a bpf program to print more information
from the skb? Like in the last patch of this series, in
tools/testing/selftests/bpf/prog_tests/so_timestamping.c, do we have a
feasible way to do that?

Thanks,
Jason
Martin KaFai Lau Oct. 31, 2024, 1:17 a.m. UTC | #9
On 10/29/24 11:50 PM, Jason Xing wrote:
> On Wed, Oct 30, 2024 at 1:42 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 10/28/24 4:05 AM, Jason Xing wrote:
>>> +/* Used to track the tskey for bpf extension
>>> + *
>>> + * @sk_tskey: bpf extension can use it only when no application uses.
>>> + *            Application can use it directly regardless of bpf extension.
>>> + *
>>> + * There are three strategies:
>>> + * 1) If we've already set through setsockopt() and here we're going to set
>>> + *    OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will
>>> + *    keep the record of delta between the current "key" and previous key.
>>> + * 2) If we've already set through bpf_setsockopt() and here we're going to
>>> + *    set for application use, we will record the delta first and then
>>> + *    override/initialize the @sk_tskey.
>>> + * 3) other cases, which means only either of them takes effect, so initialize
>>> + *    everything simplely.
>>> + */
>>> +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
>>> +{
>>> +     u32 tskey;
>>> +
>>> +     if (sk_is_tcp(sk)) {
>>> +             if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
>>> +                     return -EINVAL;
>>> +
>>> +             if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
>>> +                     tskey = tcp_sk(sk)->write_seq;
>>> +             else
>>> +                     tskey = tcp_sk(sk)->snd_una;
>>> +     } else {
>>> +             tskey = 0;
>>> +     }
>>> +
>>> +     if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
>>> +             sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
>>> +             return 0;
>>> +     } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
>>> +             sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
>>> +     } else {
>>> +             sk->sk_tskey_bpf_offset = 0;
>>> +     }
>>> +
>>> +     return tskey;
>>> +}
>>
>> Before diving into this route, the bpf prog can peek into the tcp seq no in the
>> skb. It can also look at the sk->sk_tskey for UDP socket. Can you explain why
>> those are not enough information for the bpf prog?
> 
> Well, it does make sense. It seems we don't need to implement tskey
> for this bpf feature...
> 
> Due to lack of enough knowledge of bpf, could you provide more hints
> that I can follow to write a bpf program to print more information
> from the skb? Like in the last patch of this series, in
> tools/testing/selftests/bpf/prog_tests/so_timestamping.c, do we have a
> feasible way to do that?

The bpf-prog@sendmsg() will be run to capture a timestamp for sendmsg().
When running the bpf-prog@sendmsg(), the skb can be set to the "struct 
bpf_sock_ops_kern sock_ops;" which is passed to the sockops prog. Take a look at 
bpf_skops_write_hdr_opt().

bpf prog cannot directly access the skops->skb now. It is because the sockops 
prog sees the uapi "struct bpf_sock_ops" instead of "struct 
bpf_sock_ops(_kern)". The conversion is done in sock_ops_convert_ctx_access. It 
is an old way before BTF. I don't want to extend the uapi "struct bpf_sock_ops".

Instead, use bpf_cast_to_kern_ctx((struct bpf_sock_ops *)skops_ctx) to get a 
trusted "struct bpf_sock_ops(_kern) *skops" pointer. Then it can access the 
skops->skb. afaik, the tcb->seq should be available already during sendmsg. it 
should be able to get it from TCP_SKB_CB(skb)->seq with the bpf_core_cast. Take 
a look at the existing examples of bpf_core_cast.

The same goes for the skb->data. It can use the bpf_dynptr_from_skb(). It is not 
available to skops program now but should be easy to expose.

The bpf prog wants to calculate the delay between [sendmsg, SCHED], [SCHED, 
SND], [SND, ACK]. It is why (at least in my mental model) a key is needed to 
co-relate the sendmsg, SCHED, SND, and ACK timestamp. The tcp seqno could be 
served as that key.

All that said, while looking at tcp_tx_timestamp() again, there is always 
"shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;". shinfo->tskey can be 
used directly as-is by the bpf prog. I think now I am missing why the bpf prog 
needs the sk_tskey in the sk?

In the bpf prog, when the SCHED/SND/ACK timestamp comes back, it has to find the 
earlier sendmsg timestamp. One option is to store the earlier sendmsg timestamp 
at the bpf map key-ed by seqno or the shinfo's tskey. Storing in a bpf map 
key-ed by seqno/tskey is probably what the selftest should do. In the future, we 
can consider allowing the rbtree in the bpf sk local storage for searching 
seqno. There is shinfo's hwtstamp that can be used also if there is a need.
Jason Xing Oct. 31, 2024, 2:41 a.m. UTC | #10
On Thu, Oct 31, 2024 at 9:17 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/29/24 11:50 PM, Jason Xing wrote:
> > On Wed, Oct 30, 2024 at 1:42 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >>
> >> On 10/28/24 4:05 AM, Jason Xing wrote:
> >>> +/* Used to track the tskey for bpf extension
> >>> + *
> >>> + * @sk_tskey: bpf extension can use it only when no application uses.
> >>> + *            Application can use it directly regardless of bpf extension.
> >>> + *
> >>> + * There are three strategies:
> >>> + * 1) If we've already set through setsockopt() and here we're going to set
> >>> + *    OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will
> >>> + *    keep the record of delta between the current "key" and previous key.
> >>> + * 2) If we've already set through bpf_setsockopt() and here we're going to
> >>> + *    set for application use, we will record the delta first and then
> >>> + *    override/initialize the @sk_tskey.
> >>> + * 3) other cases, which means only either of them takes effect, so initialize
> >>> + *    everything simplely.
> >>> + */
> >>> +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> >>> +{
> >>> +     u32 tskey;
> >>> +
> >>> +     if (sk_is_tcp(sk)) {
> >>> +             if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> >>> +                     return -EINVAL;
> >>> +
> >>> +             if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> >>> +                     tskey = tcp_sk(sk)->write_seq;
> >>> +             else
> >>> +                     tskey = tcp_sk(sk)->snd_una;
> >>> +     } else {
> >>> +             tskey = 0;
> >>> +     }
> >>> +
> >>> +     if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> >>> +             sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> >>> +             return 0;
> >>> +     } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> >>> +             sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> >>> +     } else {
> >>> +             sk->sk_tskey_bpf_offset = 0;
> >>> +     }
> >>> +
> >>> +     return tskey;
> >>> +}
> >>
> >> Before diving into this route, the bpf prog can peek into the tcp seq no in the
> >> skb. It can also look at the sk->sk_tskey for UDP socket. Can you explain why
> >> those are not enough information for the bpf prog?
> >
> > Well, it does make sense. It seems we don't need to implement tskey
> > for this bpf feature...
> >
> > Due to lack of enough knowledge of bpf, could you provide more hints
> > that I can follow to write a bpf program to print more information
> > from the skb? Like in the last patch of this series, in
> > tools/testing/selftests/bpf/prog_tests/so_timestamping.c, do we have a
> > feasible way to do that?
>
> The bpf-prog@sendmsg() will be run to capture a timestamp for sendmsg().
> When running the bpf-prog@sendmsg(), the skb can be set to the "struct
> bpf_sock_ops_kern sock_ops;" which is passed to the sockops prog. Take a look at
> bpf_skops_write_hdr_opt().

Thanks. I see the skb field in struct bpf_sock_ops_kern.

>
> bpf prog cannot directly access the skops->skb now. It is because the sockops
> prog sees the uapi "struct bpf_sock_ops" instead of "struct
> bpf_sock_ops(_kern)". The conversion is done in sock_ops_convert_ctx_access. It
> is an old way before BTF. I don't want to extend the uapi "struct bpf_sock_ops".

Oh, so it seems we cannot use this way, right?

I also noticed a use case that allow users to get the information from one skb:
"int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb)" in
tools/testing/selftests/bpf/progs/netif_receive_skb.c
But it requires us to add the tracepoint in __skb_tstamp_tx() first.
Two months ago, I was planning to use a tracepoint for some people who
find it difficult to deploy bpf.

>
> Instead, use bpf_cast_to_kern_ctx((struct bpf_sock_ops *)skops_ctx) to get a
> trusted "struct bpf_sock_ops(_kern) *skops" pointer. Then it can access the
> skops->skb.

Let me spend some time on it. Thanks.

> afaik, the tcb->seq should be available already during sendmsg. it
> should be able to get it from TCP_SKB_CB(skb)->seq with the bpf_core_cast. Take
> a look at the existing examples of bpf_core_cast.
>
> The same goes for the skb->data. It can use the bpf_dynptr_from_skb(). It is not
> available to skops program now but should be easy to expose.

I wonder what the use of skb->data is here.

>
> The bpf prog wants to calculate the delay between [sendmsg, SCHED], [SCHED,
> SND], [SND, ACK]. It is why (at least in my mental model) a key is needed to
> co-relate the sendmsg, SCHED, SND, and ACK timestamp. The tcp seqno could be
> served as that key.
>
> All that said, while looking at tcp_tx_timestamp() again, there is always
> "shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;". shinfo->tskey can be
> used directly as-is by the bpf prog. I think now I am missing why the bpf prog
> needs the sk_tskey in the sk?

As you said, tcp seqno could be treated as the key, but it leaks the
information in TCP layer to users. Please see the commit:
commit 4ed2d765dfaccff5ebdac68e2064b59125033a3b
Author: Willem de Bruijn <willemb@google.com>
Date:   Mon Aug 4 22:11:49 2014 -0400

    net-timestamp: TCP timestamping
...
    - To avoid leaking the absolute seqno to userspace, the offset
    returned in ee_data must always be relative. It is an offset between
    an skb and sk field.

It has to be computed in the kernel before reporting to the user space, I think.

>
> In the bpf prog, when the SCHED/SND/ACK timestamp comes back, it has to find the
> earlier sendmsg timestamp. One option is to store the earlier sendmsg timestamp
> at the bpf map key-ed by seqno or the shinfo's tskey. Storing in a bpf map
> key-ed by seqno/tskey is probably what the selftest should do. In the future, we
> can consider allowing the rbtree in the bpf sk local storage for searching
> seqno. There is shinfo's hwtstamp that can be used also if there is a need.

Thanks for the information! Let me investigate how the bpf map works...

I wonder that for the selftests could it be much simpler if we just
record each timestamp stored in three variables and calculate them at
last since we only send the small packet once instead of using bpf
map. I mean, bpf map is really good as far as I know, but I'm a bit
worried that implementing such a function could cause more extra work
(implementation and review).
Jason Xing Oct. 31, 2024, 3:27 a.m. UTC | #11
On Thu, Oct 31, 2024 at 10:41 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Thu, Oct 31, 2024 at 9:17 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> >
> > On 10/29/24 11:50 PM, Jason Xing wrote:
> > > On Wed, Oct 30, 2024 at 1:42 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
> > >>
> > >> On 10/28/24 4:05 AM, Jason Xing wrote:
> > >>> +/* Used to track the tskey for bpf extension
> > >>> + *
> > >>> + * @sk_tskey: bpf extension can use it only when no application uses.
> > >>> + *            Application can use it directly regardless of bpf extension.
> > >>> + *
> > >>> + * There are three strategies:
> > >>> + * 1) If we've already set through setsockopt() and here we're going to set
> > >>> + *    OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will
> > >>> + *    keep the record of delta between the current "key" and previous key.
> > >>> + * 2) If we've already set through bpf_setsockopt() and here we're going to
> > >>> + *    set for application use, we will record the delta first and then
> > >>> + *    override/initialize the @sk_tskey.
> > >>> + * 3) other cases, which means only either of them takes effect, so initialize
> > >>> + *    everything simplely.
> > >>> + */
> > >>> +static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
> > >>> +{
> > >>> +     u32 tskey;
> > >>> +
> > >>> +     if (sk_is_tcp(sk)) {
> > >>> +             if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
> > >>> +                     return -EINVAL;
> > >>> +
> > >>> +             if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
> > >>> +                     tskey = tcp_sk(sk)->write_seq;
> > >>> +             else
> > >>> +                     tskey = tcp_sk(sk)->snd_una;
> > >>> +     } else {
> > >>> +             tskey = 0;
> > >>> +     }
> > >>> +
> > >>> +     if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
> > >>> +             sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
> > >>> +             return 0;
> > >>> +     } else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
> > >>> +             sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
> > >>> +     } else {
> > >>> +             sk->sk_tskey_bpf_offset = 0;
> > >>> +     }
> > >>> +
> > >>> +     return tskey;
> > >>> +}
> > >>
> > >> Before diving into this route, the bpf prog can peek into the tcp seq no in the
> > >> skb. It can also look at the sk->sk_tskey for UDP socket. Can you explain why
> > >> those are not enough information for the bpf prog?
> > >
> > > Well, it does make sense. It seems we don't need to implement tskey
> > > for this bpf feature...
> > >
> > > Due to lack of enough knowledge of bpf, could you provide more hints
> > > that I can follow to write a bpf program to print more information
> > > from the skb? Like in the last patch of this series, in
> > > tools/testing/selftests/bpf/prog_tests/so_timestamping.c, do we have a
> > > feasible way to do that?
> >
> > The bpf-prog@sendmsg() will be run to capture a timestamp for sendmsg().
> > When running the bpf-prog@sendmsg(), the skb can be set to the "struct
> > bpf_sock_ops_kern sock_ops;" which is passed to the sockops prog. Take a look at
> > bpf_skops_write_hdr_opt().
>
> Thanks. I see the skb field in struct bpf_sock_ops_kern.
>
> >
> > bpf prog cannot directly access the skops->skb now. It is because the sockops
> > prog sees the uapi "struct bpf_sock_ops" instead of "struct
> > bpf_sock_ops(_kern)". The conversion is done in sock_ops_convert_ctx_access. It
> > is an old way before BTF. I don't want to extend the uapi "struct bpf_sock_ops".
>
> Oh, so it seems we cannot use this way, right?
>
> I also noticed a use case that allow users to get the information from one skb:
> "int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb)" in
> tools/testing/selftests/bpf/progs/netif_receive_skb.c
> But it requires us to add the tracepoint in __skb_tstamp_tx() first.
> Two months ago, I was planning to use a tracepoint for some people who
> find it difficult to deploy bpf.
>
> >
> > Instead, use bpf_cast_to_kern_ctx((struct bpf_sock_ops *)skops_ctx) to get a
> > trusted "struct bpf_sock_ops(_kern) *skops" pointer. Then it can access the
> > skops->skb.
>
> Let me spend some time on it. Thanks.
>
> > afaik, the tcb->seq should be available already during sendmsg. it
> > should be able to get it from TCP_SKB_CB(skb)->seq with the bpf_core_cast. Take
> > a look at the existing examples of bpf_core_cast.
> >
> > The same goes for the skb->data. It can use the bpf_dynptr_from_skb(). It is not
> > available to skops program now but should be easy to expose.
>
> I wonder what the use of skb->data is here.
>
> >
> > The bpf prog wants to calculate the delay between [sendmsg, SCHED], [SCHED,
> > SND], [SND, ACK]. It is why (at least in my mental model) a key is needed to
> > co-relate the sendmsg, SCHED, SND, and ACK timestamp. The tcp seqno could be
> > served as that key.
> >
> > All that said, while looking at tcp_tx_timestamp() again, there is always
> > "shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;". shinfo->tskey can be
> > used directly as-is by the bpf prog. I think now I am missing why the bpf prog
> > needs the sk_tskey in the sk?
>
> As you said, tcp seqno could be treated as the key, but it leaks the
> information in TCP layer to users. Please see the commit:
> commit 4ed2d765dfaccff5ebdac68e2064b59125033a3b
> Author: Willem de Bruijn <willemb@google.com>
> Date:   Mon Aug 4 22:11:49 2014 -0400
>
>     net-timestamp: TCP timestamping
> ...
>     - To avoid leaking the absolute seqno to userspace, the offset
>     returned in ee_data must always be relative. It is an offset between
>     an skb and sk field.
>
> It has to be computed in the kernel before reporting to the user space, I think.

Well, I'm thinking since the BPF program can only be used by _admin_,
we will not take any risk even if the raw seq is exported to the BPF
program.

Willem, I would like to know your opinions about this point (about
whether we can export the raw seqno or not). Thanks.

Thanks,
Jason
Martin KaFai Lau Oct. 31, 2024, 5:52 a.m. UTC | #12
On 10/30/24 7:41 PM, Jason Xing wrote:

>> All that said, while looking at tcp_tx_timestamp() again, there is always
>> "shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;". shinfo->tskey can be
>> used directly as-is by the bpf prog. I think now I am missing why the bpf prog
>> needs the sk_tskey in the sk?
> 
> As you said, tcp seqno could be treated as the key, but it leaks the
> information in TCP layer to users. Please see the commit:

I don't think it is a concern for bpf prog running in the kernel. The sockops 
bpf prog can already read the sk, the skb (which has seqno), and many others.

The bpf prog is not a print-only logic. Only using bpf prog to do raw data 
dumping is not fully utilizing its capability, e.g. data aggregation. The bpf 
prog should aggregate the data first which is to calculate the delay here.
Jason Xing Oct. 31, 2024, 6:16 a.m. UTC | #13
On Thu, Oct 31, 2024 at 1:52 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/30/24 7:41 PM, Jason Xing wrote:
>
> >> All that said, while looking at tcp_tx_timestamp() again, there is always
> >> "shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;". shinfo->tskey can be
> >> used directly as-is by the bpf prog. I think now I am missing why the bpf prog
> >> needs the sk_tskey in the sk?
> >
> > As you said, tcp seqno could be treated as the key, but it leaks the
> > information in TCP layer to users. Please see the commit:
>
> I don't think it is a concern for bpf prog running in the kernel. The sockops
> bpf prog can already read the sk, the skb (which has seqno), and many others.
>
> The bpf prog is not a print-only logic. Only using bpf prog to do raw data
> dumping is not fully utilizing its capability, e.g. data aggregation. The bpf
> prog should aggregate the data first which is to calculate the delay here.

Agree, I forgot BPF is only for admin, so it's a feasible solution. It
saves a lot of energy :)

It looks like the thing is getting simpler and simpler, which could be
mostly taken over by bpf itself at last. Good news!

Thanks,
Jason
Martin KaFai Lau Oct. 31, 2024, 11:50 p.m. UTC | #14
On 10/30/24 7:41 PM, Jason Xing wrote:
>> bpf prog cannot directly access the skops->skb now. It is because the sockops
>> prog sees the uapi "struct bpf_sock_ops" instead of "struct
>> bpf_sock_ops(_kern)". The conversion is done in sock_ops_convert_ctx_access. It
>> is an old way before BTF. I don't want to extend the uapi "struct bpf_sock_ops".
> 
> Oh, so it seems we cannot use this way, right?

No. don't extend the uapi "struct bpf_sock_ops". Use bpf_cast_to_kern_ctx() instead.

> 
> I also noticed a use case that allow users to get the information from one skb:
> "int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb)" in
> tools/testing/selftests/bpf/progs/netif_receive_skb.c
> But it requires us to add the tracepoint in __skb_tstamp_tx() first.
> Two months ago, I was planning to use a tracepoint for some people who
> find it difficult to deploy bpf.


It is a tracing prog instead of sockops prog. The verifier allows accessing 
different things based on the program type. This patch set is using the sockops 
bpf prog type which is not a tracing prog. Tracing can do a lot of read-only 
things but here we need write (e.g. bpf_setsockopt), so tracing prog is not 
suitable here.

> 
>>
>> Instead, use bpf_cast_to_kern_ctx((struct bpf_sock_ops *)skops_ctx) to get a
>> trusted "struct bpf_sock_ops(_kern) *skops" pointer. Then it can access the
>> skops->skb.
> 
> Let me spend some time on it. Thanks.

Take a look at the bpf_cast_to_kern_ctx() examples in selftests/bpf. I think 
this can be directly used to get to (struct bpf_sock_ops_kern *)skops->skb. Ping 
back if your selftest bpf prog cannot load.

> 
>> afaik, the tcb->seq should be available already during sendmsg. it
>> should be able to get it from TCP_SKB_CB(skb)->seq with the bpf_core_cast. Take
>> a look at the existing examples of bpf_core_cast.
>>
>> The same goes for the skb->data. It can use the bpf_dynptr_from_skb(). It is not
>> available to skops program now but should be easy to expose.
 > I wonder what the use of skb->data is here.

You are right, not needed. I was thinking it may need to parse the tcp header 
from the skb at the rx timestamping. It is not needed. The tcp stack should have 
already parsed it and TCP_SKB_CB can be directly used as long as the sockops 
prog can get to the skops->skb.

>>
>> In the bpf prog, when the SCHED/SND/ACK timestamp comes back, it has to find the
>> earlier sendmsg timestamp. One option is to store the earlier sendmsg timestamp
>> at the bpf map key-ed by seqno or the shinfo's tskey. Storing in a bpf map
>> key-ed by seqno/tskey is probably what the selftest should do. In the future, we
>> can consider allowing the rbtree in the bpf sk local storage for searching
>> seqno. There is shinfo's hwtstamp that can be used also if there is a need.
> 
> Thanks for the information! Let me investigate how the bpf map works...
> 
> I wonder that for the selftests could it be much simpler if we just
> record each timestamp stored in three variables and calculate them at
> last since we only send the small packet once instead of using bpf
> map. I mean, bpf map is really good as far as I know, but I'm a bit
> worried that implementing such a function could cause more extra work
> (implementation and review).

Don't worry on the review side. imo, a closer to the real world selftest prog is 
actually helping the review process. It needs to test the tskey anyway and it 
needs to store somewhere. bpf map is pretty simple to use. I don't think it will 
have much different in term of complexity also.
Jason Xing Nov. 1, 2024, 6:33 a.m. UTC | #15
On Fri, Nov 1, 2024 at 7:50 AM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 10/30/24 7:41 PM, Jason Xing wrote:
> >> bpf prog cannot directly access the skops->skb now. It is because the sockops
> >> prog sees the uapi "struct bpf_sock_ops" instead of "struct
> >> bpf_sock_ops(_kern)". The conversion is done in sock_ops_convert_ctx_access. It
> >> is an old way before BTF. I don't want to extend the uapi "struct bpf_sock_ops".
> >
> > Oh, so it seems we cannot use this way, right?
>
> No. don't extend the uapi "struct bpf_sock_ops". Use bpf_cast_to_kern_ctx() instead.

Got it!

>
> >
> > I also noticed a use case that allow users to get the information from one skb:
> > "int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb)" in
> > tools/testing/selftests/bpf/progs/netif_receive_skb.c
> > But it requires us to add the tracepoint in __skb_tstamp_tx() first.
> > Two months ago, I was planning to use a tracepoint for some people who
> > find it difficult to deploy bpf.
>
>
> It is a tracing prog instead of sockops prog. The verifier allows accessing
> different things based on the program type. This patch set is using the sockops
> bpf prog type which is not a tracing prog. Tracing can do a lot of read-only
> things but here we need write (e.g. bpf_setsockopt), so tracing prog is not
> suitable here.

Thanks for the explaination.

>
> >
> >>
> >> Instead, use bpf_cast_to_kern_ctx((struct bpf_sock_ops *)skops_ctx) to get a
> >> trusted "struct bpf_sock_ops(_kern) *skops" pointer. Then it can access the
> >> skops->skb.
> >
> > Let me spend some time on it. Thanks.
>
> Take a look at the bpf_cast_to_kern_ctx() examples in selftests/bpf. I think
> this can be directly used to get to (struct bpf_sock_ops_kern *)skops->skb. Ping
> back if your selftest bpf prog cannot load.

No problem :)

>
> >
> >> afaik, the tcb->seq should be available already during sendmsg. it
> >> should be able to get it from TCP_SKB_CB(skb)->seq with the bpf_core_cast. Take
> >> a look at the existing examples of bpf_core_cast.
> >>
> >> The same goes for the skb->data. It can use the bpf_dynptr_from_skb(). It is not
> >> available to skops program now but should be easy to expose.
>  > I wonder what the use of skb->data is here.
>
> You are right, not needed. I was thinking it may need to parse the tcp header
> from the skb at the rx timestamping. It is not needed. The tcp stack should have
> already parsed it and TCP_SKB_CB can be directly used as long as the sockops
> prog can get to the skops->skb.

Agreed.

>
> >>
> >> In the bpf prog, when the SCHED/SND/ACK timestamp comes back, it has to find the
> >> earlier sendmsg timestamp. One option is to store the earlier sendmsg timestamp
> >> at the bpf map key-ed by seqno or the shinfo's tskey. Storing in a bpf map
> >> key-ed by seqno/tskey is probably what the selftest should do. In the future, we
> >> can consider allowing the rbtree in the bpf sk local storage for searching
> >> seqno. There is shinfo's hwtstamp that can be used also if there is a need.
> >
> > Thanks for the information! Let me investigate how the bpf map works...
> >
> > I wonder that for the selftests could it be much simpler if we just
> > record each timestamp stored in three variables and calculate them at
> > last since we only send the small packet once instead of using bpf
> > map. I mean, bpf map is really good as far as I know, but I'm a bit
> > worried that implementing such a function could cause more extra work
> > (implementation and review).
>
> Don't worry on the review side. imo, a closer to the real world selftest prog is
> actually helping the review process. It needs to test the tskey anyway and it
> needs to store somewhere. bpf map is pretty simple to use. I don't think it will
> have much different in term of complexity also.

Got it, will do it soon :)

Thanks,
Jason
diff mbox series

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index 91398b20a4a3..41c6c6f78e55 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -469,6 +469,7 @@  struct sock {
 	unsigned long		sk_pacing_rate; /* bytes per second */
 	atomic_t		sk_zckey;
 	atomic_t		sk_tskey;
+	u32			sk_tskey_bpf_offset;
 	__cacheline_group_end(sock_write_tx);
 
 	__cacheline_group_begin(sock_read_tx);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0b571306f7ea..d1739317b97d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -5641,9 +5641,10 @@  void timestamp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args)
 }
 
 static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype,
+				     struct sk_buff *skb,
 				     struct skb_shared_hwtstamps *hwtstamps)
 {
-	u32 args[2] = {0, 0};
+	u32 args[3] = {0, 0, 0};
 	u32 tsflags, cb_flag;
 
 	tsflags = READ_ONCE(sk->sk_tsflags_bpf);
@@ -5672,7 +5673,15 @@  static void skb_tstamp_tx_output_bpf(struct sock *sk, int tstype,
 		args[1] = ts.tv_nsec;
 	}
 
-	timestamp_call_bpf(sk, cb_flag, 2, args);
+	if (tsflags & SOF_TIMESTAMPING_OPT_ID) {
+		args[2] = skb_shinfo(skb)->tskey;
+		if (sk_is_tcp(sk))
+			args[2] -= atomic_read(&sk->sk_tskey);
+		if (sk->sk_tskey_bpf_offset)
+			args[2] += sk->sk_tskey_bpf_offset;
+	}
+
+	timestamp_call_bpf(sk, cb_flag, 3, args);
 }
 
 void __skb_tstamp_tx(struct sk_buff *orig_skb,
@@ -5683,7 +5692,7 @@  void __skb_tstamp_tx(struct sk_buff *orig_skb,
 	if (!sk)
 		return;
 
-	skb_tstamp_tx_output_bpf(sk, tstype, hwtstamps);
+	skb_tstamp_tx_output_bpf(sk, tstype, orig_skb, hwtstamps);
 	skb_tstamp_tx_output(orig_skb, ack_skb, hwtstamps, sk, tstype);
 }
 EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
diff --git a/net/core/sock.c b/net/core/sock.c
index 42c1aba0b3fe..914ec8046f86 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -891,6 +891,49 @@  static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 	return 0;
 }
 
+/* Used to track the tskey for bpf extension
+ *
+ * @sk_tskey: bpf extension can use it only when no application uses.
+ *            Application can use it directly regardless of bpf extension.
+ *
+ * There are three strategies:
+ * 1) If we've already set through setsockopt() and here we're going to set
+ *    OPT_ID for bpf use, we will not re-initialize the @sk_tskey and will
+ *    keep the record of delta between the current "key" and previous key.
+ * 2) If we've already set through bpf_setsockopt() and here we're going to
+ *    set for application use, we will record the delta first and then
+ *    override/initialize the @sk_tskey.
+ * 3) other cases, which means only either of them takes effect, so initialize
+ *    everything simplely.
+ */
+static long int sock_calculate_tskey_offset(struct sock *sk, int val, int bpf_type)
+{
+	u32 tskey;
+
+	if (sk_is_tcp(sk)) {
+		if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+			return -EINVAL;
+
+		if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
+			tskey = tcp_sk(sk)->write_seq;
+		else
+			tskey = tcp_sk(sk)->snd_una;
+	} else {
+		tskey = 0;
+	}
+
+	if (bpf_type && (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+		sk->sk_tskey_bpf_offset = tskey - atomic_read(&sk->sk_tskey);
+		return 0;
+	} else if (!bpf_type && (sk->sk_tsflags_bpf & SOF_TIMESTAMPING_OPT_ID)) {
+		sk->sk_tskey_bpf_offset = atomic_read(&sk->sk_tskey) - tskey;
+	} else {
+		sk->sk_tskey_bpf_offset = 0;
+	}
+
+	return tskey;
+}
+
 int sock_set_tskey(struct sock *sk, int val, int bpf_type)
 {
 	u32 tsflags = bpf_type ? sk->sk_tsflags_bpf : sk->sk_tsflags;
@@ -901,17 +944,13 @@  int sock_set_tskey(struct sock *sk, int val, int bpf_type)
 
 	if (val & SOF_TIMESTAMPING_OPT_ID &&
 	    !(tsflags & SOF_TIMESTAMPING_OPT_ID)) {
-		if (sk_is_tcp(sk)) {
-			if ((1 << sk->sk_state) &
-			    (TCPF_CLOSE | TCPF_LISTEN))
-				return -EINVAL;
-			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
-				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
-			else
-				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
-		} else {
-			atomic_set(&sk->sk_tskey, 0);
-		}
+		long int ret;
+
+		ret = sock_calculate_tskey_offset(sk, val, bpf_type);
+		if (ret <= 0)
+			return ret;
+
+		atomic_set(&sk->sk_tskey, ret);
 	}
 
 	return 0;
@@ -956,10 +995,15 @@  static int sock_set_timestamping_bpf(struct sock *sk,
 				     struct so_timestamping timestamping)
 {
 	u32 flags = timestamping.flags;
+	int ret;
 
 	if (flags & ~SOF_TIMESTAMPING_BPF_SUPPPORTED_MASK)
 		return -EINVAL;
 
+	ret = sock_set_tskey(sk, flags, 1);
+	if (ret)
+		return ret;
+
 	WRITE_ONCE(sk->sk_tsflags_bpf, flags);
 
 	return 0;