diff mbox series

[net-next] tcp: annotate data-races around tp->window_clamp

Message ID 20240404114231.2195171-1-edumazet@google.com (mailing list archive)
State Accepted
Commit f410cbea9f3d2675b4c8e52af1d1985b11b387d1
Delegated to: Netdev Maintainers
Headers show
Series [net-next] tcp: annotate data-races around tp->window_clamp | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 948 this patch: 948
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 5 maintainers not CCed: mptcp@lists.linux.dev dsahern@kernel.org matttbe@kernel.org martineau@kernel.org geliang@kernel.org
netdev/build_clang success Errors and warnings before: 957 this patch: 957
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 959 this patch: 959
netdev/checkpatch fail ERROR: spaces required around that ':' (ctx:WxV) WARNING: line length of 84 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 3 this patch: 3
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2024-04-05--03-00 (tests: 951)

Commit Message

Eric Dumazet April 4, 2024, 11:42 a.m. UTC
tp->window_clamp can be read locklessly, add READ_ONCE()
and WRITE_ONCE() annotations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/ipv4/syncookies.c |  3 ++-
 net/ipv4/tcp.c        |  8 ++++----
 net/ipv4/tcp_input.c  | 17 ++++++++++-------
 net/ipv4/tcp_output.c | 18 ++++++++++--------
 net/ipv6/syncookies.c |  2 +-
 net/mptcp/protocol.c  |  2 +-
 net/mptcp/sockopt.c   |  2 +-
 7 files changed, 29 insertions(+), 23 deletions(-)

Comments

Jason Xing April 5, 2024, 2:29 p.m. UTC | #1
On Thu, Apr 4, 2024 at 7:53 PM Eric Dumazet <edumazet@google.com> wrote:
>
> tp->window_clamp can be read locklessly, add READ_ONCE()
> and WRITE_ONCE() annotations.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  net/ipv4/syncookies.c |  3 ++-
>  net/ipv4/tcp.c        |  8 ++++----
>  net/ipv4/tcp_input.c  | 17 ++++++++++-------
>  net/ipv4/tcp_output.c | 18 ++++++++++--------
>  net/ipv6/syncookies.c |  2 +-
>  net/mptcp/protocol.c  |  2 +-
>  net/mptcp/sockopt.c   |  2 +-
>  7 files changed, 29 insertions(+), 23 deletions(-)
>
> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> index 500f665f98cbce4a3d681f8e39ecd368fe4013b1..b61d36810fe3fd62b1e5c5885bbaf20185f1abf0 100644
> --- a/net/ipv4/syncookies.c
> +++ b/net/ipv4/syncookies.c
> @@ -462,7 +462,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
>         }
>
>         /* Try to redo what tcp_v4_send_synack did. */
> -       req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
> +       req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :
> +                               dst_metric(&rt->dst, RTAX_WINDOW);
>         /* limit the window selection if the user enforce a smaller rx buffer */
>         full_space = tcp_full_space(sk);
>         if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index e767721b3a588b5d56567ae7badf5dffcd35a76a..92ee60492314a1483cfbfa2f73d32fcad5632773 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -1721,7 +1721,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
>         space = tcp_space_from_win(sk, val);
>         if (space > sk->sk_rcvbuf) {
>                 WRITE_ONCE(sk->sk_rcvbuf, space);
> -               tcp_sk(sk)->window_clamp = val;
> +               WRITE_ONCE(tcp_sk(sk)->window_clamp, val);
>         }
>         return 0;
>  }
> @@ -3379,7 +3379,7 @@ int tcp_set_window_clamp(struct sock *sk, int val)
>         if (!val) {
>                 if (sk->sk_state != TCP_CLOSE)
>                         return -EINVAL;
> -               tp->window_clamp = 0;
> +               WRITE_ONCE(tp->window_clamp, 0);
>         } else {
>                 u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp;
>                 u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
> @@ -3388,7 +3388,7 @@ int tcp_set_window_clamp(struct sock *sk, int val)
>                 if (new_window_clamp == old_window_clamp)
>                         return 0;
>
> -               tp->window_clamp = new_window_clamp;
> +               WRITE_ONCE(tp->window_clamp, new_window_clamp);
>                 if (new_window_clamp < old_window_clamp) {
>                         /* need to apply the reserved mem provisioning only
>                          * when shrinking the window clamp
> @@ -4057,7 +4057,7 @@ int do_tcp_getsockopt(struct sock *sk, int level,
>                                       TCP_RTO_MAX / HZ);
>                 break;
>         case TCP_WINDOW_CLAMP:
> -               val = tp->window_clamp;
> +               val = READ_ONCE(tp->window_clamp);
>                 break;
>         case TCP_INFO: {
>                 struct tcp_info info;
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 1b6cd384001202df5f8e8e8c73adff0db89ece63..8d44ab5671eacd4bc06647c7cca387a79e346618 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -563,19 +563,20 @@ static void tcp_init_buffer_space(struct sock *sk)
>         maxwin = tcp_full_space(sk);
>
>         if (tp->window_clamp >= maxwin) {

I wonder if it is necessary to locklessly protect the above line with
READ_ONCE() because I saw the full reader protection in the
tcp_select_initial_window()? There are some other places like this.
Any special reason?

Thanks,
Jason

> -               tp->window_clamp = maxwin;
> +               WRITE_ONCE(tp->window_clamp, maxwin);
>
>                 if (tcp_app_win && maxwin > 4 * tp->advmss)
> -                       tp->window_clamp = max(maxwin -
> -                                              (maxwin >> tcp_app_win),
> -                                              4 * tp->advmss);
> +                       WRITE_ONCE(tp->window_clamp,
> +                                  max(maxwin - (maxwin >> tcp_app_win),
> +                                      4 * tp->advmss));
>         }
>
>         /* Force reservation of one segment. */
>         if (tcp_app_win &&
>             tp->window_clamp > 2 * tp->advmss &&
>             tp->window_clamp + tp->advmss > maxwin)
> -               tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
> +               WRITE_ONCE(tp->window_clamp,
> +                          max(2 * tp->advmss, maxwin - tp->advmss));
>
>         tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
>         tp->snd_cwnd_stamp = tcp_jiffies32;
> @@ -773,7 +774,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
>                         WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
>
>                         /* Make the window clamp follow along.  */
> -                       tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
> +                       WRITE_ONCE(tp->window_clamp,
> +                                  tcp_win_from_space(sk, rcvbuf));
>                 }
>         }
>         tp->rcvq_space.space = copied;
> @@ -6426,7 +6428,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
>
>                 if (!tp->rx_opt.wscale_ok) {
>                         tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
> -                       tp->window_clamp = min(tp->window_clamp, 65535U);
> +                       WRITE_ONCE(tp->window_clamp,
> +                                  min(tp->window_clamp, 65535U));
>                 }
>
>                 if (tp->rx_opt.saw_tstamp) {
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index e3167ad965676facaacd8f82848c52cf966f97c3..9282fafc0e6109f3ac86d1641740f24588b2d75d 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -203,16 +203,17 @@ static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
>   * This MUST be enforced by all callers.
>   */
>  void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
> -                              __u32 *rcv_wnd, __u32 *window_clamp,
> +                              __u32 *rcv_wnd, __u32 *__window_clamp,
>                                int wscale_ok, __u8 *rcv_wscale,
>                                __u32 init_rcv_wnd)
>  {
>         unsigned int space = (__space < 0 ? 0 : __space);
> +       u32 window_clamp = READ_ONCE(*__window_clamp);
>
>         /* If no clamp set the clamp to the max possible scaled window */
> -       if (*window_clamp == 0)
> -               (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
> -       space = min(*window_clamp, space);
> +       if (window_clamp == 0)
> +               window_clamp = (U16_MAX << TCP_MAX_WSCALE);
> +       space = min(window_clamp, space);
>
>         /* Quantize space offering to a multiple of mss if possible. */
>         if (space > mss)
> @@ -239,12 +240,13 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
>                 /* Set window scaling on max possible window */
>                 space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
>                 space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
> -               space = min_t(u32, space, *window_clamp);
> +               space = min_t(u32, space, window_clamp);
>                 *rcv_wscale = clamp_t(int, ilog2(space) - 15,
>                                       0, TCP_MAX_WSCALE);
>         }
>         /* Set the clamp no higher than max representable value */
> -       (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
> +       WRITE_ONCE(*__window_clamp,
> +                  min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp));
>  }
>  EXPORT_SYMBOL(tcp_select_initial_window);
>
> @@ -3855,7 +3857,7 @@ static void tcp_connect_init(struct sock *sk)
>         tcp_ca_dst_init(sk, dst);
>
>         if (!tp->window_clamp)
> -               tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
> +               WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW));
>         tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
>
>         tcp_initialize_rcv_mss(sk);
> @@ -3863,7 +3865,7 @@ static void tcp_connect_init(struct sock *sk)
>         /* limit the window selection if the user enforce a smaller rx buffer */
>         if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
>             (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
> -               tp->window_clamp = tcp_full_space(sk);
> +               WRITE_ONCE(tp->window_clamp, tcp_full_space(sk));
>
>         rcv_wnd = tcp_rwnd_init_bpf(sk);
>         if (rcv_wnd == 0)
> diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
> index 6d8286c299c9d139938ef6751d9958c80d3031e9..bfad1e89b6a6bb99c28b9ef14c142a6c4aeae54b 100644
> --- a/net/ipv6/syncookies.c
> +++ b/net/ipv6/syncookies.c
> @@ -246,7 +246,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
>                 }
>         }
>
> -       req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
> +       req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW);
>         /* limit the window selection if the user enforce a smaller rx buffer */
>         full_space = tcp_full_space(sk);
>         if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 3a1967bc7bad63d5a8a628b3f3b868e3a27baaca..3897a03bb8cb88f7869180b5ec261158e8e5d027 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -2056,7 +2056,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
>                                 ssk = mptcp_subflow_tcp_sock(subflow);
>                                 slow = lock_sock_fast(ssk);
>                                 WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
> -                               tcp_sk(ssk)->window_clamp = window_clamp;
> +                               WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
>                                 tcp_cleanup_rbuf(ssk, 1);
>                                 unlock_sock_fast(ssk, slow);
>                         }
> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> index dcd1c76d2a3ba1ccc31a3e9279f725cd6d433782..b702e994633788183ad95b2e12859ee6b60bf208 100644
> --- a/net/mptcp/sockopt.c
> +++ b/net/mptcp/sockopt.c
> @@ -1519,7 +1519,7 @@ int mptcp_set_rcvlowat(struct sock *sk, int val)
>
>                 slow = lock_sock_fast(ssk);
>                 WRITE_ONCE(ssk->sk_rcvbuf, space);
> -               tcp_sk(ssk)->window_clamp = val;
> +               WRITE_ONCE(tcp_sk(ssk)->window_clamp, val);
>                 unlock_sock_fast(ssk, slow);
>         }
>         return 0;
> --
> 2.44.0.478.gd926399ef9-goog
>
>
Eric Dumazet April 5, 2024, 2:49 p.m. UTC | #2
On Fri, Apr 5, 2024 at 4:29 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Thu, Apr 4, 2024 at 7:53 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > tp->window_clamp can be read locklessly, add READ_ONCE()
> > and WRITE_ONCE() annotations.
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> >  net/ipv4/syncookies.c |  3 ++-
> >  net/ipv4/tcp.c        |  8 ++++----
> >  net/ipv4/tcp_input.c  | 17 ++++++++++-------
> >  net/ipv4/tcp_output.c | 18 ++++++++++--------
> >  net/ipv6/syncookies.c |  2 +-
> >  net/mptcp/protocol.c  |  2 +-
> >  net/mptcp/sockopt.c   |  2 +-
> >  7 files changed, 29 insertions(+), 23 deletions(-)
> >
> > diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> > index 500f665f98cbce4a3d681f8e39ecd368fe4013b1..b61d36810fe3fd62b1e5c5885bbaf20185f1abf0 100644
> > --- a/net/ipv4/syncookies.c
> > +++ b/net/ipv4/syncookies.c
> > @@ -462,7 +462,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
> >         }
> >
> >         /* Try to redo what tcp_v4_send_synack did. */
> > -       req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
> > +       req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :
> > +                               dst_metric(&rt->dst, RTAX_WINDOW);
> >         /* limit the window selection if the user enforce a smaller rx buffer */
> >         full_space = tcp_full_space(sk);
> >         if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index e767721b3a588b5d56567ae7badf5dffcd35a76a..92ee60492314a1483cfbfa2f73d32fcad5632773 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -1721,7 +1721,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
> >         space = tcp_space_from_win(sk, val);
> >         if (space > sk->sk_rcvbuf) {
> >                 WRITE_ONCE(sk->sk_rcvbuf, space);
> > -               tcp_sk(sk)->window_clamp = val;
> > +               WRITE_ONCE(tcp_sk(sk)->window_clamp, val);
> >         }
> >         return 0;
> >  }
> > @@ -3379,7 +3379,7 @@ int tcp_set_window_clamp(struct sock *sk, int val)
> >         if (!val) {
> >                 if (sk->sk_state != TCP_CLOSE)
> >                         return -EINVAL;
> > -               tp->window_clamp = 0;
> > +               WRITE_ONCE(tp->window_clamp, 0);
> >         } else {
> >                 u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp;
> >                 u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
> > @@ -3388,7 +3388,7 @@ int tcp_set_window_clamp(struct sock *sk, int val)
> >                 if (new_window_clamp == old_window_clamp)
> >                         return 0;
> >
> > -               tp->window_clamp = new_window_clamp;
> > +               WRITE_ONCE(tp->window_clamp, new_window_clamp);
> >                 if (new_window_clamp < old_window_clamp) {
> >                         /* need to apply the reserved mem provisioning only
> >                          * when shrinking the window clamp
> > @@ -4057,7 +4057,7 @@ int do_tcp_getsockopt(struct sock *sk, int level,
> >                                       TCP_RTO_MAX / HZ);
> >                 break;
> >         case TCP_WINDOW_CLAMP:
> > -               val = tp->window_clamp;
> > +               val = READ_ONCE(tp->window_clamp);
> >                 break;
> >         case TCP_INFO: {
> >                 struct tcp_info info;
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 1b6cd384001202df5f8e8e8c73adff0db89ece63..8d44ab5671eacd4bc06647c7cca387a79e346618 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -563,19 +563,20 @@ static void tcp_init_buffer_space(struct sock *sk)
> >         maxwin = tcp_full_space(sk);
> >
> >         if (tp->window_clamp >= maxwin) {
>
> I wonder if it is necessary to locklessly protect the above line with
> READ_ONCE() because I saw the full reader protection in the
> tcp_select_initial_window()? There are some other places like this.
> Any special reason?

We hold the socket lock at this point.

READ_ONCE() is only needed if another thread can potentially change
the value under us.
Jason Xing April 5, 2024, 2:57 p.m. UTC | #3
On Fri, Apr 5, 2024 at 10:49 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Fri, Apr 5, 2024 at 4:29 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > On Thu, Apr 4, 2024 at 7:53 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > tp->window_clamp can be read locklessly, add READ_ONCE()
> > > and WRITE_ONCE() annotations.
> > >
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > ---
> > >  net/ipv4/syncookies.c |  3 ++-
> > >  net/ipv4/tcp.c        |  8 ++++----
> > >  net/ipv4/tcp_input.c  | 17 ++++++++++-------
> > >  net/ipv4/tcp_output.c | 18 ++++++++++--------
> > >  net/ipv6/syncookies.c |  2 +-
> > >  net/mptcp/protocol.c  |  2 +-
> > >  net/mptcp/sockopt.c   |  2 +-
> > >  7 files changed, 29 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> > > index 500f665f98cbce4a3d681f8e39ecd368fe4013b1..b61d36810fe3fd62b1e5c5885bbaf20185f1abf0 100644
> > > --- a/net/ipv4/syncookies.c
> > > +++ b/net/ipv4/syncookies.c
> > > @@ -462,7 +462,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
> > >         }
> > >
> > >         /* Try to redo what tcp_v4_send_synack did. */
> > > -       req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
> > > +       req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :
> > > +                               dst_metric(&rt->dst, RTAX_WINDOW);
> > >         /* limit the window selection if the user enforce a smaller rx buffer */
> > >         full_space = tcp_full_space(sk);
> > >         if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
> > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > > index e767721b3a588b5d56567ae7badf5dffcd35a76a..92ee60492314a1483cfbfa2f73d32fcad5632773 100644
> > > --- a/net/ipv4/tcp.c
> > > +++ b/net/ipv4/tcp.c
> > > @@ -1721,7 +1721,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
> > >         space = tcp_space_from_win(sk, val);
> > >         if (space > sk->sk_rcvbuf) {
> > >                 WRITE_ONCE(sk->sk_rcvbuf, space);
> > > -               tcp_sk(sk)->window_clamp = val;
> > > +               WRITE_ONCE(tcp_sk(sk)->window_clamp, val);
> > >         }
> > >         return 0;
> > >  }
> > > @@ -3379,7 +3379,7 @@ int tcp_set_window_clamp(struct sock *sk, int val)
> > >         if (!val) {
> > >                 if (sk->sk_state != TCP_CLOSE)
> > >                         return -EINVAL;
> > > -               tp->window_clamp = 0;
> > > +               WRITE_ONCE(tp->window_clamp, 0);
> > >         } else {
> > >                 u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp;
> > >                 u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
> > > @@ -3388,7 +3388,7 @@ int tcp_set_window_clamp(struct sock *sk, int val)
> > >                 if (new_window_clamp == old_window_clamp)
> > >                         return 0;
> > >
> > > -               tp->window_clamp = new_window_clamp;
> > > +               WRITE_ONCE(tp->window_clamp, new_window_clamp);
> > >                 if (new_window_clamp < old_window_clamp) {
> > >                         /* need to apply the reserved mem provisioning only
> > >                          * when shrinking the window clamp
> > > @@ -4057,7 +4057,7 @@ int do_tcp_getsockopt(struct sock *sk, int level,
> > >                                       TCP_RTO_MAX / HZ);
> > >                 break;
> > >         case TCP_WINDOW_CLAMP:
> > > -               val = tp->window_clamp;
> > > +               val = READ_ONCE(tp->window_clamp);
> > >                 break;
> > >         case TCP_INFO: {
> > >                 struct tcp_info info;
> > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > index 1b6cd384001202df5f8e8e8c73adff0db89ece63..8d44ab5671eacd4bc06647c7cca387a79e346618 100644
> > > --- a/net/ipv4/tcp_input.c
> > > +++ b/net/ipv4/tcp_input.c
> > > @@ -563,19 +563,20 @@ static void tcp_init_buffer_space(struct sock *sk)
> > >         maxwin = tcp_full_space(sk);
> > >
> > >         if (tp->window_clamp >= maxwin) {
> >
> > I wonder if it is necessary to locklessly protect the above line with
> > READ_ONCE() because I saw the full reader protection in the
> > tcp_select_initial_window()? There are some other places like this.
> > Any special reason?
>
> We hold the socket lock at this point.
>
> READ_ONCE() is only needed if another thread can potentially change
> the value under us.

Oh right, thanks. The socket will be locked as soon as the skb enters
into the TCP layer.
Jason Xing April 5, 2024, 2:58 p.m. UTC | #4
On Thu, Apr 4, 2024 at 7:53 PM Eric Dumazet <edumazet@google.com> wrote:
>
> tp->window_clamp can be read locklessly, add READ_ONCE()
> and WRITE_ONCE() annotations.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>

Thanks!
patchwork-bot+netdevbpf@kernel.org April 6, 2024, 6:10 a.m. UTC | #5
Hello:

This patch was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:

On Thu,  4 Apr 2024 11:42:31 +0000 you wrote:
> tp->window_clamp can be read locklessly, add READ_ONCE()
> and WRITE_ONCE() annotations.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  net/ipv4/syncookies.c |  3 ++-
>  net/ipv4/tcp.c        |  8 ++++----
>  net/ipv4/tcp_input.c  | 17 ++++++++++-------
>  net/ipv4/tcp_output.c | 18 ++++++++++--------
>  net/ipv6/syncookies.c |  2 +-
>  net/mptcp/protocol.c  |  2 +-
>  net/mptcp/sockopt.c   |  2 +-
>  7 files changed, 29 insertions(+), 23 deletions(-)

Here is the summary with links:
  - [net-next] tcp: annotate data-races around tp->window_clamp
    https://git.kernel.org/netdev/net-next/c/f410cbea9f3d

You are awesome, thank you!
diff mbox series

Patch

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 500f665f98cbce4a3d681f8e39ecd368fe4013b1..b61d36810fe3fd62b1e5c5885bbaf20185f1abf0 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -462,7 +462,8 @@  struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	}
 
 	/* Try to redo what tcp_v4_send_synack did. */
-	req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+	req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :
+				dst_metric(&rt->dst, RTAX_WINDOW);
 	/* limit the window selection if the user enforce a smaller rx buffer */
 	full_space = tcp_full_space(sk);
 	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e767721b3a588b5d56567ae7badf5dffcd35a76a..92ee60492314a1483cfbfa2f73d32fcad5632773 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1721,7 +1721,7 @@  int tcp_set_rcvlowat(struct sock *sk, int val)
 	space = tcp_space_from_win(sk, val);
 	if (space > sk->sk_rcvbuf) {
 		WRITE_ONCE(sk->sk_rcvbuf, space);
-		tcp_sk(sk)->window_clamp = val;
+		WRITE_ONCE(tcp_sk(sk)->window_clamp, val);
 	}
 	return 0;
 }
@@ -3379,7 +3379,7 @@  int tcp_set_window_clamp(struct sock *sk, int val)
 	if (!val) {
 		if (sk->sk_state != TCP_CLOSE)
 			return -EINVAL;
-		tp->window_clamp = 0;
+		WRITE_ONCE(tp->window_clamp, 0);
 	} else {
 		u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp;
 		u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
@@ -3388,7 +3388,7 @@  int tcp_set_window_clamp(struct sock *sk, int val)
 		if (new_window_clamp == old_window_clamp)
 			return 0;
 
-		tp->window_clamp = new_window_clamp;
+		WRITE_ONCE(tp->window_clamp, new_window_clamp);
 		if (new_window_clamp < old_window_clamp) {
 			/* need to apply the reserved mem provisioning only
 			 * when shrinking the window clamp
@@ -4057,7 +4057,7 @@  int do_tcp_getsockopt(struct sock *sk, int level,
 				      TCP_RTO_MAX / HZ);
 		break;
 	case TCP_WINDOW_CLAMP:
-		val = tp->window_clamp;
+		val = READ_ONCE(tp->window_clamp);
 		break;
 	case TCP_INFO: {
 		struct tcp_info info;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1b6cd384001202df5f8e8e8c73adff0db89ece63..8d44ab5671eacd4bc06647c7cca387a79e346618 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -563,19 +563,20 @@  static void tcp_init_buffer_space(struct sock *sk)
 	maxwin = tcp_full_space(sk);
 
 	if (tp->window_clamp >= maxwin) {
-		tp->window_clamp = maxwin;
+		WRITE_ONCE(tp->window_clamp, maxwin);
 
 		if (tcp_app_win && maxwin > 4 * tp->advmss)
-			tp->window_clamp = max(maxwin -
-					       (maxwin >> tcp_app_win),
-					       4 * tp->advmss);
+			WRITE_ONCE(tp->window_clamp,
+				   max(maxwin - (maxwin >> tcp_app_win),
+				       4 * tp->advmss));
 	}
 
 	/* Force reservation of one segment. */
 	if (tcp_app_win &&
 	    tp->window_clamp > 2 * tp->advmss &&
 	    tp->window_clamp + tp->advmss > maxwin)
-		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+		WRITE_ONCE(tp->window_clamp,
+			   max(2 * tp->advmss, maxwin - tp->advmss));
 
 	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
 	tp->snd_cwnd_stamp = tcp_jiffies32;
@@ -773,7 +774,8 @@  void tcp_rcv_space_adjust(struct sock *sk)
 			WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
 
 			/* Make the window clamp follow along.  */
-			tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
+			WRITE_ONCE(tp->window_clamp,
+				   tcp_win_from_space(sk, rcvbuf));
 		}
 	}
 	tp->rcvq_space.space = copied;
@@ -6426,7 +6428,8 @@  static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		if (!tp->rx_opt.wscale_ok) {
 			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
-			tp->window_clamp = min(tp->window_clamp, 65535U);
+			WRITE_ONCE(tp->window_clamp,
+				   min(tp->window_clamp, 65535U));
 		}
 
 		if (tp->rx_opt.saw_tstamp) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3167ad965676facaacd8f82848c52cf966f97c3..9282fafc0e6109f3ac86d1641740f24588b2d75d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -203,16 +203,17 @@  static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
  * This MUST be enforced by all callers.
  */
 void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
-			       __u32 *rcv_wnd, __u32 *window_clamp,
+			       __u32 *rcv_wnd, __u32 *__window_clamp,
 			       int wscale_ok, __u8 *rcv_wscale,
 			       __u32 init_rcv_wnd)
 {
 	unsigned int space = (__space < 0 ? 0 : __space);
+	u32 window_clamp = READ_ONCE(*__window_clamp);
 
 	/* If no clamp set the clamp to the max possible scaled window */
-	if (*window_clamp == 0)
-		(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
-	space = min(*window_clamp, space);
+	if (window_clamp == 0)
+		window_clamp = (U16_MAX << TCP_MAX_WSCALE);
+	space = min(window_clamp, space);
 
 	/* Quantize space offering to a multiple of mss if possible. */
 	if (space > mss)
@@ -239,12 +240,13 @@  void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
 		/* Set window scaling on max possible window */
 		space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
 		space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
-		space = min_t(u32, space, *window_clamp);
+		space = min_t(u32, space, window_clamp);
 		*rcv_wscale = clamp_t(int, ilog2(space) - 15,
 				      0, TCP_MAX_WSCALE);
 	}
 	/* Set the clamp no higher than max representable value */
-	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
+	WRITE_ONCE(*__window_clamp,
+		   min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp));
 }
 EXPORT_SYMBOL(tcp_select_initial_window);
 
@@ -3855,7 +3857,7 @@  static void tcp_connect_init(struct sock *sk)
 	tcp_ca_dst_init(sk, dst);
 
 	if (!tp->window_clamp)
-		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+		WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW));
 	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
 
 	tcp_initialize_rcv_mss(sk);
@@ -3863,7 +3865,7 @@  static void tcp_connect_init(struct sock *sk)
 	/* limit the window selection if the user enforce a smaller rx buffer */
 	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
 	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
-		tp->window_clamp = tcp_full_space(sk);
+		WRITE_ONCE(tp->window_clamp, tcp_full_space(sk));
 
 	rcv_wnd = tcp_rwnd_init_bpf(sk);
 	if (rcv_wnd == 0)
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 6d8286c299c9d139938ef6751d9958c80d3031e9..bfad1e89b6a6bb99c28b9ef14c142a6c4aeae54b 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -246,7 +246,7 @@  struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		}
 	}
 
-	req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
+	req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW);
 	/* limit the window selection if the user enforce a smaller rx buffer */
 	full_space = tcp_full_space(sk);
 	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 3a1967bc7bad63d5a8a628b3f3b868e3a27baaca..3897a03bb8cb88f7869180b5ec261158e8e5d027 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2056,7 +2056,7 @@  static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
 				ssk = mptcp_subflow_tcp_sock(subflow);
 				slow = lock_sock_fast(ssk);
 				WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
-				tcp_sk(ssk)->window_clamp = window_clamp;
+				WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
 				tcp_cleanup_rbuf(ssk, 1);
 				unlock_sock_fast(ssk, slow);
 			}
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index dcd1c76d2a3ba1ccc31a3e9279f725cd6d433782..b702e994633788183ad95b2e12859ee6b60bf208 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -1519,7 +1519,7 @@  int mptcp_set_rcvlowat(struct sock *sk, int val)
 
 		slow = lock_sock_fast(ssk);
 		WRITE_ONCE(ssk->sk_rcvbuf, space);
-		tcp_sk(ssk)->window_clamp = val;
+		WRITE_ONCE(tcp_sk(ssk)->window_clamp, val);
 		unlock_sock_fast(ssk, slow);
 	}
 	return 0;