diff mbox series

[net,1/4] tcp: add tcp_done_with_error() helper

Message ID 20240524193630.2007563-2-edumazet@google.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series tcp: fix tcp_poll() races | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 946 this patch: 946
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 1 maintainers not CCed: dsahern@kernel.org
netdev/build_clang fail Errors and warnings before: 54 this patch: 54
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 971 this patch: 971
netdev/checkpatch warning CHECK: Please don't use multiple blank lines WARNING: 'carefuly' may be misspelled - perhaps 'carefully'?
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 4 this patch: 4
netdev/source_inline success Was 0 now: 0

Commit Message

Eric Dumazet May 24, 2024, 7:36 p.m. UTC
tcp_reset() ends with a sequence that is carefuly ordered.

We need to fix [e]poll bugs in the following patches,
it makes sense to use a common helper.

Suggested-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/tcp.h    |  1 +
 net/ipv4/tcp.c       |  2 +-
 net/ipv4/tcp_input.c | 25 +++++++++++++++++--------
 3 files changed, 19 insertions(+), 9 deletions(-)

Comments

Neal Cardwell May 25, 2024, 2:14 p.m. UTC | #1
On Fri, May 24, 2024 at 3:36 PM Eric Dumazet <edumazet@google.com> wrote:
>
> tcp_reset() ends with a sequence that is carefuly ordered.
>
> We need to fix [e]poll bugs in the following patches,
> it makes sense to use a common helper.
>
> Suggested-by: Neal Cardwell <ncardwell@google.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/net/tcp.h    |  1 +
>  net/ipv4/tcp.c       |  2 +-
>  net/ipv4/tcp_input.c | 25 +++++++++++++++++--------
>  3 files changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 060e95b331a286ad7c355be11dc03250d2944920..2e7150f6755a5f5bf7b45454da0b33c5fac78183 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -677,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
>  /* tcp_input.c */
>  void tcp_rearm_rto(struct sock *sk);
>  void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
> +void tcp_done_with_error(struct sock *sk);
>  void tcp_reset(struct sock *sk, struct sk_buff *skb);
>  void tcp_fin(struct sock *sk);
>  void tcp_check_space(struct sock *sk);
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 681b54e1f3a64387787738ab6495531b8abe1771..2a8f8d8676ff1d30ea9f8cd47ccf9236940eb299 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -598,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
>                  */
>                 mask |= EPOLLOUT | EPOLLWRNORM;
>         }
> -       /* This barrier is coupled with smp_wmb() in tcp_reset() */
> +       /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
>         smp_rmb();
>         if (READ_ONCE(sk->sk_err) ||
>             !skb_queue_empty_lockless(&sk->sk_error_queue))
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 9c04a9c8be9dfaa0ec2437b3748284e57588b216..5af716f1bc74e095d22f64d605624decfe27cefe 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4436,6 +4436,22 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
>         return SKB_NOT_DROPPED_YET;
>  }
>
> +
> +void tcp_done_with_error(struct sock *sk)
> +{
> +       /* Our caller wrote a value into sk->sk_err.
> +        * This barrier is coupled with smp_rmb() in tcp_poll()
> +        */
> +       smp_wmb();
> +
> +       tcp_write_queue_purge(sk);
> +       tcp_done(sk);
> +
> +       if (!sock_flag(sk, SOCK_DEAD))
> +               sk_error_report(sk);
> +}
> +EXPORT_SYMBOL(tcp_done_with_error);
> +
>  /* When we get a reset we do this. */
>  void tcp_reset(struct sock *sk, struct sk_buff *skb)
>  {
> @@ -4460,14 +4476,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
>         default:
>                 WRITE_ONCE(sk->sk_err, ECONNRESET);
>         }
> -       /* This barrier is coupled with smp_rmb() in tcp_poll() */
> -       smp_wmb();
> -
> -       tcp_write_queue_purge(sk);
> -       tcp_done(sk);
> -
> -       if (!sock_flag(sk, SOCK_DEAD))
> -               sk_error_report(sk);
> +       tcp_done_with_error(sk);
>  }
>
>  /*
> --

Thanks, Eric!

Thinking about this more, I wonder if there is another aspect to this issue.

I am thinking about this part of tcp_done():

void tcp_done(struct sock *sk)
{
...
        sk->sk_shutdown = SHUTDOWN_MASK;

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_state_change(sk);

The tcp_poll() code reads sk->sk_shutdown to decide whether to set
EPOLLHUP and other bits. However, sk->sk_shutdown is not set until
here in tcp_done(). And in the tcp_done() code there is no smp_wmb()
to ensure that the sk->sk_shutdown is visible to other CPUs before
tcp_done() calls sk->sk_state_change() to wake up threads sleeping on
sk->sk_wq.

So AFAICT we could have cases where this sk->sk_state_change() (or the
later sk_error_report()?) wakes a thread doing a tcp_poll() on another
CPU, and the tcp_poll() code may correctly see the sk->sk_err because
it was updated before the smp_wmb() in tcp_done_with_error(), but may
fail to see the "sk->sk_shutdown = SHUTDOWN_MASK" write because that
happened after the smp_wmb() in tcp_done_with_error().

So AFAICT  maybe we need two changes?

(1) AFAICT the call to smp_wmb() should actually instead be inside
tcp_done(), after we set sk->sk_shutdown?

void tcp_done(struct sock *sk)
{
        ...
        sk->sk_shutdown = SHUTDOWN_MASK;

        /* Ensure previous writes to sk->sk_err, sk->sk_state,
         * sk->sk_shutdown are visible to others.
         * This barrier is coupled with smp_rmb() in tcp_poll()
         */
        smp_wmb();

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_state_change(sk);

(2) Correspondingly, AFAICT the tcp_poll() call to smp_rmb() should be
before tcp_poll() first reads sk->sk_shutdown, rather than right
before it reads sk->sk_err?

thanks,
neal
Jason Xing May 27, 2024, 8:06 a.m. UTC | #2
Hi Neal,

On Sat, May 25, 2024 at 10:14 PM Neal Cardwell <ncardwell@google.com> wrote:
>
> On Fri, May 24, 2024 at 3:36 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > tcp_reset() ends with a sequence that is carefuly ordered.
> >
> > We need to fix [e]poll bugs in the following patches,
> > it makes sense to use a common helper.
> >
> > Suggested-by: Neal Cardwell <ncardwell@google.com>
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> >  include/net/tcp.h    |  1 +
> >  net/ipv4/tcp.c       |  2 +-
> >  net/ipv4/tcp_input.c | 25 +++++++++++++++++--------
> >  3 files changed, 19 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 060e95b331a286ad7c355be11dc03250d2944920..2e7150f6755a5f5bf7b45454da0b33c5fac78183 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -677,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
> >  /* tcp_input.c */
> >  void tcp_rearm_rto(struct sock *sk);
> >  void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
> > +void tcp_done_with_error(struct sock *sk);
> >  void tcp_reset(struct sock *sk, struct sk_buff *skb);
> >  void tcp_fin(struct sock *sk);
> >  void tcp_check_space(struct sock *sk);
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index 681b54e1f3a64387787738ab6495531b8abe1771..2a8f8d8676ff1d30ea9f8cd47ccf9236940eb299 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -598,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
> >                  */
> >                 mask |= EPOLLOUT | EPOLLWRNORM;
> >         }
> > -       /* This barrier is coupled with smp_wmb() in tcp_reset() */
> > +       /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
> >         smp_rmb();
> >         if (READ_ONCE(sk->sk_err) ||
> >             !skb_queue_empty_lockless(&sk->sk_error_queue))
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 9c04a9c8be9dfaa0ec2437b3748284e57588b216..5af716f1bc74e095d22f64d605624decfe27cefe 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -4436,6 +4436,22 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
> >         return SKB_NOT_DROPPED_YET;
> >  }
> >
> > +
> > +void tcp_done_with_error(struct sock *sk)
> > +{
> > +       /* Our caller wrote a value into sk->sk_err.
> > +        * This barrier is coupled with smp_rmb() in tcp_poll()
> > +        */
> > +       smp_wmb();
> > +
> > +       tcp_write_queue_purge(sk);
> > +       tcp_done(sk);
> > +
> > +       if (!sock_flag(sk, SOCK_DEAD))
> > +               sk_error_report(sk);
> > +}
> > +EXPORT_SYMBOL(tcp_done_with_error);
> > +
> >  /* When we get a reset we do this. */
> >  void tcp_reset(struct sock *sk, struct sk_buff *skb)
> >  {
> > @@ -4460,14 +4476,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
> >         default:
> >                 WRITE_ONCE(sk->sk_err, ECONNRESET);
> >         }
> > -       /* This barrier is coupled with smp_rmb() in tcp_poll() */
> > -       smp_wmb();
> > -
> > -       tcp_write_queue_purge(sk);
> > -       tcp_done(sk);
> > -
> > -       if (!sock_flag(sk, SOCK_DEAD))
> > -               sk_error_report(sk);
> > +       tcp_done_with_error(sk);
> >  }
> >
> >  /*
> > --
>
> Thanks, Eric!
>
> Thinking about this more, I wonder if there is another aspect to this issue.
>
> I am thinking about this part of tcp_done():
>
> void tcp_done(struct sock *sk)
> {
> ...
>         sk->sk_shutdown = SHUTDOWN_MASK;
>
>         if (!sock_flag(sk, SOCK_DEAD))
>                 sk->sk_state_change(sk);
>
> The tcp_poll() code reads sk->sk_shutdown to decide whether to set
> EPOLLHUP and other bits. However, sk->sk_shutdown is not set until
> here in tcp_done(). And in the tcp_done() code there is no smp_wmb()
> to ensure that the sk->sk_shutdown is visible to other CPUs before
> tcp_done() calls sk->sk_state_change() to wake up threads sleeping on
> sk->sk_wq.
>
> So AFAICT we could have cases where this sk->sk_state_change() (or the
> later sk_error_report()?) wakes a thread doing a tcp_poll() on another
> CPU, and the tcp_poll() code may correctly see the sk->sk_err because
> it was updated before the smp_wmb() in tcp_done_with_error(), but may
> fail to see the "sk->sk_shutdown = SHUTDOWN_MASK" write because that
> happened after the smp_wmb() in tcp_done_with_error().

I agree. Accessing sk_shutdown with a pair of smp operations makes
sure that another cpu can see the consistency of both sk_shutdown and
sk_err in tcp_poll().

>
> So AFAICT  maybe we need two changes?
>
> (1) AFAICT the call to smp_wmb() should actually instead be inside
> tcp_done(), after we set sk->sk_shutdown?
>
> void tcp_done(struct sock *sk)
> {
>         ...
>         sk->sk_shutdown = SHUTDOWN_MASK;
>
>         /* Ensure previous writes to sk->sk_err, sk->sk_state,
>          * sk->sk_shutdown are visible to others.
>          * This barrier is coupled with smp_rmb() in tcp_poll()
>          */
>         smp_wmb();

I wonder if it would affect those callers who have no interest in
pairing smp operations, like tcp_v4_syn_recv_sock()? For those
callers, WRITE_ONCE/READ_ONCE() is enough to protect itself only.

Thanks,
Jason

>
>         if (!sock_flag(sk, SOCK_DEAD))
>                 sk->sk_state_change(sk);
>
> (2) Correspondingly, AFAICT the tcp_poll() call to smp_rmb() should be
> before tcp_poll() first reads sk->sk_shutdown, rather than right
> before it reads sk->sk_err?
>
> thanks,
> neal
>
Eric Dumazet May 27, 2024, 8:53 a.m. UTC | #3
On Sat, May 25, 2024 at 4:14 PM Neal Cardwell <ncardwell@google.com> wrote:
>
> On Fri, May 24, 2024 at 3:36 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > tcp_reset() ends with a sequence that is carefuly ordered.
> >
> > We need to fix [e]poll bugs in the following patches,
> > it makes sense to use a common helper.
> >
> > Suggested-by: Neal Cardwell <ncardwell@google.com>
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> >  include/net/tcp.h    |  1 +
> >  net/ipv4/tcp.c       |  2 +-
> >  net/ipv4/tcp_input.c | 25 +++++++++++++++++--------
> >  3 files changed, 19 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > index 060e95b331a286ad7c355be11dc03250d2944920..2e7150f6755a5f5bf7b45454da0b33c5fac78183 100644
> > --- a/include/net/tcp.h
> > +++ b/include/net/tcp.h
> > @@ -677,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
> >  /* tcp_input.c */
> >  void tcp_rearm_rto(struct sock *sk);
> >  void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
> > +void tcp_done_with_error(struct sock *sk);
> >  void tcp_reset(struct sock *sk, struct sk_buff *skb);
> >  void tcp_fin(struct sock *sk);
> >  void tcp_check_space(struct sock *sk);
> > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > index 681b54e1f3a64387787738ab6495531b8abe1771..2a8f8d8676ff1d30ea9f8cd47ccf9236940eb299 100644
> > --- a/net/ipv4/tcp.c
> > +++ b/net/ipv4/tcp.c
> > @@ -598,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
> >                  */
> >                 mask |= EPOLLOUT | EPOLLWRNORM;
> >         }
> > -       /* This barrier is coupled with smp_wmb() in tcp_reset() */
> > +       /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
> >         smp_rmb();
> >         if (READ_ONCE(sk->sk_err) ||
> >             !skb_queue_empty_lockless(&sk->sk_error_queue))
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 9c04a9c8be9dfaa0ec2437b3748284e57588b216..5af716f1bc74e095d22f64d605624decfe27cefe 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -4436,6 +4436,22 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
> >         return SKB_NOT_DROPPED_YET;
> >  }
> >
> > +
> > +void tcp_done_with_error(struct sock *sk)
> > +{
> > +       /* Our caller wrote a value into sk->sk_err.
> > +        * This barrier is coupled with smp_rmb() in tcp_poll()
> > +        */
> > +       smp_wmb();
> > +
> > +       tcp_write_queue_purge(sk);
> > +       tcp_done(sk);
> > +
> > +       if (!sock_flag(sk, SOCK_DEAD))
> > +               sk_error_report(sk);
> > +}
> > +EXPORT_SYMBOL(tcp_done_with_error);
> > +
> >  /* When we get a reset we do this. */
> >  void tcp_reset(struct sock *sk, struct sk_buff *skb)
> >  {
> > @@ -4460,14 +4476,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
> >         default:
> >                 WRITE_ONCE(sk->sk_err, ECONNRESET);
> >         }
> > -       /* This barrier is coupled with smp_rmb() in tcp_poll() */
> > -       smp_wmb();
> > -
> > -       tcp_write_queue_purge(sk);
> > -       tcp_done(sk);
> > -
> > -       if (!sock_flag(sk, SOCK_DEAD))
> > -               sk_error_report(sk);
> > +       tcp_done_with_error(sk);
> >  }
> >
> >  /*
> > --
>
> Thanks, Eric!
>
> Thinking about this more, I wonder if there is another aspect to this issue.
>
> I am thinking about this part of tcp_done():
>
> void tcp_done(struct sock *sk)
> {
> ...
>         sk->sk_shutdown = SHUTDOWN_MASK;
>
>         if (!sock_flag(sk, SOCK_DEAD))
>                 sk->sk_state_change(sk);
>
> The tcp_poll() code reads sk->sk_shutdown to decide whether to set
> EPOLLHUP and other bits. However, sk->sk_shutdown is not set until
> here in tcp_done(). And in the tcp_done() code there is no smp_wmb()
> to ensure that the sk->sk_shutdown is visible to other CPUs before
> tcp_done() calls sk->sk_state_change() to wake up threads sleeping on
> sk->sk_wq.
>
> So AFAICT we could have cases where this sk->sk_state_change() (or the
> later sk_error_report()?) wakes a thread doing a tcp_poll() on another
> CPU, and the tcp_poll() code may correctly see the sk->sk_err because
> it was updated before the smp_wmb() in tcp_done_with_error(), but may
> fail to see the "sk->sk_shutdown = SHUTDOWN_MASK" write because that
> happened after the smp_wmb() in tcp_done_with_error().
>
> So AFAICT  maybe we need two changes?

This seems orthogonal, and should be discussed in a different patch series ?
I am afraid of the additional implications of your proposal.

Applications react to EPOLLERR by getting the (termination) error
code, they don't really _need_ EPOLLHUP at this stage to behave
correctly.

And EPOLLERR got a fix  more than a decade ago, nobody complained
there was an issue with sk_shutdown.

commit a4d258036ed9b2a1811c3670c6099203a0f284a0
Author: Tom Marshall <tdm.code@gmail.com>
Date:   Mon Sep 20 15:42:05 2010 -0700

    tcp: Fix race in tcp_poll

Notice how Tom moved sk_err read to the end of tcp_poll().
It is not possible with extra smp_wmb() and smp_wmb() alone to make
sure tcp_poll() gets a consistent view.
There are too many variables to snapshot in a consistent way.

Only packetdrill has an issue here, because it expects a precise
combination of flags.

Would you prefer to change packetdrill to accept both possible values ?

   +0 epoll_ctl(5, EPOLL_CTL_ADD, 4, {events=EPOLLERR, fd=4}) = 0

// This is the part that would need a change:
// something like +0...11 epoll_wait(5, {events=EPOLLERR [| EPOLLHUP],
fd=4}, 1, 15000) = 1   ??
  +0...11 epoll_wait(5, {events=EPOLLERR|EPOLLHUP, fd=4}, 1, 15000) = 1
// Verify keepalive behavior looks correct, given the parameters above:
// Start sending keepalive probes after 3 seconds of idle.
   +3 > . 0:0(0) ack 1
// Send keepalive probes every 2 seconds.
   +2 > . 0:0(0) ack 1
   +2 > . 0:0(0) ack 1
   +2 > . 0:0(0) ack 1
   +2 > R. 1:1(0) ack 1
// Sent 4 keepalive probes and then gave up and reset the connection.
// Verify that we get the expected error when we try to use the socket:
   +0 read(4, ..., 1000) = -1 ETIMEDOUT (Connection timed out)

In 100 runs, I get 1 flake only (but I probably could get more if I
added an ndelay(1000) before tcp_done() in unpatched kernel)
keepalive-with-ts.pkt:44: runtime error in epoll_wait call:
epoll_event->events does not match script: expected: 0x18 actual: 0x8


>
> (1) AFAICT the call to smp_wmb() should actually instead be inside
> tcp_done(), after we set sk->sk_shutdown?
>
> void tcp_done(struct sock *sk)
> {
>         ...
>         sk->sk_shutdown = SHUTDOWN_MASK;
>
>         /* Ensure previous writes to sk->sk_err, sk->sk_state,
>          * sk->sk_shutdown are visible to others.
>          * This barrier is coupled with smp_rmb() in tcp_poll()
>          */
>         smp_wmb();

Not adding an smp_wmb() _right_ after the write to sk_err
might bring back the race that Tom wanted to fix in 2010.

(This was a poll() system call, not a callback initiated from TCP
stack itself with sk_error_report(), this one does not need barriers,
because sock_def_error_report() implies a lot of barriers before the
user thread can call tcp_poll())




>
>         if (!sock_flag(sk, SOCK_DEAD))
>                 sk->sk_state_change(sk);
>
> (2) Correspondingly, AFAICT the tcp_poll() call to smp_rmb() should be
> before tcp_poll() first reads sk->sk_shutdown, rather than right
> before it reads sk->sk_err?
>
> thanks,
> neal
Eric Dumazet May 27, 2024, 8:56 a.m. UTC | #4
On Mon, May 27, 2024 at 10:07 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> Hi Neal,
>
> On Sat, May 25, 2024 at 10:14 PM Neal Cardwell <ncardwell@google.com> wrote:
> >
> > On Fri, May 24, 2024 at 3:36 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > tcp_reset() ends with a sequence that is carefuly ordered.
> > >
> > > We need to fix [e]poll bugs in the following patches,
> > > it makes sense to use a common helper.
> > >
> > > Suggested-by: Neal Cardwell <ncardwell@google.com>
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > ---
> > >  include/net/tcp.h    |  1 +
> > >  net/ipv4/tcp.c       |  2 +-
> > >  net/ipv4/tcp_input.c | 25 +++++++++++++++++--------
> > >  3 files changed, 19 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > > index 060e95b331a286ad7c355be11dc03250d2944920..2e7150f6755a5f5bf7b45454da0b33c5fac78183 100644
> > > --- a/include/net/tcp.h
> > > +++ b/include/net/tcp.h
> > > @@ -677,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
> > >  /* tcp_input.c */
> > >  void tcp_rearm_rto(struct sock *sk);
> > >  void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
> > > +void tcp_done_with_error(struct sock *sk);
> > >  void tcp_reset(struct sock *sk, struct sk_buff *skb);
> > >  void tcp_fin(struct sock *sk);
> > >  void tcp_check_space(struct sock *sk);
> > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > > index 681b54e1f3a64387787738ab6495531b8abe1771..2a8f8d8676ff1d30ea9f8cd47ccf9236940eb299 100644
> > > --- a/net/ipv4/tcp.c
> > > +++ b/net/ipv4/tcp.c
> > > @@ -598,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
> > >                  */
> > >                 mask |= EPOLLOUT | EPOLLWRNORM;
> > >         }
> > > -       /* This barrier is coupled with smp_wmb() in tcp_reset() */
> > > +       /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
> > >         smp_rmb();
> > >         if (READ_ONCE(sk->sk_err) ||
> > >             !skb_queue_empty_lockless(&sk->sk_error_queue))
> > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > index 9c04a9c8be9dfaa0ec2437b3748284e57588b216..5af716f1bc74e095d22f64d605624decfe27cefe 100644
> > > --- a/net/ipv4/tcp_input.c
> > > +++ b/net/ipv4/tcp_input.c
> > > @@ -4436,6 +4436,22 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
> > >         return SKB_NOT_DROPPED_YET;
> > >  }
> > >
> > > +
> > > +void tcp_done_with_error(struct sock *sk)
> > > +{
> > > +       /* Our caller wrote a value into sk->sk_err.
> > > +        * This barrier is coupled with smp_rmb() in tcp_poll()
> > > +        */
> > > +       smp_wmb();
> > > +
> > > +       tcp_write_queue_purge(sk);
> > > +       tcp_done(sk);
> > > +
> > > +       if (!sock_flag(sk, SOCK_DEAD))
> > > +               sk_error_report(sk);
> > > +}
> > > +EXPORT_SYMBOL(tcp_done_with_error);
> > > +
> > >  /* When we get a reset we do this. */
> > >  void tcp_reset(struct sock *sk, struct sk_buff *skb)
> > >  {
> > > @@ -4460,14 +4476,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
> > >         default:
> > >                 WRITE_ONCE(sk->sk_err, ECONNRESET);
> > >         }
> > > -       /* This barrier is coupled with smp_rmb() in tcp_poll() */
> > > -       smp_wmb();
> > > -
> > > -       tcp_write_queue_purge(sk);
> > > -       tcp_done(sk);
> > > -
> > > -       if (!sock_flag(sk, SOCK_DEAD))
> > > -               sk_error_report(sk);
> > > +       tcp_done_with_error(sk);
> > >  }
> > >
> > >  /*
> > > --
> >
> > Thanks, Eric!
> >
> > Thinking about this more, I wonder if there is another aspect to this issue.
> >
> > I am thinking about this part of tcp_done():
> >
> > void tcp_done(struct sock *sk)
> > {
> > ...
> >         sk->sk_shutdown = SHUTDOWN_MASK;
> >
> >         if (!sock_flag(sk, SOCK_DEAD))
> >                 sk->sk_state_change(sk);
> >
> > The tcp_poll() code reads sk->sk_shutdown to decide whether to set
> > EPOLLHUP and other bits. However, sk->sk_shutdown is not set until
> > here in tcp_done(). And in the tcp_done() code there is no smp_wmb()
> > to ensure that the sk->sk_shutdown is visible to other CPUs before
> > tcp_done() calls sk->sk_state_change() to wake up threads sleeping on
> > sk->sk_wq.
> >
> > So AFAICT we could have cases where this sk->sk_state_change() (or the
> > later sk_error_report()?) wakes a thread doing a tcp_poll() on another
> > CPU, and the tcp_poll() code may correctly see the sk->sk_err because
> > it was updated before the smp_wmb() in tcp_done_with_error(), but may
> > fail to see the "sk->sk_shutdown = SHUTDOWN_MASK" write because that
> > happened after the smp_wmb() in tcp_done_with_error().
>
> I agree. Accessing sk_shutdown with a pair of smp operations makes
> sure that another cpu can see the consistency of both sk_shutdown and
> sk_err in tcp_poll().
>
> >
> > So AFAICT  maybe we need two changes?
> >
> > (1) AFAICT the call to smp_wmb() should actually instead be inside
> > tcp_done(), after we set sk->sk_shutdown?
> >
> > void tcp_done(struct sock *sk)
> > {
> >         ...
> >         sk->sk_shutdown = SHUTDOWN_MASK;
> >
> >         /* Ensure previous writes to sk->sk_err, sk->sk_state,
> >          * sk->sk_shutdown are visible to others.
> >          * This barrier is coupled with smp_rmb() in tcp_poll()
> >          */
> >         smp_wmb();
>
> I wonder if it would affect those callers who have no interest in
> pairing smp operations, like tcp_v4_syn_recv_sock()? For those
> callers, WRITE_ONCE/READ_ONCE() is enough to protect itself only.

WRITE_ONCE()/READ_ONCE() and smp_rmb()/smp_wmb() have different purposes.

smp_rmb()/smp_wmb() are order of magnitude more expensive than
WRITE_ONCE()/READ_ONCE()

You should use them only when absolutely needed.
Jason Xing May 27, 2024, 12:50 p.m. UTC | #5
On Mon, May 27, 2024 at 4:56 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, May 27, 2024 at 10:07 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > Hi Neal,
> >
> > On Sat, May 25, 2024 at 10:14 PM Neal Cardwell <ncardwell@google.com> wrote:
> > >
> > > On Fri, May 24, 2024 at 3:36 PM Eric Dumazet <edumazet@google.com> wrote:
> > > >
> > > > tcp_reset() ends with a sequence that is carefuly ordered.
> > > >
> > > > We need to fix [e]poll bugs in the following patches,
> > > > it makes sense to use a common helper.
> > > >
> > > > Suggested-by: Neal Cardwell <ncardwell@google.com>
> > > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > > ---
> > > >  include/net/tcp.h    |  1 +
> > > >  net/ipv4/tcp.c       |  2 +-
> > > >  net/ipv4/tcp_input.c | 25 +++++++++++++++++--------
> > > >  3 files changed, 19 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > > > index 060e95b331a286ad7c355be11dc03250d2944920..2e7150f6755a5f5bf7b45454da0b33c5fac78183 100644
> > > > --- a/include/net/tcp.h
> > > > +++ b/include/net/tcp.h
> > > > @@ -677,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
> > > >  /* tcp_input.c */
> > > >  void tcp_rearm_rto(struct sock *sk);
> > > >  void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
> > > > +void tcp_done_with_error(struct sock *sk);
> > > >  void tcp_reset(struct sock *sk, struct sk_buff *skb);
> > > >  void tcp_fin(struct sock *sk);
> > > >  void tcp_check_space(struct sock *sk);
> > > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > > > index 681b54e1f3a64387787738ab6495531b8abe1771..2a8f8d8676ff1d30ea9f8cd47ccf9236940eb299 100644
> > > > --- a/net/ipv4/tcp.c
> > > > +++ b/net/ipv4/tcp.c
> > > > @@ -598,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
> > > >                  */
> > > >                 mask |= EPOLLOUT | EPOLLWRNORM;
> > > >         }
> > > > -       /* This barrier is coupled with smp_wmb() in tcp_reset() */
> > > > +       /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
> > > >         smp_rmb();
> > > >         if (READ_ONCE(sk->sk_err) ||
> > > >             !skb_queue_empty_lockless(&sk->sk_error_queue))
> > > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > > index 9c04a9c8be9dfaa0ec2437b3748284e57588b216..5af716f1bc74e095d22f64d605624decfe27cefe 100644
> > > > --- a/net/ipv4/tcp_input.c
> > > > +++ b/net/ipv4/tcp_input.c
> > > > @@ -4436,6 +4436,22 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
> > > >         return SKB_NOT_DROPPED_YET;
> > > >  }
> > > >
> > > > +
> > > > +void tcp_done_with_error(struct sock *sk)
> > > > +{
> > > > +       /* Our caller wrote a value into sk->sk_err.
> > > > +        * This barrier is coupled with smp_rmb() in tcp_poll()
> > > > +        */
> > > > +       smp_wmb();
> > > > +
> > > > +       tcp_write_queue_purge(sk);
> > > > +       tcp_done(sk);
> > > > +
> > > > +       if (!sock_flag(sk, SOCK_DEAD))
> > > > +               sk_error_report(sk);
> > > > +}
> > > > +EXPORT_SYMBOL(tcp_done_with_error);
> > > > +
> > > >  /* When we get a reset we do this. */
> > > >  void tcp_reset(struct sock *sk, struct sk_buff *skb)
> > > >  {
> > > > @@ -4460,14 +4476,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
> > > >         default:
> > > >                 WRITE_ONCE(sk->sk_err, ECONNRESET);
> > > >         }
> > > > -       /* This barrier is coupled with smp_rmb() in tcp_poll() */
> > > > -       smp_wmb();
> > > > -
> > > > -       tcp_write_queue_purge(sk);
> > > > -       tcp_done(sk);
> > > > -
> > > > -       if (!sock_flag(sk, SOCK_DEAD))
> > > > -               sk_error_report(sk);
> > > > +       tcp_done_with_error(sk);
> > > >  }
> > > >
> > > >  /*
> > > > --
> > >
> > > Thanks, Eric!
> > >
> > > Thinking about this more, I wonder if there is another aspect to this issue.
> > >
> > > I am thinking about this part of tcp_done():
> > >
> > > void tcp_done(struct sock *sk)
> > > {
> > > ...
> > >         sk->sk_shutdown = SHUTDOWN_MASK;
> > >
> > >         if (!sock_flag(sk, SOCK_DEAD))
> > >                 sk->sk_state_change(sk);
> > >
> > > The tcp_poll() code reads sk->sk_shutdown to decide whether to set
> > > EPOLLHUP and other bits. However, sk->sk_shutdown is not set until
> > > here in tcp_done(). And in the tcp_done() code there is no smp_wmb()
> > > to ensure that the sk->sk_shutdown is visible to other CPUs before
> > > tcp_done() calls sk->sk_state_change() to wake up threads sleeping on
> > > sk->sk_wq.
> > >
> > > So AFAICT we could have cases where this sk->sk_state_change() (or the
> > > later sk_error_report()?) wakes a thread doing a tcp_poll() on another
> > > CPU, and the tcp_poll() code may correctly see the sk->sk_err because
> > > it was updated before the smp_wmb() in tcp_done_with_error(), but may
> > > fail to see the "sk->sk_shutdown = SHUTDOWN_MASK" write because that
> > > happened after the smp_wmb() in tcp_done_with_error().
> >
> > I agree. Accessing sk_shutdown with a pair of smp operations makes
> > sure that another cpu can see the consistency of both sk_shutdown and
> > sk_err in tcp_poll().
> >
> > >
> > > So AFAICT  maybe we need two changes?
> > >
> > > (1) AFAICT the call to smp_wmb() should actually instead be inside
> > > tcp_done(), after we set sk->sk_shutdown?
> > >
> > > void tcp_done(struct sock *sk)
> > > {
> > >         ...
> > >         sk->sk_shutdown = SHUTDOWN_MASK;
> > >
> > >         /* Ensure previous writes to sk->sk_err, sk->sk_state,
> > >          * sk->sk_shutdown are visible to others.
> > >          * This barrier is coupled with smp_rmb() in tcp_poll()
> > >          */
> > >         smp_wmb();
> >
> > I wonder if it would affect those callers who have no interest in
> > pairing smp operations, like tcp_v4_syn_recv_sock()? For those
> > callers, WRITE_ONCE/READ_ONCE() is enough to protect itself only.
>
> WRITE_ONCE()/READ_ONCE() and smp_rmb()/smp_wmb() have different purposes.
>
> smp_rmb()/smp_wmb() are order of magnitude more expensive than
> WRITE_ONCE()/READ_ONCE()
>
> You should use them only when absolutely needed.

Sure, I know them. What I was trying to say is putting a smp_wmb()
into tcp_done() is not appropriate because other callers don't want
this expansive protection for sk_shutdown which can be protected with
WRITE/READ_ONCE.

Thanks,
Jason
Paolo Abeni May 28, 2024, 10:41 a.m. UTC | #6
On Mon, 2024-05-27 at 10:53 +0200, Eric Dumazet wrote:
> On Sat, May 25, 2024 at 4:14 PM Neal Cardwell <ncardwell@google.com> wrote:
> > 
> > On Fri, May 24, 2024 at 3:36 PM Eric Dumazet <edumazet@google.com> wrote:
> > > 
> > > tcp_reset() ends with a sequence that is carefuly ordered.
> > > 
> > > We need to fix [e]poll bugs in the following patches,
> > > it makes sense to use a common helper.
> > > 
> > > Suggested-by: Neal Cardwell <ncardwell@google.com>
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > > ---
> > >  include/net/tcp.h    |  1 +
> > >  net/ipv4/tcp.c       |  2 +-
> > >  net/ipv4/tcp_input.c | 25 +++++++++++++++++--------
> > >  3 files changed, 19 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/include/net/tcp.h b/include/net/tcp.h
> > > index 060e95b331a286ad7c355be11dc03250d2944920..2e7150f6755a5f5bf7b45454da0b33c5fac78183 100644
> > > --- a/include/net/tcp.h
> > > +++ b/include/net/tcp.h
> > > @@ -677,6 +677,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
> > >  /* tcp_input.c */
> > >  void tcp_rearm_rto(struct sock *sk);
> > >  void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
> > > +void tcp_done_with_error(struct sock *sk);
> > >  void tcp_reset(struct sock *sk, struct sk_buff *skb);
> > >  void tcp_fin(struct sock *sk);
> > >  void tcp_check_space(struct sock *sk);
> > > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> > > index 681b54e1f3a64387787738ab6495531b8abe1771..2a8f8d8676ff1d30ea9f8cd47ccf9236940eb299 100644
> > > --- a/net/ipv4/tcp.c
> > > +++ b/net/ipv4/tcp.c
> > > @@ -598,7 +598,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
> > >                  */
> > >                 mask |= EPOLLOUT | EPOLLWRNORM;
> > >         }
> > > -       /* This barrier is coupled with smp_wmb() in tcp_reset() */
> > > +       /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
> > >         smp_rmb();
> > >         if (READ_ONCE(sk->sk_err) ||
> > >             !skb_queue_empty_lockless(&sk->sk_error_queue))
> > > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > > index 9c04a9c8be9dfaa0ec2437b3748284e57588b216..5af716f1bc74e095d22f64d605624decfe27cefe 100644
> > > --- a/net/ipv4/tcp_input.c
> > > +++ b/net/ipv4/tcp_input.c
> > > @@ -4436,6 +4436,22 @@ static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
> > >         return SKB_NOT_DROPPED_YET;
> > >  }
> > > 
> > > +
> > > +void tcp_done_with_error(struct sock *sk)
> > > +{
> > > +       /* Our caller wrote a value into sk->sk_err.
> > > +        * This barrier is coupled with smp_rmb() in tcp_poll()
> > > +        */
> > > +       smp_wmb();
> > > +
> > > +       tcp_write_queue_purge(sk);
> > > +       tcp_done(sk);
> > > +
> > > +       if (!sock_flag(sk, SOCK_DEAD))
> > > +               sk_error_report(sk);
> > > +}
> > > +EXPORT_SYMBOL(tcp_done_with_error);
> > > +
> > >  /* When we get a reset we do this. */
> > >  void tcp_reset(struct sock *sk, struct sk_buff *skb)
> > >  {
> > > @@ -4460,14 +4476,7 @@ void tcp_reset(struct sock *sk, struct sk_buff *skb)
> > >         default:
> > >                 WRITE_ONCE(sk->sk_err, ECONNRESET);
> > >         }
> > > -       /* This barrier is coupled with smp_rmb() in tcp_poll() */
> > > -       smp_wmb();
> > > -
> > > -       tcp_write_queue_purge(sk);
> > > -       tcp_done(sk);
> > > -
> > > -       if (!sock_flag(sk, SOCK_DEAD))
> > > -               sk_error_report(sk);
> > > +       tcp_done_with_error(sk);
> > >  }
> > > 
> > >  /*
> > > --
> > 
> > Thanks, Eric!
> > 
> > Thinking about this more, I wonder if there is another aspect to this issue.
> > 
> > I am thinking about this part of tcp_done():
> > 
> > void tcp_done(struct sock *sk)
> > {
> > ...
> >         sk->sk_shutdown = SHUTDOWN_MASK;
> > 
> >         if (!sock_flag(sk, SOCK_DEAD))
> >                 sk->sk_state_change(sk);
> > 
> > The tcp_poll() code reads sk->sk_shutdown to decide whether to set
> > EPOLLHUP and other bits. However, sk->sk_shutdown is not set until
> > here in tcp_done(). And in the tcp_done() code there is no smp_wmb()
> > to ensure that the sk->sk_shutdown is visible to other CPUs before
> > tcp_done() calls sk->sk_state_change() to wake up threads sleeping on
> > sk->sk_wq.
> > 
> > So AFAICT we could have cases where this sk->sk_state_change() (or the
> > later sk_error_report()?) wakes a thread doing a tcp_poll() on another
> > CPU, and the tcp_poll() code may correctly see the sk->sk_err because
> > it was updated before the smp_wmb() in tcp_done_with_error(), but may
> > fail to see the "sk->sk_shutdown = SHUTDOWN_MASK" write because that
> > happened after the smp_wmb() in tcp_done_with_error().
> > 
> > So AFAICT  maybe we need two changes?
> 
> This seems orthogonal, and should be discussed in a different patch series ?
> I am afraid of the additional implications of your proposal.
> 
> Applications react to EPOLLERR by getting the (termination) error
> code, they don't really _need_ EPOLLHUP at this stage to behave
> correctly.
> 
> And EPOLLERR got a fix  more than a decade ago, nobody complained
> there was an issue with sk_shutdown.
> 
> commit a4d258036ed9b2a1811c3670c6099203a0f284a0
> Author: Tom Marshall <tdm.code@gmail.com>
> Date:   Mon Sep 20 15:42:05 2010 -0700
> 
>     tcp: Fix race in tcp_poll
> 
> Notice how Tom moved sk_err read to the end of tcp_poll().
> It is not possible with extra smp_wmb() and smp_wmb() alone to make
> sure tcp_poll() gets a consistent view.
> There are too many variables to snapshot in a consistent way.
> 
> Only packetdrill has an issue here, because it expects a precise
> combination of flags.
> 
> Would you prefer to change packetdrill to accept both possible values ?
> 
>    +0 epoll_ctl(5, EPOLL_CTL_ADD, 4, {events=EPOLLERR, fd=4}) = 0
> 
> // This is the part that would need a change:
> // something like +0...11 epoll_wait(5, {events=EPOLLERR [| EPOLLHUP],
> fd=4}, 1, 15000) = 1   ??
>   +0...11 epoll_wait(5, {events=EPOLLERR|EPOLLHUP, fd=4}, 1, 15000) = 1
> // Verify keepalive behavior looks correct, given the parameters above:
> // Start sending keepalive probes after 3 seconds of idle.
>    +3 > . 0:0(0) ack 1
> // Send keepalive probes every 2 seconds.
>    +2 > . 0:0(0) ack 1
>    +2 > . 0:0(0) ack 1
>    +2 > . 0:0(0) ack 1
>    +2 > R. 1:1(0) ack 1
> // Sent 4 keepalive probes and then gave up and reset the connection.
> // Verify that we get the expected error when we try to use the socket:
>    +0 read(4, ..., 1000) = -1 ETIMEDOUT (Connection timed out)
> 
> In 100 runs, I get 1 flake only (but I probably could get more if I
> added an ndelay(1000) before tcp_done() in unpatched kernel)
> keepalive-with-ts.pkt:44: runtime error in epoll_wait call:
> epoll_event->events does not match script: expected: 0x18 actual: 0x8
> 
> 
> > 
> > (1) AFAICT the call to smp_wmb() should actually instead be inside
> > tcp_done(), after we set sk->sk_shutdown?
> > 
> > void tcp_done(struct sock *sk)
> > {
> >         ...
> >         sk->sk_shutdown = SHUTDOWN_MASK;
> > 
> >         /* Ensure previous writes to sk->sk_err, sk->sk_state,
> >          * sk->sk_shutdown are visible to others.
> >          * This barrier is coupled with smp_rmb() in tcp_poll()
> >          */
> >         smp_wmb();
> 
> Not adding an smp_wmb() _right_ after the write to sk_err
> might bring back the race that Tom wanted to fix in 2010.
> 
> (This was a poll() system call, not a callback initiated from TCP
> stack itself with sk_error_report(), this one does not need barriers,
> because sock_def_error_report() implies a lot of barriers before the
> user thread can call tcp_poll())

Waiting for Neal's ack.

FTR I think the new helper introduction is worthy even just for the
consistency it brings.

IIRC there is some extra complexity in the MPTCP code to handle
correctly receiving the sk_error_report sk_state_change cb pair in both
possible orders.

Cheers,

Paolo
Eric Dumazet May 28, 2024, 11:31 a.m. UTC | #7
On Tue, May 28, 2024 at 12:41 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
>
> Waiting for Neal's ack.
>
> FTR I think the new helper introduction is worthy even just for the
> consistency it brings.
>
> IIRC there is some extra complexity in the MPTCP code to handle
> correctly receiving the sk_error_report sk_state_change cb pair in both
> possible orders.

Would you prefer me to base the series on net-next then ?
Paolo Abeni May 28, 2024, 11:50 a.m. UTC | #8
On Tue, 2024-05-28 at 13:31 +0200, Eric Dumazet wrote:
> On Tue, May 28, 2024 at 12:41 PM Paolo Abeni <pabeni@redhat.com> wrote:
> > 
> > 
> > Waiting for Neal's ack.
> > 
> > FTR I think the new helper introduction is worthy even just for the
> > consistency it brings.
> > 
> > IIRC there is some extra complexity in the MPTCP code to handle
> > correctly receiving the sk_error_report sk_state_change cb pair in both
> > possible orders.
> 
> Would you prefer me to base the series on net-next then ?

Now that you make me thing about it, net-next will be preferable to
handle possible mptcp follow-up (if any). And the addressed issue
itself are so old it should not make difference in practice, right?

So if it's not a problem for you move the patches on a different tree,
net-next would be good option, thanks!

Paolo
Eric Dumazet May 28, 2024, 12:22 p.m. UTC | #9
On Tue, May 28, 2024 at 1:50 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Tue, 2024-05-28 at 13:31 +0200, Eric Dumazet wrote:
> > On Tue, May 28, 2024 at 12:41 PM Paolo Abeni <pabeni@redhat.com> wrote:
> > >
> > >
> > > Waiting for Neal's ack.
> > >
> > > FTR I think the new helper introduction is worthy even just for the
> > > consistency it brings.
> > >
> > > IIRC there is some extra complexity in the MPTCP code to handle
> > > correctly receiving the sk_error_report sk_state_change cb pair in both
> > > possible orders.
> >
> > Would you prefer me to base the series on net-next then ?
>
> Now that you make me thing about it, net-next will be preferable to
> handle possible mptcp follow-up (if any). And the addressed issue
> itself are so old it should not make difference in practice, right?
>
> So if it's not a problem for you move the patches on a different tree,
> net-next would be good option, thanks!

Sure, these are minor changes I think, only nice to have.
diff mbox series

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 060e95b331a286ad7c355be11dc03250d2944920..2e7150f6755a5f5bf7b45454da0b33c5fac78183 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -677,6 +677,7 @@  void tcp_skb_collapse_tstamp(struct sk_buff *skb,
 /* tcp_input.c */
 void tcp_rearm_rto(struct sock *sk);
 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
+void tcp_done_with_error(struct sock *sk);
 void tcp_reset(struct sock *sk, struct sk_buff *skb);
 void tcp_fin(struct sock *sk);
 void tcp_check_space(struct sock *sk);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 681b54e1f3a64387787738ab6495531b8abe1771..2a8f8d8676ff1d30ea9f8cd47ccf9236940eb299 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -598,7 +598,7 @@  __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 		 */
 		mask |= EPOLLOUT | EPOLLWRNORM;
 	}
-	/* This barrier is coupled with smp_wmb() in tcp_reset() */
+	/* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
 	smp_rmb();
 	if (READ_ONCE(sk->sk_err) ||
 	    !skb_queue_empty_lockless(&sk->sk_error_queue))
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9c04a9c8be9dfaa0ec2437b3748284e57588b216..5af716f1bc74e095d22f64d605624decfe27cefe 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4436,6 +4436,22 @@  static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp,
 	return SKB_NOT_DROPPED_YET;
 }
 
+
+void tcp_done_with_error(struct sock *sk)
+{
+	/* Our caller wrote a value into sk->sk_err.
+	 * This barrier is coupled with smp_rmb() in tcp_poll()
+	 */
+	smp_wmb();
+
+	tcp_write_queue_purge(sk);
+	tcp_done(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk_error_report(sk);
+}
+EXPORT_SYMBOL(tcp_done_with_error);
+
 /* When we get a reset we do this. */
 void tcp_reset(struct sock *sk, struct sk_buff *skb)
 {
@@ -4460,14 +4476,7 @@  void tcp_reset(struct sock *sk, struct sk_buff *skb)
 	default:
 		WRITE_ONCE(sk->sk_err, ECONNRESET);
 	}
-	/* This barrier is coupled with smp_rmb() in tcp_poll() */
-	smp_wmb();
-
-	tcp_write_queue_purge(sk);
-	tcp_done(sk);
-
-	if (!sock_flag(sk, SOCK_DEAD))
-		sk_error_report(sk);
+	tcp_done_with_error(sk);
 }
 
 /*