diff mbox series

[net-next,2/9] mptcp: track some aggregate data counters

Message ID 20230620-upstream-net-next-20230620-mptcp-expose-more-info-and-misc-v1-2-62b9444bfd48@tessares.net (mailing list archive)
State Accepted
Commit 38967f424b5be79c4c676712e5640d846efd07e3
Delegated to: Netdev Maintainers
Headers show
Series mptcp: expose more info and small improvements | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 13 this patch: 13
netdev/cc_maintainers success CCed 8 of 8 maintainers
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 13 this patch: 13
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 145 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Matthieu Baerts June 20, 2023, 4:30 p.m. UTC
From: Paolo Abeni <pabeni@redhat.com>

Currently there are no data transfer counters accounting for all
the subflows used by a given MPTCP socket. The user-space can compute
such figures aggregating the subflow info, but that is inaccurate
if any subflow is closed before the MPTCP socket itself.

Add the new counters in the MPTCP socket itself and expose them
via the existing diag and sockopt. While touching mptcp_diag_fill_info(),
acquire the relevant locks before fetching the msk data, to ensure
better data consistency

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/385
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
---
 include/uapi/linux/mptcp.h |  5 +++++
 net/mptcp/options.c        | 10 ++++++++--
 net/mptcp/protocol.c       | 11 ++++++++++-
 net/mptcp/protocol.h       |  4 ++++
 net/mptcp/sockopt.c        | 25 ++++++++++++++++++++-----
 5 files changed, 47 insertions(+), 8 deletions(-)

Comments

Eric Dumazet June 23, 2023, 2:25 p.m. UTC | #1
On Tue, Jun 20, 2023 at 6:30 PM Matthieu Baerts
<matthieu.baerts@tessares.net> wrote:
>
> From: Paolo Abeni <pabeni@redhat.com>
>
> Currently there are no data transfer counters accounting for all
> the subflows used by a given MPTCP socket. The user-space can compute
> such figures aggregating the subflow info, but that is inaccurate
> if any subflow is closed before the MPTCP socket itself.
>
> Add the new counters in the MPTCP socket itself and expose them
> via the existing diag and sockopt. While touching mptcp_diag_fill_info(),
> acquire the relevant locks before fetching the msk data, to ensure
> better data consistency
>
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/385
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> ---
>  include/uapi/linux/mptcp.h |  5 +++++
>  net/mptcp/options.c        | 10 ++++++++--
>  net/mptcp/protocol.c       | 11 ++++++++++-
>  net/mptcp/protocol.h       |  4 ++++
>  net/mptcp/sockopt.c        | 25 ++++++++++++++++++++-----
>  5 files changed, 47 insertions(+), 8 deletions(-)
>
> diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
> index 32af2d278cb4..a124be6ebbba 100644
> --- a/include/uapi/linux/mptcp.h
> +++ b/include/uapi/linux/mptcp.h
> @@ -123,6 +123,11 @@ struct mptcp_info {
>         __u8    mptcpi_local_addr_used;
>         __u8    mptcpi_local_addr_max;
>         __u8    mptcpi_csum_enabled;
> +       __u32   mptcpi_retransmits;
> +       __u64   mptcpi_bytes_retrans;
> +       __u64   mptcpi_bytes_sent;
> +       __u64   mptcpi_bytes_received;
> +       __u64   mptcpi_bytes_acked;
>  };
>
>  /*
> diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> index 4bdcd2b326bd..c254accb14de 100644
> --- a/net/mptcp/options.c
> +++ b/net/mptcp/options.c
> @@ -1026,6 +1026,12 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq)
>         return cur_seq;
>  }
>
> +static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
> +{
> +       msk->bytes_acked += new_snd_una - msk->snd_una;
> +       msk->snd_una = new_snd_una;
> +}
> +
>  static void ack_update_msk(struct mptcp_sock *msk,
>                            struct sock *ssk,
>                            struct mptcp_options_received *mp_opt)
> @@ -1057,7 +1063,7 @@ static void ack_update_msk(struct mptcp_sock *msk,
>                 __mptcp_check_push(sk, ssk);
>
>         if (after64(new_snd_una, old_snd_una)) {
> -               msk->snd_una = new_snd_una;
> +               __mptcp_snd_una_update(msk, new_snd_una);
>                 __mptcp_data_acked(sk);
>         }
>         mptcp_data_unlock(sk);
> @@ -1123,7 +1129,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
>                 /* on fallback we just need to ignore the msk-level snd_una, as
>                  * this is really plain TCP
>                  */
> -               msk->snd_una = READ_ONCE(msk->snd_nxt);
> +               __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt));
>
>                 __mptcp_data_acked(subflow->conn);
>                 mptcp_data_unlock(subflow->conn);
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 9c756d675d4d..d5b8e488bce1 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -377,6 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
>
>         if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
>                 /* in sequence */
> +               msk->bytes_received += copy_len;
>                 WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
>                 tail = skb_peek_tail(&sk->sk_receive_queue);
>                 if (tail && mptcp_try_coalesce(sk, tail, skb))
> @@ -760,6 +761,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
>                         MPTCP_SKB_CB(skb)->map_seq += delta;
>                         __skb_queue_tail(&sk->sk_receive_queue, skb);
>                 }
> +               msk->bytes_received += end_seq - msk->ack_seq;
>                 msk->ack_seq = end_seq;
>                 moved = true;
>         }
> @@ -1531,8 +1533,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk,
>          * that has been handed to the subflow for transmission
>          * and skip update in case it was old dfrag.
>          */
> -       if (likely(after64(snd_nxt_new, msk->snd_nxt)))
> +       if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
> +               msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
>                 msk->snd_nxt = snd_nxt_new;
> +       }
>  }
>
>  void mptcp_check_and_set_pending(struct sock *sk)
> @@ -2590,6 +2594,7 @@ static void __mptcp_retrans(struct sock *sk)
>         }
>         if (copied) {
>                 dfrag->already_sent = max(dfrag->already_sent, info.sent);
> +               msk->bytes_retrans += copied;
>                 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
>                          info.size_goal);
>                 WRITE_ONCE(msk->allow_infinite_fallback, false);
> @@ -3102,6 +3107,10 @@ static int mptcp_disconnect(struct sock *sk, int flags)
>         WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
>         mptcp_pm_data_reset(msk);
>         mptcp_ca_reset(sk);
> +       msk->bytes_acked = 0;
> +       msk->bytes_received = 0;
> +       msk->bytes_sent = 0;
> +       msk->bytes_retrans = 0;
>
>         WRITE_ONCE(sk->sk_shutdown, 0);
>         sk_error_report(sk);
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 47b46602870e..27adfcc5aaa2 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -262,10 +262,13 @@ struct mptcp_sock {
>         u64             local_key;
>         u64             remote_key;
>         u64             write_seq;
> +       u64             bytes_sent;
>         u64             snd_nxt;
> +       u64             bytes_received;
>         u64             ack_seq;
>         atomic64_t      rcv_wnd_sent;
>         u64             rcv_data_fin_seq;
> +       u64             bytes_retrans;
>         int             rmem_fwd_alloc;
>         struct sock     *last_snd;
>         int             snd_burst;
> @@ -274,6 +277,7 @@ struct mptcp_sock {
>                                                  * recovery related fields are under data_lock
>                                                  * protection
>                                                  */
> +       u64             bytes_acked;
>         u64             snd_una;
>         u64             wnd_end;
>         unsigned long   timer_ival;
> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> index e172a5848b0d..fa5055d5b029 100644
> --- a/net/mptcp/sockopt.c
> +++ b/net/mptcp/sockopt.c
> @@ -889,7 +889,9 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
>
>  void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
>  {
> +       struct sock *sk = (struct sock *)msk;
>         u32 flags = 0;
> +       bool slow;
>
>         memset(info, 0, sizeof(*info));
>
> @@ -898,6 +900,9 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
>         info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
>         info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
>
> +       if (inet_sk_state_load(sk) == TCP_LISTEN)
> +               return;
> +
>         /* The following limits only make sense for the in-kernel PM */
>         if (mptcp_pm_is_kernel(msk)) {
>                 info->mptcpi_subflows_max =
> @@ -915,11 +920,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
>         if (READ_ONCE(msk->can_ack))
>                 flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
>         info->mptcpi_flags = flags;
> -       info->mptcpi_token = READ_ONCE(msk->token);
> -       info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
> -       info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
> -       info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
> -       info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
> +       mptcp_data_lock(sk);
> +       info->mptcpi_snd_una = msk->snd_una;
> +       info->mptcpi_rcv_nxt = msk->ack_seq;
> +       info->mptcpi_bytes_acked = msk->bytes_acked;
> +       mptcp_data_unlock(sk);
> +
> +       slow = lock_sock_fast(sk);

This causes a lockdep issue.

lock_sock_fast(sk) could sleep, if socket lock is owned by another process.

But we are called from a context where both a spin lock and
rcu_read_lock() are held.

BUG: sleeping function called from invalid context at net/core/sock.c:3549
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 316, name: syz-executor.4
preempt_count: 1, expected: 0
RCU nest depth: 1, expected: 0
7 locks held by syz-executor.4/316:
#0: ffffffff8e125408 (sock_diag_mutex){+.+.}-{3:3}, at:
sock_diag_rcv+0x1b/0x40 net/core/sock_diag.c:279
#1: ffffffff8e125588 (sock_diag_table_mutex){+.+.}-{3:3}, at:
sock_diag_rcv_msg net/core/sock_diag.c:259 [inline]
#1: ffffffff8e125588 (sock_diag_table_mutex){+.+.}-{3:3}, at:
sock_diag_rcv_msg+0x2d2/0x440 net/core/sock_diag.c:248
#2: ffff8880232bb688 (nlk_cb_mutex-SOCK_DIAG){+.+.}-{3:3}, at:
netlink_dump+0xbe/0xc50 net/netlink/af_netlink.c:2215
#3: ffffffff8e29a628 (inet_diag_table_mutex){+.+.}-{3:3}, at:
inet_diag_lock_handler+0x6e/0x100 net/ipv4/inet_diag.c:63
#4: ffffffff8c7990c0 (rcu_read_lock){....}-{1:2}, at:
mptcp_diag_dump_listeners net/mptcp/mptcp_diag.c:95 [inline]
#4: ffffffff8c7990c0 (rcu_read_lock){....}-{1:2}, at:
mptcp_diag_dump+0x7c8/0x1330 net/mptcp/mptcp_diag.c:197
#5: ffffc90001316bf0 (&h->lhash2[i].lock){+.+.}-{2:2}, at: spin_lock
include/linux/spinlock.h:350 [inline]
#5: ffffc90001316bf0 (&h->lhash2[i].lock){+.+.}-{2:2}, at:
mptcp_diag_dump_listeners net/mptcp/mptcp_diag.c:98 [inline]
#5: ffffc90001316bf0 (&h->lhash2[i].lock){+.+.}-{2:2}, at:
mptcp_diag_dump+0x838/0x1330 net/mptcp/mptcp_diag.c:197
#6: ffff88802c42a5f0 (msk_lock-AF_INET){+.+.}-{0:0}, at:
mptcp_diag_get_info+0x1ae/0x380 net/mptcp/mptcp_diag.c:224
Preemption disabled at:
[<0000000000000000>] 0x0

> +       info->mptcpi_csum_enabled = msk->csum_enabled;
> +       info->mptcpi_token = msk->token;
> +       info->mptcpi_write_seq = msk->write_seq;
> +       info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
> +       info->mptcpi_bytes_sent = msk->bytes_sent;
> +       info->mptcpi_bytes_received = msk->bytes_received;
> +       info->mptcpi_bytes_retrans = msk->bytes_retrans;
> +       unlock_sock_fast(sk, slow);
>  }
>  EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
>
>
> --
> 2.40.1
>
Paolo Abeni June 23, 2023, 2:41 p.m. UTC | #2
Hi,

On Fri, 2023-06-23 at 16:25 +0200, Eric Dumazet wrote:
> On Tue, Jun 20, 2023 at 6:30 PM Matthieu Baerts
> <matthieu.baerts@tessares.net> wrote:
> > 
> > From: Paolo Abeni <pabeni@redhat.com>
> > 
> > Currently there are no data transfer counters accounting for all
> > the subflows used by a given MPTCP socket. The user-space can compute
> > such figures aggregating the subflow info, but that is inaccurate
> > if any subflow is closed before the MPTCP socket itself.
> > 
> > Add the new counters in the MPTCP socket itself and expose them
> > via the existing diag and sockopt. While touching mptcp_diag_fill_info(),
> > acquire the relevant locks before fetching the msk data, to ensure
> > better data consistency
> > 
> > Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/385
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> > Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
> > ---
> >  include/uapi/linux/mptcp.h |  5 +++++
> >  net/mptcp/options.c        | 10 ++++++++--
> >  net/mptcp/protocol.c       | 11 ++++++++++-
> >  net/mptcp/protocol.h       |  4 ++++
> >  net/mptcp/sockopt.c        | 25 ++++++++++++++++++++-----
> >  5 files changed, 47 insertions(+), 8 deletions(-)
> > 
> > diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
> > index 32af2d278cb4..a124be6ebbba 100644
> > --- a/include/uapi/linux/mptcp.h
> > +++ b/include/uapi/linux/mptcp.h
> > @@ -123,6 +123,11 @@ struct mptcp_info {
> >         __u8    mptcpi_local_addr_used;
> >         __u8    mptcpi_local_addr_max;
> >         __u8    mptcpi_csum_enabled;
> > +       __u32   mptcpi_retransmits;
> > +       __u64   mptcpi_bytes_retrans;
> > +       __u64   mptcpi_bytes_sent;
> > +       __u64   mptcpi_bytes_received;
> > +       __u64   mptcpi_bytes_acked;
> >  };
> > 
> >  /*
> > diff --git a/net/mptcp/options.c b/net/mptcp/options.c
> > index 4bdcd2b326bd..c254accb14de 100644
> > --- a/net/mptcp/options.c
> > +++ b/net/mptcp/options.c
> > @@ -1026,6 +1026,12 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq)
> >         return cur_seq;
> >  }
> > 
> > +static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
> > +{
> > +       msk->bytes_acked += new_snd_una - msk->snd_una;
> > +       msk->snd_una = new_snd_una;
> > +}
> > +
> >  static void ack_update_msk(struct mptcp_sock *msk,
> >                            struct sock *ssk,
> >                            struct mptcp_options_received *mp_opt)
> > @@ -1057,7 +1063,7 @@ static void ack_update_msk(struct mptcp_sock *msk,
> >                 __mptcp_check_push(sk, ssk);
> > 
> >         if (after64(new_snd_una, old_snd_una)) {
> > -               msk->snd_una = new_snd_una;
> > +               __mptcp_snd_una_update(msk, new_snd_una);
> >                 __mptcp_data_acked(sk);
> >         }
> >         mptcp_data_unlock(sk);
> > @@ -1123,7 +1129,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
> >                 /* on fallback we just need to ignore the msk-level snd_una, as
> >                  * this is really plain TCP
> >                  */
> > -               msk->snd_una = READ_ONCE(msk->snd_nxt);
> > +               __mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt));
> > 
> >                 __mptcp_data_acked(subflow->conn);
> >                 mptcp_data_unlock(subflow->conn);
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 9c756d675d4d..d5b8e488bce1 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -377,6 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> > 
> >         if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
> >                 /* in sequence */
> > +               msk->bytes_received += copy_len;
> >                 WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
> >                 tail = skb_peek_tail(&sk->sk_receive_queue);
> >                 if (tail && mptcp_try_coalesce(sk, tail, skb))
> > @@ -760,6 +761,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
> >                         MPTCP_SKB_CB(skb)->map_seq += delta;
> >                         __skb_queue_tail(&sk->sk_receive_queue, skb);
> >                 }
> > +               msk->bytes_received += end_seq - msk->ack_seq;
> >                 msk->ack_seq = end_seq;
> >                 moved = true;
> >         }
> > @@ -1531,8 +1533,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk,
> >          * that has been handed to the subflow for transmission
> >          * and skip update in case it was old dfrag.
> >          */
> > -       if (likely(after64(snd_nxt_new, msk->snd_nxt)))
> > +       if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
> > +               msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
> >                 msk->snd_nxt = snd_nxt_new;
> > +       }
> >  }
> > 
> >  void mptcp_check_and_set_pending(struct sock *sk)
> > @@ -2590,6 +2594,7 @@ static void __mptcp_retrans(struct sock *sk)
> >         }
> >         if (copied) {
> >                 dfrag->already_sent = max(dfrag->already_sent, info.sent);
> > +               msk->bytes_retrans += copied;
> >                 tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
> >                          info.size_goal);
> >                 WRITE_ONCE(msk->allow_infinite_fallback, false);
> > @@ -3102,6 +3107,10 @@ static int mptcp_disconnect(struct sock *sk, int flags)
> >         WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
> >         mptcp_pm_data_reset(msk);
> >         mptcp_ca_reset(sk);
> > +       msk->bytes_acked = 0;
> > +       msk->bytes_received = 0;
> > +       msk->bytes_sent = 0;
> > +       msk->bytes_retrans = 0;
> > 
> >         WRITE_ONCE(sk->sk_shutdown, 0);
> >         sk_error_report(sk);
> > diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> > index 47b46602870e..27adfcc5aaa2 100644
> > --- a/net/mptcp/protocol.h
> > +++ b/net/mptcp/protocol.h
> > @@ -262,10 +262,13 @@ struct mptcp_sock {
> >         u64             local_key;
> >         u64             remote_key;
> >         u64             write_seq;
> > +       u64             bytes_sent;
> >         u64             snd_nxt;
> > +       u64             bytes_received;
> >         u64             ack_seq;
> >         atomic64_t      rcv_wnd_sent;
> >         u64             rcv_data_fin_seq;
> > +       u64             bytes_retrans;
> >         int             rmem_fwd_alloc;
> >         struct sock     *last_snd;
> >         int             snd_burst;
> > @@ -274,6 +277,7 @@ struct mptcp_sock {
> >                                                  * recovery related fields are under data_lock
> >                                                  * protection
> >                                                  */
> > +       u64             bytes_acked;
> >         u64             snd_una;
> >         u64             wnd_end;
> >         unsigned long   timer_ival;
> > diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> > index e172a5848b0d..fa5055d5b029 100644
> > --- a/net/mptcp/sockopt.c
> > +++ b/net/mptcp/sockopt.c
> > @@ -889,7 +889,9 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
> > 
> >  void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
> >  {
> > +       struct sock *sk = (struct sock *)msk;
> >         u32 flags = 0;
> > +       bool slow;
> > 
> >         memset(info, 0, sizeof(*info));
> > 
> > @@ -898,6 +900,9 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
> >         info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
> >         info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
> > 
> > +       if (inet_sk_state_load(sk) == TCP_LISTEN)
> > +               return;
> > +
> >         /* The following limits only make sense for the in-kernel PM */
> >         if (mptcp_pm_is_kernel(msk)) {
> >                 info->mptcpi_subflows_max =
> > @@ -915,11 +920,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
> >         if (READ_ONCE(msk->can_ack))
> >                 flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
> >         info->mptcpi_flags = flags;
> > -       info->mptcpi_token = READ_ONCE(msk->token);
> > -       info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
> > -       info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
> > -       info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
> > -       info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
> > +       mptcp_data_lock(sk);
> > +       info->mptcpi_snd_una = msk->snd_una;
> > +       info->mptcpi_rcv_nxt = msk->ack_seq;
> > +       info->mptcpi_bytes_acked = msk->bytes_acked;
> > +       mptcp_data_unlock(sk);
> > +
> > +       slow = lock_sock_fast(sk);
> 
> This causes a lockdep issue.
> 
> lock_sock_fast(sk) could sleep, if socket lock is owned by another process.
> 
> But we are called from a context where both a spin lock and
> rcu_read_lock() are held.
> 
> BUG: sleeping function called from invalid context at net/core/sock.c:3549
> in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 316, name: syz-executor.4
> preempt_count: 1, expected: 0
> RCU nest depth: 1, expected: 0
> 7 locks held by syz-executor.4/316:
> #0: ffffffff8e125408 (sock_diag_mutex){+.+.}-{3:3}, at:
> sock_diag_rcv+0x1b/0x40 net/core/sock_diag.c:279
> #1: ffffffff8e125588 (sock_diag_table_mutex){+.+.}-{3:3}, at:
> sock_diag_rcv_msg net/core/sock_diag.c:259 [inline]
> #1: ffffffff8e125588 (sock_diag_table_mutex){+.+.}-{3:3}, at:
> sock_diag_rcv_msg+0x2d2/0x440 net/core/sock_diag.c:248
> #2: ffff8880232bb688 (nlk_cb_mutex-SOCK_DIAG){+.+.}-{3:3}, at:
> netlink_dump+0xbe/0xc50 net/netlink/af_netlink.c:2215
> #3: ffffffff8e29a628 (inet_diag_table_mutex){+.+.}-{3:3}, at:
> inet_diag_lock_handler+0x6e/0x100 net/ipv4/inet_diag.c:63
> #4: ffffffff8c7990c0 (rcu_read_lock){....}-{1:2}, at:
> mptcp_diag_dump_listeners net/mptcp/mptcp_diag.c:95 [inline]
> #4: ffffffff8c7990c0 (rcu_read_lock){....}-{1:2}, at:
> mptcp_diag_dump+0x7c8/0x1330 net/mptcp/mptcp_diag.c:197
> #5: ffffc90001316bf0 (&h->lhash2[i].lock){+.+.}-{2:2}, at: spin_lock
> include/linux/spinlock.h:350 [inline]
> #5: ffffc90001316bf0 (&h->lhash2[i].lock){+.+.}-{2:2}, at:
> mptcp_diag_dump_listeners net/mptcp/mptcp_diag.c:98 [inline]
> #5: ffffc90001316bf0 (&h->lhash2[i].lock){+.+.}-{2:2}, at:
> mptcp_diag_dump+0x838/0x1330 net/mptcp/mptcp_diag.c:197
> #6: ffff88802c42a5f0 (msk_lock-AF_INET){+.+.}-{0:0}, at:
> mptcp_diag_get_info+0x1ae/0x380 net/mptcp/mptcp_diag.c:224
> Preemption disabled at:
> [<0000000000000000>] 0x0

Thank you for the report.

out-of-order patches here. "mptcp: track some aggregate data counters"
should have landed to net-next only after:

57fc0f1ceaa4 mptcp: ensure listener is unhashed before updating the sk
status

The latter should explicitly avoid the critical scenario above. Anyhow
the current net-next tree (after merging back net) should be ok (at
least I can't repro the issue here).

Thanks,

Paolo
diff mbox series

Patch

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 32af2d278cb4..a124be6ebbba 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -123,6 +123,11 @@  struct mptcp_info {
 	__u8	mptcpi_local_addr_used;
 	__u8	mptcpi_local_addr_max;
 	__u8	mptcpi_csum_enabled;
+	__u32	mptcpi_retransmits;
+	__u64	mptcpi_bytes_retrans;
+	__u64	mptcpi_bytes_sent;
+	__u64	mptcpi_bytes_received;
+	__u64	mptcpi_bytes_acked;
 };
 
 /*
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 4bdcd2b326bd..c254accb14de 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1026,6 +1026,12 @@  u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq)
 	return cur_seq;
 }
 
+static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
+{
+	msk->bytes_acked += new_snd_una - msk->snd_una;
+	msk->snd_una = new_snd_una;
+}
+
 static void ack_update_msk(struct mptcp_sock *msk,
 			   struct sock *ssk,
 			   struct mptcp_options_received *mp_opt)
@@ -1057,7 +1063,7 @@  static void ack_update_msk(struct mptcp_sock *msk,
 		__mptcp_check_push(sk, ssk);
 
 	if (after64(new_snd_una, old_snd_una)) {
-		msk->snd_una = new_snd_una;
+		__mptcp_snd_una_update(msk, new_snd_una);
 		__mptcp_data_acked(sk);
 	}
 	mptcp_data_unlock(sk);
@@ -1123,7 +1129,7 @@  bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 		/* on fallback we just need to ignore the msk-level snd_una, as
 		 * this is really plain TCP
 		 */
-		msk->snd_una = READ_ONCE(msk->snd_nxt);
+		__mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt));
 
 		__mptcp_data_acked(subflow->conn);
 		mptcp_data_unlock(subflow->conn);
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 9c756d675d4d..d5b8e488bce1 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -377,6 +377,7 @@  static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
 
 	if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
 		/* in sequence */
+		msk->bytes_received += copy_len;
 		WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		if (tail && mptcp_try_coalesce(sk, tail, skb))
@@ -760,6 +761,7 @@  static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
 			MPTCP_SKB_CB(skb)->map_seq += delta;
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
 		}
+		msk->bytes_received += end_seq - msk->ack_seq;
 		msk->ack_seq = end_seq;
 		moved = true;
 	}
@@ -1531,8 +1533,10 @@  static void mptcp_update_post_push(struct mptcp_sock *msk,
 	 * that has been handed to the subflow for transmission
 	 * and skip update in case it was old dfrag.
 	 */
-	if (likely(after64(snd_nxt_new, msk->snd_nxt)))
+	if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
+		msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
 		msk->snd_nxt = snd_nxt_new;
+	}
 }
 
 void mptcp_check_and_set_pending(struct sock *sk)
@@ -2590,6 +2594,7 @@  static void __mptcp_retrans(struct sock *sk)
 	}
 	if (copied) {
 		dfrag->already_sent = max(dfrag->already_sent, info.sent);
+		msk->bytes_retrans += copied;
 		tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
 			 info.size_goal);
 		WRITE_ONCE(msk->allow_infinite_fallback, false);
@@ -3102,6 +3107,10 @@  static int mptcp_disconnect(struct sock *sk, int flags)
 	WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
 	mptcp_pm_data_reset(msk);
 	mptcp_ca_reset(sk);
+	msk->bytes_acked = 0;
+	msk->bytes_received = 0;
+	msk->bytes_sent = 0;
+	msk->bytes_retrans = 0;
 
 	WRITE_ONCE(sk->sk_shutdown, 0);
 	sk_error_report(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 47b46602870e..27adfcc5aaa2 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -262,10 +262,13 @@  struct mptcp_sock {
 	u64		local_key;
 	u64		remote_key;
 	u64		write_seq;
+	u64		bytes_sent;
 	u64		snd_nxt;
+	u64		bytes_received;
 	u64		ack_seq;
 	atomic64_t	rcv_wnd_sent;
 	u64		rcv_data_fin_seq;
+	u64		bytes_retrans;
 	int		rmem_fwd_alloc;
 	struct sock	*last_snd;
 	int		snd_burst;
@@ -274,6 +277,7 @@  struct mptcp_sock {
 						 * recovery related fields are under data_lock
 						 * protection
 						 */
+	u64		bytes_acked;
 	u64		snd_una;
 	u64		wnd_end;
 	unsigned long	timer_ival;
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index e172a5848b0d..fa5055d5b029 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -889,7 +889,9 @@  static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
 
 void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 {
+	struct sock *sk = (struct sock *)msk;
 	u32 flags = 0;
+	bool slow;
 
 	memset(info, 0, sizeof(*info));
 
@@ -898,6 +900,9 @@  void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 	info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
 	info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
 
+	if (inet_sk_state_load(sk) == TCP_LISTEN)
+		return;
+
 	/* The following limits only make sense for the in-kernel PM */
 	if (mptcp_pm_is_kernel(msk)) {
 		info->mptcpi_subflows_max =
@@ -915,11 +920,21 @@  void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 	if (READ_ONCE(msk->can_ack))
 		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
 	info->mptcpi_flags = flags;
-	info->mptcpi_token = READ_ONCE(msk->token);
-	info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
-	info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
-	info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
-	info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
+	mptcp_data_lock(sk);
+	info->mptcpi_snd_una = msk->snd_una;
+	info->mptcpi_rcv_nxt = msk->ack_seq;
+	info->mptcpi_bytes_acked = msk->bytes_acked;
+	mptcp_data_unlock(sk);
+
+	slow = lock_sock_fast(sk);
+	info->mptcpi_csum_enabled = msk->csum_enabled;
+	info->mptcpi_token = msk->token;
+	info->mptcpi_write_seq = msk->write_seq;
+	info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
+	info->mptcpi_bytes_sent = msk->bytes_sent;
+	info->mptcpi_bytes_received = msk->bytes_received;
+	info->mptcpi_bytes_retrans = msk->bytes_retrans;
+	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);