diff mbox series

[v1,net,11/16] net: Fix a data-race around sysctl_mem.

Message ID 20220706052130.16368-12-kuniyu@amazon.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series sysctl: Fix data-races around ipv4_table. | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net, async
netdev/fixes_present success Fixes tag present in non-next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count fail Series longer than 15 patches (and no cover letter)
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2860 this patch: 2860
netdev/cc_maintainers fail 1 blamed authors not CCed: nhorman@tuxdriver.com; 8 maintainers not CCed: marcelo.leitner@gmail.com vyasevich@gmail.com mingo@redhat.com nhorman@tuxdriver.com linux-decnet-user@lists.sourceforge.net yoshfuji@linux-ipv6.org linux-sctp@vger.kernel.org dsahern@kernel.org
netdev/build_clang success Errors and warnings before: 631 this patch: 631
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 2990 this patch: 2990
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 52 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Kuniyuki Iwashima July 6, 2022, 5:21 a.m. UTC
While reading .sysctl_mem, it can be changed concurrently.  So, we need to
add READ_ONCE().  Then we can set proc_doulongvec_minmax_lockless() as the
handler to mark it safe.

Fixes: 3847ce32aea9 ("core: add tracepoints for queueing skb to rcvbuf")
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
CC: Satoru Moriya <satoru.moriya@hds.com>
CC: Steven Rostedt <rostedt@goodmis.org>
---
 include/net/sock.h             | 2 +-
 include/trace/events/sock.h    | 6 +++---
 net/decnet/sysctl_net_decnet.c | 2 +-
 net/ipv4/sysctl_net_ipv4.c     | 4 ++--
 net/sctp/sysctl.c              | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

Comments

Steven Rostedt July 6, 2022, 1:17 p.m. UTC | #1
On Tue, 5 Jul 2022 22:21:25 -0700
Kuniyuki Iwashima <kuniyu@amazon.com> wrote:

> --- a/include/trace/events/sock.h
> +++ b/include/trace/events/sock.h
> @@ -122,9 +122,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
>  
>  	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
>  		__entry->name,
> -		__entry->sysctl_mem[0],
> -		__entry->sysctl_mem[1],
> -		__entry->sysctl_mem[2],
> +		READ_ONCE(__entry->sysctl_mem[0]),
> +		READ_ONCE(__entry->sysctl_mem[1]),
> +		READ_ONCE(__entry->sysctl_mem[2]),

This is not reading anything to do with sysctl. It's reading the content of
what was recorded in the ring buffer.

That is, the READ_ONCE() here is not necessary, and if anything will break
user space parsing, as this is exported to user space to tell it how to
read the binary format in the ring buffer.

-- Steve


>  		__entry->allocated,
>  		__entry->sysctl_rmem,
>  		__entry->rmem_alloc,
Steven Rostedt July 6, 2022, 1:27 p.m. UTC | #2
On Wed, 6 Jul 2022 09:17:07 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Tue, 5 Jul 2022 22:21:25 -0700
> Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> 
> > --- a/include/trace/events/sock.h
> > +++ b/include/trace/events/sock.h
> > @@ -122,9 +122,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
> >  
> >  	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
> >  		__entry->name,
> > -		__entry->sysctl_mem[0],
> > -		__entry->sysctl_mem[1],
> > -		__entry->sysctl_mem[2],
> > +		READ_ONCE(__entry->sysctl_mem[0]),
> > +		READ_ONCE(__entry->sysctl_mem[1]),
> > +		READ_ONCE(__entry->sysctl_mem[2]),  
> 
> This is not reading anything to do with sysctl. It's reading the content of
> what was recorded in the ring buffer.
> 
> That is, the READ_ONCE() here is not necessary, and if anything will break
> user space parsing, as this is exported to user space to tell it how to
> read the binary format in the ring buffer.

I take that back. Looking at the actual trace event, it is pointing to
sysctl memory, which is a major bug.

TRACE_EVENT(sock_exceed_buf_limit,

        TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind),

        TP_ARGS(sk, prot, allocated, kind),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long *, sysctl_mem)

sysctl_mem is a pointer.

                __field(long, allocated)
                __field(int, sysctl_rmem)
                __field(int, rmem_alloc)
                __field(int, sysctl_wmem)
                __field(int, wmem_alloc)
                __field(int, wmem_queued)
                __field(int, kind)
        ),

        TP_fast_assign(
                strncpy(__entry->name, prot->name, 32);

                __entry->sysctl_mem = prot->sysctl_mem;


They save the pointer **IN THE RING BUFFER**!!!

                __entry->allocated = allocated;
                __entry->sysctl_rmem = sk_get_rmem0(sk, prot);
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->sysctl_wmem = sk_get_wmem0(sk, prot);
                __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc);
                __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued);
                __entry->kind = kind;
        ),

        TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
                __entry->name,
                __entry->sysctl_mem[0],
                __entry->sysctl_mem[1],
                __entry->sysctl_mem[2],

They are now reading a stale pointer, which can be read at any time. That
is, you get the information of what is in sysctl_mem at the time the ring
buffer is read (which is useless from user space), and not at the time of
the event.

Thanks for pointing this out. This needs to be fixed.

-- Steve


                __entry->allocated,
                __entry->sysctl_rmem,
                __entry->rmem_alloc,
                __entry->sysctl_wmem,
                __entry->wmem_alloc,
                __entry->wmem_queued,
                show_skmem_kind_names(__entry->kind)
        )
Kuniyuki Iwashima July 6, 2022, 4:27 p.m. UTC | #3
From:   Steven Rostedt <rostedt@goodmis.org>
Date:   Wed, 6 Jul 2022 09:27:11 -0400
> On Wed, 6 Jul 2022 09:17:07 -0400
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > On Tue, 5 Jul 2022 22:21:25 -0700
> > Kuniyuki Iwashima <kuniyu@amazon.com> wrote:
> > 
> > > --- a/include/trace/events/sock.h
> > > +++ b/include/trace/events/sock.h
> > > @@ -122,9 +122,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
> > >  
> > >  	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
> > >  		__entry->name,
> > > -		__entry->sysctl_mem[0],
> > > -		__entry->sysctl_mem[1],
> > > -		__entry->sysctl_mem[2],
> > > +		READ_ONCE(__entry->sysctl_mem[0]),
> > > +		READ_ONCE(__entry->sysctl_mem[1]),
> > > +		READ_ONCE(__entry->sysctl_mem[2]),  
> > 
> > This is not reading anything to do with sysctl. It's reading the content of
> > what was recorded in the ring buffer.
> > 
> > That is, the READ_ONCE() here is not necessary, and if anything will break
> > user space parsing, as this is exported to user space to tell it how to
> > read the binary format in the ring buffer.
> 
> I take that back. Looking at the actual trace event, it is pointing to
> sysctl memory, which is a major bug.
> 
> TRACE_EVENT(sock_exceed_buf_limit,
> 
>         TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind),
> 
>         TP_ARGS(sk, prot, allocated, kind),
> 
>         TP_STRUCT__entry(
>                 __array(char, name, 32)
>                 __field(long *, sysctl_mem)
> 
> sysctl_mem is a pointer.
> 
>                 __field(long, allocated)
>                 __field(int, sysctl_rmem)
>                 __field(int, rmem_alloc)
>                 __field(int, sysctl_wmem)
>                 __field(int, wmem_alloc)
>                 __field(int, wmem_queued)
>                 __field(int, kind)
>         ),
> 
>         TP_fast_assign(
>                 strncpy(__entry->name, prot->name, 32);
> 
>                 __entry->sysctl_mem = prot->sysctl_mem;
> 
> 
> They save the pointer **IN THE RING BUFFER**!!!
> 
>                 __entry->allocated = allocated;
>                 __entry->sysctl_rmem = sk_get_rmem0(sk, prot);
>                 __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
>                 __entry->sysctl_wmem = sk_get_wmem0(sk, prot);
>                 __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc);
>                 __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued);
>                 __entry->kind = kind;
>         ),
> 
>         TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
>                 __entry->name,
>                 __entry->sysctl_mem[0],
>                 __entry->sysctl_mem[1],
>                 __entry->sysctl_mem[2],
> 
> They are now reading a stale pointer, which can be read at any time. That
> is, you get the information of what is in sysctl_mem at the time the ring
> buffer is read (which is useless from user space), and not at the time of
> the event.
> 
> Thanks for pointing this out. This needs to be fixed.

For the record, Steve fixed this properly here, so I'll drop the tracing
part in v2.
https://lore.kernel.org/netdev/20220706105040.54fc03b0@gandalf.local.home/
diff mbox series

Patch

diff --git a/include/net/sock.h b/include/net/sock.h
index 72ca97ccb460..9fa54762e077 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1529,7 +1529,7 @@  void __sk_mem_reclaim(struct sock *sk, int amount);
 /* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */
 static inline long sk_prot_mem_limits(const struct sock *sk, int index)
 {
-	long val = sk->sk_prot->sysctl_mem[index];
+	long val = READ_ONCE(sk->sk_prot->sysctl_mem[index]);
 
 #if PAGE_SIZE > SK_MEM_QUANTUM
 	val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT;
diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 12c315782766..3c36c2812782 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -122,9 +122,9 @@  TRACE_EVENT(sock_exceed_buf_limit,
 
 	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
 		__entry->name,
-		__entry->sysctl_mem[0],
-		__entry->sysctl_mem[1],
-		__entry->sysctl_mem[2],
+		READ_ONCE(__entry->sysctl_mem[0]),
+		READ_ONCE(__entry->sysctl_mem[1]),
+		READ_ONCE(__entry->sysctl_mem[2]),
 		__entry->allocated,
 		__entry->sysctl_rmem,
 		__entry->rmem_alloc,
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 67b5ab2657b7..e7e658f1ba67 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -315,7 +315,7 @@  static struct ctl_table dn_table[] = {
 		.data = &sysctl_decnet_mem,
 		.maxlen = sizeof(sysctl_decnet_mem),
 		.mode = 0644,
-		.proc_handler = proc_doulongvec_minmax
+		.proc_handler = proc_doulongvec_minmax_lockless,
 	},
 	{
 		.procname = "decnet_rmem",
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index eea11218a663..b14931ca5c85 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -504,7 +504,7 @@  static struct ctl_table ipv4_table[] = {
 		.maxlen		= sizeof(sysctl_tcp_mem),
 		.data		= &sysctl_tcp_mem,
 		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax_lockless,
 	},
 	{
 		.procname	= "tcp_low_latency",
@@ -570,7 +570,7 @@  static struct ctl_table ipv4_table[] = {
 		.data		= &sysctl_udp_mem,
 		.maxlen		= sizeof(sysctl_udp_mem),
 		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax_lockless,
 	},
 	{
 		.procname	= "fib_sync_mem",
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index b46a416787ec..fa79bf4059d1 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -64,7 +64,7 @@  static struct ctl_table sctp_table[] = {
 		.data		= &sysctl_sctp_mem,
 		.maxlen		= sizeof(sysctl_sctp_mem),
 		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax
+		.proc_handler	= proc_doulongvec_minmax_lockless,
 	},
 	{
 		.procname	= "sctp_rmem",