diff mbox series

[net-next] net: add a refcount tracker for kernel sockets

Message ID 20221020232018.3333414-1-edumazet@google.com (mailing list archive)
State Accepted
Commit 0cafd77dcd032d1687efaba5598cf07bce85997f
Delegated to: Netdev Maintainers
Headers show
Series [net-next] net: add a refcount tracker for kernel sockets | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Single patches do not need cover letters
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 4540 this patch: 4540
netdev/cc_maintainers warning 7 maintainers not CCed: rds-devel@oss.oracle.com linux-rdma@vger.kernel.org shakeelb@google.com roman.gushchin@linux.dev vasily.averin@linux.dev martin.lau@kernel.org santosh.shilimkar@oracle.com
netdev/build_clang success Errors and warnings before: 1106 this patch: 1106
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4723 this patch: 4723
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns WARNING: please, no spaces at the start of a line
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Eric Dumazet Oct. 20, 2022, 11:20 p.m. UTC
Commit ffa84b5ffb37 ("net: add netns refcount tracker to struct sock")
added a tracker to sockets, but did not track kernel sockets.

We still have syzbot reports hinting about netns being destroyed
while some kernel TCP sockets had not been dismantled.

This patch tracks kernel sockets, and adds a ref_tracker_dir_print()
call to net_free() right before the netns is freed.

Normally, each layer is responsible for properly releasing its
kernel sockets before last call to net_free().

This debugging facility is enabled with CONFIG_NET_NS_REFCNT_TRACKER=y

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/net_namespace.h | 30 ++++++++++++++++++++++--------
 net/core/net_namespace.c    |  5 +++++
 net/core/sock.c             | 14 ++++++++++++++
 net/netlink/af_netlink.c    | 11 +++++++++++
 net/rds/tcp.c               |  3 +++
 5 files changed, 55 insertions(+), 8 deletions(-)

Comments

Kuniyuki Iwashima Oct. 21, 2022, 5:01 p.m. UTC | #1
Date:   Thu, 20 Oct 2022 23:20:18 +0000
From:   Eric Dumazet <edumazet@google.com>
> Commit ffa84b5ffb37 ("net: add netns refcount tracker to struct sock")
> added a tracker to sockets, but did not track kernel sockets.
> 
> We still have syzbot reports hinting about netns being destroyed
> while some kernel TCP sockets had not been dismantled.
> 
> This patch tracks kernel sockets, and adds a ref_tracker_dir_print()
> call to net_free() right before the netns is freed.
> 
> Normally, each layer is responsible for properly releasing its
> kernel sockets before last call to net_free().
> 
> This debugging facility is enabled with CONFIG_NET_NS_REFCNT_TRACKER=y
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Tested-by: Kuniyuki Iwashima <kuniyu@amazon.com>

Thanks for the patch!
I confirmed it worked with a buggy module :)

---8<---
$ head -n 100 *
==> kern_sk.c <==
#include <linux/module.h>
#include <net/tcp.h>

MODULE_LICENSE("GPL");

struct socket *sock = NULL;

static int __init init_kern_sk(void)
{
	struct net *net = current->nsproxy->net_ns;
	int ret;

	ret = sock_create_kern(net, AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);

	return ret;
}

static void __exit exit_kern_sk(void)
{
	sock_release(sock);
}


module_init(init_kern_sk);
module_exit(exit_kern_sk);

==> Makefile <==
obj-m := kern_sk.o
SRC := /mnt/ec2-user/kernel/kern_sk_reftracker
PWD := $(shell pwd)

default:
	$(MAKE) -C $(SRC) M=$(PWD) modules

clean:
	$(MAKE) -C $(SRC) M=$(PWD) clean

---8<---

---8<---
[root@localhost ~]# unshare -n insmod ./kern_sk.ko
[   22.650224] kern_sk: loading out-of-tree module taints kernel.
[root@localhost ~]# [   22.693636] leaked reference.
[   22.693836]  sk_alloc+0x1f3/0x210
[   22.694009]  inet_create+0xca/0x370
[   22.694194]  __sock_create+0x106/0x1c0
[   22.694388]  do_one_initcall+0x3c/0x1f0
[   22.694588]  do_init_module+0x46/0x1c0
[   22.694781]  __do_sys_finit_module+0xa6/0x100
[   22.695000]  do_syscall_64+0x38/0x90
[   22.695183]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   22.695470] ------------[ cut here ]------------
[   22.695709] WARNING: CPU: 2 PID: 58 at lib/ref_tracker.c:39 ref_tracker_dir_exit.cold+0x62/0x6e
[   22.696142] Modules linked in: kern_sk(O)
[   22.696344] CPU: 2 PID: 58 Comm: kworker/u8:2 Tainted: G           O       6.0.0-11828-g86ae4a5d11bc #4
[   22.696811] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.amzn2022.0.1 04/01/2014
[   22.697285] Workqueue: netns cleanup_net
[   22.697487] RIP: 0010:ref_tracker_dir_exit.cold+0x62/0x6e
[   22.697767] Code: 00 00 00 00 ad de 48 89 45 08 4c 89 6d 00 4c 89 e5 e8 7a f0 45 ff 49 8b 04 24 49 39 dc 75 12 4c 89 f6 4c 89 ff e8 16 18 05 00 <0f> 0b e9 23 94 7e ff 49 89 c4 eb 9c 48 c7 c7 80 f1 58 82 48 89 04
[   22.698671] RSP: 0018:ffffc90000a1fe08 EFLAGS: 00010246
[   22.698931] RAX: 0000000000000000 RBX: ffff888107838ea0 RCX: 0000000000000000
[   22.699285] RDX: 0000000000000001 RSI: 0000000000000282 RDI: 00000000ffffffff
[   22.699650] RBP: ffff888107838ea0 R08: 0000000000000001 R09: ffffffff81d65800
[   22.700000] R10: ffffffff82856080 R11: ffffffff82906080 R12: ffff888107838ea0
[   22.700351] R13: dead000000000100 R14: 0000000000000282 R15: ffff888107838e88
[   22.700710] FS:  0000000000000000(0000) GS:ffff88842fd00000(0000) knlGS:0000000000000000
[   22.701106] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   22.701391] CR2: 00005578b9031148 CR3: 0000000102358001 CR4: 0000000000770ee0
[   22.701760] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   22.702108] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   22.702456] PKRU: 55555554
[   22.702613] Call Trace:
[   22.702750]  <TASK>
[   22.702890]  net_free+0x34/0x50
[   22.703077]  cleanup_net+0x2cd/0x330
[   22.703260]  process_one_work+0x1d4/0x3a0
[   22.703466]  worker_thread+0x48/0x3c0
[   22.703669]  ? process_one_work+0x3a0/0x3a0
[   22.703881]  kthread+0xe0/0x110
[   22.704049]  ? kthread_complete_and_exit+0x20/0x20
[   22.704287]  ret_from_fork+0x1f/0x30
[   22.704472]  </TASK>
[   22.704640] ---[ end trace 0000000000000000 ]---
---8<---
patchwork-bot+netdevbpf@kernel.org Oct. 24, 2022, 10:30 a.m. UTC | #2
Hello:

This patch was applied to netdev/net-next.git (master)
by David S. Miller <davem@davemloft.net>:

On Thu, 20 Oct 2022 23:20:18 +0000 you wrote:
> Commit ffa84b5ffb37 ("net: add netns refcount tracker to struct sock")
> added a tracker to sockets, but did not track kernel sockets.
> 
> We still have syzbot reports hinting about netns being destroyed
> while some kernel TCP sockets had not been dismantled.
> 
> This patch tracks kernel sockets, and adds a ref_tracker_dir_print()
> call to net_free() right before the netns is freed.
> 
> [...]

Here is the summary with links:
  - [net-next] net: add a refcount tracker for kernel sockets
    https://git.kernel.org/netdev/net-next/c/0cafd77dcd03

You are awesome, thank you!
diff mbox series

Patch

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 8c3587d5c308f048a3fbd681125f40ec86a00eb4..78beaa765c733665c035e99c5923334740a18770 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -92,7 +92,9 @@  struct net {
 
 	struct ns_common	ns;
 	struct ref_tracker_dir  refcnt_tracker;
-
+	struct ref_tracker_dir  notrefcnt_tracker; /* tracker for objects not
+						    * refcounted against netns
+						    */
 	struct list_head 	dev_base_head;
 	struct proc_dir_entry 	*proc_net;
 	struct proc_dir_entry 	*proc_net_stat;
@@ -320,19 +322,31 @@  static inline int check_net(const struct net *net)
 #endif
 
 
-static inline void netns_tracker_alloc(struct net *net,
-				       netns_tracker *tracker, gfp_t gfp)
+static inline void __netns_tracker_alloc(struct net *net,
+					 netns_tracker *tracker,
+					 bool refcounted,
+					 gfp_t gfp)
 {
 #ifdef CONFIG_NET_NS_REFCNT_TRACKER
-	ref_tracker_alloc(&net->refcnt_tracker, tracker, gfp);
+	ref_tracker_alloc(refcounted ? &net->refcnt_tracker :
+				       &net->notrefcnt_tracker,
+			  tracker, gfp);
 #endif
 }
 
-static inline void netns_tracker_free(struct net *net,
-				      netns_tracker *tracker)
+static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker,
+				       gfp_t gfp)
+{
+	__netns_tracker_alloc(net, tracker, true, gfp);
+}
+
+static inline void __netns_tracker_free(struct net *net,
+					netns_tracker *tracker,
+					bool refcounted)
 {
 #ifdef CONFIG_NET_NS_REFCNT_TRACKER
-       ref_tracker_free(&net->refcnt_tracker, tracker);
+       ref_tracker_free(refcounted ? &net->refcnt_tracker :
+				     &net->notrefcnt_tracker, tracker);
 #endif
 }
 
@@ -346,7 +360,7 @@  static inline struct net *get_net_track(struct net *net,
 
 static inline void put_net_track(struct net *net, netns_tracker *tracker)
 {
-	netns_tracker_free(net, tracker);
+	__netns_tracker_free(net, tracker, true);
 	put_net(net);
 }
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 0ec2f5906a27c7f930e832835682d69a32e3c8e1..12c68edf768283e4f3ba74f0818618766c8f8d67 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -309,6 +309,7 @@  static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 
 	refcount_set(&net->ns.count, 1);
 	ref_tracker_dir_init(&net->refcnt_tracker, 128);
+	ref_tracker_dir_init(&net->notrefcnt_tracker, 128);
 
 	refcount_set(&net->passive, 1);
 	get_random_bytes(&net->hash_mix, sizeof(u32));
@@ -429,6 +430,10 @@  static void net_free(struct net *net)
 {
 	if (refcount_dec_and_test(&net->passive)) {
 		kfree(rcu_access_pointer(net->gen));
+
+		/* There should not be any trackers left there. */
+		ref_tracker_dir_exit(&net->notrefcnt_tracker);
+
 		kmem_cache_free(net_cachep, net);
 	}
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index a3ba0358c77c0e44db1cfbaeb420f8b80ad7cf98..aa608dc0930b51ced01af4ff59ae5430da84dd9a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2094,6 +2094,9 @@  struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		if (likely(sk->sk_net_refcnt)) {
 			get_net_track(net, &sk->ns_tracker, priority);
 			sock_inuse_add(net, 1);
+		} else {
+			__netns_tracker_alloc(net, &sk->ns_tracker,
+					      false, priority);
 		}
 
 		sock_net_set(sk, net);
@@ -2149,6 +2152,9 @@  static void __sk_destruct(struct rcu_head *head)
 
 	if (likely(sk->sk_net_refcnt))
 		put_net_track(sock_net(sk), &sk->ns_tracker);
+	else
+		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
+
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
 
@@ -2237,6 +2243,14 @@  struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 	if (likely(newsk->sk_net_refcnt)) {
 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
 		sock_inuse_add(sock_net(newsk), 1);
+	} else {
+		/* Kernel sockets are not elevating the struct net refcount.
+		 * Instead, use a tracker to more easily detect if a layer
+		 * is not properly dismantling its kernel sockets at netns
+		 * destroy time.
+		 */
+		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
+				      false, priority);
 	}
 	sk_node_init(&newsk->sk_node);
 	sock_lock_init(newsk);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a662e8a5ff84a658e51d165e3a6da1e182ad6b38..f0c94d394ab194193a6c67b15ae6e700a6ffdf2d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -812,6 +812,17 @@  static int netlink_release(struct socket *sock)
 	}
 
 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
+
+	/* Because struct net might disappear soon, do not keep a pointer. */
+	if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) {
+		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
+		/* Because of deferred_put_nlk_sk and use of work queue,
+		 * it is possible  netns will be freed before this socket.
+		 */
+		sock_net_set(sk, &init_net);
+		__netns_tracker_alloc(&init_net, &sk->ns_tracker,
+				      false, GFP_KERNEL);
+	}
 	call_rcu(&nlk->rcu, deferred_put_nlk_sk);
 	return 0;
 }
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 4444fd82b66df7d54065893dc3d1aa02c3e2ef7c..c5b86066ff663f4250f2646b1eee80f9c7f30821 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -503,6 +503,9 @@  bool rds_tcp_tune(struct socket *sock)
 			release_sock(sk);
 			return false;
 		}
+		/* Update ns_tracker to current stack trace and refcounted tracker */
+		__netns_tracker_free(net, &sk->ns_tracker, false);
+
 		sk->sk_net_refcnt = 1;
 		netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL);
 		sock_inuse_add(net, 1);