diff mbox series

[net-next] net: introduce per netns packet chains

Message ID b931a4c9b78e282c143ab9455d4c65faa5f6de1c.1742228617.git.pabeni@redhat.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next] net: introduce per netns packet chains | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 38 this patch: 38
netdev/build_tools success Errors and warnings before: 26 (+0) this patch: 26 (+0)
netdev/cc_maintainers warning 2 maintainers not CCed: kuniyu@amazon.com dsahern@kernel.org
netdev/build_clang success Errors and warnings before: 73 this patch: 73
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4243 this patch: 4243
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 88 this patch: 88
netdev/source_inline success Was 0 now: 0

Commit Message

Paolo Abeni March 17, 2025, 4:37 p.m. UTC
Currently network taps unbound to any interface are linked in the
global ptype_all list, affecting the performance in all the network
namespaces.

Add per netns ptypes chains, so that in the mentioned case only
the netns owning the packet socket(s) is affected.

While at that drop the global ptype_all list: no in kernel user
registers a tap on "any" type without specifying either the target
device or the target namespace (and IMHO doing that would not make
any sense).

In dev_queue_xmit_nit() we can remove a redundant RCU read lock: such
function is called only by dev_hard_start_xmit(), which is ensured
to run under RCU protection.

Note that this adds a conditional in the fast path (to check for
per netns ptype_specific list) and increases the dataset size by
a cacheline (owing the per netns lists).

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
rfc -> v1
 - fix procfs dump
 - fix dev->ptype_specific -> dev->ptype_all type in ptype_head()
 - dev_net() -> dev_net_rcu
 - add dev_nit_active_rcu  variant
 - ptype specific netns deliver uses dev_net_rcu(skb->dev)) instead
   of dev_net(orig_dev)
---
 include/linux/netdevice.h   | 12 ++++++++-
 include/net/hotdata.h       |  1 -
 include/net/net_namespace.h |  3 +++
 net/core/dev.c              | 53 ++++++++++++++++++++++++++-----------
 net/core/hotdata.c          |  1 -
 net/core/net-procfs.c       | 28 +++++++++++++++-----
 net/core/net_namespace.c    |  2 ++
 7 files changed, 75 insertions(+), 25 deletions(-)

Comments

Sabrina Dubroca March 18, 2025, 4:17 p.m. UTC | #1
The patch mostly looks ok to me, except:


2025-03-17, 17:37:42 +0100, Paolo Abeni wrote:
>  /**
> - * dev_nit_active - return true if any network interface taps are in use
> + * dev_nit_active_rcu - return true if any network interface taps are in use
> + *
> + * The caller must held the RCU lock

typo: s/held/hold/


>   *
>   * @dev: network device to check for the presence of taps
>   */
> -bool dev_nit_active(struct net_device *dev)
> +bool dev_nit_active_rcu(struct net_device *dev)
>  {
> -	return !list_empty(&net_hotdata.ptype_all) ||
> +	return !list_empty(&dev_net_rcu(dev)->ptype_all) ||
>  	       !list_empty(&dev->ptype_all);
>  }
> -EXPORT_SYMBOL_GPL(dev_nit_active);
> +EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
>  
>  /*
>   *	Support routine. Sends outgoing frames to any network
> @@ -2481,11 +2497,10 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
>  
>  void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
>  {
> -	struct list_head *ptype_list = &net_hotdata.ptype_all;
> +	struct list_head *ptype_list = &dev_net_rcu(dev)->ptype_all;
>  	struct packet_type *ptype, *pt_prev = NULL;
>  	struct sk_buff *skb2 = NULL;
>  
> -	rcu_read_lock();

I can't convince myself that all callers of dev_queue_xmit_nit are
under RCU, specifically coming from drivers/net/wan/hdlc_x25.c
(x25_data_transmit). Maybe keep this rcu_read_lock here?
diff mbox series

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0dbfe069a6e38..067bf137db6ab 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4276,7 +4276,17 @@  static __always_inline int ____dev_forward_skb(struct net_device *dev,
 	return 0;
 }
 
-bool dev_nit_active(struct net_device *dev);
+bool dev_nit_active_rcu(struct net_device *dev);
+static inline bool dev_nit_active(struct net_device *dev)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = dev_nit_active_rcu(dev);
+	rcu_read_unlock();
+	return ret;
+}
+
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
 
 static inline void __dev_put(struct net_device *dev)
diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 30e9570beb2af..fda94b2647ffa 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -23,7 +23,6 @@  struct net_hotdata {
 	struct net_offload	udpv6_offload;
 #endif
 	struct list_head	offload_base;
-	struct list_head	ptype_all;
 	struct kmem_cache	*skbuff_cache;
 	struct kmem_cache	*skbuff_fclone_cache;
 	struct kmem_cache	*skb_small_head_cache;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f467a66abc6b1..bd57d8fb54f14 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -83,6 +83,9 @@  struct net {
 	struct llist_node	defer_free_list;
 	struct llist_node	cleanup_list;	/* namespaces on death row */
 
+	struct list_head ptype_all;
+	struct list_head ptype_specific;
+
 #ifdef CONFIG_KEYS
 	struct key_tag		*key_domain;	/* Key domain of operation tag */
 #endif
diff --git a/net/core/dev.c b/net/core/dev.c
index 6fa6ed5b57987..640c5758f8851 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -572,11 +572,19 @@  static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 
 static inline struct list_head *ptype_head(const struct packet_type *pt)
 {
-	if (pt->type == htons(ETH_P_ALL))
-		return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
-	else
-		return pt->dev ? &pt->dev->ptype_specific :
-				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+	if (pt->type == htons(ETH_P_ALL)) {
+		if (!pt->af_packet_net && !pt->dev)
+			return NULL;
+
+		return pt->dev ? &pt->dev->ptype_all :
+				 &pt->af_packet_net->ptype_all;
+	}
+
+	if (pt->dev)
+		return &pt->dev->ptype_specific;
+
+	return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
+				   &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 }
 
 /**
@@ -596,6 +604,9 @@  void dev_add_pack(struct packet_type *pt)
 {
 	struct list_head *head = ptype_head(pt);
 
+	if (WARN_ON_ONCE(!head))
+		return;
+
 	spin_lock(&ptype_lock);
 	list_add_rcu(&pt->list, head);
 	spin_unlock(&ptype_lock);
@@ -620,6 +631,9 @@  void __dev_remove_pack(struct packet_type *pt)
 	struct list_head *head = ptype_head(pt);
 	struct packet_type *pt1;
 
+	if (!head)
+		return;
+
 	spin_lock(&ptype_lock);
 
 	list_for_each_entry(pt1, head, list) {
@@ -2463,16 +2477,18 @@  static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 }
 
 /**
- * dev_nit_active - return true if any network interface taps are in use
+ * dev_nit_active_rcu - return true if any network interface taps are in use
+ *
+ * The caller must held the RCU lock
  *
  * @dev: network device to check for the presence of taps
  */
-bool dev_nit_active(struct net_device *dev)
+bool dev_nit_active_rcu(struct net_device *dev)
 {
-	return !list_empty(&net_hotdata.ptype_all) ||
+	return !list_empty(&dev_net_rcu(dev)->ptype_all) ||
 	       !list_empty(&dev->ptype_all);
 }
-EXPORT_SYMBOL_GPL(dev_nit_active);
+EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
 
 /*
  *	Support routine. Sends outgoing frames to any network
@@ -2481,11 +2497,10 @@  EXPORT_SYMBOL_GPL(dev_nit_active);
 
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct list_head *ptype_list = &net_hotdata.ptype_all;
+	struct list_head *ptype_list = &dev_net_rcu(dev)->ptype_all;
 	struct packet_type *ptype, *pt_prev = NULL;
 	struct sk_buff *skb2 = NULL;
 
-	rcu_read_lock();
 again:
 	list_for_each_entry_rcu(ptype, ptype_list, list) {
 		if (READ_ONCE(ptype->ignore_outgoing))
@@ -2529,7 +2544,7 @@  void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 		pt_prev = ptype;
 	}
 
-	if (ptype_list == &net_hotdata.ptype_all) {
+	if (ptype_list == &dev_net_rcu(dev)->ptype_all) {
 		ptype_list = &dev->ptype_all;
 		goto again;
 	}
@@ -2540,7 +2555,6 @@  void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 		else
 			kfree_skb(skb2);
 	}
-	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
 
@@ -3774,7 +3788,7 @@  static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 	unsigned int len;
 	int rc;
 
-	if (dev_nit_active(dev))
+	if (dev_nit_active_rcu(dev))
 		dev_queue_xmit_nit(skb, dev);
 
 	len = skb->len;
@@ -5718,7 +5732,8 @@  static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 	if (pfmemalloc)
 		goto skip_taps;
 
-	list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
+	list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
+				list) {
 		if (pt_prev)
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 		pt_prev = ptype;
@@ -5830,6 +5845,14 @@  static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
 				       &ptype_base[ntohs(type) &
 						   PTYPE_HASH_MASK]);
+
+		/* The only per net ptype user - packet socket - matches
+		 * the target netns vs dev_net(skb->dev); we need to
+		 * process only such netns even when orig_dev lays in a
+		 * different one.
+		 */
+		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+				       &dev_net_rcu(skb->dev)->ptype_specific);
 	}
 
 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index d0aaaaa556f22..0bc893d5f07b0 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -7,7 +7,6 @@ 
 
 struct net_hotdata net_hotdata __cacheline_aligned = {
 	.offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
-	.ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all),
 	.gro_normal_batch = 8,
 
 	.netdev_budget = 300,
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index fa6d3969734a6..3e92bf0f9060b 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -185,7 +185,13 @@  static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
 		}
 	}
 
-	list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) {
+	list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) {
+		if (i == pos)
+			return pt;
+		++i;
+	}
+
+	list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) {
 		if (i == pos)
 			return pt;
 		++i;
@@ -210,6 +216,7 @@  static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+	struct net *net = seq_file_net(seq);
 	struct net_device *dev;
 	struct packet_type *pt;
 	struct list_head *nxt;
@@ -232,15 +239,22 @@  static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 				goto found;
 			}
 		}
-
-		nxt = net_hotdata.ptype_all.next;
-		goto ptype_all;
+		nxt = net->ptype_all.next;
+		goto net_ptype_all;
 	}
 
-	if (pt->type == htons(ETH_P_ALL)) {
-ptype_all:
-		if (nxt != &net_hotdata.ptype_all)
+	if (pt->af_packet_net) {
+net_ptype_all:
+		if (nxt != &net->ptype_all && nxt != &net->ptype_specific)
 			goto found;
+
+		if (nxt == &net->ptype_all) {
+			/* continue with ->ptype_specific if it's not empty */
+			nxt = net->ptype_specific.next;
+			if (nxt != &net->ptype_specific)
+				goto found;
+		}
+
 		hash = 0;
 		nxt = ptype_base[0].next;
 	} else
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 4303f2a492624..b0dfdf791ece5 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -340,6 +340,8 @@  static __net_init void preinit_net(struct net *net, struct user_namespace *user_
 	lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
 #endif
 
+	INIT_LIST_HEAD(&net->ptype_all);
+	INIT_LIST_HEAD(&net->ptype_specific);
 	preinit_net_sysctl(net);
 }