diff mbox series

[RFC] net: flush dst_cache on device removal

Message ID 13bccadd7dcc66283898cde11520918670e942db.1716202430.git.pabeni@redhat.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series [RFC] net: flush dst_cache on device removal | expand

Checks

Context Check Description
netdev/series_format warning Single patches do not need cover letters; Target tree name not specified in the subject
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1080 this patch: 1080
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 5 of 5 maintainers
netdev/build_clang success Errors and warnings before: 909 this patch: 909
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 1108 this patch: 1108
netdev/checkpatch warning WARNING: line length of 87 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 4 this patch: 4
netdev/source_inline success Was 0 now: 0

Commit Message

Paolo Abeni May 20, 2024, 10:59 a.m. UTC
Eric reported that dst_cache don't cope correctly with device removal,
keeping the cached dst unmodified even when the underlining device is
deleted and the dst itself is not uncached.

The above causes the infamous 'unregistering netdevice' hangup.

Address the issue implementing explicit book-keeping of all the
initialized dst_caches. At network device unregistration time, traverse
them, looking for relevant dst and eventually replace the dst reference
with a blackhole one.

Use an xarray to store the dst_cache references, to avoid blocking the
BH during the traversal for a possibly unbounded time.

Reported-by: Eric Dumazet <edumazet@google.com>
Fixes: 911362c70df5 ("net: add dst_cache support")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
I can't reproduce the issue locally, I hope somebody able to observe it
could step-in and give this patch a shot.
---
 include/net/dst_cache.h |   1 +
 net/core/dst_cache.c    | 100 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 100 insertions(+), 1 deletion(-)

Comments

Eric Dumazet May 20, 2024, 1:54 p.m. UTC | #1
On Mon, May 20, 2024 at 1:00 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> Eric reported that dst_cache don't cope correctly with device removal,
> keeping the cached dst unmodified even when the underlining device is
> deleted and the dst itself is not uncached.
>
> The above causes the infamous 'unregistering netdevice' hangup.
>
> Address the issue implementing explicit book-keeping of all the
> initialized dst_caches. At network device unregistration time, traverse
> them, looking for relevant dst and eventually replace the dst reference
> with a blackhole one.
>
> Use an xarray to store the dst_cache references, to avoid blocking the
> BH during the traversal for a possibly unbounded time.
>
> Reported-by: Eric Dumazet <edumazet@google.com>
> Fixes: 911362c70df5 ("net: add dst_cache support")
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> I can't reproduce the issue locally, I hope somebody able to observe it
> could step-in and give this patch a shot.
> ---

H Paolo, thanks for your patch.

It seems dst_cache_netdev_event() could spend an awful amount of cpu
in complex setups.

I wonder if we could instead reuse the existing uncached_list
mechanism we have already ?

BTW it seems we could get rid of the ul->quarantine lists.

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bbc2a0dd931429e7f8c68df0df48bce6d604fb56..964dd7cc37ca8c53ebbd1306adb47772c62a7d36
100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -130,7 +130,6 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
 struct uncached_list {
        spinlock_t              lock;
        struct list_head        head;
-       struct list_head        quarantine;
 };

 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
@@ -188,8 +187,7 @@ static void rt6_uncached_list_flush_dev(struct
net_device *dev)
                                handled = true;
                        }
                        if (handled)
-                               list_move(&rt->dst.rt_uncached,
-                                         &ul->quarantine);
+                               list_del_init(&rt->dst.rt_uncached);
                }
                spin_unlock_bh(&ul->lock);
        }
@@ -6754,7 +6752,6 @@ int __init ip6_route_init(void)
                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);

                INIT_LIST_HEAD(&ul->head);
-               INIT_LIST_HEAD(&ul->quarantine);
                spin_lock_init(&ul->lock);
        }
Paolo Abeni May 20, 2024, 3:30 p.m. UTC | #2
On Mon, 2024-05-20 at 15:54 +0200, Eric Dumazet wrote:
> On Mon, May 20, 2024 at 1:00 PM Paolo Abeni <pabeni@redhat.com> wrote:
> > 
> > Eric reported that dst_cache don't cope correctly with device removal,
> > keeping the cached dst unmodified even when the underlining device is
> > deleted and the dst itself is not uncached.
> > 
> > The above causes the infamous 'unregistering netdevice' hangup.
> > 
> > Address the issue implementing explicit book-keeping of all the
> > initialized dst_caches. At network device unregistration time, traverse
> > them, looking for relevant dst and eventually replace the dst reference
> > with a blackhole one.
> > 
> > Use an xarray to store the dst_cache references, to avoid blocking the
> > BH during the traversal for a possibly unbounded time.
> > 
> > Reported-by: Eric Dumazet <edumazet@google.com>
> > Fixes: 911362c70df5 ("net: add dst_cache support")
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > I can't reproduce the issue locally, I hope somebody able to observe it
> > could step-in and give this patch a shot.
> > ---
> 
> H Paolo, thanks for your patch.
> 
> It seems dst_cache_netdev_event() could spend an awful amount of cpu
> in complex setups.

I agree.

> I wonder if we could instead reuse the existing uncached_list
> mechanism we have already ?

Then rt_flush_dev()/fib_netdev_event() will use a similar amount of
time, right? Or do you mean something entirely different? 

On the plus side it will make this patch much smaller, dropping the
notifier.

> BTW it seems we could get rid of the ul->quarantine lists.

I think 'quarantine' is used to avoid traversing multiple times the
same/already blackholed entries when processing multiple
NETDEV_UNREGISTER events before the dst entries themself are freed.
Could it make a difference at netns disposal time with many dst and
devices in there?

Cheers,

Paolo
Eric Dumazet May 20, 2024, 3:34 p.m. UTC | #3
On Mon, May 20, 2024 at 5:30 PM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Mon, 2024-05-20 at 15:54 +0200, Eric Dumazet wrote:
> > On Mon, May 20, 2024 at 1:00 PM Paolo Abeni <pabeni@redhat.com> wrote:
> > >
> > > Eric reported that dst_cache don't cope correctly with device removal,
> > > keeping the cached dst unmodified even when the underlining device is
> > > deleted and the dst itself is not uncached.
> > >
> > > The above causes the infamous 'unregistering netdevice' hangup.
> > >
> > > Address the issue implementing explicit book-keeping of all the
> > > initialized dst_caches. At network device unregistration time, traverse
> > > them, looking for relevant dst and eventually replace the dst reference
> > > with a blackhole one.
> > >
> > > Use an xarray to store the dst_cache references, to avoid blocking the
> > > BH during the traversal for a possibly unbounded time.
> > >
> > > Reported-by: Eric Dumazet <edumazet@google.com>
> > > Fixes: 911362c70df5 ("net: add dst_cache support")
> > > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > > ---
> > > I can't reproduce the issue locally, I hope somebody able to observe it
> > > could step-in and give this patch a shot.
> > > ---
> >
> > H Paolo, thanks for your patch.
> >
> > It seems dst_cache_netdev_event() could spend an awful amount of cpu
> > in complex setups.
>
> I agree.
>
> > I wonder if we could instead reuse the existing uncached_list
> > mechanism we have already ?
>
> Then rt_flush_dev()/fib_netdev_event() will use a similar amount of
> time, right? Or do you mean something entirely different?
>
> On the plus side it will make this patch much smaller, dropping the
> notifier.
>
> > BTW it seems we could get rid of the ul->quarantine lists.
>
> I think 'quarantine' is used to avoid traversing multiple times the
> same/already blackholed entries when processing multiple
> NETDEV_UNREGISTER events before the dst entries themself are freed.

But if we list_del_init(&rt->dst.rt_uncached), each handled dst is not anymore
in a list.

ul->quarantine has no purpose (we never traverse it). Even kmemleak
does not need it, I think.

Note this is net-next material, just because I took a look after
seeing your patch.

> Could it make a difference at netns disposal time with many dst and
> devices in there?
>
> Cheers,
>
> Paolo
>
diff mbox series

Patch

diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h
index b4a55d2d5e71..4d0302e143b4 100644
--- a/include/net/dst_cache.h
+++ b/include/net/dst_cache.h
@@ -11,6 +11,7 @@ 
 struct dst_cache {
 	struct dst_cache_pcpu __percpu *cache;
 	unsigned long reset_ts;
+	u32 id;
 };
 
 /**
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 6a0482e676d3..22921cc9d42c 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -7,6 +7,8 @@ 
 
 #include <linux/kernel.h>
 #include <linux/percpu.h>
+#include <linux/xarray.h>
+#include <linux/rcupdate_wait.h>
 #include <net/dst_cache.h>
 #include <net/route.h>
 #if IS_ENABLED(CONFIG_IPV6)
@@ -14,6 +16,13 @@ 
 #endif
 #include <uapi/linux/in.h>
 
+static DEFINE_XARRAY_FLAGS(dst_caches, XA_FLAGS_ALLOC);
+
+struct dst_cache_entry {
+	struct dst_cache_pcpu __percpu *cache;
+	struct rcu_head rcu;
+};
+
 struct dst_cache_pcpu {
 	unsigned long refresh_ts;
 	struct dst_entry *dst;
@@ -140,27 +149,60 @@  EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
 
 int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
 {
+	struct dst_cache_entry *entry;
+	int last_id, ret = -ENOMEM;
+
 	dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
 					    gfp | __GFP_ZERO);
 	if (!dst_cache->cache)
 		return -ENOMEM;
 
+	entry = kmalloc(sizeof(*entry), gfp | __GFP_ZERO);
+	if (!entry)
+		goto free_cache;
+
+	ret = xa_alloc_cyclic_bh(&dst_caches, &dst_cache->id, entry,
+				 xa_limit_32b, &last_id, gfp);
+	if (ret < 0)
+		goto free_entry;
+
+	entry->cache = dst_cache->cache;
 	dst_cache_reset(dst_cache);
 	return 0;
+
+free_entry:
+	kfree(entry);
+
+free_cache:
+	free_percpu(dst_cache->cache);
+	dst_cache->cache = NULL;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(dst_cache_init);
 
+static void dst_cache_entry_free(struct rcu_head *rcu)
+{
+	struct dst_cache_entry *entry = container_of(rcu, struct dst_cache_entry, rcu);
+
+	free_percpu(entry->cache);
+	kfree(entry);
+}
+
 void dst_cache_destroy(struct dst_cache *dst_cache)
 {
+	struct dst_cache_entry *entry;
 	int i;
 
 	if (!dst_cache->cache)
 		return;
 
+	entry = xa_erase_bh(&dst_caches, dst_cache->id);
+
 	for_each_possible_cpu(i)
 		dst_release(per_cpu_ptr(dst_cache->cache, i)->dst);
 
-	free_percpu(dst_cache->cache);
+	if (!WARN_ON_ONCE(!entry))
+		call_rcu(&entry->rcu, dst_cache_entry_free);
 }
 EXPORT_SYMBOL_GPL(dst_cache_destroy);
 
@@ -182,3 +224,59 @@  void dst_cache_reset_now(struct dst_cache *dst_cache)
 	}
 }
 EXPORT_SYMBOL_GPL(dst_cache_reset_now);
+
+static void dst_cache_flush_dev(struct dst_cache_entry *entry,
+				struct net_device *dev)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		struct dst_cache_pcpu *idst = per_cpu_ptr(entry->cache, i);
+		struct dst_entry *dst = READ_ONCE(idst->dst);
+
+		if (!dst || !dst_hold_safe(dst))
+			continue;
+
+		if (!list_empty(&dst->rt_uncached) || dst->dev != dev)
+			goto release;
+
+		dst->dev = blackhole_netdev;
+		netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
+				   GFP_ATOMIC);
+
+release:
+		dst_release(dst);
+	}
+}
+
+static int dst_cache_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct dst_cache_entry *entry;
+	XA_STATE(xas, &dst_caches, 0);
+
+	if (event == NETDEV_UNREGISTER) {
+		rcu_read_lock();
+		xas_for_each(&xas, entry, UINT_MAX) {
+			dst_cache_flush_dev(entry, dev);
+			if (need_resched()) {
+				xas_pause(&xas);
+				cond_resched_rcu();
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block dst_cache_notifier = {
+	.notifier_call = dst_cache_netdev_event,
+};
+
+static int __init dst_cache_notifier_init(void)
+{
+	return register_netdevice_notifier(&dst_cache_notifier);
+}
+
+subsys_initcall(dst_cache_notifier_init);