Message ID | 20240514040757.1957761-1-omid.ehtemamhaghighi@menlosecurity.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net/ipv6: Fix kernel soft lockup in fib6_select_path under high next hop churn | expand |
On 5/13/24 10:07 PM, Omid Ehtemam-Haghighi wrote: > diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c > index c1f62352a481..b4f3627dd045 100644 > --- a/net/ipv6/ip6_fib.c > +++ b/net/ipv6/ip6_fib.c > @@ -1037,7 +1037,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, > fib6_drop_pcpu_from(rt, table); > > if (rt->nh && !list_empty(&rt->nh_list)) > - list_del_init(&rt->nh_list); > + list_del_rcu(&rt->nh_list); This path is only for the separate nexthop objects (the rt->nh check), while you seem to be dependent on the legacy IPv6 multipath code. > > if (refcount_read(&rt->fib6_ref) != 1) { > /* This route is used as dummy address holder in some split > @@ -1247,7 +1247,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, > fib6_siblings) > sibling->fib6_nsiblings--; > rt->fib6_nsiblings = 0; > - list_del_init(&rt->fib6_siblings); > + list_del_rcu(&rt->fib6_siblings); If using rcu for fib6_siblings fixes your problem, then all references should be updated to annotate or use the rcu apis. > rt6_multipath_rebalance(next_sibling); > return err; > } > @@ -1965,7 +1965,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, > &rt->fib6_siblings, fib6_siblings) > sibling->fib6_nsiblings--; > rt->fib6_nsiblings = 0; > - list_del_init(&rt->fib6_siblings); > + list_del_rcu(&rt->fib6_siblings); > rt6_multipath_rebalance(next_sibling); > } > > diff --git a/net/ipv6/route.c b/net/ipv6/route.c > index 1f4b935a0e57..485a14098958 100644 > --- a/net/ipv6/route.c > +++ b/net/ipv6/route.c > @@ -414,7 +414,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, > struct flowi6 *fl6, int oif, bool have_oif_match, > const struct sk_buff *skb, int strict) > { > - struct fib6_info *sibling, *next_sibling; > + struct fib6_info *sibling; > struct fib6_info *match = res->f6i; > > if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) > @@ -441,8 +441,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, > if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) > goto out; > > - list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, > - fib6_siblings) { > + list_for_each_entry_rcu(sibling, &match->fib6_siblings, > + fib6_siblings) { > const struct fib6_nh *nh = sibling->fib6_nh; > int nh_upper_bound; >
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c1f62352a481..b4f3627dd045 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1037,7 +1037,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) - list_del_init(&rt->nh_list); + list_del_rcu(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1247,7 +1247,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; - list_del_init(&rt->fib6_siblings); + list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } @@ -1965,7 +1965,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; - list_del_init(&rt->fib6_siblings); + list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1f4b935a0e57..485a14098958 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -414,7 +414,7 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { - struct fib6_info *sibling, *next_sibling; + struct fib6_info *sibling; struct fib6_info *match = res->f6i; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) @@ -441,8 +441,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) goto out; - list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, - fib6_siblings) { + list_for_each_entry_rcu(sibling, &match->fib6_siblings, + fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound;
Kernel soft lockups have been observed in clusters of Linux-based edge routers located in a highly dynamic environment. These routers consistently update BGP-advertised routes due to frequently changing next hop destinations, while managing significant IPv6 traffic. The lockups occur during the traversal of the multipath circular linked-list in the fib6_select_path function, particularly while iterating through siblings in the list. The issue arises when nodes in the linked list are unexpectedly deleted concurrently on another CPU core—indicated by their 'next' and 'previous' elements pointing back to the node itself and their reference count dropping to zero. This causes an infinite loop, leading to a kernel soft lockup that triggers a kernel panic via the watchdog timer (log attached below). To reproduce the issue, we set up a test environment designed to simulate the conditions that lead to soft lockups. This was achieved by periodically modifying the routing table and generating a heavy load of outgoing IPv6 traffic through four iperf3 clients. This setup consistently triggered soft lockups in less than a minute. As a solution, I applied RCU locks in places that are identified as culprits, which has successfully resolved the issue within our testing parameters. Kernel log: 0 [ffffbd13003e8d30] machine_kexec at ffffffff8ceaf3eb 1 [ffffbd13003e8d90] __crash_kexec at ffffffff8d0120e3 2 [ffffbd13003e8e58] panic at ffffffff8cef65d4 3 [ffffbd13003e8ed8] watchdog_timer_fn at ffffffff8d05cb03 4 [ffffbd13003e8f08] __hrtimer_run_queues at ffffffff8cfec62f 5 [ffffbd13003e8f70] hrtimer_interrupt at ffffffff8cfed756 6 [ffffbd13003e8fd0] __sysvec_apic_timer_interrupt at ffffffff8cea01af 7 [ffffbd13003e8ff0] sysvec_apic_timer_interrupt at ffffffff8df1b83d -- <IRQ stack> -- 8 [ffffbd13003d3708] asm_sysvec_apic_timer_interrupt at ffffffff8e000ecb [exception RIP: fib6_select_path+299] RIP: ffffffff8ddafe7b RSP: ffffbd13003d37b8 RFLAGS: 00000287 RAX: ffff975850b43600 RBX: ffff975850b40200 RCX: 0000000000000000 RDX: 000000003fffffff RSI: 0000000051d383e4 RDI: ffff975850b43618 RBP: ffffbd13003d3800 R8: 0000000000000000 R9: ffff975850b40200 R10: 0000000000000000 R11: 0000000000000000 R12: ffffbd13003d3830 R13: ffff975850b436a8 R14: ffff975850b43600 R15: 0000000000000007 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 9 [ffffbd13003d3808] ip6_pol_route at ffffffff8ddb030c 10 [ffffbd13003d3888] ip6_pol_route_input at ffffffff8ddb068c 11 [ffffbd13003d3898] fib6_rule_lookup at ffffffff8ddf02b5 12 [ffffbd13003d3928] ip6_route_input at ffffffff8ddb0f47 13 [ffffbd13003d3a18] ip6_rcv_finish_core.constprop.0 at ffffffff8dd950d0 14 [ffffbd13003d3a30] ip6_list_rcv_finish.constprop.0 at ffffffff8dd96274 15 [ffffbd13003d3a98] ip6_sublist_rcv at ffffffff8dd96474 16 [ffffbd13003d3af8] ipv6_list_rcv at ffffffff8dd96615 17 [ffffbd13003d3b60] __netif_receive_skb_list_core at ffffffff8dc16fec 18 [ffffbd13003d3be0] netif_receive_skb_list_internal at ffffffff8dc176b3 19 [ffffbd13003d3c50] napi_gro_receive at ffffffff8dc565b9 20 [ffffbd13003d3c80] ice_receive_skb at ffffffffc087e4f5 [ice] 21 [ffffbd13003d3c90] ice_clean_rx_irq at ffffffffc0881b80 [ice] 22 [ffffbd13003d3d20] ice_napi_poll at ffffffffc088232f [ice] 23 [ffffbd13003d3d80] __napi_poll at ffffffff8dc18000 24 [ffffbd13003d3db8] net_rx_action at ffffffff8dc18581 25 [ffffbd13003d3e40] __do_softirq at ffffffff8df352e9 26 [ffffbd13003d3eb0] run_ksoftirqd at ffffffff8ceffe47 27 [ffffbd13003d3ec0] smpboot_thread_fn at ffffffff8cf36a30 28 [ffffbd13003d3ee8] kthread at ffffffff8cf2b39f 29 [ffffbd13003d3f28] ret_from_fork at ffffffff8ce5fa64 30 [ffffbd13003d3f50] ret_from_fork_asm at ffffffff8ce03cbb Signed-off-by: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi@menlosecurity.com> --- net/ipv6/ip6_fib.c | 6 +++--- net/ipv6/route.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-)