Skip to content

Commit 4dc1ebd

Browse files
author
Hangbin Liu
committed
ipv6: Fix soft lockups in fib6_select_path under high next hop churn
JIRA: https://issues.redhat.com/browse/RHEL-73281 Upstream Status: net.git commit d9ccb18 Conflicts: context conflicts due to missing a lot selftest updates. commit d9ccb18 Author: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi@menlosecurity.com> Date: Tue Nov 5 17:02:36 2024 -0800 ipv6: Fix soft lockups in fib6_select_path under high next hop churn Soft lockups have been observed on a cluster of Linux-based edge routers located in a highly dynamic environment. Using the `bird` service, these routers continuously update BGP-advertised routes due to frequently changing nexthop destinations, while also managing significant IPv6 traffic. The lockups occur during the traversal of the multipath circular linked-list in the `fib6_select_path` function, particularly while iterating through the siblings in the list. The issue typically arises when the nodes of the linked list are unexpectedly deleted concurrently on a different core—indicated by their 'next' and 'previous' elements pointing back to the node itself and their reference count dropping to zero. This results in an infinite loop, leading to a soft lockup that triggers a system panic via the watchdog timer. Apply RCU primitives in the problematic code sections to resolve the issue. Where necessary, update the references to fib6_siblings to annotate or use the RCU APIs. Include a test script that reproduces the issue. The script periodically updates the routing table while generating a heavy load of outgoing IPv6 traffic through multiple iperf3 clients. It consistently induces infinite soft lockups within a couple of minutes. Kernel log: 0 [ffffbd13003e8d30] machine_kexec at ffffffff8ceaf3eb 1 [ffffbd13003e8d90] __crash_kexec at ffffffff8d0120e3 2 [ffffbd13003e8e58] panic at ffffffff8cef65d4 3 [ffffbd13003e8ed8] watchdog_timer_fn at ffffffff8d05cb03 4 [ffffbd13003e8f08] __hrtimer_run_queues at ffffffff8cfec62f 5 [ffffbd13003e8f70] hrtimer_interrupt at ffffffff8cfed756 6 [ffffbd13003e8fd0] __sysvec_apic_timer_interrupt at ffffffff8cea01af 7 [ffffbd13003e8ff0] sysvec_apic_timer_interrupt at ffffffff8df1b83d -- <IRQ stack> -- 8 [ffffbd13003d3708] asm_sysvec_apic_timer_interrupt at ffffffff8e000ecb [exception RIP: fib6_select_path+299] RIP: ffffffff8ddafe7b RSP: ffffbd13003d37b8 RFLAGS: 00000287 RAX: ffff975850b43600 RBX: ffff975850b40200 RCX: 0000000000000000 RDX: 000000003fffffff RSI: 0000000051d383e4 RDI: ffff975850b43618 RBP: ffffbd13003d3800 R8: 0000000000000000 R9: ffff975850b40200 R10: 0000000000000000 R11: 0000000000000000 R12: ffffbd13003d3830 R13: ffff975850b436a8 R14: ffff975850b43600 R15: 0000000000000007 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 9 [ffffbd13003d3808] ip6_pol_route at ffffffff8ddb030c 10 [ffffbd13003d3888] ip6_pol_route_input at ffffffff8ddb068c 11 [ffffbd13003d3898] fib6_rule_lookup at ffffffff8ddf02b5 12 [ffffbd13003d3928] ip6_route_input at ffffffff8ddb0f47 13 [ffffbd13003d3a18] ip6_rcv_finish_core.constprop.0 at ffffffff8dd950d0 14 [ffffbd13003d3a30] ip6_list_rcv_finish.constprop.0 at ffffffff8dd96274 15 [ffffbd13003d3a98] ip6_sublist_rcv at ffffffff8dd96474 16 [ffffbd13003d3af8] ipv6_list_rcv at ffffffff8dd96615 17 [ffffbd13003d3b60] __netif_receive_skb_list_core at ffffffff8dc16fec 18 [ffffbd13003d3be0] netif_receive_skb_list_internal at ffffffff8dc176b3 19 [ffffbd13003d3c50] napi_gro_receive at ffffffff8dc565b9 20 [ffffbd13003d3c80] ice_receive_skb at ffffffffc087e4f5 [ice] 21 [ffffbd13003d3c90] ice_clean_rx_irq at ffffffffc0881b80 [ice] 22 [ffffbd13003d3d20] ice_napi_poll at ffffffffc088232f [ice] 23 [ffffbd13003d3d80] __napi_poll at ffffffff8dc18000 24 [ffffbd13003d3db8] net_rx_action at ffffffff8dc18581 25 [ffffbd13003d3e40] __do_softirq at ffffffff8df352e9 26 [ffffbd13003d3eb0] run_ksoftirqd at ffffffff8ceffe47 27 [ffffbd13003d3ec0] smpboot_thread_fn at ffffffff8cf36a30 28 [ffffbd13003d3ee8] kthread at ffffffff8cf2b39f 29 [ffffbd13003d3f28] ret_from_fork at ffffffff8ce5fa64 30 [ffffbd13003d3f50] ret_from_fork_asm at ffffffff8ce03cbb Fixes: 66f5d6c ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Adrian Oliver <kernel@aoliver.ca> Signed-off-by: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi@menlosecurity.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Ido Schimmel <idosch@idosch.org> Cc: Kuniyuki Iwashima <kuniyu@amazon.com> Cc: Simon Horman <horms@kernel.org> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://patch.msgid.link/20241106010236.1239299-1-omid.ehtemamhaghighi@menlosecurity.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Hangbin Liu <haliu@redhat.com>
1 parent d4d8fdf commit 4dc1ebd

File tree

4 files changed

+297
-19
lines changed

4 files changed

+297
-19
lines changed

net/ipv6/ip6_fib.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1183,8 +1183,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
11831183
while (sibling) {
11841184
if (sibling->fib6_metric == rt->fib6_metric &&
11851185
rt6_qualify_for_ecmp(sibling)) {
1186-
list_add_tail(&rt->fib6_siblings,
1187-
&sibling->fib6_siblings);
1186+
list_add_tail_rcu(&rt->fib6_siblings,
1187+
&sibling->fib6_siblings);
11881188
break;
11891189
}
11901190
sibling = rcu_dereference_protected(sibling->fib6_next,
@@ -1245,7 +1245,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
12451245
fib6_siblings)
12461246
sibling->fib6_nsiblings--;
12471247
rt->fib6_nsiblings = 0;
1248-
list_del_init(&rt->fib6_siblings);
1248+
list_del_rcu(&rt->fib6_siblings);
12491249
rt6_multipath_rebalance(next_sibling);
12501250
return err;
12511251
}
@@ -1959,7 +1959,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
19591959
&rt->fib6_siblings, fib6_siblings)
19601960
sibling->fib6_nsiblings--;
19611961
rt->fib6_nsiblings = 0;
1962-
list_del_init(&rt->fib6_siblings);
1962+
list_del_rcu(&rt->fib6_siblings);
19631963
rt6_multipath_rebalance(next_sibling);
19641964
}
19651965

net/ipv6/route.c

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -419,8 +419,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
419419
struct flowi6 *fl6, int oif, bool have_oif_match,
420420
const struct sk_buff *skb, int strict)
421421
{
422-
struct fib6_info *sibling, *next_sibling;
423422
struct fib6_info *match = res->f6i;
423+
struct fib6_info *sibling;
424424

425425
if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
426426
goto out;
@@ -446,8 +446,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
446446
if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
447447
goto out;
448448

449-
list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
450-
fib6_siblings) {
449+
list_for_each_entry_rcu(sibling, &match->fib6_siblings,
450+
fib6_siblings) {
451451
const struct fib6_nh *nh = sibling->fib6_nh;
452452
int nh_upper_bound;
453453

@@ -5216,14 +5216,18 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
52165216
* nexthop. Since sibling routes are always added at the end of
52175217
* the list, find the first sibling of the last route appended
52185218
*/
5219+
rcu_read_lock();
5220+
52195221
if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
5220-
rt = list_first_entry(&rt_last->fib6_siblings,
5221-
struct fib6_info,
5222-
fib6_siblings);
5222+
rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
5223+
struct fib6_info,
5224+
fib6_siblings);
52235225
}
52245226

52255227
if (rt)
52265228
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
5229+
5230+
rcu_read_unlock();
52275231
}
52285232

52295233
static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
@@ -5568,17 +5572,21 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i)
55685572
nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
55695573
&nexthop_len);
55705574
} else {
5571-
struct fib6_info *sibling, *next_sibling;
55725575
struct fib6_nh *nh = f6i->fib6_nh;
5576+
struct fib6_info *sibling;
55735577

55745578
nexthop_len = 0;
55755579
if (f6i->fib6_nsiblings) {
55765580
rt6_nh_nlmsg_size(nh, &nexthop_len);
55775581

5578-
list_for_each_entry_safe(sibling, next_sibling,
5579-
&f6i->fib6_siblings, fib6_siblings) {
5582+
rcu_read_lock();
5583+
5584+
list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
5585+
fib6_siblings) {
55805586
rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
55815587
}
5588+
5589+
rcu_read_unlock();
55825590
}
55835591
nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
55845592
}
@@ -5742,7 +5750,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
57425750
lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
57435751
goto nla_put_failure;
57445752
} else if (rt->fib6_nsiblings) {
5745-
struct fib6_info *sibling, *next_sibling;
5753+
struct fib6_info *sibling;
57465754
struct nlattr *mp;
57475755

57485756
mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
@@ -5754,14 +5762,21 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
57545762
0) < 0)
57555763
goto nla_put_failure;
57565764

5757-
list_for_each_entry_safe(sibling, next_sibling,
5758-
&rt->fib6_siblings, fib6_siblings) {
5765+
rcu_read_lock();
5766+
5767+
list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
5768+
fib6_siblings) {
57595769
if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
57605770
sibling->fib6_nh->fib_nh_weight,
5761-
AF_INET6, 0) < 0)
5771+
AF_INET6, 0) < 0) {
5772+
rcu_read_unlock();
5773+
57625774
goto nla_put_failure;
5775+
}
57635776
}
57645777

5778+
rcu_read_unlock();
5779+
57655780
nla_nest_end(skb, mp);
57665781
} else if (rt->nh) {
57675782
if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
@@ -6198,7 +6213,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
61986213
err = -ENOBUFS;
61996214
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
62006215

6201-
skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
6216+
skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC);
62026217
if (!skb)
62036218
goto errout;
62046219

@@ -6211,7 +6226,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
62116226
goto errout;
62126227
}
62136228
rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
6214-
info->nlh, gfp_any());
6229+
info->nlh, GFP_ATOMIC);
62156230
return;
62166231
errout:
62176232
if (err < 0)

tools/testing/selftests/net/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ TEST_PROGS += test_vxlan_mdb.sh
6464
TEST_PROGS += test_bridge_neigh_suppress.sh
6565
TEST_PROGS += test_vxlan_nolocalbypass.sh
6666
TEST_PROGS += test_bridge_backup_port.sh
67+
TEST_PROGS += ipv6_route_update_soft_lockup.sh
6768

6869
TEST_FILES := settings
6970
TEST_FILES += in_netns.sh lib.sh net_helper.sh setup_loopback.sh setup_veth.sh

0 commit comments

Comments
 (0)