diff mbox series

[net-next,v7] net: ipv4: Cache pmtu for all packet paths if multipath enabled

Message ID 20241104072753.77042-1-deliran@verdict.gg (mailing list archive)
State New
Headers show
Series [net-next,v7] net: ipv4: Cache pmtu for all packet paths if multipath enabled | expand

Commit Message

Vladimir Vdovin Nov. 4, 2024, 7:27 a.m. UTC
Check number of paths by fib_info_num_path(),
and update_or_create_fnhe() for every path.
Problem is that pmtu is cached only for the oif
that has received icmp message "need to frag",
other oifs will still try to use "default" iface mtu.

An example topology showing the problem:

                    |  host1
                +---------+
                |  dummy0 | 10.179.20.18/32  mtu9000
                +---------+
        +-----------+----------------+
    +---------+                     +---------+
    | ens17f0 |  10.179.2.141/31    | ens17f1 |  10.179.2.13/31
    +---------+                     +---------+
        |    (all here have mtu 9000)    |
    +------+                         +------+
    | ro1  |  10.179.2.140/31        | ro2  |  10.179.2.12/31
    +------+                         +------+
        |                                |
---------+------------+-------------------+------
                        |
                    +-----+
                    | ro3 | 10.10.10.10  mtu1500
                    +-----+
                        |
    ========================================
                some networks
    ========================================
                        |
                    +-----+
                    | eth0| 10.10.30.30  mtu9000
                    +-----+
                        |  host2

host1 have enabled multipath and
sysctl net.ipv4.fib_multipath_hash_policy = 1:

default proto static src 10.179.20.18
        nexthop via 10.179.2.12 dev ens17f1 weight 1
        nexthop via 10.179.2.140 dev ens17f0 weight 1

When host1 tries to do pmtud from 10.179.20.18/32 to host2,
host1 receives at ens17f1 iface an icmp packet from ro3 that ro3 mtu=1500.
And host1 caches it in nexthop exceptions cache.

Problem is that it is cached only for the iface that has received icmp,
and there is no way that ro3 will send icmp msg to host1 via another path.

Host1 now have this routes to host2:

ip r g 10.10.30.30 sport 30000 dport 443
10.10.30.30 via 10.179.2.12 dev ens17f1 src 10.179.20.18 uid 0
    cache expires 521sec mtu 1500

ip r g 10.10.30.30 sport 30033 dport 443
10.10.30.30 via 10.179.2.140 dev ens17f0 src 10.179.20.18 uid 0
    cache

So when host1 tries again to reach host2 with mtu>1500,
if packet flow is lucky enough to be hashed with oif=ens17f1 its ok,
if oif=ens17f0 it blackholes and still gets icmp msgs from ro3 to ens17f1,
until lucky day when ro3 will send it through another flow to ens17f0.

Signed-off-by: Vladimir Vdovin <deliran@verdict.gg>
---
V7:
  selftest in pmtu.sh:
    - add setup_multipath() with old and new nh tests
    - add global "dummy_v4" addr variables
    - add documentation
    - remove dummy netdev usage in mp nh test
    - remove useless sysctl opts in mp nh test

V6:
  - make commit message cleaner

V5:
  - make self test cleaner

V4:
  - fix selftest, do route lookup before checking cached exceptions

V3:
  - add selftest
  - fix compile error

V2:
  - fix fib_info_num_path parameter pass
---
 net/ipv4/route.c                    | 13 ++++
 tools/testing/selftests/net/pmtu.sh | 95 ++++++++++++++++++++++++++++-
 2 files changed, 107 insertions(+), 1 deletion(-)


base-commit: 66600fac7a984dea4ae095411f644770b2561ede

Comments

Ido Schimmel Nov. 4, 2024, 5:44 p.m. UTC | #1
+ Guillaume:
please see comment below about route_get_dst_pmtu_from_exception())
Original patch:
https://lore.kernel.org/netdev/20241104072753.77042-1-deliran@verdict.gg/

On Mon, Nov 04, 2024 at 07:27:50AM +0000, Vladimir Vdovin wrote:
> diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
> index 569bce8b6383..f24c84184c61 100755
> --- a/tools/testing/selftests/net/pmtu.sh
> +++ b/tools/testing/selftests/net/pmtu.sh
> @@ -197,6 +197,12 @@
>  #
>  # - pmtu_ipv6_route_change
>  #	Same as above but with IPv6
> +#
> +# - pmtu_ipv4_mp_exceptions
> +#	Use the same topology as in pmtu_ipv4, but add routeable "dummy"

No need for "dummy" as these are regular addresses on the loopback
device

> +#	addresses on host A and B on lo0 reachable via both routers.

There is no "lo0" device. Only "lo". Run with "-v" and you will see a
lot of errors

> +#	Host A and B "dummy" addresses have multipath routes to each other.
> +#	Check that PMTU exceptions are created for both paths.
>  
>  source lib.sh
>  source net_helper.sh
> @@ -266,7 +272,8 @@ tests="
>  	list_flush_ipv4_exception	ipv4: list and flush cached exceptions	1
>  	list_flush_ipv6_exception	ipv6: list and flush cached exceptions	1
>  	pmtu_ipv4_route_change		ipv4: PMTU exception w/route replace	1
> -	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1"
> +	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1
> +	pmtu_ipv4_mp_exceptions		ipv4: PMTU multipath nh exceptions	1"
>  
>  # Addressing and routing for tests with routers: four network segments, with
>  # index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
> @@ -343,6 +350,9 @@ tunnel6_a_addr="fd00:2::a"
>  tunnel6_b_addr="fd00:2::b"
>  tunnel6_mask="64"
>  
> +dummy4_a_addr="192.168.99.99"
> +dummy4_b_addr="192.168.88.88"

Let's change to "host4_a_addr" and "host4_b_addr" (or similar) as we are
no longer using a dummy device

> +
>  dummy6_0_prefix="fc00:1000::"
>  dummy6_1_prefix="fc00:1001::"
>  dummy6_mask="64"
> @@ -984,6 +994,50 @@ setup_ovs_bridge() {
>  	run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
>  }
>  
> +setup_multipath() {
> +	if [ "$USE_NH" = "yes" ]; then
> +		setup_multipath_new
> +	else
> +		setup_multipath_old
> +	fi

Please move setup_multipath_{new,old}() before setup_multipath() like
setup_routing() and related functions

> +
> +	# Set up routers with routes to dummies
> +	run_cmd ${ns_r1} ip route add ${dummy4_a_addr} via ${prefix4}.${a_r1}.1
> +	run_cmd ${ns_r2} ip route add ${dummy4_a_addr} via ${prefix4}.${a_r2}.1
> +	run_cmd ${ns_r1} ip route add ${dummy4_b_addr} via ${prefix4}.${b_r1}.1
> +	run_cmd ${ns_r2} ip route add ${dummy4_b_addr} via ${prefix4}.${b_r2}.1
> +}
> +
> +setup_multipath_new() {
> +	# Set up host A with multipath routes to host B dummy4_b_addr
> +	run_cmd ${ns_a} ip addr add ${dummy4_a_addr} dev lo0

s/lo0/lo/ same in other places

> +	run_cmd ${ns_a} ip nexthop add id 201 via ${prefix4}.${a_r1}.2 dev veth_A-R1
> +	run_cmd ${ns_a} ip nexthop add id 202 via ${prefix4}.${a_r2}.2 dev veth_A-R2
> +	run_cmd ${ns_a} ip nexthop add id 203 group 201/202
> +	run_cmd ${ns_a} ip route add ${dummy4_b_addr} nhid 203

Maybe number the nexthops 401..403 so that we can later use 601..603 for
IPv6 like $routes_nh is doing

> +
> +	# Set up host B with multipath routes to host A dummy4_a_addr
> +	run_cmd ${ns_b} ip addr add ${dummy4_b_addr} dev lo0
> +	run_cmd ${ns_b} ip nexthop add id 201 via ${prefix4}.${b_r1}.2 dev veth_A-R1

s/veth_A-R1/veth_B-R1/

> +	run_cmd ${ns_b} ip nexthop add id 202 via ${prefix4}.${b_r2}.2 dev veth_A-R2

s/veth_A-R2/veth_B-R2/

> +	run_cmd ${ns_b} ip nexthop add id 203 group 201/202
> +	run_cmd ${ns_b} ip route add ${dummy4_a_addr} nhid 203
> +}
> +
> +setup_multipath_old() {
> +	# Set up host A with multipath routes to host B dummy4_b_addr
> +	run_cmd ${ns_a} ip addr add ${dummy4_a_addr} dev lo0
> +	run_cmd ${ns_a} ip route add ${dummy4_b_addr} \
> +			nexthop via ${prefix4}.${a_r1}.2 weight 1 \
> +			nexthop via ${prefix4}.${a_r2}.2 weight 1
> +
> +	# Set up host B with multipath routes to host A dummy4_a_addr
> +	run_cmd ${ns_b} ip addr add ${dummy4_b_addr} dev lo0
> +	run_cmd ${ns_a} ip route add ${dummy4_a_addr} \

s/ns_a/ns_b/

> +			nexthop via ${prefix4}.${a_b1}.2 weight 1 \

s/a_b1/b_r1/

> +			nexthop via ${prefix4}.${a_b2}.2 weight 1

s/a_b2/b_r2/

> +}
> +
>  setup() {
>  	[ "$(id -u)" -ne 0 ] && echo "  need to run as root" && return $ksft_skip
>  
> @@ -2329,6 +2383,45 @@ test_pmtu_ipv6_route_change() {
>  	test_pmtu_ipvX_route_change 6
>  }
>  
> +test_pmtu_ipv4_mp_exceptions() {
> +	setup namespaces routing multipath || return $ksft_skip
> +
> +	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
> +	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
> +	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
> +	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2
> +
> +	# Set up initial MTU values
> +	mtu "${ns_a}"  veth_A-R1 2000
> +	mtu "${ns_r1}" veth_R1-A 2000
> +	mtu "${ns_r1}" veth_R1-B 1400
> +	mtu "${ns_b}"  veth_B-R1 1400
> +
> +	mtu "${ns_a}"  veth_A-R2 2000
> +	mtu "${ns_r2}" veth_R2-A 2000
> +	mtu "${ns_r2}" veth_R2-B 1500
> +	mtu "${ns_b}"  veth_B-R2 1500
> +
> +	fail=0
> +
> +	# Ping and expect two nexthop exceptions for two routes in nh group
> +	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${dummy4_b_addr}"
> +

I looked more into pmtu.sh and this hunk from here ...

> +	# Do route lookup before checking cached exceptions.
> +	# The following commands are needed for dst entries to be cached
> +	# in both paths exceptions and therefore dumped to user space
> +	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R1
> +	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R2
> +
> +	# Check cached exceptions
> +	if [ "$(${ns_a} ip -oneline route list cache | grep mtu | wc -l)" -ne 2 ]; then
> +		err "  there are not enough cached exceptions"
> +		fail=1
> +	fi

... until here can be replaced by route_get_dst_pmtu_from_exception() and
check_pmtu_value() like in other test cases. There are two
prerequisites:

1. We should set the same MTU in both paths as otherwise we don't know
which MTU will be cached and what to pass to check_pmtu_value() as the
expected value. I did see that check_pmtu_value() accepts "any", but I
think it's better to check for a specific value.

2. route_get_dst_pmtu_from_exception() is not very flexible in the
keywords it accepts for "ip route get" and we need to pass "oif". It can
be solved by [1] (please test), but Guillaume might have a better idea.
Then, the above hunk can be replaced by [2].

[1]
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 569bce8b6383..6e790d38e5d9 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -1076,23 +1076,15 @@ link_get_mtu() {
 }
 
 route_get_dst_exception() {
-	ns_cmd="${1}"
-	dst="${2}"
-	dsfield="${3}"
-
-	if [ -z "${dsfield}" ]; then
-		dsfield=0
-	fi
+	ns_cmd="${1}"; shift
 
-	${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
+	${ns_cmd} ip route get "$@"
 }
 
 route_get_dst_pmtu_from_exception() {
-	ns_cmd="${1}"
-	dst="${2}"
-	dsfield="${3}"
+	ns_cmd="${1}"; shift
 
-	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
+	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")"
 }
 
 check_pmtu_value() {
@@ -1235,10 +1227,10 @@ test_pmtu_ipv4_dscp_icmp_exception() {
 	run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
 
 	# Check that exceptions have been created with the correct PMTU
-	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
+	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1} dsfield ${policy_mark})"
 	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
 
-	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
+	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2} dsfield ${policy_mark})"
 	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
 }
 
@@ -1285,9 +1277,9 @@ test_pmtu_ipv4_dscp_udp_exception() {
 		UDP:"${dst2}":50000,tos="${dsfield}"
 
 	# Check that exceptions have been created with the correct PMTU
-	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
+	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1} dsfield ${policy_mark})"
 	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
-	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
+	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2} dsfield ${policy_mark})"
 	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
 }

[2]
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index a3c3f7f99e5b..10b8ac2d7f47 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -2399,19 +2399,11 @@ test_pmtu_ipv4_mp_exceptions() {
 	# Ping and expect two nexthop exceptions for two routes in nh group
 	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${dummy4_b_addr}"
 
-	# Do route lookup before checking cached exceptions.
-	# The following commands are needed for dst entries to be cached
-	# in both paths exceptions and therefore dumped to user space
-	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R1
-	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R2
-
-	# Check cached exceptions
-	if [ "$(${ns_a} ip -oneline route list cache | grep mtu | wc -l)" -ne 2 ]; then
-		err "  there are not enough cached exceptions"
-		fail=1
-	fi
-
-	return ${fail}
+	# Check that exceptions have been created with the correct PMTU
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dummy4_b_addr} oif veth_A-R1)"
+	check_pmtu_value "1500" "${pmtu}" "exceeding MTU (veth_A-R1)" || return 1
+	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dummy4_b_addr} oif veth_A-R2)"
+	check_pmtu_value "1500" "${pmtu}" "exceeding MTU (veth_A-R2)" || return 1
 }
 
 usage() {
Vladimir Vdovin Nov. 6, 2024, 1:29 p.m. UTC | #2
Thanks for your detailed review, sending v8 patch to summerize all changes

On Mon Nov 4, 2024 at 8:44 PM MSK, Ido Schimmel wrote:
> + Guillaume:
> please see comment below about route_get_dst_pmtu_from_exception())
> Original patch:
> https://lore.kernel.org/netdev/20241104072753.77042-1-deliran@verdict.gg/
>
> On Mon, Nov 04, 2024 at 07:27:50AM +0000, Vladimir Vdovin wrote:
> > diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
> > index 569bce8b6383..f24c84184c61 100755
> > --- a/tools/testing/selftests/net/pmtu.sh
> > +++ b/tools/testing/selftests/net/pmtu.sh
> > @@ -197,6 +197,12 @@
> >  #
> >  # - pmtu_ipv6_route_change
> >  #	Same as above but with IPv6
> > +#
> > +# - pmtu_ipv4_mp_exceptions
> > +#	Use the same topology as in pmtu_ipv4, but add routeable "dummy"
>
> No need for "dummy" as these are regular addresses on the loopback
> device
>
> > +#	addresses on host A and B on lo0 reachable via both routers.
>
> There is no "lo0" device. Only "lo". Run with "-v" and you will see a
> lot of errors
>
> > +#	Host A and B "dummy" addresses have multipath routes to each other.
> > +#	Check that PMTU exceptions are created for both paths.
> >  
> >  source lib.sh
> >  source net_helper.sh
> > @@ -266,7 +272,8 @@ tests="
> >  	list_flush_ipv4_exception	ipv4: list and flush cached exceptions	1
> >  	list_flush_ipv6_exception	ipv6: list and flush cached exceptions	1
> >  	pmtu_ipv4_route_change		ipv4: PMTU exception w/route replace	1
> > -	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1"
> > +	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1
> > +	pmtu_ipv4_mp_exceptions		ipv4: PMTU multipath nh exceptions	1"
> >  
> >  # Addressing and routing for tests with routers: four network segments, with
> >  # index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
> > @@ -343,6 +350,9 @@ tunnel6_a_addr="fd00:2::a"
> >  tunnel6_b_addr="fd00:2::b"
> >  tunnel6_mask="64"
> >  
> > +dummy4_a_addr="192.168.99.99"
> > +dummy4_b_addr="192.168.88.88"
>
> Let's change to "host4_a_addr" and "host4_b_addr" (or similar) as we are
> no longer using a dummy device
>
> > +
> >  dummy6_0_prefix="fc00:1000::"
> >  dummy6_1_prefix="fc00:1001::"
> >  dummy6_mask="64"
> > @@ -984,6 +994,50 @@ setup_ovs_bridge() {
> >  	run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
> >  }
> >  
> > +setup_multipath() {
> > +	if [ "$USE_NH" = "yes" ]; then
> > +		setup_multipath_new
> > +	else
> > +		setup_multipath_old
> > +	fi
>
> Please move setup_multipath_{new,old}() before setup_multipath() like
> setup_routing() and related functions
>
> > +
> > +	# Set up routers with routes to dummies
> > +	run_cmd ${ns_r1} ip route add ${dummy4_a_addr} via ${prefix4}.${a_r1}.1
> > +	run_cmd ${ns_r2} ip route add ${dummy4_a_addr} via ${prefix4}.${a_r2}.1
> > +	run_cmd ${ns_r1} ip route add ${dummy4_b_addr} via ${prefix4}.${b_r1}.1
> > +	run_cmd ${ns_r2} ip route add ${dummy4_b_addr} via ${prefix4}.${b_r2}.1
> > +}
> > +
> > +setup_multipath_new() {
> > +	# Set up host A with multipath routes to host B dummy4_b_addr
> > +	run_cmd ${ns_a} ip addr add ${dummy4_a_addr} dev lo0
>
> s/lo0/lo/ same in other places
>
> > +	run_cmd ${ns_a} ip nexthop add id 201 via ${prefix4}.${a_r1}.2 dev veth_A-R1
> > +	run_cmd ${ns_a} ip nexthop add id 202 via ${prefix4}.${a_r2}.2 dev veth_A-R2
> > +	run_cmd ${ns_a} ip nexthop add id 203 group 201/202
> > +	run_cmd ${ns_a} ip route add ${dummy4_b_addr} nhid 203
>
> Maybe number the nexthops 401..403 so that we can later use 601..603 for
> IPv6 like $routes_nh is doing
>
> > +
> > +	# Set up host B with multipath routes to host A dummy4_a_addr
> > +	run_cmd ${ns_b} ip addr add ${dummy4_b_addr} dev lo0
> > +	run_cmd ${ns_b} ip nexthop add id 201 via ${prefix4}.${b_r1}.2 dev veth_A-R1
>
> s/veth_A-R1/veth_B-R1/
>
> > +	run_cmd ${ns_b} ip nexthop add id 202 via ${prefix4}.${b_r2}.2 dev veth_A-R2
>
> s/veth_A-R2/veth_B-R2/
>
> > +	run_cmd ${ns_b} ip nexthop add id 203 group 201/202
> > +	run_cmd ${ns_b} ip route add ${dummy4_a_addr} nhid 203
> > +}
> > +
> > +setup_multipath_old() {
> > +	# Set up host A with multipath routes to host B dummy4_b_addr
> > +	run_cmd ${ns_a} ip addr add ${dummy4_a_addr} dev lo0
> > +	run_cmd ${ns_a} ip route add ${dummy4_b_addr} \
> > +			nexthop via ${prefix4}.${a_r1}.2 weight 1 \
> > +			nexthop via ${prefix4}.${a_r2}.2 weight 1
> > +
> > +	# Set up host B with multipath routes to host A dummy4_a_addr
> > +	run_cmd ${ns_b} ip addr add ${dummy4_b_addr} dev lo0
> > +	run_cmd ${ns_a} ip route add ${dummy4_a_addr} \
>
> s/ns_a/ns_b/
>
> > +			nexthop via ${prefix4}.${a_b1}.2 weight 1 \
>
> s/a_b1/b_r1/
>
> > +			nexthop via ${prefix4}.${a_b2}.2 weight 1
>
> s/a_b2/b_r2/
>
> > +}
> > +
> >  setup() {
> >  	[ "$(id -u)" -ne 0 ] && echo "  need to run as root" && return $ksft_skip
> >  
> > @@ -2329,6 +2383,45 @@ test_pmtu_ipv6_route_change() {
> >  	test_pmtu_ipvX_route_change 6
> >  }
> >  
> > +test_pmtu_ipv4_mp_exceptions() {
> > +	setup namespaces routing multipath || return $ksft_skip
> > +
> > +	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
> > +	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
> > +	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
> > +	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2
> > +
> > +	# Set up initial MTU values
> > +	mtu "${ns_a}"  veth_A-R1 2000
> > +	mtu "${ns_r1}" veth_R1-A 2000
> > +	mtu "${ns_r1}" veth_R1-B 1400
> > +	mtu "${ns_b}"  veth_B-R1 1400
> > +
> > +	mtu "${ns_a}"  veth_A-R2 2000
> > +	mtu "${ns_r2}" veth_R2-A 2000
> > +	mtu "${ns_r2}" veth_R2-B 1500
> > +	mtu "${ns_b}"  veth_B-R2 1500
> > +
> > +	fail=0
> > +
> > +	# Ping and expect two nexthop exceptions for two routes in nh group
> > +	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${dummy4_b_addr}"
> > +
>
> I looked more into pmtu.sh and this hunk from here ...
>
> > +	# Do route lookup before checking cached exceptions.
> > +	# The following commands are needed for dst entries to be cached
> > +	# in both paths exceptions and therefore dumped to user space
> > +	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R1
> > +	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R2
> > +
> > +	# Check cached exceptions
> > +	if [ "$(${ns_a} ip -oneline route list cache | grep mtu | wc -l)" -ne 2 ]; then
> > +		err "  there are not enough cached exceptions"
> > +		fail=1
> > +	fi
>
> ... until here can be replaced by route_get_dst_pmtu_from_exception() and
> check_pmtu_value() like in other test cases. There are two
> prerequisites:
>
> 1. We should set the same MTU in both paths as otherwise we don't know
> which MTU will be cached and what to pass to check_pmtu_value() as the
> expected value. I did see that check_pmtu_value() accepts "any", but I
> think it's better to check for a specific value.
>
> 2. route_get_dst_pmtu_from_exception() is not very flexible in the
> keywords it accepts for "ip route get" and we need to pass "oif". It can
> be solved by [1] (please test), but Guillaume might have a better idea.
> Then, the above hunk can be replaced by [2].
>
> [1]
> diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
> index 569bce8b6383..6e790d38e5d9 100755
> --- a/tools/testing/selftests/net/pmtu.sh
> +++ b/tools/testing/selftests/net/pmtu.sh
> @@ -1076,23 +1076,15 @@ link_get_mtu() {
>  }
>  
>  route_get_dst_exception() {
> -	ns_cmd="${1}"
> -	dst="${2}"
> -	dsfield="${3}"
> -
> -	if [ -z "${dsfield}" ]; then
> -		dsfield=0
> -	fi
> +	ns_cmd="${1}"; shift
>  
> -	${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
> +	${ns_cmd} ip route get "$@"
>  }
>  
>  route_get_dst_pmtu_from_exception() {
> -	ns_cmd="${1}"
> -	dst="${2}"
> -	dsfield="${3}"
> +	ns_cmd="${1}"; shift
>  
> -	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
> +	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")"
>  }
>  
>  check_pmtu_value() {
> @@ -1235,10 +1227,10 @@ test_pmtu_ipv4_dscp_icmp_exception() {
>  	run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
>  
>  	# Check that exceptions have been created with the correct PMTU
> -	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
> +	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1} dsfield ${policy_mark})"
>  	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
>  
> -	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
> +	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2} dsfield ${policy_mark})"
>  	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
>  }
>  
> @@ -1285,9 +1277,9 @@ test_pmtu_ipv4_dscp_udp_exception() {
>  		UDP:"${dst2}":50000,tos="${dsfield}"
>  
>  	# Check that exceptions have been created with the correct PMTU
> -	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
> +	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1} dsfield ${policy_mark})"
>  	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
> -	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
> +	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2} dsfield ${policy_mark})"
>  	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
>  }
>
> [2]
> diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
> index a3c3f7f99e5b..10b8ac2d7f47 100755
> --- a/tools/testing/selftests/net/pmtu.sh
> +++ b/tools/testing/selftests/net/pmtu.sh
> @@ -2399,19 +2399,11 @@ test_pmtu_ipv4_mp_exceptions() {
>  	# Ping and expect two nexthop exceptions for two routes in nh group
>  	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${dummy4_b_addr}"
>  
> -	# Do route lookup before checking cached exceptions.
> -	# The following commands are needed for dst entries to be cached
> -	# in both paths exceptions and therefore dumped to user space
> -	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R1
> -	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R2
> -
> -	# Check cached exceptions
> -	if [ "$(${ns_a} ip -oneline route list cache | grep mtu | wc -l)" -ne 2 ]; then
> -		err "  there are not enough cached exceptions"
> -		fail=1
> -	fi
> -
> -	return ${fail}
> +	# Check that exceptions have been created with the correct PMTU
> +	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dummy4_b_addr} oif veth_A-R1)"
> +	check_pmtu_value "1500" "${pmtu}" "exceeding MTU (veth_A-R1)" || return 1
> +	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dummy4_b_addr} oif veth_A-R2)"
> +	check_pmtu_value "1500" "${pmtu}" "exceeding MTU (veth_A-R2)" || return 1
>  }
>  
>  usage() {
Guillaume Nault Nov. 6, 2024, 3:26 p.m. UTC | #3
On Mon, Nov 04, 2024 at 07:44:29PM +0200, Ido Schimmel wrote:
> 1. We should set the same MTU in both paths as otherwise we don't know
> which MTU will be cached and what to pass to check_pmtu_value() as the
> expected value. I did see that check_pmtu_value() accepts "any", but I
> think it's better to check for a specific value.
> 
> 2. route_get_dst_pmtu_from_exception() is not very flexible in the
> keywords it accepts for "ip route get" and we need to pass "oif". It can
> be solved by [1] (please test), but Guillaume might have a better idea.
> Then, the above hunk can be replaced by [2].

Thanks for bringing this to my attention Ido!
I fully agree with this approach (and I also fully agree with your other
feedbacks).

Nice to see this bug fixed and get a selftest!

> [1]
> diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
> index 569bce8b6383..6e790d38e5d9 100755
> --- a/tools/testing/selftests/net/pmtu.sh
> +++ b/tools/testing/selftests/net/pmtu.sh
> @@ -1076,23 +1076,15 @@ link_get_mtu() {
>  }
>  
>  route_get_dst_exception() {
> -	ns_cmd="${1}"
> -	dst="${2}"
> -	dsfield="${3}"
> -
> -	if [ -z "${dsfield}" ]; then
> -		dsfield=0
> -	fi
> +	ns_cmd="${1}"; shift
>  
> -	${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
> +	${ns_cmd} ip route get "$@"
>  }
>  
>  route_get_dst_pmtu_from_exception() {
> -	ns_cmd="${1}"
> -	dst="${2}"
> -	dsfield="${3}"
> +	ns_cmd="${1}"; shift
>  
> -	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
> +	mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")"
>  }
>  
>  check_pmtu_value() {
> @@ -1235,10 +1227,10 @@ test_pmtu_ipv4_dscp_icmp_exception() {
>  	run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
>  
>  	# Check that exceptions have been created with the correct PMTU
> -	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
> +	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1} dsfield ${policy_mark})"
>  	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
>  
> -	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
> +	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2} dsfield ${policy_mark})"
>  	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
>  }
>  
> @@ -1285,9 +1277,9 @@ test_pmtu_ipv4_dscp_udp_exception() {
>  		UDP:"${dst2}":50000,tos="${dsfield}"
>  
>  	# Check that exceptions have been created with the correct PMTU
> -	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
> +	pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1} dsfield ${policy_mark})"
>  	check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
> -	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
> +	pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2} dsfield ${policy_mark})"
>  	check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
>  }
> 
> [2]
> diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
> index a3c3f7f99e5b..10b8ac2d7f47 100755
> --- a/tools/testing/selftests/net/pmtu.sh
> +++ b/tools/testing/selftests/net/pmtu.sh
> @@ -2399,19 +2399,11 @@ test_pmtu_ipv4_mp_exceptions() {
>  	# Ping and expect two nexthop exceptions for two routes in nh group
>  	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${dummy4_b_addr}"
>  
> -	# Do route lookup before checking cached exceptions.
> -	# The following commands are needed for dst entries to be cached
> -	# in both paths exceptions and therefore dumped to user space
> -	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R1
> -	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R2
> -
> -	# Check cached exceptions
> -	if [ "$(${ns_a} ip -oneline route list cache | grep mtu | wc -l)" -ne 2 ]; then
> -		err "  there are not enough cached exceptions"
> -		fail=1
> -	fi
> -
> -	return ${fail}
> +	# Check that exceptions have been created with the correct PMTU
> +	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dummy4_b_addr} oif veth_A-R1)"
> +	check_pmtu_value "1500" "${pmtu}" "exceeding MTU (veth_A-R1)" || return 1
> +	pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dummy4_b_addr} oif veth_A-R2)"
> +	check_pmtu_value "1500" "${pmtu}" "exceeding MTU (veth_A-R2)" || return 1
>  }
>  
>  usage() {
>
diff mbox series

Patch

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 723ac9181558..652f603d29fe 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1027,6 +1027,19 @@  static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 		struct fib_nh_common *nhc;
 
 		fib_select_path(net, &res, fl4, NULL);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		if (fib_info_num_path(res.fi) > 1) {
+			int nhsel;
+
+			for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
+				nhc = fib_info_nhc(res.fi, nhsel);
+				update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+						      jiffies + net->ipv4.ip_rt_mtu_expires);
+			}
+			rcu_read_unlock();
+			return;
+		}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
 		nhc = FIB_RES_NHC(res);
 		update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
 				      jiffies + net->ipv4.ip_rt_mtu_expires);
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 569bce8b6383..f24c84184c61 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -197,6 +197,12 @@ 
 #
 # - pmtu_ipv6_route_change
 #	Same as above but with IPv6
+#
+# - pmtu_ipv4_mp_exceptions
+#	Use the same topology as in pmtu_ipv4, but add routeable "dummy"
+#	addresses on host A and B on lo0 reachable via both routers.
+#	Host A and B "dummy" addresses have multipath routes to each other.
+#	Check that PMTU exceptions are created for both paths.
 
 source lib.sh
 source net_helper.sh
@@ -266,7 +272,8 @@  tests="
 	list_flush_ipv4_exception	ipv4: list and flush cached exceptions	1
 	list_flush_ipv6_exception	ipv6: list and flush cached exceptions	1
 	pmtu_ipv4_route_change		ipv4: PMTU exception w/route replace	1
-	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1"
+	pmtu_ipv6_route_change		ipv6: PMTU exception w/route replace	1
+	pmtu_ipv4_mp_exceptions		ipv4: PMTU multipath nh exceptions	1"
 
 # Addressing and routing for tests with routers: four network segments, with
 # index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
@@ -343,6 +350,9 @@  tunnel6_a_addr="fd00:2::a"
 tunnel6_b_addr="fd00:2::b"
 tunnel6_mask="64"
 
+dummy4_a_addr="192.168.99.99"
+dummy4_b_addr="192.168.88.88"
+
 dummy6_0_prefix="fc00:1000::"
 dummy6_1_prefix="fc00:1001::"
 dummy6_mask="64"
@@ -984,6 +994,50 @@  setup_ovs_bridge() {
 	run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
 }
 
+setup_multipath() {
+	if [ "$USE_NH" = "yes" ]; then
+		setup_multipath_new
+	else
+		setup_multipath_old
+	fi
+
+	# Set up routers with routes to dummies
+	run_cmd ${ns_r1} ip route add ${dummy4_a_addr} via ${prefix4}.${a_r1}.1
+	run_cmd ${ns_r2} ip route add ${dummy4_a_addr} via ${prefix4}.${a_r2}.1
+	run_cmd ${ns_r1} ip route add ${dummy4_b_addr} via ${prefix4}.${b_r1}.1
+	run_cmd ${ns_r2} ip route add ${dummy4_b_addr} via ${prefix4}.${b_r2}.1
+}
+
+setup_multipath_new() {
+	# Set up host A with multipath routes to host B dummy4_b_addr
+	run_cmd ${ns_a} ip addr add ${dummy4_a_addr} dev lo0
+	run_cmd ${ns_a} ip nexthop add id 201 via ${prefix4}.${a_r1}.2 dev veth_A-R1
+	run_cmd ${ns_a} ip nexthop add id 202 via ${prefix4}.${a_r2}.2 dev veth_A-R2
+	run_cmd ${ns_a} ip nexthop add id 203 group 201/202
+	run_cmd ${ns_a} ip route add ${dummy4_b_addr} nhid 203
+
+	# Set up host B with multipath routes to host A dummy4_a_addr
+	run_cmd ${ns_b} ip addr add ${dummy4_b_addr} dev lo0
+	run_cmd ${ns_b} ip nexthop add id 201 via ${prefix4}.${b_r1}.2 dev veth_A-R1
+	run_cmd ${ns_b} ip nexthop add id 202 via ${prefix4}.${b_r2}.2 dev veth_A-R2
+	run_cmd ${ns_b} ip nexthop add id 203 group 201/202
+	run_cmd ${ns_b} ip route add ${dummy4_a_addr} nhid 203
+}
+
+setup_multipath_old() {
+	# Set up host A with multipath routes to host B dummy4_b_addr
+	run_cmd ${ns_a} ip addr add ${dummy4_a_addr} dev lo0
+	run_cmd ${ns_a} ip route add ${dummy4_b_addr} \
+			nexthop via ${prefix4}.${a_r1}.2 weight 1 \
+			nexthop via ${prefix4}.${a_r2}.2 weight 1
+
+	# Set up host B with multipath routes to host A dummy4_a_addr
+	run_cmd ${ns_b} ip addr add ${dummy4_b_addr} dev lo0
+	run_cmd ${ns_a} ip route add ${dummy4_a_addr} \
+			nexthop via ${prefix4}.${a_b1}.2 weight 1 \
+			nexthop via ${prefix4}.${a_b2}.2 weight 1
+}
+
 setup() {
 	[ "$(id -u)" -ne 0 ] && echo "  need to run as root" && return $ksft_skip
 
@@ -2329,6 +2383,45 @@  test_pmtu_ipv6_route_change() {
 	test_pmtu_ipvX_route_change 6
 }
 
+test_pmtu_ipv4_mp_exceptions() {
+	setup namespaces routing multipath || return $ksft_skip
+
+	trace "${ns_a}"  veth_A-R1    "${ns_r1}" veth_R1-A \
+	      "${ns_r1}" veth_R1-B    "${ns_b}"  veth_B-R1 \
+	      "${ns_a}"  veth_A-R2    "${ns_r2}" veth_R2-A \
+	      "${ns_r2}" veth_R2-B    "${ns_b}"  veth_B-R2
+
+	# Set up initial MTU values
+	mtu "${ns_a}"  veth_A-R1 2000
+	mtu "${ns_r1}" veth_R1-A 2000
+	mtu "${ns_r1}" veth_R1-B 1400
+	mtu "${ns_b}"  veth_B-R1 1400
+
+	mtu "${ns_a}"  veth_A-R2 2000
+	mtu "${ns_r2}" veth_R2-A 2000
+	mtu "${ns_r2}" veth_R2-B 1500
+	mtu "${ns_b}"  veth_B-R2 1500
+
+	fail=0
+
+	# Ping and expect two nexthop exceptions for two routes in nh group
+	run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${dummy4_b_addr}"
+
+	# Do route lookup before checking cached exceptions.
+	# The following commands are needed for dst entries to be cached
+	# in both paths exceptions and therefore dumped to user space
+	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R1
+	run_cmd ${ns_a} ip route get ${dummy4_b_addr} oif veth_A-R2
+
+	# Check cached exceptions
+	if [ "$(${ns_a} ip -oneline route list cache | grep mtu | wc -l)" -ne 2 ]; then
+		err "  there are not enough cached exceptions"
+		fail=1
+	fi
+
+	return ${fail}
+}
+
 usage() {
 	echo
 	echo "$0 [OPTIONS] [TEST]..."