diff mbox series

udp: gso: fix MTU check for small packets

Message ID Z5cgWh/6bRQm9vVU@debian.debian (mailing list archive)
State New
Headers show
Series udp: gso: fix MTU check for small packets | expand

Commit Message

Yan Zhai Jan. 27, 2025, 5:57 a.m. UTC
Commit 4094871db1d6 ("udp: only do GSO if # of segs > 1") avoided GSO
for small packets. But the kernel currently dismisses GSO requests only
after checking MTU on gso_size. This means any packets, regardless of
their payload sizes, would be dropped when MTU is smaller than requested
gso_size. Meanwhile, EINVAL would be returned in this case, making it
very misleading to debug.

Ideally, do not check any GSO related constraints when payload size is
smaller than requested gso_size, and return EMSGSIZE on MTU check
failure consistently for all packets to ease debugging.

Fixes: 4094871db1d6 ("udp: only do GSO if # of segs > 1")
Signed-off-by: Yan Zhai <yan@cloudflare.com>
---
 net/ipv4/udp.c                       | 18 ++++++++----------
 net/ipv6/udp.c                       | 18 ++++++++----------
 tools/testing/selftests/net/udpgso.c | 14 ++++++++++++++
 3 files changed, 30 insertions(+), 20 deletions(-)

Comments

Willem de Bruijn Jan. 27, 2025, 2:33 p.m. UTC | #1
Yan Zhai wrote:
> Commit 4094871db1d6 ("udp: only do GSO if # of segs > 1") avoided GSO
> for small packets. But the kernel currently dismisses GSO requests only
> after checking MTU on gso_size. This means any packets, regardless of
> their payload sizes, would be dropped when MTU is smaller than requested
> gso_size.

Is this a realistic concern? How did you encounter this in practice.

It *is* a misconfiguration to configure a gso_size larger than MTU.

> Meanwhile, EINVAL would be returned in this case, making it
> very misleading to debug.

Misleading is subjective. I'm not sure what is misleading here. From
my above comment, I believe this is correctly EINVAL.

That said, if this impacts a real workload we could reconsider
relaxing the check. I.e., allowing through packets even when an
application has clearly misconfigured UDP_SEGMENT.

> 
> Ideally, do not check any GSO related constraints when payload size is
> smaller than requested gso_size, and return EMSGSIZE on MTU check
> failure consistently for all packets to ease debugging.
> 
> Fixes: 4094871db1d6 ("udp: only do GSO if # of segs > 1")
> Signed-off-by: Yan Zhai <yan@cloudflare.com>
> ---
>  net/ipv4/udp.c                       | 18 ++++++++----------
>  net/ipv6/udp.c                       | 18 ++++++++----------
>  tools/testing/selftests/net/udpgso.c | 14 ++++++++++++++
>  3 files changed, 30 insertions(+), 20 deletions(-)
> 
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index c472c9a57cf6..9aed1b4a871f 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -1137,13 +1137,13 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
>  	uh->len = htons(len);
>  	uh->check = 0;
>  
> -	if (cork->gso_size) {
> +	if (cork->gso_size && datalen > cork->gso_size) {
>  		const int hlen = skb_network_header_len(skb) +
>  				 sizeof(struct udphdr);
>  
>  		if (hlen + cork->gso_size > cork->fragsize) {
>  			kfree_skb(skb);
> -			return -EINVAL;
> +			return -EMSGSIZE;
>  		}
>  		if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
>  			kfree_skb(skb);
> @@ -1158,15 +1158,13 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
>  			return -EIO;
>  		}
>  
> -		if (datalen > cork->gso_size) {
> -			skb_shinfo(skb)->gso_size = cork->gso_size;
> -			skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
> -			skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
> -								 cork->gso_size);
> +		skb_shinfo(skb)->gso_size = cork->gso_size;
> +		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
> +		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
> +							 cork->gso_size);
>  
> -			/* Don't checksum the payload, skb will get segmented */
> -			goto csum_partial;
> -		}
> +		/* Don't checksum the payload, skb will get segmented */
> +		goto csum_partial;
>  	}
>  
>  	if (is_udplite)  				 /*     UDP-Lite      */
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 6671daa67f4f..6cdc8ce4c6f9 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1385,13 +1385,13 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
>  	uh->len = htons(len);
>  	uh->check = 0;
>  
> -	if (cork->gso_size) {
> +	if (cork->gso_size && datalen > cork->gso_size) {
>  		const int hlen = skb_network_header_len(skb) +
>  				 sizeof(struct udphdr);
>  
>  		if (hlen + cork->gso_size > cork->fragsize) {
>  			kfree_skb(skb);
> -			return -EINVAL;
> +			return -EMSGSIZE;
>  		}
>  		if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
>  			kfree_skb(skb);
> @@ -1406,15 +1406,13 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
>  			return -EIO;
>  		}
>  
> -		if (datalen > cork->gso_size) {
> -			skb_shinfo(skb)->gso_size = cork->gso_size;
> -			skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
> -			skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
> -								 cork->gso_size);
> +		skb_shinfo(skb)->gso_size = cork->gso_size;
> +		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
> +		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
> +							 cork->gso_size);
>  
> -			/* Don't checksum the payload, skb will get segmented */
> -			goto csum_partial;
> -		}
> +		/* Don't checksum the payload, skb will get segmented */
> +		goto csum_partial;
>  	}
>  
>  	if (is_udplite)
> diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
> index 3f2fca02fec5..fb73f1c331fb 100644
> --- a/tools/testing/selftests/net/udpgso.c
> +++ b/tools/testing/selftests/net/udpgso.c
> @@ -102,6 +102,13 @@ struct testcase testcases_v4[] = {
>  		.gso_len = CONST_MSS_V4,
>  		.r_num_mss = 1,
>  	},
> +	{
> +		/* datalen <= MSS < gso_len: will fall back to no GSO */
> +		.tlen = CONST_MSS_V4,
> +		.gso_len = CONST_MSS_V4 + 1,
> +		.r_num_mss = 0,
> +		.r_len_last = CONST_MSS_V4,
> +	},
>  	{
>  		/* send a single MSS + 1B */
>  		.tlen = CONST_MSS_V4 + 1,
> @@ -205,6 +212,13 @@ struct testcase testcases_v6[] = {
>  		.gso_len = CONST_MSS_V6,
>  		.r_num_mss = 1,
>  	},
> +	{
> +		/* datalen <= MSS < gso_len: will fall back to no GSO */
> +		.tlen = CONST_MSS_V6,
> +		.gso_len = CONST_MSS_V6 + 1,
> +		.r_num_mss = 0,
> +		.r_len_last = CONST_MSS_V6,
> +	},
>  	{
>  		/* send a single MSS + 1B */
>  		.tlen = CONST_MSS_V6 + 1,
> -- 
> 2.30.2
> 
>
Yan Zhai Jan. 27, 2025, 5 p.m. UTC | #2
Hi Willem,

Thanks for getting back to me.

On Mon, Jan 27, 2025 at 8:33 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Yan Zhai wrote:
> > Commit 4094871db1d6 ("udp: only do GSO if # of segs > 1") avoided GSO
> > for small packets. But the kernel currently dismisses GSO requests only
> > after checking MTU on gso_size. This means any packets, regardless of
> > their payload sizes, would be dropped when MTU is smaller than requested
> > gso_size.
>
> Is this a realistic concern? How did you encounter this in practice.
>
> It *is* a misconfiguration to configure a gso_size larger than MTU.
>
> > Meanwhile, EINVAL would be returned in this case, making it
> > very misleading to debug.
>
> Misleading is subjective. I'm not sure what is misleading here. From
> my above comment, I believe this is correctly EINVAL.
>
> That said, if this impacts a real workload we could reconsider
> relaxing the check. I.e., allowing through packets even when an
> application has clearly misconfigured UDP_SEGMENT.
>
We did encounter a painful reliability issue in production last month.

To simplify the scenario, we had these symptoms when the issue occurred:
1. QUIC connections to host A started to fail, and cannot establish new ones
2. User space Wireguard to the exact same host worked 100% fine

This happened rarely, like one or twice a day, lasting for a few
minutes usually, but it was quite visible since it is an office
network.

Initially this prompted something wrong at the protocol layer. But
after multiple rounds of digging, we finally figured the root cause
was:
3. Something sometimes pings host B, which shares the same IP with
host A but different ports (thanks to limited IPv4 space), and its
PMTU was reduced to 1280 occasionally. This unexpectedly affected all
traffic to that IP including traffic toward host A. Our QUIC client
set gso_size to 1350, and that's why it got hit.

I agree that configurations do matter a lot here. Given how broken the
PMTU was for the Internet, we might just turn off pmtudisc option on
our end to avoid this failure path. But for those who hasn't yet, this
could still be confusing if it ever happens, because nothing seems to
point to PMTU in the first place:
* small packets also get dropped
* error code was EINVAL from sendmsg

That said, I probably should have used PMTU in my commit message to be
more clear for our problem. But meanwhile I am also concerned about
newly added tunnels to trigger the same issue, even if it has a static
device MTU. My proposal should make the error reason more clear:
EMSGSIZE itself is a direct signal pointing to MTU/PMTU. Larger
packets getting dropped would have a similar effect.

thanks
Yan

> >
> > Ideally, do not check any GSO related constraints when payload size is
> > smaller than requested gso_size, and return EMSGSIZE on MTU check
> > failure consistently for all packets to ease debugging.
> >
> > Fixes: 4094871db1d6 ("udp: only do GSO if # of segs > 1")
> > Signed-off-by: Yan Zhai <yan@cloudflare.com>
> > ---
> >  net/ipv4/udp.c                       | 18 ++++++++----------
> >  net/ipv6/udp.c                       | 18 ++++++++----------
> >  tools/testing/selftests/net/udpgso.c | 14 ++++++++++++++
> >  3 files changed, 30 insertions(+), 20 deletions(-)
> >
> > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> > index c472c9a57cf6..9aed1b4a871f 100644
> > --- a/net/ipv4/udp.c
> > +++ b/net/ipv4/udp.c
> > @@ -1137,13 +1137,13 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
> >       uh->len = htons(len);
> >       uh->check = 0;
> >
> > -     if (cork->gso_size) {
> > +     if (cork->gso_size && datalen > cork->gso_size) {
> >               const int hlen = skb_network_header_len(skb) +
> >                                sizeof(struct udphdr);
> >
> >               if (hlen + cork->gso_size > cork->fragsize) {
> >                       kfree_skb(skb);
> > -                     return -EINVAL;
> > +                     return -EMSGSIZE;
> >               }
> >               if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
> >                       kfree_skb(skb);
> > @@ -1158,15 +1158,13 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
> >                       return -EIO;
> >               }
> >
> > -             if (datalen > cork->gso_size) {
> > -                     skb_shinfo(skb)->gso_size = cork->gso_size;
> > -                     skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
> > -                     skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
> > -                                                              cork->gso_size);
> > +             skb_shinfo(skb)->gso_size = cork->gso_size;
> > +             skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
> > +             skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
> > +                                                      cork->gso_size);
> >
> > -                     /* Don't checksum the payload, skb will get segmented */
> > -                     goto csum_partial;
> > -             }
> > +             /* Don't checksum the payload, skb will get segmented */
> > +             goto csum_partial;
> >       }
> >
> >       if (is_udplite)                                  /*     UDP-Lite      */
> > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> > index 6671daa67f4f..6cdc8ce4c6f9 100644
> > --- a/net/ipv6/udp.c
> > +++ b/net/ipv6/udp.c
> > @@ -1385,13 +1385,13 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
> >       uh->len = htons(len);
> >       uh->check = 0;
> >
> > -     if (cork->gso_size) {
> > +     if (cork->gso_size && datalen > cork->gso_size) {
> >               const int hlen = skb_network_header_len(skb) +
> >                                sizeof(struct udphdr);
> >
> >               if (hlen + cork->gso_size > cork->fragsize) {
> >                       kfree_skb(skb);
> > -                     return -EINVAL;
> > +                     return -EMSGSIZE;
> >               }
> >               if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
> >                       kfree_skb(skb);
> > @@ -1406,15 +1406,13 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
> >                       return -EIO;
> >               }
> >
> > -             if (datalen > cork->gso_size) {
> > -                     skb_shinfo(skb)->gso_size = cork->gso_size;
> > -                     skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
> > -                     skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
> > -                                                              cork->gso_size);
> > +             skb_shinfo(skb)->gso_size = cork->gso_size;
> > +             skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
> > +             skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
> > +                                                      cork->gso_size);
> >
> > -                     /* Don't checksum the payload, skb will get segmented */
> > -                     goto csum_partial;
> > -             }
> > +             /* Don't checksum the payload, skb will get segmented */
> > +             goto csum_partial;
> >       }
> >
> >       if (is_udplite)
> > diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
> > index 3f2fca02fec5..fb73f1c331fb 100644
> > --- a/tools/testing/selftests/net/udpgso.c
> > +++ b/tools/testing/selftests/net/udpgso.c
> > @@ -102,6 +102,13 @@ struct testcase testcases_v4[] = {
> >               .gso_len = CONST_MSS_V4,
> >               .r_num_mss = 1,
> >       },
> > +     {
> > +             /* datalen <= MSS < gso_len: will fall back to no GSO */
> > +             .tlen = CONST_MSS_V4,
> > +             .gso_len = CONST_MSS_V4 + 1,
> > +             .r_num_mss = 0,
> > +             .r_len_last = CONST_MSS_V4,
> > +     },
> >       {
> >               /* send a single MSS + 1B */
> >               .tlen = CONST_MSS_V4 + 1,
> > @@ -205,6 +212,13 @@ struct testcase testcases_v6[] = {
> >               .gso_len = CONST_MSS_V6,
> >               .r_num_mss = 1,
> >       },
> > +     {
> > +             /* datalen <= MSS < gso_len: will fall back to no GSO */
> > +             .tlen = CONST_MSS_V6,
> > +             .gso_len = CONST_MSS_V6 + 1,
> > +             .r_num_mss = 0,
> > +             .r_len_last = CONST_MSS_V6,
> > +     },
> >       {
> >               /* send a single MSS + 1B */
> >               .tlen = CONST_MSS_V6 + 1,
> > --
> > 2.30.2
> >
> >
>
>
Willem de Bruijn Jan. 28, 2025, 2:45 p.m. UTC | #3
Yan Zhai wrote:
> Hi Willem,
> 
> Thanks for getting back to me.
> 
> On Mon, Jan 27, 2025 at 8:33 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Yan Zhai wrote:
> > > Commit 4094871db1d6 ("udp: only do GSO if # of segs > 1") avoided GSO
> > > for small packets. But the kernel currently dismisses GSO requests only
> > > after checking MTU on gso_size. This means any packets, regardless of
> > > their payload sizes, would be dropped when MTU is smaller than requested
> > > gso_size.
> >
> > Is this a realistic concern? How did you encounter this in practice.
> >
> > It *is* a misconfiguration to configure a gso_size larger than MTU.
> >
> > > Meanwhile, EINVAL would be returned in this case, making it
> > > very misleading to debug.
> >
> > Misleading is subjective. I'm not sure what is misleading here. From
> > my above comment, I believe this is correctly EINVAL.
> >
> > That said, if this impacts a real workload we could reconsider
> > relaxing the check. I.e., allowing through packets even when an
> > application has clearly misconfigured UDP_SEGMENT.
> >
> We did encounter a painful reliability issue in production last month.
> 
> To simplify the scenario, we had these symptoms when the issue occurred:
> 1. QUIC connections to host A started to fail, and cannot establish new ones
> 2. User space Wireguard to the exact same host worked 100% fine
> 
> This happened rarely, like one or twice a day, lasting for a few
> minutes usually, but it was quite visible since it is an office
> network.
> 
> Initially this prompted something wrong at the protocol layer. But
> after multiple rounds of digging, we finally figured the root cause
> was:
> 3. Something sometimes pings host B, which shares the same IP with
> host A but different ports (thanks to limited IPv4 space), and its
> PMTU was reduced to 1280 occasionally. This unexpectedly affected all
> traffic to that IP including traffic toward host A. Our QUIC client
> set gso_size to 1350, and that's why it got hit.
> 
> I agree that configurations do matter a lot here. Given how broken the
> PMTU was for the Internet, we might just turn off pmtudisc option on
> our end to avoid this failure path. But for those who hasn't yet, this
> could still be confusing if it ever happens, because nothing seems to
> point to PMTU in the first place:
> * small packets also get dropped
> * error code was EINVAL from sendmsg
> 
> That said, I probably should have used PMTU in my commit message to be
> more clear for our problem. But meanwhile I am also concerned about
> newly added tunnels to trigger the same issue, even if it has a static
> device MTU. My proposal should make the error reason more clear:
> EMSGSIZE itself is a direct signal pointing to MTU/PMTU. Larger
> packets getting dropped would have a similar effect.

Thanks for that context. Makes sense that this is a real issue.

One issue is that with segmentation, the initial mtu checks are
skipped, so they have to be enforced later. In __ip_append_data:

    mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;

Also, might this make the debugging actually harder, as the
error condition is now triggered intermittently.
Yan Zhai Jan. 29, 2025, 4:31 a.m. UTC | #4
On Tue, Jan 28, 2025 at 8:45 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Yan Zhai wrote:
> > Hi Willem,
> >
> > Thanks for getting back to me.
> >
> > On Mon, Jan 27, 2025 at 8:33 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Yan Zhai wrote:
> > > > Commit 4094871db1d6 ("udp: only do GSO if # of segs > 1") avoided GSO
> > > > for small packets. But the kernel currently dismisses GSO requests only
> > > > after checking MTU on gso_size. This means any packets, regardless of
> > > > their payload sizes, would be dropped when MTU is smaller than requested
> > > > gso_size.
> > >
> > > Is this a realistic concern? How did you encounter this in practice.
> > >
> > > It *is* a misconfiguration to configure a gso_size larger than MTU.
> > >
> > > > Meanwhile, EINVAL would be returned in this case, making it
> > > > very misleading to debug.
> > >
> > > Misleading is subjective. I'm not sure what is misleading here. From
> > > my above comment, I believe this is correctly EINVAL.
> > >
> > > That said, if this impacts a real workload we could reconsider
> > > relaxing the check. I.e., allowing through packets even when an
> > > application has clearly misconfigured UDP_SEGMENT.
> > >
> > We did encounter a painful reliability issue in production last month.
> >
> > To simplify the scenario, we had these symptoms when the issue occurred:
> > 1. QUIC connections to host A started to fail, and cannot establish new ones
> > 2. User space Wireguard to the exact same host worked 100% fine
> >
> > This happened rarely, like one or twice a day, lasting for a few
> > minutes usually, but it was quite visible since it is an office
> > network.
> >
> > Initially this prompted something wrong at the protocol layer. But
> > after multiple rounds of digging, we finally figured the root cause
> > was:
> > 3. Something sometimes pings host B, which shares the same IP with
> > host A but different ports (thanks to limited IPv4 space), and its
> > PMTU was reduced to 1280 occasionally. This unexpectedly affected all
> > traffic to that IP including traffic toward host A. Our QUIC client
> > set gso_size to 1350, and that's why it got hit.
> >
> > I agree that configurations do matter a lot here. Given how broken the
> > PMTU was for the Internet, we might just turn off pmtudisc option on
> > our end to avoid this failure path. But for those who hasn't yet, this
> > could still be confusing if it ever happens, because nothing seems to
> > point to PMTU in the first place:
> > * small packets also get dropped
> > * error code was EINVAL from sendmsg
> >
> > That said, I probably should have used PMTU in my commit message to be
> > more clear for our problem. But meanwhile I am also concerned about
> > newly added tunnels to trigger the same issue, even if it has a static
> > device MTU. My proposal should make the error reason more clear:
> > EMSGSIZE itself is a direct signal pointing to MTU/PMTU. Larger
> > packets getting dropped would have a similar effect.
>
> Thanks for that context. Makes sense that this is a real issue.
>
> One issue is that with segmentation, the initial mtu checks are
> skipped, so they have to be enforced later. In __ip_append_data:
>
>     mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
>
You are right, if packet sizes are between (PMTU, gso_size), then they
should still be dropped. But instead of checking explicitly in
udp_send_skb, maybe we can leave them to be dropped in
ip_finish_output? This way there is no need to add an extra branch for
non GSO code paths. PMTU shrinking should be rare, so the overhead
should be minimal.

> Also, might this make the debugging actually harder, as the
> error condition is now triggered intermittently.
Yes sendmsg may only return errors for a portion of packets now under
the same situation. But IMHO it's not trading debugging for
reliability. Consistent error is good news for engineers to reproduce
locally, but in production I find people (SREs, solution and
escalation engineers) rely on pcaps and errno a lot. The pattern in
pcaps (lack of large packets of certain sizes, since they are dropped
before dev_queue_xmit), and exact error reasons like EMSGSIZE are both
good indicators for root causes. EINVAL is more generic on the other
hand. For example, I remembered we had another issue on UDP sendmsg,
which also returned a bunch of EINVAL. But that was due to some
attacker tricking us to reply with source port 0.

thanks
Yan
Willem de Bruijn Jan. 29, 2025, 2:08 p.m. UTC | #5
Yan Zhai wrote:
> On Tue, Jan 28, 2025 at 8:45 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Yan Zhai wrote:
> > > Hi Willem,
> > >
> > > Thanks for getting back to me.
> > >
> > > On Mon, Jan 27, 2025 at 8:33 AM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > Yan Zhai wrote:
> > > > > Commit 4094871db1d6 ("udp: only do GSO if # of segs > 1") avoided GSO
> > > > > for small packets. But the kernel currently dismisses GSO requests only
> > > > > after checking MTU on gso_size. This means any packets, regardless of
> > > > > their payload sizes, would be dropped when MTU is smaller than requested
> > > > > gso_size.
> > > >
> > > > Is this a realistic concern? How did you encounter this in practice.
> > > >
> > > > It *is* a misconfiguration to configure a gso_size larger than MTU.
> > > >
> > > > > Meanwhile, EINVAL would be returned in this case, making it
> > > > > very misleading to debug.
> > > >
> > > > Misleading is subjective. I'm not sure what is misleading here. From
> > > > my above comment, I believe this is correctly EINVAL.
> > > >
> > > > That said, if this impacts a real workload we could reconsider
> > > > relaxing the check. I.e., allowing through packets even when an
> > > > application has clearly misconfigured UDP_SEGMENT.
> > > >
> > > We did encounter a painful reliability issue in production last month.
> > >
> > > To simplify the scenario, we had these symptoms when the issue occurred:
> > > 1. QUIC connections to host A started to fail, and cannot establish new ones
> > > 2. User space Wireguard to the exact same host worked 100% fine
> > >
> > > This happened rarely, like one or twice a day, lasting for a few
> > > minutes usually, but it was quite visible since it is an office
> > > network.
> > >
> > > Initially this prompted something wrong at the protocol layer. But
> > > after multiple rounds of digging, we finally figured the root cause
> > > was:
> > > 3. Something sometimes pings host B, which shares the same IP with
> > > host A but different ports (thanks to limited IPv4 space), and its
> > > PMTU was reduced to 1280 occasionally. This unexpectedly affected all
> > > traffic to that IP including traffic toward host A. Our QUIC client
> > > set gso_size to 1350, and that's why it got hit.
> > >
> > > I agree that configurations do matter a lot here. Given how broken the
> > > PMTU was for the Internet, we might just turn off pmtudisc option on
> > > our end to avoid this failure path. But for those who hasn't yet, this
> > > could still be confusing if it ever happens, because nothing seems to
> > > point to PMTU in the first place:
> > > * small packets also get dropped
> > > * error code was EINVAL from sendmsg
> > >
> > > That said, I probably should have used PMTU in my commit message to be
> > > more clear for our problem. But meanwhile I am also concerned about
> > > newly added tunnels to trigger the same issue, even if it has a static
> > > device MTU. My proposal should make the error reason more clear:
> > > EMSGSIZE itself is a direct signal pointing to MTU/PMTU. Larger
> > > packets getting dropped would have a similar effect.
> >
> > Thanks for that context. Makes sense that this is a real issue.
> >
> > One issue is that with segmentation, the initial mtu checks are
> > skipped, so they have to be enforced later. In __ip_append_data:
> >
> >     mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
> >
> You are right, if packet sizes are between (PMTU, gso_size), then they
> should still be dropped. But instead of checking explicitly in
> udp_send_skb, maybe we can leave them to be dropped in
> ip_finish_output?

Not sure how to do this, or whether it will be simpler than having all
the UDP GSO checks in udp_send_skb.

For a "don't add cost to the hot path" point of view, it's actually
best to keep all these checks in one place only when UDP_SEGMENT is
negotiated (where the hot path is the common case without GSO).

> This way there is no need to add an extra branch for
> non GSO code paths. PMTU shrinking should be rare, so the overhead
> should be minimal.
> 
> > Also, might this make the debugging actually harder, as the
> > error condition is now triggered intermittently.
> Yes sendmsg may only return errors for a portion of packets now under
> the same situation. But IMHO it's not trading debugging for
> reliability. Consistent error is good news for engineers to reproduce
> locally, but in production I find people (SREs, solution and
> escalation engineers) rely on pcaps and errno a lot. The pattern in
> pcaps (lack of large packets of certain sizes, since they are dropped
> before dev_queue_xmit), and exact error reasons like EMSGSIZE are both
> good indicators for root causes. EINVAL is more generic on the other
> hand. For example, I remembered we had another issue on UDP sendmsg,
> which also returned a bunch of EINVAL. But that was due to some
> attacker tricking us to reply with source port 0.

Relying on error code is fraught anyway. For online analysis (which
I think can be assumed when pcap is mentioned), function tracing and
bpf trace are much more powerful.

That said, no objections to returning EMSGSIZE instead of EINVAL. That
is the same UDP returns when sending a single datagram that exceeds
MTU, after all.
Yan Zhai Jan. 29, 2025, 4:48 p.m. UTC | #6
On Wed, Jan 29, 2025 at 8:08 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> Yan Zhai wrote:
> > On Tue, Jan 28, 2025 at 8:45 AM Willem de Bruijn
> > <willemdebruijn.kernel@gmail.com> wrote:
> > >
> > > Yan Zhai wrote:
> > > > Hi Willem,
> > > >
> > > > Thanks for getting back to me.
> > > >
> > > > On Mon, Jan 27, 2025 at 8:33 AM Willem de Bruijn
> > > > <willemdebruijn.kernel@gmail.com> wrote:
> > > > >
> > > > > Yan Zhai wrote:
> > > > > > Commit 4094871db1d6 ("udp: only do GSO if # of segs > 1") avoided GSO
> > > > > > for small packets. But the kernel currently dismisses GSO requests only
> > > > > > after checking MTU on gso_size. This means any packets, regardless of
> > > > > > their payload sizes, would be dropped when MTU is smaller than requested
> > > > > > gso_size.
> > > > >
> > > > > Is this a realistic concern? How did you encounter this in practice.
> > > > >
> > > > > It *is* a misconfiguration to configure a gso_size larger than MTU.
> > > > >
> > > > > > Meanwhile, EINVAL would be returned in this case, making it
> > > > > > very misleading to debug.
> > > > >
> > > > > Misleading is subjective. I'm not sure what is misleading here. From
> > > > > my above comment, I believe this is correctly EINVAL.
> > > > >
> > > > > That said, if this impacts a real workload we could reconsider
> > > > > relaxing the check. I.e., allowing through packets even when an
> > > > > application has clearly misconfigured UDP_SEGMENT.
> > > > >
> > > > We did encounter a painful reliability issue in production last month.
> > > >
> > > > To simplify the scenario, we had these symptoms when the issue occurred:
> > > > 1. QUIC connections to host A started to fail, and cannot establish new ones
> > > > 2. User space Wireguard to the exact same host worked 100% fine
> > > >
> > > > This happened rarely, like one or twice a day, lasting for a few
> > > > minutes usually, but it was quite visible since it is an office
> > > > network.
> > > >
> > > > Initially this prompted something wrong at the protocol layer. But
> > > > after multiple rounds of digging, we finally figured the root cause
> > > > was:
> > > > 3. Something sometimes pings host B, which shares the same IP with
> > > > host A but different ports (thanks to limited IPv4 space), and its
> > > > PMTU was reduced to 1280 occasionally. This unexpectedly affected all
> > > > traffic to that IP including traffic toward host A. Our QUIC client
> > > > set gso_size to 1350, and that's why it got hit.
> > > >
> > > > I agree that configurations do matter a lot here. Given how broken the
> > > > PMTU was for the Internet, we might just turn off pmtudisc option on
> > > > our end to avoid this failure path. But for those who hasn't yet, this
> > > > could still be confusing if it ever happens, because nothing seems to
> > > > point to PMTU in the first place:
> > > > * small packets also get dropped
> > > > * error code was EINVAL from sendmsg
> > > >
> > > > That said, I probably should have used PMTU in my commit message to be
> > > > more clear for our problem. But meanwhile I am also concerned about
> > > > newly added tunnels to trigger the same issue, even if it has a static
> > > > device MTU. My proposal should make the error reason more clear:
> > > > EMSGSIZE itself is a direct signal pointing to MTU/PMTU. Larger
> > > > packets getting dropped would have a similar effect.
> > >
> > > Thanks for that context. Makes sense that this is a real issue.
> > >
> > > One issue is that with segmentation, the initial mtu checks are
> > > skipped, so they have to be enforced later. In __ip_append_data:
> > >
> > >     mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
> > >
> > You are right, if packet sizes are between (PMTU, gso_size), then they
> > should still be dropped. But instead of checking explicitly in
> > udp_send_skb, maybe we can leave them to be dropped in
> > ip_finish_output?
>
> Not sure how to do this, or whether it will be simpler than having all
> the UDP GSO checks in udp_send_skb.
>
> For a "don't add cost to the hot path" point of view, it's actually
> best to keep all these checks in one place only when UDP_SEGMENT is
> negotiated (where the hot path is the common case without GSO).
>
I mean ip_finish_output is already dropping packets with length larger
than dst MTU. But I guess it doesn't hurt to check it also in GSO
branch. Let me send a V2 later to address it.

> > This way there is no need to add an extra branch for
> > non GSO code paths. PMTU shrinking should be rare, so the overhead
> > should be minimal.
> >
> > > Also, might this make the debugging actually harder, as the
> > > error condition is now triggered intermittently.
> > Yes sendmsg may only return errors for a portion of packets now under
> > the same situation. But IMHO it's not trading debugging for
> > reliability. Consistent error is good news for engineers to reproduce
> > locally, but in production I find people (SREs, solution and
> > escalation engineers) rely on pcaps and errno a lot. The pattern in
> > pcaps (lack of large packets of certain sizes, since they are dropped
> > before dev_queue_xmit), and exact error reasons like EMSGSIZE are both
> > good indicators for root causes. EINVAL is more generic on the other
> > hand. For example, I remembered we had another issue on UDP sendmsg,
> > which also returned a bunch of EINVAL. But that was due to some
> > attacker tricking us to reply with source port 0.
>
> Relying on error code is fraught anyway. For online analysis (which
> I think can be assumed when pcap is mentioned), function tracing and
> bpf trace are much more powerful.
>
Totally agree tracing is more powerful. Time by time we see issues
lingering for a few months get addressed in a few days or even hours
when tracing is plugged in. Unfortunately at least for us, the number
of people who can trace properly is far behind the volume of problems.
I can only hope in the future more people will recognize this as a
golden skill, in addition to current standard skills like pcap
analysis.

Yan

> That said, no objections to returning EMSGSIZE instead of EINVAL. That
> is the same UDP returns when sending a single datagram that exceeds
> MTU, after all.
>
Yan Zhai Jan. 30, 2025, 7:58 a.m. UTC | #7
v2 link here: https://lore.kernel.org/netdev/Z5swit7ykNRbJFMS@debian.debian/T/#u

On Wed, Jan 29, 2025 at 10:48 AM Yan Zhai <yan@cloudflare.com> wrote:
>
> On Wed, Jan 29, 2025 at 8:08 AM Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
> >
> > Yan Zhai wrote:
> > > On Tue, Jan 28, 2025 at 8:45 AM Willem de Bruijn
> > > <willemdebruijn.kernel@gmail.com> wrote:
> > > >
> > > > Yan Zhai wrote:
> > > > > Hi Willem,
> > > > >
> > > > > Thanks for getting back to me.
> > > > >
> > > > > On Mon, Jan 27, 2025 at 8:33 AM Willem de Bruijn
> > > > > <willemdebruijn.kernel@gmail.com> wrote:
> > > > > >
> > > > > > Yan Zhai wrote:
> > > > > > > Commit 4094871db1d6 ("udp: only do GSO if # of segs > 1") avoided GSO
> > > > > > > for small packets. But the kernel currently dismisses GSO requests only
> > > > > > > after checking MTU on gso_size. This means any packets, regardless of
> > > > > > > their payload sizes, would be dropped when MTU is smaller than requested
> > > > > > > gso_size.
> > > > > >
> > > > > > Is this a realistic concern? How did you encounter this in practice.
> > > > > >
> > > > > > It *is* a misconfiguration to configure a gso_size larger than MTU.
> > > > > >
> > > > > > > Meanwhile, EINVAL would be returned in this case, making it
> > > > > > > very misleading to debug.
> > > > > >
> > > > > > Misleading is subjective. I'm not sure what is misleading here. From
> > > > > > my above comment, I believe this is correctly EINVAL.
> > > > > >
> > > > > > That said, if this impacts a real workload we could reconsider
> > > > > > relaxing the check. I.e., allowing through packets even when an
> > > > > > application has clearly misconfigured UDP_SEGMENT.
> > > > > >
> > > > > We did encounter a painful reliability issue in production last month.
> > > > >
> > > > > To simplify the scenario, we had these symptoms when the issue occurred:
> > > > > 1. QUIC connections to host A started to fail, and cannot establish new ones
> > > > > 2. User space Wireguard to the exact same host worked 100% fine
> > > > >
> > > > > This happened rarely, like one or twice a day, lasting for a few
> > > > > minutes usually, but it was quite visible since it is an office
> > > > > network.
> > > > >
> > > > > Initially this prompted something wrong at the protocol layer. But
> > > > > after multiple rounds of digging, we finally figured the root cause
> > > > > was:
> > > > > 3. Something sometimes pings host B, which shares the same IP with
> > > > > host A but different ports (thanks to limited IPv4 space), and its
> > > > > PMTU was reduced to 1280 occasionally. This unexpectedly affected all
> > > > > traffic to that IP including traffic toward host A. Our QUIC client
> > > > > set gso_size to 1350, and that's why it got hit.
> > > > >
> > > > > I agree that configurations do matter a lot here. Given how broken the
> > > > > PMTU was for the Internet, we might just turn off pmtudisc option on
> > > > > our end to avoid this failure path. But for those who hasn't yet, this
> > > > > could still be confusing if it ever happens, because nothing seems to
> > > > > point to PMTU in the first place:
> > > > > * small packets also get dropped
> > > > > * error code was EINVAL from sendmsg
> > > > >
> > > > > That said, I probably should have used PMTU in my commit message to be
> > > > > more clear for our problem. But meanwhile I am also concerned about
> > > > > newly added tunnels to trigger the same issue, even if it has a static
> > > > > device MTU. My proposal should make the error reason more clear:
> > > > > EMSGSIZE itself is a direct signal pointing to MTU/PMTU. Larger
> > > > > packets getting dropped would have a similar effect.
> > > >
> > > > Thanks for that context. Makes sense that this is a real issue.
> > > >
> > > > One issue is that with segmentation, the initial mtu checks are
> > > > skipped, so they have to be enforced later. In __ip_append_data:
> > > >
> > > >     mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
> > > >
> > > You are right, if packet sizes are between (PMTU, gso_size), then they
> > > should still be dropped. But instead of checking explicitly in
> > > udp_send_skb, maybe we can leave them to be dropped in
> > > ip_finish_output?
> >
> > Not sure how to do this, or whether it will be simpler than having all
> > the UDP GSO checks in udp_send_skb.
> >
> > For a "don't add cost to the hot path" point of view, it's actually
> > best to keep all these checks in one place only when UDP_SEGMENT is
> > negotiated (where the hot path is the common case without GSO).
> >
> I mean ip_finish_output is already dropping packets with length larger
> than dst MTU. But I guess it doesn't hurt to check it also in GSO
> branch. Let me send a V2 later to address it.
>
> > > This way there is no need to add an extra branch for
> > > non GSO code paths. PMTU shrinking should be rare, so the overhead
> > > should be minimal.
> > >
> > > > Also, might this make the debugging actually harder, as the
> > > > error condition is now triggered intermittently.
> > > Yes sendmsg may only return errors for a portion of packets now under
> > > the same situation. But IMHO it's not trading debugging for
> > > reliability. Consistent error is good news for engineers to reproduce
> > > locally, but in production I find people (SREs, solution and
> > > escalation engineers) rely on pcaps and errno a lot. The pattern in
> > > pcaps (lack of large packets of certain sizes, since they are dropped
> > > before dev_queue_xmit), and exact error reasons like EMSGSIZE are both
> > > good indicators for root causes. EINVAL is more generic on the other
> > > hand. For example, I remembered we had another issue on UDP sendmsg,
> > > which also returned a bunch of EINVAL. But that was due to some
> > > attacker tricking us to reply with source port 0.
> >
> > Relying on error code is fraught anyway. For online analysis (which
> > I think can be assumed when pcap is mentioned), function tracing and
> > bpf trace are much more powerful.
> >
> Totally agree tracing is more powerful. Time by time we see issues
> lingering for a few months get addressed in a few days or even hours
> when tracing is plugged in. Unfortunately at least for us, the number
> of people who can trace properly is far behind the volume of problems.
> I can only hope in the future more people will recognize this as a
> golden skill, in addition to current standard skills like pcap
> analysis.
>
> Yan
>
> > That said, no objections to returning EMSGSIZE instead of EINVAL. That
> > is the same UDP returns when sending a single datagram that exceeds
> > MTU, after all.
> >
diff mbox series

Patch

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c472c9a57cf6..9aed1b4a871f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1137,13 +1137,13 @@  static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 	uh->len = htons(len);
 	uh->check = 0;
 
-	if (cork->gso_size) {
+	if (cork->gso_size && datalen > cork->gso_size) {
 		const int hlen = skb_network_header_len(skb) +
 				 sizeof(struct udphdr);
 
 		if (hlen + cork->gso_size > cork->fragsize) {
 			kfree_skb(skb);
-			return -EINVAL;
+			return -EMSGSIZE;
 		}
 		if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
 			kfree_skb(skb);
@@ -1158,15 +1158,13 @@  static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
 			return -EIO;
 		}
 
-		if (datalen > cork->gso_size) {
-			skb_shinfo(skb)->gso_size = cork->gso_size;
-			skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
-			skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
-								 cork->gso_size);
+		skb_shinfo(skb)->gso_size = cork->gso_size;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
+							 cork->gso_size);
 
-			/* Don't checksum the payload, skb will get segmented */
-			goto csum_partial;
-		}
+		/* Don't checksum the payload, skb will get segmented */
+		goto csum_partial;
 	}
 
 	if (is_udplite)  				 /*     UDP-Lite      */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6671daa67f4f..6cdc8ce4c6f9 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1385,13 +1385,13 @@  static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 	uh->len = htons(len);
 	uh->check = 0;
 
-	if (cork->gso_size) {
+	if (cork->gso_size && datalen > cork->gso_size) {
 		const int hlen = skb_network_header_len(skb) +
 				 sizeof(struct udphdr);
 
 		if (hlen + cork->gso_size > cork->fragsize) {
 			kfree_skb(skb);
-			return -EINVAL;
+			return -EMSGSIZE;
 		}
 		if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
 			kfree_skb(skb);
@@ -1406,15 +1406,13 @@  static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
 			return -EIO;
 		}
 
-		if (datalen > cork->gso_size) {
-			skb_shinfo(skb)->gso_size = cork->gso_size;
-			skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
-			skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
-								 cork->gso_size);
+		skb_shinfo(skb)->gso_size = cork->gso_size;
+		skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
+							 cork->gso_size);
 
-			/* Don't checksum the payload, skb will get segmented */
-			goto csum_partial;
-		}
+		/* Don't checksum the payload, skb will get segmented */
+		goto csum_partial;
 	}
 
 	if (is_udplite)
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index 3f2fca02fec5..fb73f1c331fb 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -102,6 +102,13 @@  struct testcase testcases_v4[] = {
 		.gso_len = CONST_MSS_V4,
 		.r_num_mss = 1,
 	},
+	{
+		/* datalen <= MSS < gso_len: will fall back to no GSO */
+		.tlen = CONST_MSS_V4,
+		.gso_len = CONST_MSS_V4 + 1,
+		.r_num_mss = 0,
+		.r_len_last = CONST_MSS_V4,
+	},
 	{
 		/* send a single MSS + 1B */
 		.tlen = CONST_MSS_V4 + 1,
@@ -205,6 +212,13 @@  struct testcase testcases_v6[] = {
 		.gso_len = CONST_MSS_V6,
 		.r_num_mss = 1,
 	},
+	{
+		/* datalen <= MSS < gso_len: will fall back to no GSO */
+		.tlen = CONST_MSS_V6,
+		.gso_len = CONST_MSS_V6 + 1,
+		.r_num_mss = 0,
+		.r_len_last = CONST_MSS_V6,
+	},
 	{
 		/* send a single MSS + 1B */
 		.tlen = CONST_MSS_V6 + 1,