diff mbox series

[net-next,08/15] ipv6: Add hop-by-hop header to jumbograms in ip6_output

Message ID 20220203015140.3022854-9-eric.dumazet@gmail.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series tcp: BIG TCP implementation | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2123 this patch: 2123
netdev/cc_maintainers warning 2 maintainers not CCed: dsahern@kernel.org yoshfuji@linux-ipv6.org
netdev/build_clang success Errors and warnings before: 313 this patch: 313
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 2249 this patch: 2249
netdev/checkpatch warning WARNING: 'wont' may be misspelled - perhaps 'won't'?
netdev/kdoc success Errors and warnings before: 37 this patch: 37
netdev/source_inline success Was 0 now: 0

Commit Message

Eric Dumazet Feb. 3, 2022, 1:51 a.m. UTC
From: Coco Li <lixiaoyan@google.com>

Instead of simply forcing a 0 payload_len in IPv6 header,
implement RFC 2675 and insert a custom extension header.

Note that only TCP stack is currently potentially generating
jumbograms, and that this extension header is purely local,
it wont be sent on a physical link.

This is needed so that packet capture (tcpdump and friends)
can properly dissect these large packets.

Signed-off-by: Coco Li <lixiaoyan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/linux/ipv6.h  |  1 +
 net/ipv6/ip6_output.c | 22 ++++++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

Comments

Paolo Abeni Feb. 3, 2022, 9:07 a.m. UTC | #1
On Wed, 2022-02-02 at 17:51 -0800, Eric Dumazet wrote:
> From: Coco Li <lixiaoyan@google.com>
> 
> Instead of simply forcing a 0 payload_len in IPv6 header,
> implement RFC 2675 and insert a custom extension header.
> 
> Note that only TCP stack is currently potentially generating
> jumbograms, and that this extension header is purely local,
> it wont be sent on a physical link.
> 
> This is needed so that packet capture (tcpdump and friends)
> can properly dissect these large packets.
> 
> Signed-off-by: Coco Li <lixiaoyan@google.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/linux/ipv6.h  |  1 +
>  net/ipv6/ip6_output.c | 22 ++++++++++++++++++++--
>  2 files changed, 21 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
> index 1e0f8a31f3de175659dca9ecee9f97d8b01e2b68..d3fb87e1589997570cde9cb5d92b2222008a229d 100644
> --- a/include/linux/ipv6.h
> +++ b/include/linux/ipv6.h
> @@ -144,6 +144,7 @@ struct inet6_skb_parm {
>  #define IP6SKB_L3SLAVE         64
>  #define IP6SKB_JUMBOGRAM      128
>  #define IP6SKB_SEG6	      256
> +#define IP6SKB_FAKEJUMBO      512
>  };
>  
>  #if defined(CONFIG_NET_L3_MASTER_DEV)
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index 0c6c971ce0a58b50f8a9349b8507dffac9c7818c..f78ba145620560e5d7cb25aaf16fec61ddd9ed40 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -180,7 +180,9 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff
>  #endif
>  
>  	mtu = ip6_skb_dst_mtu(skb);
> -	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
> +	if (skb_is_gso(skb) &&
> +	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
> +	    !skb_gso_validate_network_len(skb, mtu))
>  		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);

If I read correctly jumbogram with gso len not fitting the egress
device MTU will not be fragmented, as opposed to plain old GSO packets.
Am I correct? why fragmentation is not needed for jumbogram?

Thanks!

Paolo
Eric Dumazet Feb. 3, 2022, 4:31 p.m. UTC | #2
On Thu, Feb 3, 2022 at 1:07 AM Paolo Abeni <pabeni@redhat.com> wrote:
>
> On Wed, 2022-02-02 at 17:51 -0800, Eric Dumazet wrote:
> > From: Coco Li <lixiaoyan@google.com>
> >
> > Instead of simply forcing a 0 payload_len in IPv6 header,
> > implement RFC 2675 and insert a custom extension header.
> >
> > Note that only TCP stack is currently potentially generating
> > jumbograms, and that this extension header is purely local,
> > it wont be sent on a physical link.
> >
> > This is needed so that packet capture (tcpdump and friends)
> > can properly dissect these large packets.
> >
> > Signed-off-by: Coco Li <lixiaoyan@google.com>
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> >  include/linux/ipv6.h  |  1 +
> >  net/ipv6/ip6_output.c | 22 ++++++++++++++++++++--
> >  2 files changed, 21 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
> > index 1e0f8a31f3de175659dca9ecee9f97d8b01e2b68..d3fb87e1589997570cde9cb5d92b2222008a229d 100644
> > --- a/include/linux/ipv6.h
> > +++ b/include/linux/ipv6.h
> > @@ -144,6 +144,7 @@ struct inet6_skb_parm {
> >  #define IP6SKB_L3SLAVE         64
> >  #define IP6SKB_JUMBOGRAM      128
> >  #define IP6SKB_SEG6        256
> > +#define IP6SKB_FAKEJUMBO      512
> >  };
> >
> >  #if defined(CONFIG_NET_L3_MASTER_DEV)
> > diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> > index 0c6c971ce0a58b50f8a9349b8507dffac9c7818c..f78ba145620560e5d7cb25aaf16fec61ddd9ed40 100644
> > --- a/net/ipv6/ip6_output.c
> > +++ b/net/ipv6/ip6_output.c
> > @@ -180,7 +180,9 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff
> >  #endif
> >
> >       mtu = ip6_skb_dst_mtu(skb);
> > -     if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
> > +     if (skb_is_gso(skb) &&
> > +         !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
> > +         !skb_gso_validate_network_len(skb, mtu))
> >               return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
>
> If I read correctly jumbogram with gso len not fitting the egress
> device MTU will not be fragmented, as opposed to plain old GSO packets.
> Am I correct? why fragmentation is not needed for jumbogram?

I guess we could add this validation in place.

Honestly, we do not expect BIG TCP being deployed in hostile
environments (host having devices with different MTU)

Fragmentation is evil and should be avoided at all costs.
diff mbox series

Patch

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 1e0f8a31f3de175659dca9ecee9f97d8b01e2b68..d3fb87e1589997570cde9cb5d92b2222008a229d 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -144,6 +144,7 @@  struct inet6_skb_parm {
 #define IP6SKB_L3SLAVE         64
 #define IP6SKB_JUMBOGRAM      128
 #define IP6SKB_SEG6	      256
+#define IP6SKB_FAKEJUMBO      512
 };
 
 #if defined(CONFIG_NET_L3_MASTER_DEV)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 0c6c971ce0a58b50f8a9349b8507dffac9c7818c..f78ba145620560e5d7cb25aaf16fec61ddd9ed40 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -180,7 +180,9 @@  static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff
 #endif
 
 	mtu = ip6_skb_dst_mtu(skb);
-	if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))
+	if (skb_is_gso(skb) &&
+	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
+	    !skb_gso_validate_network_len(skb, mtu))
 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
 
 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
@@ -251,6 +253,8 @@  int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *dev = dst->dev;
 	struct inet6_dev *idev = ip6_dst_idev(dst);
+	struct hop_jumbo_hdr *hop_jumbo;
+	int hoplen = sizeof(*hop_jumbo);
 	unsigned int head_room;
 	struct ipv6hdr *hdr;
 	u8  proto = fl6->flowi6_proto;
@@ -258,7 +262,7 @@  int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	int hlimit = -1;
 	u32 mtu;
 
-	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
+	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
 	if (opt)
 		head_room += opt->opt_nflen + opt->opt_flen;
 
@@ -281,6 +285,20 @@  int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 					     &fl6->saddr);
 	}
 
+	if (unlikely(seg_len > IPV6_MAXPLEN)) {
+		hop_jumbo = skb_push(skb, hoplen);
+
+		hop_jumbo->nexthdr = proto;
+		hop_jumbo->hdrlen = 0;
+		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
+		hop_jumbo->tlv_len = 4;
+		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
+
+		proto = IPPROTO_HOPOPTS;
+		seg_len = 0;
+		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
+	}
+
 	skb_push(skb, sizeof(struct ipv6hdr));
 	skb_reset_network_header(skb);
 	hdr = ipv6_hdr(skb);