Message ID | 20220203015140.3022854-9-eric.dumazet@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | tcp: BIG TCP implementation | expand |
On Wed, 2022-02-02 at 17:51 -0800, Eric Dumazet wrote: > From: Coco Li <lixiaoyan@google.com> > > Instead of simply forcing a 0 payload_len in IPv6 header, > implement RFC 2675 and insert a custom extension header. > > Note that only TCP stack is currently potentially generating > jumbograms, and that this extension header is purely local, > it wont be sent on a physical link. > > This is needed so that packet capture (tcpdump and friends) > can properly dissect these large packets. > > Signed-off-by: Coco Li <lixiaoyan@google.com> > Signed-off-by: Eric Dumazet <edumazet@google.com> > --- > include/linux/ipv6.h | 1 + > net/ipv6/ip6_output.c | 22 ++++++++++++++++++++-- > 2 files changed, 21 insertions(+), 2 deletions(-) > > diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h > index 1e0f8a31f3de175659dca9ecee9f97d8b01e2b68..d3fb87e1589997570cde9cb5d92b2222008a229d 100644 > --- a/include/linux/ipv6.h > +++ b/include/linux/ipv6.h > @@ -144,6 +144,7 @@ struct inet6_skb_parm { > #define IP6SKB_L3SLAVE 64 > #define IP6SKB_JUMBOGRAM 128 > #define IP6SKB_SEG6 256 > +#define IP6SKB_FAKEJUMBO 512 > }; > > #if defined(CONFIG_NET_L3_MASTER_DEV) > diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c > index 0c6c971ce0a58b50f8a9349b8507dffac9c7818c..f78ba145620560e5d7cb25aaf16fec61ddd9ed40 100644 > --- a/net/ipv6/ip6_output.c > +++ b/net/ipv6/ip6_output.c > @@ -180,7 +180,9 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff > #endif > > mtu = ip6_skb_dst_mtu(skb); > - if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu)) > + if (skb_is_gso(skb) && > + !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && > + !skb_gso_validate_network_len(skb, mtu)) > return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); If I read correctly jumbogram with gso len not fitting the egress device MTU will not be fragmented, as opposed to plain old GSO packets. Am I correct? why fragmentation is not needed for jumbogram? Thanks! Paolo
On Thu, Feb 3, 2022 at 1:07 AM Paolo Abeni <pabeni@redhat.com> wrote: > > On Wed, 2022-02-02 at 17:51 -0800, Eric Dumazet wrote: > > From: Coco Li <lixiaoyan@google.com> > > > > Instead of simply forcing a 0 payload_len in IPv6 header, > > implement RFC 2675 and insert a custom extension header. > > > > Note that only TCP stack is currently potentially generating > > jumbograms, and that this extension header is purely local, > > it wont be sent on a physical link. > > > > This is needed so that packet capture (tcpdump and friends) > > can properly dissect these large packets. > > > > Signed-off-by: Coco Li <lixiaoyan@google.com> > > Signed-off-by: Eric Dumazet <edumazet@google.com> > > --- > > include/linux/ipv6.h | 1 + > > net/ipv6/ip6_output.c | 22 ++++++++++++++++++++-- > > 2 files changed, 21 insertions(+), 2 deletions(-) > > > > diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h > > index 1e0f8a31f3de175659dca9ecee9f97d8b01e2b68..d3fb87e1589997570cde9cb5d92b2222008a229d 100644 > > --- a/include/linux/ipv6.h > > +++ b/include/linux/ipv6.h > > @@ -144,6 +144,7 @@ struct inet6_skb_parm { > > #define IP6SKB_L3SLAVE 64 > > #define IP6SKB_JUMBOGRAM 128 > > #define IP6SKB_SEG6 256 > > +#define IP6SKB_FAKEJUMBO 512 > > }; > > > > #if defined(CONFIG_NET_L3_MASTER_DEV) > > diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c > > index 0c6c971ce0a58b50f8a9349b8507dffac9c7818c..f78ba145620560e5d7cb25aaf16fec61ddd9ed40 100644 > > --- a/net/ipv6/ip6_output.c > > +++ b/net/ipv6/ip6_output.c > > @@ -180,7 +180,9 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff > > #endif > > > > mtu = ip6_skb_dst_mtu(skb); > > - if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu)) > > + if (skb_is_gso(skb) && > > + !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && > > + !skb_gso_validate_network_len(skb, mtu)) > > return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); > > If I read correctly jumbogram with gso len not fitting the egress > device MTU will not be fragmented, as opposed to plain old GSO packets. > Am I correct? why fragmentation is not needed for jumbogram? I guess we could add this validation in place. Honestly, we do not expect BIG TCP being deployed in hostile environments (host having devices with different MTU) Fragmentation is evil and should be avoided at all costs.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 1e0f8a31f3de175659dca9ecee9f97d8b01e2b68..d3fb87e1589997570cde9cb5d92b2222008a229d 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -144,6 +144,7 @@ struct inet6_skb_parm { #define IP6SKB_L3SLAVE 64 #define IP6SKB_JUMBOGRAM 128 #define IP6SKB_SEG6 256 +#define IP6SKB_FAKEJUMBO 512 }; #if defined(CONFIG_NET_L3_MASTER_DEV) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0c6c971ce0a58b50f8a9349b8507dffac9c7818c..f78ba145620560e5d7cb25aaf16fec61ddd9ed40 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -180,7 +180,9 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff #endif mtu = ip6_skb_dst_mtu(skb); - if (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu)) + if (skb_is_gso(skb) && + !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && + !skb_gso_validate_network_len(skb, mtu)) return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); if ((skb->len > mtu && !skb_is_gso(skb)) || @@ -251,6 +253,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; struct inet6_dev *idev = ip6_dst_idev(dst); + struct hop_jumbo_hdr *hop_jumbo; + int hoplen = sizeof(*hop_jumbo); unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; @@ -258,7 +262,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, int hlimit = -1; u32 mtu; - head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); + head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; @@ -281,6 +285,20 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, &fl6->saddr); } + if (unlikely(seg_len > IPV6_MAXPLEN)) { + hop_jumbo = skb_push(skb, hoplen); + + hop_jumbo->nexthdr = proto; + hop_jumbo->hdrlen = 0; + hop_jumbo->tlv_type = IPV6_TLV_JUMBO; + hop_jumbo->tlv_len = 4; + hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); + + proto = IPPROTO_HOPOPTS; + seg_len = 0; + IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; + } + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb);