Message ID | 798ca80553e73028eeec4be08ba1549d08b2e5fc.1674835106.git.lucien.xin@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: support ipv4 big tcp | expand |
On Fri, Jan 27, 2023 at 5:00 PM Xin Long <lucien.xin@gmail.com> wrote: > > Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP. > > Firstly, allow sk->sk_gso_max_size to be set to a value greater than > GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size() > for IPv4 TCP sockets. > > Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU > in __ip_local_out() to allow to send BIG TCP packets, and this implies > that skb->len is the length of a IPv4 packet; On RX path, use skb->len > as the length of the IPv4 packet when the IP header tot_len is 0 and > skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and > skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only > need to update these APIs. > > Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows > the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In > GRO complete, set IP header tot_len to 0 when the merged packet size > greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed > on RX path. > > Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes > this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP > packets. > > Signed-off-by: Xin Long <lucien.xin@gmail.com> > --- > net/core/gro.c | 12 +++++++----- > net/core/sock.c | 8 ++++++-- > net/ipv4/af_inet.c | 7 ++++--- > net/ipv4/ip_input.c | 2 +- > net/ipv4/ip_output.c | 2 +- > 5 files changed, 19 insertions(+), 12 deletions(-) > > diff --git a/net/core/gro.c b/net/core/gro.c > index 506f83d715f8..b15f85546bdd 100644 > --- a/net/core/gro.c > +++ b/net/core/gro.c > @@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > struct sk_buff *lp; > int segs; > > - /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ > - gro_max_size = READ_ONCE(p->dev->gro_max_size); > + /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ > + gro_max_size = p->protocol == htons(ETH_P_IPV6) ? > + READ_ONCE(p->dev->gro_max_size) : > + READ_ONCE(p->dev->gro_ipv4_max_size); > > if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) > return -E2BIG; > > if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { > - if (p->protocol != htons(ETH_P_IPV6) || > - skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || > - ipv6_hdr(p)->nexthdr != IPPROTO_TCP || > + if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || > + (p->protocol == htons(ETH_P_IPV6) && > + skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || > p->encapsulation) > return -E2BIG; > } > diff --git a/net/core/sock.c b/net/core/sock.c > index 7ba4891460ad..c98f9a4eeff9 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -2383,6 +2383,8 @@ static void sk_trim_gso_size(struct sock *sk) > !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) > return; > #endif > + if (sk->sk_family == AF_INET && sk_is_tcp(sk)) > + return; Or simply diff --git a/net/core/sock.c b/net/core/sock.c index 7ba4891460adbd6c13c0ce1dcdd7f23c8c1f0f5d..dcb8fff91fd9a9472267a2cf2fdc98114a7d2b7d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2375,14 +2375,9 @@ EXPORT_SYMBOL_GPL(sk_free_unlock_clone); static void sk_trim_gso_size(struct sock *sk) { - if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) + if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE || + sk_is_tcp(sk)) return; -#if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6 && - sk_is_tcp(sk) && - !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) - return; -#endif sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; } > sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; > } > > @@ -2403,8 +2405,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) > sk->sk_route_caps &= ~NETIF_F_GSO_MASK; > } else { > sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; > - /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ > - sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); > + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ > + sk->sk_gso_max_size = sk->sk_family == AF_INET6 ? > + READ_ONCE(dst->dev->gso_max_size) : > + READ_ONCE(dst->dev->gso_ipv4_max_size); > sk_trim_gso_size(sk); > sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); > /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > index 6c0ec2789943..2f992a323b95 100644 > --- a/net/ipv4/af_inet.c > +++ b/net/ipv4/af_inet.c > @@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) > if (unlikely(ip_fast_csum((u8 *)iph, 5))) > goto out; > > + NAPI_GRO_CB(skb)->proto = proto; > id = ntohl(*(__be32 *)&iph->id); > flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); > id >>= 16; > @@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) > > int inet_gro_complete(struct sk_buff *skb, int nhoff) > { > - __be16 newlen = htons(skb->len - nhoff); > struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); > const struct net_offload *ops; > + __be16 totlen = iph->tot_len; > int proto = iph->protocol; > int err = -ENOSYS; > > @@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) > skb_set_inner_network_header(skb, nhoff); > } > > - csum_replace2(&iph->check, iph->tot_len, newlen); > - iph->tot_len = newlen; > + iph_set_totlen(iph, skb->len - nhoff); > + csum_replace2(&iph->check, totlen, iph->tot_len); > > ops = rcu_dereference(inet_offloads[proto]); > if (WARN_ON(!ops || !ops->callbacks.gro_complete)) > diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c > index e880ce77322a..0aa8c49b4e1b 100644 > --- a/net/ipv4/ip_input.c > +++ b/net/ipv4/ip_input.c > @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) > if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) > goto csum_error; > > - len = ntohs(iph->tot_len); > + len = skb_ip_totlen(skb); len = iph_totlen(skb, iph); > if (skb->len < len) { > drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; > __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > index 922c87ef1ab5..4e4e308c3230 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) > { > struct iphdr *iph = ip_hdr(skb); > > - iph->tot_len = htons(skb->len); > + iph_set_totlen(iph, skb->len); > ip_send_check(iph); > > /* if egress device is enslaved to an L3 master device pass the > -- > 2.31.1 >
On Fri, Jan 27, 2023 at 12:41 PM Eric Dumazet <edumazet@google.com> wrote: > > On Fri, Jan 27, 2023 at 5:00 PM Xin Long <lucien.xin@gmail.com> wrote: > > > > Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP. > > > > Firstly, allow sk->sk_gso_max_size to be set to a value greater than > > GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size() > > for IPv4 TCP sockets. > > > > Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU > > in __ip_local_out() to allow to send BIG TCP packets, and this implies > > that skb->len is the length of a IPv4 packet; On RX path, use skb->len > > as the length of the IPv4 packet when the IP header tot_len is 0 and > > skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and > > skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only > > need to update these APIs. > > > > Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows > > the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In > > GRO complete, set IP header tot_len to 0 when the merged packet size > > greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed > > on RX path. > > > > Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes > > this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP > > packets. > > > > Signed-off-by: Xin Long <lucien.xin@gmail.com> > > --- > > net/core/gro.c | 12 +++++++----- > > net/core/sock.c | 8 ++++++-- > > net/ipv4/af_inet.c | 7 ++++--- > > net/ipv4/ip_input.c | 2 +- > > net/ipv4/ip_output.c | 2 +- > > 5 files changed, 19 insertions(+), 12 deletions(-) > > > > diff --git a/net/core/gro.c b/net/core/gro.c > > index 506f83d715f8..b15f85546bdd 100644 > > --- a/net/core/gro.c > > +++ b/net/core/gro.c > > @@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > struct sk_buff *lp; > > int segs; > > > > - /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ > > - gro_max_size = READ_ONCE(p->dev->gro_max_size); > > + /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ > > + gro_max_size = p->protocol == htons(ETH_P_IPV6) ? > > + READ_ONCE(p->dev->gro_max_size) : > > + READ_ONCE(p->dev->gro_ipv4_max_size); > > > > if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) > > return -E2BIG; > > > > if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { > > - if (p->protocol != htons(ETH_P_IPV6) || > > - skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || > > - ipv6_hdr(p)->nexthdr != IPPROTO_TCP || > > + if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || > > + (p->protocol == htons(ETH_P_IPV6) && > > + skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || > > p->encapsulation) > > return -E2BIG; > > } > > diff --git a/net/core/sock.c b/net/core/sock.c > > index 7ba4891460ad..c98f9a4eeff9 100644 > > --- a/net/core/sock.c > > +++ b/net/core/sock.c > > @@ -2383,6 +2383,8 @@ static void sk_trim_gso_size(struct sock *sk) > > !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) > > return; > > #endif > > + if (sk->sk_family == AF_INET && sk_is_tcp(sk)) > > + return; > > Or simply > > diff --git a/net/core/sock.c b/net/core/sock.c > index 7ba4891460adbd6c13c0ce1dcdd7f23c8c1f0f5d..dcb8fff91fd9a9472267a2cf2fdc98114a7d2b7d > 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -2375,14 +2375,9 @@ EXPORT_SYMBOL_GPL(sk_free_unlock_clone); > > static void sk_trim_gso_size(struct sock *sk) > { > - if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) > + if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE || > + sk_is_tcp(sk)) > return; > -#if IS_ENABLED(CONFIG_IPV6) > - if (sk->sk_family == AF_INET6 && > - sk_is_tcp(sk) && > - !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) > - return; > -#endif > sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; > } There's a difference, AF_INET6 TCP socket may send ipv4 packets with ipv6_addr_v4mapped, if we don't check ipv6_addr_v4mapped(), IPV4 GSO packets might go with the "gso_max_size" for IPV6. I think we could use the change you wrote above, but we also need to use dst->ops->family instead of sk->sk_family in sk_setup_caps(): + sk->sk_gso_max_size = dst->ops->family == AF_INET6 ? + READ_ONCE(dst->dev->gso_max_size) : + READ_ONCE(dst->dev->gso_ipv4_max_size); > > > > > sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; > > } > > > > @@ -2403,8 +2405,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) > > sk->sk_route_caps &= ~NETIF_F_GSO_MASK; > > } else { > > sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; > > - /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ > > - sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); > > + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ > > + sk->sk_gso_max_size = sk->sk_family == AF_INET6 ? > > + READ_ONCE(dst->dev->gso_max_size) : > > + READ_ONCE(dst->dev->gso_ipv4_max_size); > > sk_trim_gso_size(sk); > > sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); > > /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ > > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > > index 6c0ec2789943..2f992a323b95 100644 > > --- a/net/ipv4/af_inet.c > > +++ b/net/ipv4/af_inet.c > > @@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) > > if (unlikely(ip_fast_csum((u8 *)iph, 5))) > > goto out; > > > > + NAPI_GRO_CB(skb)->proto = proto; > > id = ntohl(*(__be32 *)&iph->id); > > flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); > > id >>= 16; > > @@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) > > > > int inet_gro_complete(struct sk_buff *skb, int nhoff) > > { > > - __be16 newlen = htons(skb->len - nhoff); > > struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); > > const struct net_offload *ops; > > + __be16 totlen = iph->tot_len; > > int proto = iph->protocol; > > int err = -ENOSYS; > > > > @@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) > > skb_set_inner_network_header(skb, nhoff); > > } > > > > - csum_replace2(&iph->check, iph->tot_len, newlen); > > - iph->tot_len = newlen; > > + iph_set_totlen(iph, skb->len - nhoff); > > + csum_replace2(&iph->check, totlen, iph->tot_len); > > > > ops = rcu_dereference(inet_offloads[proto]); > > if (WARN_ON(!ops || !ops->callbacks.gro_complete)) > > diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c > > index e880ce77322a..0aa8c49b4e1b 100644 > > --- a/net/ipv4/ip_input.c > > +++ b/net/ipv4/ip_input.c > > @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) > > if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) > > goto csum_error; > > > > - len = ntohs(iph->tot_len); > > + len = skb_ip_totlen(skb); > > len = iph_totlen(skb, iph); OK, thanks.
On Fri, Jan 27, 2023 at 7:37 PM Xin Long <lucien.xin@gmail.com> wrote: > > On Fri, Jan 27, 2023 at 12:41 PM Eric Dumazet <edumazet@google.com> wrote: > > > > On Fri, Jan 27, 2023 at 5:00 PM Xin Long <lucien.xin@gmail.com> wrote: > > > > > > Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP. > > > > > > Firstly, allow sk->sk_gso_max_size to be set to a value greater than > > > GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size() > > > for IPv4 TCP sockets. > > > > > > Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU > > > in __ip_local_out() to allow to send BIG TCP packets, and this implies > > > that skb->len is the length of a IPv4 packet; On RX path, use skb->len > > > as the length of the IPv4 packet when the IP header tot_len is 0 and > > > skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and > > > skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only > > > need to update these APIs. > > > > > > Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows > > > the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In > > > GRO complete, set IP header tot_len to 0 when the merged packet size > > > greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed > > > on RX path. > > > > > > Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes > > > this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP > > > packets. > > > > > > Signed-off-by: Xin Long <lucien.xin@gmail.com> > > > --- > > > net/core/gro.c | 12 +++++++----- > > > net/core/sock.c | 8 ++++++-- > > > net/ipv4/af_inet.c | 7 ++++--- > > > net/ipv4/ip_input.c | 2 +- > > > net/ipv4/ip_output.c | 2 +- > > > 5 files changed, 19 insertions(+), 12 deletions(-) > > > > > > diff --git a/net/core/gro.c b/net/core/gro.c > > > index 506f83d715f8..b15f85546bdd 100644 > > > --- a/net/core/gro.c > > > +++ b/net/core/gro.c > > > @@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) > > > struct sk_buff *lp; > > > int segs; > > > > > > - /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ > > > - gro_max_size = READ_ONCE(p->dev->gro_max_size); > > > + /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ > > > + gro_max_size = p->protocol == htons(ETH_P_IPV6) ? > > > + READ_ONCE(p->dev->gro_max_size) : > > > + READ_ONCE(p->dev->gro_ipv4_max_size); > > > > > > if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) > > > return -E2BIG; > > > > > > if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { > > > - if (p->protocol != htons(ETH_P_IPV6) || > > > - skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || > > > - ipv6_hdr(p)->nexthdr != IPPROTO_TCP || > > > + if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || > > > + (p->protocol == htons(ETH_P_IPV6) && > > > + skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || > > > p->encapsulation) > > > return -E2BIG; > > > } > > > diff --git a/net/core/sock.c b/net/core/sock.c > > > index 7ba4891460ad..c98f9a4eeff9 100644 > > > --- a/net/core/sock.c > > > +++ b/net/core/sock.c > > > @@ -2383,6 +2383,8 @@ static void sk_trim_gso_size(struct sock *sk) > > > !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) > > > return; > > > #endif > > > + if (sk->sk_family == AF_INET && sk_is_tcp(sk)) > > > + return; > > > > Or simply > > > > diff --git a/net/core/sock.c b/net/core/sock.c > > index 7ba4891460adbd6c13c0ce1dcdd7f23c8c1f0f5d..dcb8fff91fd9a9472267a2cf2fdc98114a7d2b7d > > 100644 > > --- a/net/core/sock.c > > +++ b/net/core/sock.c > > @@ -2375,14 +2375,9 @@ EXPORT_SYMBOL_GPL(sk_free_unlock_clone); > > > > static void sk_trim_gso_size(struct sock *sk) > > { > > - if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE) > > + if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE || > > + sk_is_tcp(sk)) > > return; > > -#if IS_ENABLED(CONFIG_IPV6) > > - if (sk->sk_family == AF_INET6 && > > - sk_is_tcp(sk) && > > - !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) > > - return; > > -#endif > > sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; > > } > There's a difference, AF_INET6 TCP socket may send ipv4 packets with > ipv6_addr_v4mapped, if we don't check ipv6_addr_v4mapped(), IPV4 > GSO packets might go with the "gso_max_size" for IPV6. > But the change you wrote in sk_setup_caps() only checked sk_family. > I think we could use the change you wrote above, but we also need to > use dst->ops->family instead of sk->sk_family in sk_setup_caps(): > > + sk->sk_gso_max_size = dst->ops->family == AF_INET6 ? > + READ_ONCE(dst->dev->gso_max_size) : > + > READ_ONCE(dst->dev->gso_ipv4_max_size); > > > > > > > > > > sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; > > > } > > > > > > @@ -2403,8 +2405,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) > > > sk->sk_route_caps &= ~NETIF_F_GSO_MASK; > > > } else { > > > sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; > > > - /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ > > > - sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); > > > + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ > > > + sk->sk_gso_max_size = sk->sk_family == AF_INET6 ? > > > + READ_ONCE(dst->dev->gso_max_size) : > > > + READ_ONCE(dst->dev->gso_ipv4_max_size); Here... So if you need ipv6_addr_v4mapped() this should be done here anyway. > > > sk_trim_gso_size(sk); > > > sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); > > > /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ > > > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c > > > index 6c0ec2789943..2f992a323b95 100644 > > > --- a/net/ipv4/af_inet.c > > > +++ b/net/ipv4/af_inet.c > > > @@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) > > > if (unlikely(ip_fast_csum((u8 *)iph, 5))) > > > goto out; > > > > > > + NAPI_GRO_CB(skb)->proto = proto; > > > id = ntohl(*(__be32 *)&iph->id); > > > flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); > > > id >>= 16; > > > @@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) > > > > > > int inet_gro_complete(struct sk_buff *skb, int nhoff) > > > { > > > - __be16 newlen = htons(skb->len - nhoff); > > > struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); > > > const struct net_offload *ops; > > > + __be16 totlen = iph->tot_len; > > > int proto = iph->protocol; > > > int err = -ENOSYS; > > > > > > @@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) > > > skb_set_inner_network_header(skb, nhoff); > > > } > > > > > > - csum_replace2(&iph->check, iph->tot_len, newlen); > > > - iph->tot_len = newlen; > > > + iph_set_totlen(iph, skb->len - nhoff); > > > + csum_replace2(&iph->check, totlen, iph->tot_len); > > > > > > ops = rcu_dereference(inet_offloads[proto]); > > > if (WARN_ON(!ops || !ops->callbacks.gro_complete)) > > > diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c > > > index e880ce77322a..0aa8c49b4e1b 100644 > > > --- a/net/ipv4/ip_input.c > > > +++ b/net/ipv4/ip_input.c > > > @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) > > > if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) > > > goto csum_error; > > > > > > - len = ntohs(iph->tot_len); > > > + len = skb_ip_totlen(skb); > > > > len = iph_totlen(skb, iph); > OK, thanks.
diff --git a/net/core/gro.c b/net/core/gro.c index 506f83d715f8..b15f85546bdd 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) struct sk_buff *lp; int segs; - /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ - gro_max_size = READ_ONCE(p->dev->gro_max_size); + /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */ + gro_max_size = p->protocol == htons(ETH_P_IPV6) ? + READ_ONCE(p->dev->gro_max_size) : + READ_ONCE(p->dev->gro_ipv4_max_size); if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush)) return -E2BIG; if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { - if (p->protocol != htons(ETH_P_IPV6) || - skb_headroom(p) < sizeof(struct hop_jumbo_hdr) || - ipv6_hdr(p)->nexthdr != IPPROTO_TCP || + if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || + (p->protocol == htons(ETH_P_IPV6) && + skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } diff --git a/net/core/sock.c b/net/core/sock.c index 7ba4891460ad..c98f9a4eeff9 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2383,6 +2383,8 @@ static void sk_trim_gso_size(struct sock *sk) !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return; #endif + if (sk->sk_family == AF_INET && sk_is_tcp(sk)) + return; sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE; } @@ -2403,8 +2405,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */ - sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size); + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ + sk->sk_gso_max_size = sk->sk_family == AF_INET6 ? + READ_ONCE(dst->dev->gso_max_size) : + READ_ONCE(dst->dev->gso_ipv4_max_size); sk_trim_gso_size(sk); sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6c0ec2789943..2f992a323b95 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out; + NAPI_GRO_CB(skb)->proto = proto; id = ntohl(*(__be32 *)&iph->id); flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; @@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) int inet_gro_complete(struct sk_buff *skb, int nhoff) { - __be16 newlen = htons(skb->len - nhoff); struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; + __be16 totlen = iph->tot_len; int proto = iph->protocol; int err = -ENOSYS; @@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff) skb_set_inner_network_header(skb, nhoff); } - csum_replace2(&iph->check, iph->tot_len, newlen); - iph->tot_len = newlen; + iph_set_totlen(iph, skb->len - nhoff); + csum_replace2(&iph->check, totlen, iph->tot_len); ops = rcu_dereference(inet_offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e880ce77322a..0aa8c49b4e1b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; - len = ntohs(iph->tot_len); + len = skb_ip_totlen(skb); if (skb->len < len) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 922c87ef1ab5..4e4e308c3230 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); - iph->tot_len = htons(skb->len); + iph_set_totlen(iph, skb->len); ip_send_check(iph); /* if egress device is enslaved to an L3 master device pass the
Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP. Firstly, allow sk->sk_gso_max_size to be set to a value greater than GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size() for IPv4 TCP sockets. Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU in __ip_local_out() to allow to send BIG TCP packets, and this implies that skb->len is the length of a IPv4 packet; On RX path, use skb->len as the length of the IPv4 packet when the IP header tot_len is 0 and skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only need to update these APIs. Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In GRO complete, set IP header tot_len to 0 when the merged packet size greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed on RX path. Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP packets. Signed-off-by: Xin Long <lucien.xin@gmail.com> --- net/core/gro.c | 12 +++++++----- net/core/sock.c | 8 ++++++-- net/ipv4/af_inet.c | 7 ++++--- net/ipv4/ip_input.c | 2 +- net/ipv4/ip_output.c | 2 +- 5 files changed, 19 insertions(+), 12 deletions(-)