diff mbox series

[net,1/2] tcp: Allow full IP tos/IPv6 tclass to be reflected in L3 header

Message ID 160582103106.66684.9841738004971200231.stgit@localhost.localdomain (mailing list archive)
State Accepted
Delegated to: Netdev Maintainers
Headers show
Series tcp: Address issues with ECT0 not being set in DCTCP packets | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 15 this patch: 15
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 32 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 15 this patch: 15
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Alexander Duyck Nov. 19, 2020, 9:23 p.m. UTC
From: Alexander Duyck <alexanderduyck@fb.com>

An issue was recently found where DCTCP SYN/ACK packets did not have the
ECT bit set in the L3 header. A bit of code review found that the recent
change referenced below had gone though and added a mask that prevented the
ECN bits from being populated in the L3 header.

This patch addresses that by rolling back the mask so that it is only
applied to the flags coming from the incoming TCP request instead of
applying it to the socket tos/tclass field. Doing this the ECT bits were
restored in the SYN/ACK packets in my testing.

One thing that is not addressed by this patch set is the fact that
tcp_reflect_tos appears to be incompatible with ECN based congestion
avoidance algorithms. At a minimum the feature should likely be documented
which it currently isn't.

Fixes: ac8f1710c12b ("tcp: reflect tos value received in SYN to the socket")
Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
---
 net/ipv4/tcp_ipv4.c |    5 +++--
 net/ipv6/tcp_ipv6.c |    6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

Comments

Wei Wang Nov. 19, 2020, 11:09 p.m. UTC | #1
On Thu, Nov 19, 2020 at 1:23 PM Alexander Duyck
<alexander.duyck@gmail.com> wrote:
>
> From: Alexander Duyck <alexanderduyck@fb.com>
>
> An issue was recently found where DCTCP SYN/ACK packets did not have the
> ECT bit set in the L3 header. A bit of code review found that the recent
> change referenced below had gone though and added a mask that prevented the
> ECN bits from being populated in the L3 header.
>
> This patch addresses that by rolling back the mask so that it is only
> applied to the flags coming from the incoming TCP request instead of
> applying it to the socket tos/tclass field. Doing this the ECT bits were
> restored in the SYN/ACK packets in my testing.
>
> One thing that is not addressed by this patch set is the fact that
> tcp_reflect_tos appears to be incompatible with ECN based congestion
> avoidance algorithms. At a minimum the feature should likely be documented
> which it currently isn't.
>
> Fixes: ac8f1710c12b ("tcp: reflect tos value received in SYN to the socket")

Acked-by: Wei Wang <weiwan@google.com>

Thanks for catching this. I was under the wrong impression that the
ECT bits were marked in tos after the tcp layer. Upon a closer look,
it seems right now, it only gets marked in inet_sock(sk)->tos from
tcp_init_congestion_control() once.
I will submit a follow-up fix to make sure we include the lower 2 bits
in the reflection case as well.

> Signed-off-by: Alexander Duyck <alexanderduyck@fb.com>
> ---
>  net/ipv4/tcp_ipv4.c |    5 +++--
>  net/ipv6/tcp_ipv6.c |    6 +++---
>  2 files changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index c2d5132c523c..c5f8b686aa82 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -981,7 +981,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
>         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
>
>         tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
> -                       tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
> +                       tcp_rsk(req)->syn_tos & ~INET_ECN_MASK :
> +                       inet_sk(sk)->tos;
>
>         if (skb) {
>                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
> @@ -990,7 +991,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
>                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
>                                             ireq->ir_rmt_addr,
>                                             rcu_dereference(ireq->ireq_opt),
> -                                           tos & ~INET_ECN_MASK);
> +                                           tos);
>                 rcu_read_unlock();
>                 err = net_xmit_eval(err);
>         }
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 8db59f4e5f13..3d49e8d0afee 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -530,12 +530,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
>                 rcu_read_lock();
>                 opt = ireq->ipv6_opt;
>                 tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
> -                               tcp_rsk(req)->syn_tos : np->tclass;
> +                               tcp_rsk(req)->syn_tos & ~INET_ECN_MASK :
> +                               np->tclass;
>                 if (!opt)
>                         opt = rcu_dereference(np->opt);
>                 err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
> -                              tclass & ~INET_ECN_MASK,
> -                              sk->sk_priority);
> +                              tclass, sk->sk_priority);
>                 rcu_read_unlock();
>                 err = net_xmit_eval(err);
>         }
>
>
diff mbox series

Patch

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c2d5132c523c..c5f8b686aa82 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -981,7 +981,8 @@  static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 	skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 
 	tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
-			tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
+			tcp_rsk(req)->syn_tos & ~INET_ECN_MASK :
+			inet_sk(sk)->tos;
 
 	if (skb) {
 		__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
@@ -990,7 +991,7 @@  static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 		err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
 					    ireq->ir_rmt_addr,
 					    rcu_dereference(ireq->ireq_opt),
-					    tos & ~INET_ECN_MASK);
+					    tos);
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 8db59f4e5f13..3d49e8d0afee 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -530,12 +530,12 @@  static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 		rcu_read_lock();
 		opt = ireq->ipv6_opt;
 		tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
-				tcp_rsk(req)->syn_tos : np->tclass;
+				tcp_rsk(req)->syn_tos & ~INET_ECN_MASK :
+				np->tclass;
 		if (!opt)
 			opt = rcu_dereference(np->opt);
 		err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
-			       tclass & ~INET_ECN_MASK,
-			       sk->sk_priority);
+			       tclass, sk->sk_priority);
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}