diff mbox series

[RFCv2,3/3] tcp: Wait for sufficient data in tcp_mtu_probe

Message ID e6dac4f513fa2ca96ccb4dcc5b11f96b3f9ddc40.1622025457.git.cdleonard@gmail.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series tcp: Improve mtu probe preconditions | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Guessed tree name to be net-next
netdev/subject_prefix warning Target tree name not specified in the subject
netdev/cc_maintainers warning 6 maintainers not CCed: idosch@OSS.NVIDIA.COM andreas.a.roeseler@gmail.com linux-doc@vger.kernel.org corbet@lwn.net fw@strlen.de amcohen@nvidia.com
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 5513 this patch: 5513
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 5578 this patch: 5578
netdev/header_inline success Link

Commit Message

Leonard Crestez May 26, 2021, 10:38 a.m. UTC
According to RFC4821 Section 7.4 "Protocols MAY delay sending non-probes
in order to accumulate enough data" but linux almost never does that.

Implement this by returning 0 from tcp_mtu_probe if not enough data is
queued locally but some packets are still in flight. This makes mtu
probing more likely to happen for applications that do small writes.

Only doing this if packets are in flight should ensure that writing will
be attempted again later. This is similar to how tcp_mtu_probe already
returns zero if the probe doesn't fit inside the receiver window or the
congestion window.

Control this with a sysctl because this implies a latency tradeoff but
only up to one RTT.

Signed-off-by: Leonard Crestez <cdleonard@gmail.com>
---
 Documentation/networking/ip-sysctl.rst |  5 +++++
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
 net/ipv4/tcp_ipv4.c                    |  1 +
 net/ipv4/tcp_output.c                  | 18 ++++++++++++++----
 5 files changed, 28 insertions(+), 4 deletions(-)

Comments

Eric Dumazet May 26, 2021, 2:35 p.m. UTC | #1
On Wed, May 26, 2021 at 12:38 PM Leonard Crestez <cdleonard@gmail.com> wrote:
>
> According to RFC4821 Section 7.4 "Protocols MAY delay sending non-probes
> in order to accumulate enough data" but linux almost never does that.
>
> Implement this by returning 0 from tcp_mtu_probe if not enough data is
> queued locally but some packets are still in flight. This makes mtu
> probing more likely to happen for applications that do small writes.
>
> Only doing this if packets are in flight should ensure that writing will
> be attempted again later. This is similar to how tcp_mtu_probe already
> returns zero if the probe doesn't fit inside the receiver window or the
> congestion window.
>
> Control this with a sysctl because this implies a latency tradeoff but
> only up to one RTT.
>
> Signed-off-by: Leonard Crestez <cdleonard@gmail.com>
> ---
>  Documentation/networking/ip-sysctl.rst |  5 +++++
>  include/net/netns/ipv4.h               |  1 +
>  net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
>  net/ipv4/tcp_ipv4.c                    |  1 +
>  net/ipv4/tcp_output.c                  | 18 ++++++++++++++----
>  5 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 7ab52a105a5d..967b7fac35b1 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -349,10 +349,15 @@ tcp_mtu_probe_floor - INTEGER
>         If MTU probing is enabled this caps the minimum MSS used for search_low
>         for the connection.
>
>         Default : 48
>
> +tcp_mtu_probe_waitdata - BOOLEAN
> +       Wait for enough data for an mtu probe to accumulate on the sender.
> +
> +       Default: 1
> +
>  tcp_mtu_probe_rack - BOOLEAN
>         Try to use shorter probes if RACK is also enabled
>
>         Default: 1
>
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index b4ff12f25a7f..366e7b325778 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -112,10 +112,11 @@ struct netns_ipv4 {
>  #ifdef CONFIG_NET_L3_MASTER_DEV
>         u8 sysctl_tcp_l3mdev_accept;
>  #endif
>         u8 sysctl_tcp_mtu_probing;
>         int sysctl_tcp_mtu_probe_floor;
> +       int sysctl_tcp_mtu_probe_waitdata;

If this is a boolean, you should use u8, and place this field to avoid
adding a hole.

>         int sysctl_tcp_mtu_probe_rack;
>         int sysctl_tcp_base_mss;
>         int sysctl_tcp_min_snd_mss;
>         int sysctl_tcp_probe_threshold;
>         u32 sysctl_tcp_probe_interval;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 275c91fb9cf8..53868b812958 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -847,10 +847,17 @@ static struct ctl_table ipv4_net_table[] = {
>                 .mode           = 0644,
>                 .proc_handler   = proc_dointvec_minmax,
>                 .extra1         = &tcp_min_snd_mss_min,
>                 .extra2         = &tcp_min_snd_mss_max,
>         },
> +       {
> +               .procname       = "tcp_mtu_probe_waitdata",
> +               .data           = &init_net.ipv4.sysctl_tcp_mtu_probe_waitdata,
> +               .maxlen         = sizeof(int),
> +               .mode           = 0644,
> +               .proc_handler   = proc_dointvec,

If this is a boolean, please use proc_dou8vec_minmax, and SYSCTL_ZERO/SYSCTL_ONE

> +       },
>         {
>                 .procname       = "tcp_mtu_probe_rack",
>                 .data           = &init_net.ipv4.sysctl_tcp_mtu_probe_rack,
>                 .maxlen         = sizeof(int),
>                 .mode           = 0644,
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index ed8af4a7325b..940df2ae4636 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -2892,10 +2892,11 @@ static int __net_init tcp_sk_init(struct net *net)
>         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
>         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
>         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
>         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
>         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
> +       net->ipv4.sysctl_tcp_mtu_probe_waitdata = 1;
>         net->ipv4.sysctl_tcp_mtu_probe_rack = 1;
>
>         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
>         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
>         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 362f97cfb09e..268e1bac001f 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -2394,14 +2394,10 @@ static int tcp_mtu_probe(struct sock *sk)
>                  */
>                 tcp_mtu_check_reprobe(sk);
>                 return -1;
>         }
>
> -       /* Have enough data in the send queue to probe? */
> -       if (tp->write_seq - tp->snd_nxt < size_needed)
> -               return -1;
> -
>         /* Can probe fit inside congestion window? */
>         if (packets_needed > tp->snd_cwnd)
>                 return -1;
>
>         /* Can probe fit inside receiver window? If not then skip probing.
> @@ -2411,10 +2407,24 @@ static int tcp_mtu_probe(struct sock *sk)
>          * clear below.
>          */
>         if (tp->snd_wnd < size_needed)
>                 return -1;
>
> +       /* Have enough data in the send queue to probe? */
> +       if (tp->write_seq - tp->snd_nxt < size_needed) {
> +               /* If packets are already in flight it's safe to wait for more data to
> +                * accumulate on the sender because writing will be triggered as ACKs
> +                * arrive.
> +                * If no packets are in flight returning zero can stall.
> +                */
> +               if (net->ipv4.sysctl_tcp_mtu_probe_waitdata &&

I have serious doubts about RPC traffic.
Adding one RTT latency is going to make this unlikely to be used.

> +                   tcp_packets_in_flight(tp))
> +                       return 0;
> +               else
> +                       return -1;
> +       }
> +
>         /* Do we need for more acks to clear the receive window? */
>         if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
>                 return 0;
>
>         /* Do we need the congestion window to clear? */
> --
> 2.25.1
>
Leonard Crestez June 3, 2021, 1:35 p.m. UTC | #2
On 5/26/21 5:35 PM, Eric Dumazet wrote:
> On Wed, May 26, 2021 at 12:38 PM Leonard Crestez <cdleonard@gmail.com> wrote:
>>
>> According to RFC4821 Section 7.4 "Protocols MAY delay sending non-probes
>> in order to accumulate enough data" but linux almost never does that.
>>
>> Implement this by returning 0 from tcp_mtu_probe if not enough data is
>> queued locally but some packets are still in flight. This makes mtu
>> probing more likely to happen for applications that do small writes.
>>
>> Only doing this if packets are in flight should ensure that writing will
>> be attempted again later. This is similar to how tcp_mtu_probe already
>> returns zero if the probe doesn't fit inside the receiver window or the
>> congestion window.
>>
>> Control this with a sysctl because this implies a latency tradeoff but
>> only up to one RTT.
>>
>> Signed-off-by: Leonard Crestez <cdleonard@gmail.com>
>> ---
>>   Documentation/networking/ip-sysctl.rst |  5 +++++
>>   include/net/netns/ipv4.h               |  1 +
>>   net/ipv4/sysctl_net_ipv4.c             |  7 +++++++
>>   net/ipv4/tcp_ipv4.c                    |  1 +
>>   net/ipv4/tcp_output.c                  | 18 ++++++++++++++----
>>   5 files changed, 28 insertions(+), 4 deletions(-)
>>
>> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
>> index 7ab52a105a5d..967b7fac35b1 100644
>> --- a/Documentation/networking/ip-sysctl.rst
>> +++ b/Documentation/networking/ip-sysctl.rst
>> @@ -349,10 +349,15 @@ tcp_mtu_probe_floor - INTEGER
>>          If MTU probing is enabled this caps the minimum MSS used for search_low
>>          for the connection.
>>
>>          Default : 48
>>
>> +tcp_mtu_probe_waitdata - BOOLEAN
>> +       Wait for enough data for an mtu probe to accumulate on the sender.
>> +
>> +       Default: 1
>> +
>>   tcp_mtu_probe_rack - BOOLEAN
>>          Try to use shorter probes if RACK is also enabled
>>
>>          Default: 1
>>
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index b4ff12f25a7f..366e7b325778 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -112,10 +112,11 @@ struct netns_ipv4 {
>>   #ifdef CONFIG_NET_L3_MASTER_DEV
>>          u8 sysctl_tcp_l3mdev_accept;
>>   #endif
>>          u8 sysctl_tcp_mtu_probing;
>>          int sysctl_tcp_mtu_probe_floor;
>> +       int sysctl_tcp_mtu_probe_waitdata;
> 
> If this is a boolean, you should use u8, and place this field to avoid
> adding a hole.
> 
>>          int sysctl_tcp_mtu_probe_rack;
>>          int sysctl_tcp_base_mss;
>>          int sysctl_tcp_min_snd_mss;
>>          int sysctl_tcp_probe_threshold;
>>          u32 sysctl_tcp_probe_interval;
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index 275c91fb9cf8..53868b812958 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -847,10 +847,17 @@ static struct ctl_table ipv4_net_table[] = {
>>                  .mode           = 0644,
>>                  .proc_handler   = proc_dointvec_minmax,
>>                  .extra1         = &tcp_min_snd_mss_min,
>>                  .extra2         = &tcp_min_snd_mss_max,
>>          },
>> +       {
>> +               .procname       = "tcp_mtu_probe_waitdata",
>> +               .data           = &init_net.ipv4.sysctl_tcp_mtu_probe_waitdata,
>> +               .maxlen         = sizeof(int),
>> +               .mode           = 0644,
>> +               .proc_handler   = proc_dointvec,
> 
> If this is a boolean, please use proc_dou8vec_minmax, and SYSCTL_ZERO/SYSCTL_ONE
> 
>> +       },
>>          {
>>                  .procname       = "tcp_mtu_probe_rack",
>>                  .data           = &init_net.ipv4.sysctl_tcp_mtu_probe_rack,
>>                  .maxlen         = sizeof(int),
>>                  .mode           = 0644,
>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
>> index ed8af4a7325b..940df2ae4636 100644
>> --- a/net/ipv4/tcp_ipv4.c
>> +++ b/net/ipv4/tcp_ipv4.c
>> @@ -2892,10 +2892,11 @@ static int __net_init tcp_sk_init(struct net *net)
>>          net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
>>          net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
>>          net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
>>          net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
>>          net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
>> +       net->ipv4.sysctl_tcp_mtu_probe_waitdata = 1;
>>          net->ipv4.sysctl_tcp_mtu_probe_rack = 1;
>>
>>          net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
>>          net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
>>          net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
>> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
>> index 362f97cfb09e..268e1bac001f 100644
>> --- a/net/ipv4/tcp_output.c
>> +++ b/net/ipv4/tcp_output.c
>> @@ -2394,14 +2394,10 @@ static int tcp_mtu_probe(struct sock *sk)
>>                   */
>>                  tcp_mtu_check_reprobe(sk);
>>                  return -1;
>>          }
>>
>> -       /* Have enough data in the send queue to probe? */
>> -       if (tp->write_seq - tp->snd_nxt < size_needed)
>> -               return -1;
>> -
>>          /* Can probe fit inside congestion window? */
>>          if (packets_needed > tp->snd_cwnd)
>>                  return -1;
>>
>>          /* Can probe fit inside receiver window? If not then skip probing.
>> @@ -2411,10 +2407,24 @@ static int tcp_mtu_probe(struct sock *sk)
>>           * clear below.
>>           */
>>          if (tp->snd_wnd < size_needed)
>>                  return -1;
>>
>> +       /* Have enough data in the send queue to probe? */
>> +       if (tp->write_seq - tp->snd_nxt < size_needed) {
>> +               /* If packets are already in flight it's safe to wait for more data to
>> +                * accumulate on the sender because writing will be triggered as ACKs
>> +                * arrive.
>> +                * If no packets are in flight returning zero can stall.
>> +                */
>> +               if (net->ipv4.sysctl_tcp_mtu_probe_waitdata &&
> 
> I have serious doubts about RPC traffic.
> Adding one RTT latency is going to make this unlikely to be used.
> 
>> +                   tcp_packets_in_flight(tp))
>> +                       return 0;
>> +               else
>> +                       return -1;
>> +       }
>> +

This could be measured with tcp_rr mode in netperf, right? I've been 
using iperf which is more focused on bandwidth. My expectation would be 
that the "maximum" latency would increase but not the average since MTU 
probing is a rare occurrence.

Another way to implement waiting would be to check sk_wmem_alloc > 
skb->size like autocork does and this would promise zero latency. But 
I'm not sure how to do that correctly. If it's up to tcp_mtu_probe to 
ensure that traffic does not stall then could it set the TSQ_THROTTLED flag?

--
Regards,
Leonard
diff mbox series

Patch

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 7ab52a105a5d..967b7fac35b1 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -349,10 +349,15 @@  tcp_mtu_probe_floor - INTEGER
 	If MTU probing is enabled this caps the minimum MSS used for search_low
 	for the connection.
 
 	Default : 48
 
+tcp_mtu_probe_waitdata - BOOLEAN
+	Wait for enough data for an mtu probe to accumulate on the sender.
+
+	Default: 1
+
 tcp_mtu_probe_rack - BOOLEAN
 	Try to use shorter probes if RACK is also enabled
 
 	Default: 1
 
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index b4ff12f25a7f..366e7b325778 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -112,10 +112,11 @@  struct netns_ipv4 {
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	u8 sysctl_tcp_l3mdev_accept;
 #endif
 	u8 sysctl_tcp_mtu_probing;
 	int sysctl_tcp_mtu_probe_floor;
+	int sysctl_tcp_mtu_probe_waitdata;
 	int sysctl_tcp_mtu_probe_rack;
 	int sysctl_tcp_base_mss;
 	int sysctl_tcp_min_snd_mss;
 	int sysctl_tcp_probe_threshold;
 	u32 sysctl_tcp_probe_interval;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 275c91fb9cf8..53868b812958 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -847,10 +847,17 @@  static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &tcp_min_snd_mss_min,
 		.extra2		= &tcp_min_snd_mss_max,
 	},
+	{
+		.procname	= "tcp_mtu_probe_waitdata",
+		.data		= &init_net.ipv4.sysctl_tcp_mtu_probe_waitdata,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{
 		.procname	= "tcp_mtu_probe_rack",
 		.data		= &init_net.ipv4.sysctl_tcp_mtu_probe_rack,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ed8af4a7325b..940df2ae4636 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2892,10 +2892,11 @@  static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
 	net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
 	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
 	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
 	net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
+	net->ipv4.sysctl_tcp_mtu_probe_waitdata = 1;
 	net->ipv4.sysctl_tcp_mtu_probe_rack = 1;
 
 	net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
 	net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
 	net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 362f97cfb09e..268e1bac001f 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2394,14 +2394,10 @@  static int tcp_mtu_probe(struct sock *sk)
 		 */
 		tcp_mtu_check_reprobe(sk);
 		return -1;
 	}
 
-	/* Have enough data in the send queue to probe? */
-	if (tp->write_seq - tp->snd_nxt < size_needed)
-		return -1;
-
 	/* Can probe fit inside congestion window? */
 	if (packets_needed > tp->snd_cwnd)
 		return -1;
 
 	/* Can probe fit inside receiver window? If not then skip probing.
@@ -2411,10 +2407,24 @@  static int tcp_mtu_probe(struct sock *sk)
 	 * clear below.
 	 */
 	if (tp->snd_wnd < size_needed)
 		return -1;
 
+	/* Have enough data in the send queue to probe? */
+	if (tp->write_seq - tp->snd_nxt < size_needed) {
+		/* If packets are already in flight it's safe to wait for more data to
+		 * accumulate on the sender because writing will be triggered as ACKs
+		 * arrive.
+		 * If no packets are in flight returning zero can stall.
+		 */
+		if (net->ipv4.sysctl_tcp_mtu_probe_waitdata &&
+		    tcp_packets_in_flight(tp))
+			return 0;
+		else
+			return -1;
+	}
+
 	/* Do we need for more acks to clear the receive window? */
 	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
 		return 0;
 
 	/* Do we need the congestion window to clear? */