diff mbox series

[RFC,v2,net-next,1/5] net: Introduce Qdisc backpressure infrastructure

Message ID 7e5bd29f232d42d6aa94ff818a778de707203406.1661158173.git.peilin.ye@bytedance.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series net: Qdisc backpressure infrastructure | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 249779 this patch: 249779
netdev/cc_maintainers success CCed 5 of 5 maintainers
netdev/build_clang success Errors and warnings before: 585 this patch: 585
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 265532 this patch: 265532
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 75 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Peilin Ye Aug. 22, 2022, 9:11 a.m. UTC
From: Peilin Ye <peilin.ye@bytedance.com>

Currently sockets (especially UDP ones) can drop a lot of traffic at TC
egress when rate limited by shaper Qdiscs like HTB.  Improve this by
introducing a Qdisc backpressure infrastructure:

  a. A new 'sock struct' field, @sk_overlimits, which keeps track of the
     number of bytes in socket send buffer that are currently
     unavailable due to TC egress congestion.  The size of an overlimit
     socket's "effective" send buffer is represented by @sk_sndbuf minus
     @sk_overlimits, with a lower limit of SOCK_MIN_SNDBUF:

     max(@sk_sndbuf - @sk_overlimits, SOCK_MIN_SNDBUF)

  b. A new (*backpressure) 'struct proto' callback, which is the
     protocol's private algorithm for Qdisc backpressure.

Working together:

  1. When a shaper Qdisc (TBF, HTB, CBQ, etc.) drops a packet that
     belongs to a local socket, it calls qdisc_backpressure().

  2. qdisc_backpressure() eventually invokes the socket protocol's
     (*backpressure) callback, which should increase @sk_overlimits.

  3. The transport layer then sees a smaller "effective" send buffer and
     will send slower.

  4. It is the per-protocol (*backpressure) implementation's
     responsibility to decrease @sk_overlimits when TC egress becomes
     idle again, potentially by using a timer.

Suggested-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Peilin Ye <peilin.ye@bytedance.com>
---
 include/net/sch_generic.h | 11 +++++++++++
 include/net/sock.h        | 21 +++++++++++++++++++++
 net/core/sock.c           |  1 +
 3 files changed, 33 insertions(+)
diff mbox series

Patch

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ec693fe7c553..afdf4bf64936 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -19,6 +19,7 @@ 
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 #include <net/flow_offload.h>
+#include <net/sock.h>
 
 struct Qdisc_ops;
 struct qdisc_walker;
@@ -1188,6 +1189,16 @@  static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
 	return NET_XMIT_DROP;
 }
 
+static inline void qdisc_backpressure(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	if (!sk || !sk_fullsock(sk))
+		return;
+
+	sk_backpressure(sk);
+}
+
 /* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how
    long it will take to send a packet given its size.
  */
diff --git a/include/net/sock.h b/include/net/sock.h
index 05a1bbdf5805..ef10ca66cf26 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -277,6 +277,7 @@  struct sk_filter;
   *	@sk_pacing_status: Pacing status (requested, handled by sch_fq)
   *	@sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
   *	@sk_sndbuf: size of send buffer in bytes
+  *	@sk_overlimits: size of temporarily unavailable send buffer in bytes
   *	@__sk_flags_offset: empty field used to determine location of bitfield
   *	@sk_padding: unused element for alignment
   *	@sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
@@ -439,6 +440,7 @@  struct sock {
 	struct dst_entry __rcu	*sk_dst_cache;
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
+	int			sk_overlimits;
 
 	/* ===== cache line for TX ===== */
 	int			sk_wmem_queued;
@@ -1264,6 +1266,7 @@  struct proto {
 
 	bool			(*stream_memory_free)(const struct sock *sk, int wake);
 	bool			(*sock_is_readable)(struct sock *sk);
+	void			(*backpressure)(struct sock *sk);
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
 	void			(*leave_memory_pressure)(struct sock *sk);
@@ -2499,6 +2502,24 @@  static inline void sk_stream_moderate_sndbuf(struct sock *sk)
 	WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
 }
 
+static inline int sk_sndbuf_avail(struct sock *sk)
+{
+	int overlimits, sndbuf = READ_ONCE(sk->sk_sndbuf);
+
+	if (!sk->sk_prot->backpressure)
+		return sndbuf;
+
+	overlimits = READ_ONCE(sk->sk_overlimits);
+
+	return max_t(int, sndbuf - overlimits, SOCK_MIN_SNDBUF);
+}
+
+static inline void sk_backpressure(struct sock *sk)
+{
+	if (sk->sk_prot->backpressure)
+		sk->sk_prot->backpressure(sk);
+}
+
 /**
  * sk_page_frag - return an appropriate page_frag
  * @sk: socket
diff --git a/net/core/sock.c b/net/core/sock.c
index 4cb957d934a2..167d471b176f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2194,6 +2194,7 @@  struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
 	refcount_set(&newsk->sk_wmem_alloc, 1);
+	newsk->sk_overlimits	= 0;
 
 	atomic_set(&newsk->sk_omem_alloc, 0);
 	sk_init_common(newsk);