[RFC,v1,net-next,1/4] net: Introduce Qdisc backpressure infrastructure

Message ID	f4090d129b685df72070f708294550fbc513f888.1651800598.git.peilin.ye@bytedance.com (mailing list archive)
State	RFC
Delegated to:	Netdev Maintainers
Headers	show Return-Path: <netdev-owner@kernel.org> From: Peilin Ye <yepeilin.cs@gmail.com> To: "David S. Miller" <davem@davemloft.net>, Eric Dumazet <eric.dumazet@gmail.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>, David Ahern <dsahern@kernel.org>, Jamal Hadi Salim <jhs@mojatatu.com>, Cong Wang <xiyou.wangcong@gmail.com>, Jiri Pirko <jiri@resnulli.us> Cc: Peilin Ye <peilin.ye@bytedance.com>, netdev@vger.kernel.org, linux-kernel@vger.kernel.org, Cong Wang <cong.wang@bytedance.com>, Peilin Ye <yepeilin.cs@gmail.com> Subject: [PATCH RFC v1 net-next 1/4] net: Introduce Qdisc backpressure infrastructure Date: Fri, 6 May 2022 12:44:22 -0700 Message-Id: <f4090d129b685df72070f708294550fbc513f888.1651800598.git.peilin.ye@bytedance.com> In-Reply-To: <cover.1651800598.git.peilin.ye@bytedance.com> References: <cover.1651800598.git.peilin.ye@bytedance.com> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	net: Qdisc backpressure infrastructure \| expand [RFC,v1,net-next,0/4] net: Qdisc backpressure infrastructure [RFC,v1,net-next,1/4] net: Introduce Qdisc backpressure infrastructure [RFC,v1,net-next,2/4] net/sched: sch_tbf: Use Qdisc backpressure infrastructure [RFC,v1,net-next,3/4] net/sched: sch_htb: Use Qdisc backpressure infrastructure [RFC,v1,net-next,4/4] net/sched: sch_cbq: Use Qdisc backpressure infrastructure

Context	Check	Description
netdev/tree_selection	success	Clearly marked for net-next, async
netdev/fixes_present	success	Fixes tag not required for -next series
netdev/subject_prefix	success	Link
netdev/cover_letter	success	Series has a cover letter
netdev/patch_count	success	Link
netdev/header_inline	success	No static functions without inline keyword in header files
netdev/build_32bit	success	Errors and warnings before: 3148 this patch: 3148
netdev/cc_maintainers	warning	2 maintainers not CCed: petrm@nvidia.com edumazet@google.com
netdev/build_clang	success	Errors and warnings before: 623 this patch: 623
netdev/module_param	success	Was 0 now: 0
netdev/verify_signedoff	success	Signed-off-by tag matches author and committer
netdev/verify_fixes	success	No Fixes tag
netdev/build_allmodconfig_warn	success	Errors and warnings before: 3274 this patch: 3274
netdev/checkpatch	warning	WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns WARNING: memory barrier without comment
netdev/kdoc	success	Errors and warnings before: 0 this patch: 0
netdev/source_inline	success	Was 0 now: 0

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9bab396c1f3b..5ddbe0b65cb6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -19,6 +19,7 @@ #include <net/gen_stats.h> #include <net/rtnetlink.h> #include <net/flow_offload.h> +#include <net/sock.h> struct Qdisc_ops; struct qdisc_walker; @@ -108,6 +109,7 @@ struct Qdisc { struct gnet_stats_queue __percpu *cpu_qstats; int pad; refcount_t refcnt; + struct list_head backpressure_list; /* * For performance sake on SMP, we put highly modified fields at the end @@ -1221,6 +1223,47 @@ static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_DROP; } +static inline void qdisc_backpressure_overlimit(struct Qdisc *sch, struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (!sk || !sk_fullsock(sk)) + return; + + if (cmpxchg(&sk->sk_backpressure_status, SK_UNTHROTTLED, SK_OVERLIMIT) == SK_UNTHROTTLED) { + sock_hold(sk); + list_add_tail(&sk->sk_backpressure_node, &sch->backpressure_list); + } +} + +static inline void qdisc_backpressure_throttle(struct Qdisc *sch) +{ + struct list_head *pos; + struct sock *sk; + + list_for_each(pos, &sch->backpressure_list) { + sk = list_entry(pos, struct sock, sk_backpressure_node); + + WRITE_ONCE(sk->sk_backpressure_status, SK_THROTTLED); + } +} + +static inline void qdisc_backpressure_unthrottle(struct Qdisc *sch) +{ + struct list_head *pos, *next; + struct sock *sk; + + list_for_each_safe(pos, next, &sch->backpressure_list) { + sk = list_entry(pos, struct sock, sk_backpressure_node); + + list_del_init(pos); + smp_store_release(&sk->sk_backpressure_status, SK_UNTHROTTLED); + sk->sk_write_space(sk); + + sock_put(sk); + } +} + /* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how long it will take to send a packet given its size. */ diff --git a/include/net/sock.h b/include/net/sock.h index 73063c88a249..6ed2de43dc98 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -315,6 +315,8 @@ struct sk_filter; * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit + * @sk_backpressure_status: Qdisc backpressure status + * @sk_backpressure_node: linkage for Qdisc backpressure list * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer @@ -468,6 +470,8 @@ struct sock { unsigned int sk_gso_max_size; gfp_t sk_allocation; __u32 sk_txhash; + u32 sk_backpressure_status; /* see enum sk_backpressure */ + struct list_head sk_backpressure_node; /* * Because of non atomicity rules, all @@ -548,6 +552,12 @@ enum sk_pacing { SK_PACING_FQ = 2, }; +enum sk_backpressure { + SK_UNTHROTTLED = 0, + SK_OVERLIMIT = 1, + SK_THROTTLED = 2, +}; + /* Pointer stored in sk_user_data might not be suitable for copying * when cloning the socket. For instance, it can point to a reference * counted object. sk_user_data bottom bit is set if pointer must not @@ -2522,12 +2532,18 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); +static inline bool sk_is_throttled(const struct sock *sk) +{ + return READ_ONCE(sk->sk_backpressure_status) == SK_THROTTLED; +} + /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); + return !sk_is_throttled(sk) && + refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); } static inline gfp_t gfp_any(void) diff --git a/net/core/dev.c b/net/core/dev.c index c2d73595a7c3..7c3d136725b9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5013,6 +5013,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) if (!(q->flags & TCQ_F_NOLOCK)) { root_lock = qdisc_lock(q); spin_lock(root_lock); + qdisc_backpressure_unthrottle(q); } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { /* There is a synchronize_net() between diff --git a/net/core/sock.c b/net/core/sock.c index be20a1af20e5..7ed9d2bd991f 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2034,6 +2034,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); + INIT_LIST_HEAD(&sk->sk_backpressure_node); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); @@ -2589,7 +2590,8 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) + if (!sk_is_throttled(sk) && + refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) break; if (sk->sk_shutdown & SEND_SHUTDOWN) break; @@ -2624,7 +2626,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, if (sk->sk_shutdown & SEND_SHUTDOWN) goto failure; - if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) + if (!sk_is_throttled(sk) && sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) break; sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 918816ec5dd4..6e905995bfa2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3006,9 +3006,14 @@ void tcp4_proc_exit(void) */ bool tcp_stream_memory_free(const struct sock *sk, int wake) { - const struct tcp_sock *tp = tcp_sk(sk); - u32 notsent_bytes = READ_ONCE(tp->write_seq) - - READ_ONCE(tp->snd_nxt); + const struct tcp_sock *tp; + u32 notsent_bytes; + + if (sk_is_throttled(sk)) + return false; + + tp = tcp_sk(sk); + notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index dba0b3e24af5..9ab314b874a7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -674,6 +674,7 @@ struct Qdisc noop_qdisc = { .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, + .backpressure_list = LIST_HEAD_INIT(noop_qdisc.backpressure_list), }; EXPORT_SYMBOL(noop_qdisc); @@ -947,6 +948,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, qdisc_skb_head_init(&sch->q); gnet_stats_basic_sync_init(&sch->bstats); spin_lock_init(&sch->q.lock); + INIT_LIST_HEAD(&sch->backpressure_list); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = @@ -1025,6 +1027,8 @@ void qdisc_reset(struct Qdisc *qdisc) if (ops->reset) ops->reset(qdisc); + qdisc_backpressure_unthrottle(qdisc); + __skb_queue_purge(&qdisc->gso_skb); __skb_queue_purge(&qdisc->skb_bad_txq);

[RFC,v1,net-next,1/4] net: Introduce Qdisc backpressure infrastructure

Checks

Commit Message

Comments

Patch