Message ID | 20210310053222.41371-3-xiyou.wangcong@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | sockmap: introduce BPF_SK_SKB_VERDICT and support UDP | expand |
Context | Check | Description |
---|---|---|
netdev/cover_letter | success | Link |
netdev/fixes_present | success | Link |
netdev/patch_count | success | Link |
netdev/tree_selection | success | Clearly marked for bpf-next |
netdev/subject_prefix | success | Link |
netdev/cc_maintainers | warning | 11 maintainers not CCed: dsahern@kernel.org yhs@fb.com kpsingh@kernel.org yoshfuji@linux-ipv6.org andrii@kernel.org kafai@fb.com ast@kernel.org kuba@kernel.org songliubraving@fb.com davem@davemloft.net edumazet@google.com |
netdev/source_inline | success | Was 0 now: 0 |
netdev/verify_signedoff | success | Link |
netdev/module_param | success | Was 0 now: 0 |
netdev/build_32bit | success | Errors and warnings before: 110 this patch: 110 |
netdev/kdoc | success | Errors and warnings before: 0 this patch: 0 |
netdev/verify_fixes | success | Link |
netdev/checkpatch | warning | CHECK: spinlock_t definition without comment WARNING: line length of 81 exceeds 80 columns |
netdev/build_allmodconfig_warn | success | Errors and warnings before: 110 this patch: 110 |
netdev/header_inline | success | Link |
On Wed, Mar 10, 2021 at 06:32 AM CET, Cong Wang wrote: > From: Cong Wang <cong.wang@bytedance.com> > > Currently we rely on lock_sock to protect ingress_msg, > it is too big for this, we can actually just use a spinlock > to protect this list like protecting other skb queues. > > __tcp_bpf_recvmsg() is still special because of peeking, > it still has to use lock_sock. > > Cc: John Fastabend <john.fastabend@gmail.com> > Cc: Daniel Borkmann <daniel@iogearbox.net> > Cc: Jakub Sitnicki <jakub@cloudflare.com> > Cc: Lorenz Bauer <lmb@cloudflare.com> > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > --- One nit below. Acked-by: Jakub Sitnicki <jakub@cloudflare.com> > include/linux/skmsg.h | 46 +++++++++++++++++++++++++++++++++++++++++++ > net/core/skmsg.c | 3 +++ > net/ipv4/tcp_bpf.c | 18 ++++++----------- > 3 files changed, 55 insertions(+), 12 deletions(-) > > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h > index 6c09d94be2e9..7333bf881b81 100644 > --- a/include/linux/skmsg.h > +++ b/include/linux/skmsg.h > @@ -89,6 +89,7 @@ struct sk_psock { > #endif > struct sk_buff_head ingress_skb; > struct list_head ingress_msg; > + spinlock_t ingress_lock; > unsigned long state; > struct list_head link; > spinlock_t link_lock; > @@ -284,7 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) > static inline void sk_psock_queue_msg(struct sk_psock *psock, > struct sk_msg *msg) > { > + spin_lock_bh(&psock->ingress_lock); > list_add_tail(&msg->list, &psock->ingress_msg); > + spin_unlock_bh(&psock->ingress_lock); > +} > + > +static inline struct sk_msg *sk_psock_deque_msg(struct sk_psock *psock) Should be sk_psock_deque*ue*_msg()? [...]
On Thu, Mar 11, 2021 at 3:28 AM Jakub Sitnicki <jakub@cloudflare.com> wrote: > > + > > +static inline struct sk_msg *sk_psock_deque_msg(struct sk_psock *psock) > > Should be sk_psock_deque*ue*_msg()? Right, it is better and less confusing to use "dequeue". Thanks.
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 6c09d94be2e9..7333bf881b81 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -89,6 +89,7 @@ struct sk_psock { #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; + spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; @@ -284,7 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + spin_lock_bh(&psock->ingress_lock); list_add_tail(&msg->list, &psock->ingress_msg); + spin_unlock_bh(&psock->ingress_lock); +} + +static inline struct sk_msg *sk_psock_deque_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + if (msg) + list_del(&msg->list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, + struct sk_msg *msg) +{ + struct sk_msg *ret; + + spin_lock_bh(&psock->ingress_lock); + if (list_is_last(&msg->list, &psock->ingress_msg)) + ret = NULL; + else + ret = list_next_entry(msg, list); + spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline bool sk_psock_queue_empty(const struct sk_psock *psock) @@ -292,6 +331,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock) return psock ? list_empty(&psock->ingress_msg) : true; } +static inline void kfree_sk_msg(struct sk_msg *msg) +{ + if (msg->skb) + consume_skb(msg->skb); + kfree(msg); +} + static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index bebf84ed4e30..41a5f82c53e6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -592,6 +592,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) INIT_WORK(&psock->work, sk_psock_backlog); INIT_LIST_HEAD(&psock->ingress_msg); + spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); @@ -623,11 +624,13 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; + spin_lock_bh(&psock->ingress_lock); list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); sk_msg_free(psock->sk, msg); kfree(msg); } + spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_zap_ingress(struct sk_psock *psock) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 17c322b875fd..ad1261cdcdde 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -18,9 +18,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg_rx; int i, copied = 0; - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - + msg_rx = sk_psock_peek_msg(psock); while (copied != len) { struct scatterlist *sge; @@ -68,22 +66,18 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } while (i != msg_rx->sg.end); if (unlikely(peek)) { - if (msg_rx == list_last_entry(&psock->ingress_msg, - struct sk_msg, list)) + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) break; - msg_rx = list_next_entry(msg_rx, list); continue; } msg_rx->sg.start = i; if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); + msg_rx = sk_psock_deque_msg(psock); + kfree_sk_msg(msg_rx); } - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); + msg_rx = sk_psock_peek_msg(psock); } return copied;