Message ID | 20210220052924.106599-5-xiyou.wangcong@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | sock_map: clean up and refactor code for BPF_SK_SKB_VERDICT | expand |
On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote: > From: Cong Wang <cong.wang@bytedance.com> > > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly > does not work for any other non-TCP protocols. We can move them to > skb ext, but it introduces a memory allocation on fast path. > > Fortunately, we only need to a word-size to store all the information, > because the flags actually only contains 1 bit so can be just packed > into the lowest bit of the "pointer", which is stored as unsigned > long. > > Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is > no longer needed after ->sk_data_ready() so we can just drop it. > > Cc: Daniel Borkmann <daniel@iogearbox.net> > Cc: Jakub Sitnicki <jakub@cloudflare.com> > Cc: Lorenz Bauer <lmb@cloudflare.com> > Acked-by: John Fastabend <john.fastabend@gmail.com> > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > --- LGTM. I have some questions (below) that would help me confirm if I understand the changes, and what could be improved, if anything. Acked-by: Jakub Sitnicki <jakub@cloudflare.com> > include/linux/skbuff.h | 3 +++ > include/linux/skmsg.h | 35 +++++++++++++++++++++++++++++++++++ > include/net/tcp.h | 19 ------------------- > net/core/skmsg.c | 32 ++++++++++++++++++++------------ > net/core/sock_map.c | 8 ++------ > 5 files changed, 60 insertions(+), 37 deletions(-) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index 6d0a33d1c0db..bd84f799c952 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -755,6 +755,9 @@ struct sk_buff { > void (*destructor)(struct sk_buff *skb); > }; > struct list_head tcp_tsorted_anchor; > +#ifdef CONFIG_NET_SOCK_MSG > + unsigned long _sk_redir; > +#endif > }; > > #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h > index e3bb712af257..fc234d507fd7 100644 > --- a/include/linux/skmsg.h > +++ b/include/linux/skmsg.h > @@ -459,4 +459,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) > return false; > return !!psock->saved_data_ready; > } > + > +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) > +static inline bool skb_bpf_ingress(const struct sk_buff *skb) > +{ > + unsigned long sk_redir = skb->_sk_redir; > + > + return sk_redir & BPF_F_INGRESS; > +} > + > +static inline void skb_bpf_set_ingress(struct sk_buff *skb) > +{ > + skb->_sk_redir |= BPF_F_INGRESS; > +} > + > +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, > + bool ingress) > +{ > + skb->_sk_redir = (unsigned long)sk_redir; > + if (ingress) > + skb->_sk_redir |= BPF_F_INGRESS; > +} > + > +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) > +{ > + unsigned long sk_redir = skb->_sk_redir; > + > + sk_redir &= ~0x1UL; We're using the enum when setting the bit flag, but a hardcoded constant when masking it. ~BPF_F_INGRESS would be more consistent here. > + return (struct sock *)sk_redir; > +} > + > +static inline void skb_bpf_redirect_clear(struct sk_buff *skb) > +{ > + skb->_sk_redir = 0; > +} > +#endif /* CONFIG_NET_SOCK_MSG */ > #endif /* _LINUX_SKMSG_H */ > diff --git a/include/net/tcp.h b/include/net/tcp.h > index 947ef5da6867..075de26f449d 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -883,30 +883,11 @@ struct tcp_skb_cb { > struct inet6_skb_parm h6; > #endif > } header; /* For incoming skbs */ > - struct { > - __u32 flags; > - struct sock *sk_redir; > - } bpf; > }; > }; > > #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) > > -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) > -{ > - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; > -} > - > -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) > -{ > - return TCP_SKB_CB(skb)->bpf.sk_redir; > -} > - > -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) > -{ > - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; > -} > - > extern const struct inet_connection_sock_af_ops ipv4_specific; > > #if IS_ENABLED(CONFIG_IPV6) > diff --git a/net/core/skmsg.c b/net/core/skmsg.c > index 2d8bbb3fd87c..05b5af09ff42 100644 > --- a/net/core/skmsg.c > +++ b/net/core/skmsg.c > @@ -494,6 +494,8 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb > static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, > u32 off, u32 len, bool ingress) > { > + skb_bpf_redirect_clear(skb); This is called to avoid leaking state in skb->_skb_refdst. Correct? I'm wondering why we're doing it every time sk_psock_handle_skb() gets invoked from the do/while loop in sk_psock_backlog(), instead of doing it once after reading ingress flag with skb_bpf_ingress()? > + > if (!ingress) { > if (!sock_writeable(psock->sk)) > return -EAGAIN; > @@ -525,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work) > len = skb->len; > off = 0; > start: > - ingress = tcp_skb_bpf_ingress(skb); > + ingress = skb_bpf_ingress(skb); > do { > ret = -EIO; > if (likely(psock->sk->sk_socket)) > @@ -631,7 +633,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) > > static void sk_psock_zap_ingress(struct sk_psock *psock) > { > - __skb_queue_purge(&psock->ingress_skb); > + struct sk_buff *skb; > + > + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { > + skb_bpf_redirect_clear(skb); I believe we clone the skb before enqueuing it psock->ingress_skb. Clone happens either in sk_psock_verdict_recv() or in __strp_recv(). There are not other users holding a ref, so clearing the redirect seems unneeded. Unless I'm missing something? > + kfree_skb(skb); > + } > __sk_psock_purge_ingress_msg(psock); > } > > @@ -752,7 +759,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) > struct sk_psock *psock_other; > struct sock *sk_other; > > - sk_other = tcp_skb_bpf_redirect_fetch(skb); > + sk_other = skb_bpf_redirect_fetch(skb); > /* This error is a buggy BPF program, it returned a redirect > * return code, but then didn't set a redirect interface. > */ > @@ -802,9 +809,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) > * TLS context. > */ > skb->sk = psock->sk; > - tcp_skb_bpf_redirect_clear(skb); > + skb_dst_drop(skb); > + skb_bpf_redirect_clear(skb); After skb_dst_drop(), skb->_skb_refdst is clear. So it seems the redirect_clear() is not needed. But I'm guessing it is being invoked to communicate the intention? > ret = sk_psock_bpf_run(psock, prog, skb); > - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); > + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); > skb->sk = NULL; > } > sk_psock_tls_verdict_apply(skb, psock->sk, ret); > @@ -816,7 +824,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); > static void sk_psock_verdict_apply(struct sk_psock *psock, > struct sk_buff *skb, int verdict) > { > - struct tcp_skb_cb *tcp; > struct sock *sk_other; > int err = -EIO; > > @@ -828,8 +835,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, > goto out_free; > } > > - tcp = TCP_SKB_CB(skb); > - tcp->bpf.flags |= BPF_F_INGRESS; > + skb_bpf_set_ingress(skb); > > /* If the queue is empty then we can submit directly > * into the msg queue. If its not empty we have to > @@ -890,9 +896,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) > skb_set_owner_r(skb, sk); > prog = READ_ONCE(psock->progs.skb_verdict); > if (likely(prog)) { > - tcp_skb_bpf_redirect_clear(skb); > + skb_dst_drop(skb); > + skb_bpf_redirect_clear(skb); > ret = sk_psock_bpf_run(psock, prog, skb); > - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); > + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); > } > sk_psock_verdict_apply(psock, skb, ret); > out: > @@ -1005,9 +1012,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, > skb_set_owner_r(skb, sk); > prog = READ_ONCE(psock->progs.skb_verdict); > if (likely(prog)) { > - tcp_skb_bpf_redirect_clear(skb); > + skb_dst_drop(skb); > + skb_bpf_redirect_clear(skb); > ret = sk_psock_bpf_run(psock, prog, skb); > - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); > + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); > } > sk_psock_verdict_apply(psock, skb, ret); > out: > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > index 1a28a5c2c61e..dbfcd7006338 100644 > --- a/net/core/sock_map.c > +++ b/net/core/sock_map.c > @@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { > BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, > struct bpf_map *, map, u32, key, u64, flags) > { > - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); > struct sock *sk; > > if (unlikely(flags & ~(BPF_F_INGRESS))) > @@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, > if (unlikely(!sk || !sock_map_redirect_allowed(sk))) > return SK_DROP; > > - tcb->bpf.flags = flags; > - tcb->bpf.sk_redir = sk; > + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); > return SK_PASS; > } > > @@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { > BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, > struct bpf_map *, map, void *, key, u64, flags) > { > - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); > struct sock *sk; > > if (unlikely(flags & ~(BPF_F_INGRESS))) > @@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, > if (unlikely(!sk || !sock_map_redirect_allowed(sk))) > return SK_DROP; > > - tcb->bpf.flags = flags; > - tcb->bpf.sk_redir = sk; > + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); > return SK_PASS; > }
On Mon, Feb 22, 2021 at 4:20 AM Jakub Sitnicki <jakub@cloudflare.com> wrote: > > On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote: > > From: Cong Wang <cong.wang@bytedance.com> > > > > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly > > does not work for any other non-TCP protocols. We can move them to > > skb ext, but it introduces a memory allocation on fast path. > > > > Fortunately, we only need to a word-size to store all the information, > > because the flags actually only contains 1 bit so can be just packed > > into the lowest bit of the "pointer", which is stored as unsigned > > long. > > > > Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is > > no longer needed after ->sk_data_ready() so we can just drop it. > > > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > Cc: Lorenz Bauer <lmb@cloudflare.com> > > Acked-by: John Fastabend <john.fastabend@gmail.com> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > --- > > LGTM. I have some questions (below) that would help me confirm if I > understand the changes, and what could be improved, if anything. > > Acked-by: Jakub Sitnicki <jakub@cloudflare.com> > > > include/linux/skbuff.h | 3 +++ > > include/linux/skmsg.h | 35 +++++++++++++++++++++++++++++++++++ > > include/net/tcp.h | 19 ------------------- > > net/core/skmsg.c | 32 ++++++++++++++++++++------------ > > net/core/sock_map.c | 8 ++------ > > 5 files changed, 60 insertions(+), 37 deletions(-) > > > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > > index 6d0a33d1c0db..bd84f799c952 100644 > > --- a/include/linux/skbuff.h > > +++ b/include/linux/skbuff.h > > @@ -755,6 +755,9 @@ struct sk_buff { > > void (*destructor)(struct sk_buff *skb); > > }; > > struct list_head tcp_tsorted_anchor; > > +#ifdef CONFIG_NET_SOCK_MSG > > + unsigned long _sk_redir; > > +#endif > > }; > > > > #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) > > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h > > index e3bb712af257..fc234d507fd7 100644 > > --- a/include/linux/skmsg.h > > +++ b/include/linux/skmsg.h > > @@ -459,4 +459,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) > > return false; > > return !!psock->saved_data_ready; > > } > > + > > +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) > > +static inline bool skb_bpf_ingress(const struct sk_buff *skb) > > +{ > > + unsigned long sk_redir = skb->_sk_redir; > > + > > + return sk_redir & BPF_F_INGRESS; > > +} > > + > > +static inline void skb_bpf_set_ingress(struct sk_buff *skb) > > +{ > > + skb->_sk_redir |= BPF_F_INGRESS; > > +} > > + > > +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, > > + bool ingress) > > +{ > > + skb->_sk_redir = (unsigned long)sk_redir; > > + if (ingress) > > + skb->_sk_redir |= BPF_F_INGRESS; > > +} > > + > > +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) > > +{ > > + unsigned long sk_redir = skb->_sk_redir; > > + > > + sk_redir &= ~0x1UL; > > We're using the enum when setting the bit flag, but a hardcoded constant > when masking it. ~BPF_F_INGRESS would be more consistent here. Well, here we need a mask, not a bit, but we don't have a mask yet, hence I just use hard-coded 0x1. Does #define BPF_F_MASK 0x1UL look any better? > > > + return (struct sock *)sk_redir; > > +} > > + > > +static inline void skb_bpf_redirect_clear(struct sk_buff *skb) > > +{ > > + skb->_sk_redir = 0; > > +} > > +#endif /* CONFIG_NET_SOCK_MSG */ > > #endif /* _LINUX_SKMSG_H */ > > diff --git a/include/net/tcp.h b/include/net/tcp.h > > index 947ef5da6867..075de26f449d 100644 > > --- a/include/net/tcp.h > > +++ b/include/net/tcp.h > > @@ -883,30 +883,11 @@ struct tcp_skb_cb { > > struct inet6_skb_parm h6; > > #endif > > } header; /* For incoming skbs */ > > - struct { > > - __u32 flags; > > - struct sock *sk_redir; > > - } bpf; > > }; > > }; > > > > #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) > > > > -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) > > -{ > > - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; > > -} > > - > > -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) > > -{ > > - return TCP_SKB_CB(skb)->bpf.sk_redir; > > -} > > - > > -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) > > -{ > > - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; > > -} > > - > > extern const struct inet_connection_sock_af_ops ipv4_specific; > > > > #if IS_ENABLED(CONFIG_IPV6) > > diff --git a/net/core/skmsg.c b/net/core/skmsg.c > > index 2d8bbb3fd87c..05b5af09ff42 100644 > > --- a/net/core/skmsg.c > > +++ b/net/core/skmsg.c > > @@ -494,6 +494,8 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb > > static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, > > u32 off, u32 len, bool ingress) > > { > > + skb_bpf_redirect_clear(skb); > > This is called to avoid leaking state in skb->_skb_refdst. Correct? This is to teach kfree_skb() not to consider it as a valid _skb_refdst. > > I'm wondering why we're doing it every time sk_psock_handle_skb() gets > invoked from the do/while loop in sk_psock_backlog(), instead of doing > it once after reading ingress flag with skb_bpf_ingress()? It should also work, I don't see much difference here, as we almost always process a full skb, that is, ret == skb->len. > > > + > > if (!ingress) { > > if (!sock_writeable(psock->sk)) > > return -EAGAIN; > > @@ -525,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work) > > len = skb->len; > > off = 0; > > start: > > - ingress = tcp_skb_bpf_ingress(skb); > > + ingress = skb_bpf_ingress(skb); > > do { > > ret = -EIO; > > if (likely(psock->sk->sk_socket)) > > @@ -631,7 +633,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) > > > > static void sk_psock_zap_ingress(struct sk_psock *psock) > > { > > - __skb_queue_purge(&psock->ingress_skb); > > + struct sk_buff *skb; > > + > > + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { > > + skb_bpf_redirect_clear(skb); > > I believe we clone the skb before enqueuing it psock->ingress_skb. > Clone happens either in sk_psock_verdict_recv() or in __strp_recv(). > There are not other users holding a ref, so clearing the redirect seems > unneeded. Unless I'm missing something? Yes, skb dst is also cloned: 980 static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) 981 { 982 new->tstamp = old->tstamp; 983 /* We do not copy old->sk */ 984 new->dev = old->dev; 985 memcpy(new->cb, old->cb, sizeof(old->cb)); 986 skb_dst_copy(new, old); Also, if without this, dst_release() would complain again. I was not smart enough to add it in the beginning, dst_release() taught me this lesson. ;) > > > + kfree_skb(skb); > > + } > > __sk_psock_purge_ingress_msg(psock); > > } > > > > @@ -752,7 +759,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) > > struct sk_psock *psock_other; > > struct sock *sk_other; > > > > - sk_other = tcp_skb_bpf_redirect_fetch(skb); > > + sk_other = skb_bpf_redirect_fetch(skb); > > /* This error is a buggy BPF program, it returned a redirect > > * return code, but then didn't set a redirect interface. > > */ > > @@ -802,9 +809,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) > > * TLS context. > > */ > > skb->sk = psock->sk; > > - tcp_skb_bpf_redirect_clear(skb); > > + skb_dst_drop(skb); > > + skb_bpf_redirect_clear(skb); > > After skb_dst_drop(), skb->_skb_refdst is clear. So it seems the > redirect_clear() is not needed. But I'm guessing it is being invoked > to communicate the intention? Technically true, but I prefer to call them explicitly, not to rely on the fact skb->_skb_refdst shares the same storage with skb->_sk_redir, which would also require some comments to explain. Thanks.
On Mon, Feb 22, 2021 at 08:27 PM CET, Cong Wang wrote: > On Mon, Feb 22, 2021 at 4:20 AM Jakub Sitnicki <jakub@cloudflare.com> wrote: >> >> On Sat, Feb 20, 2021 at 06:29 AM CET, Cong Wang wrote: >> > From: Cong Wang <cong.wang@bytedance.com> >> > >> > Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly >> > does not work for any other non-TCP protocols. We can move them to >> > skb ext, but it introduces a memory allocation on fast path. >> > >> > Fortunately, we only need to a word-size to store all the information, >> > because the flags actually only contains 1 bit so can be just packed >> > into the lowest bit of the "pointer", which is stored as unsigned >> > long. >> > >> > Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is >> > no longer needed after ->sk_data_ready() so we can just drop it. >> > >> > Cc: Daniel Borkmann <daniel@iogearbox.net> >> > Cc: Jakub Sitnicki <jakub@cloudflare.com> >> > Cc: Lorenz Bauer <lmb@cloudflare.com> >> > Acked-by: John Fastabend <john.fastabend@gmail.com> >> > Signed-off-by: Cong Wang <cong.wang@bytedance.com> >> > --- >> >> LGTM. I have some questions (below) that would help me confirm if I >> understand the changes, and what could be improved, if anything. >> >> Acked-by: Jakub Sitnicki <jakub@cloudflare.com> >> >> > include/linux/skbuff.h | 3 +++ >> > include/linux/skmsg.h | 35 +++++++++++++++++++++++++++++++++++ >> > include/net/tcp.h | 19 ------------------- >> > net/core/skmsg.c | 32 ++++++++++++++++++++------------ >> > net/core/sock_map.c | 8 ++------ >> > 5 files changed, 60 insertions(+), 37 deletions(-) >> > >> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h >> > index 6d0a33d1c0db..bd84f799c952 100644 >> > --- a/include/linux/skbuff.h >> > +++ b/include/linux/skbuff.h >> > @@ -755,6 +755,9 @@ struct sk_buff { >> > void (*destructor)(struct sk_buff *skb); >> > }; >> > struct list_head tcp_tsorted_anchor; >> > +#ifdef CONFIG_NET_SOCK_MSG >> > + unsigned long _sk_redir; >> > +#endif >> > }; >> > >> > #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) >> > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h >> > index e3bb712af257..fc234d507fd7 100644 >> > --- a/include/linux/skmsg.h >> > +++ b/include/linux/skmsg.h >> > @@ -459,4 +459,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) >> > return false; >> > return !!psock->saved_data_ready; >> > } >> > + >> > +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) >> > +static inline bool skb_bpf_ingress(const struct sk_buff *skb) >> > +{ >> > + unsigned long sk_redir = skb->_sk_redir; >> > + >> > + return sk_redir & BPF_F_INGRESS; >> > +} >> > + >> > +static inline void skb_bpf_set_ingress(struct sk_buff *skb) >> > +{ >> > + skb->_sk_redir |= BPF_F_INGRESS; >> > +} >> > + >> > +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, >> > + bool ingress) >> > +{ >> > + skb->_sk_redir = (unsigned long)sk_redir; >> > + if (ingress) >> > + skb->_sk_redir |= BPF_F_INGRESS; >> > +} >> > + >> > +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) >> > +{ >> > + unsigned long sk_redir = skb->_sk_redir; >> > + >> > + sk_redir &= ~0x1UL; >> >> We're using the enum when setting the bit flag, but a hardcoded constant >> when masking it. ~BPF_F_INGRESS would be more consistent here. > > Well, here we need a mask, not a bit, but we don't have a mask yet, > hence I just use hard-coded 0x1. Does #define BPF_F_MASK 0x1UL > look any better? Based on what I've seen around, mask for sanitizing tagged pointers is usually derived from the flag(s). For instance: #define SKB_DST_NOREF 1UL #define SKB_DST_PTRMASK ~(SKB_DST_NOREF) #define SK_USER_DATA_NOCOPY 1UL #define SK_USER_DATA_BPF 2UL /* Managed by BPF */ #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF) Using ~(BPF_F_INGRESS) expression would be like substituting mask definition. [..] >> > diff --git a/include/net/tcp.h b/include/net/tcp.h >> > index 947ef5da6867..075de26f449d 100644 >> > --- a/include/net/tcp.h >> > +++ b/include/net/tcp.h >> > @@ -883,30 +883,11 @@ struct tcp_skb_cb { >> > struct inet6_skb_parm h6; >> > #endif >> > } header; /* For incoming skbs */ >> > - struct { >> > - __u32 flags; >> > - struct sock *sk_redir; >> > - } bpf; >> > }; >> > }; >> > >> > #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) >> > >> > -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) >> > -{ >> > - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; >> > -} >> > - >> > -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) >> > -{ >> > - return TCP_SKB_CB(skb)->bpf.sk_redir; >> > -} >> > - >> > -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) >> > -{ >> > - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; >> > -} >> > - >> > extern const struct inet_connection_sock_af_ops ipv4_specific; >> > >> > #if IS_ENABLED(CONFIG_IPV6) >> > diff --git a/net/core/skmsg.c b/net/core/skmsg.c >> > index 2d8bbb3fd87c..05b5af09ff42 100644 >> > --- a/net/core/skmsg.c >> > +++ b/net/core/skmsg.c >> > @@ -494,6 +494,8 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb >> > static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, >> > u32 off, u32 len, bool ingress) >> > { >> > + skb_bpf_redirect_clear(skb); >> >> This is called to avoid leaking state in skb->_skb_refdst. Correct? > > This is to teach kfree_skb() not to consider it as a valid _skb_refdst. OK > >> >> I'm wondering why we're doing it every time sk_psock_handle_skb() gets >> invoked from the do/while loop in sk_psock_backlog(), instead of doing >> it once after reading ingress flag with skb_bpf_ingress()? > > It should also work, I don't see much difference here, as we almost > always process a full skb, that is, ret == skb->len. OK > > >> >> > + >> > if (!ingress) { >> > if (!sock_writeable(psock->sk)) >> > return -EAGAIN; >> > @@ -525,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work) >> > len = skb->len; >> > off = 0; >> > start: >> > - ingress = tcp_skb_bpf_ingress(skb); >> > + ingress = skb_bpf_ingress(skb); >> > do { >> > ret = -EIO; >> > if (likely(psock->sk->sk_socket)) >> > @@ -631,7 +633,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) >> > >> > static void sk_psock_zap_ingress(struct sk_psock *psock) >> > { >> > - __skb_queue_purge(&psock->ingress_skb); >> > + struct sk_buff *skb; >> > + >> > + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { >> > + skb_bpf_redirect_clear(skb); >> >> I believe we clone the skb before enqueuing it psock->ingress_skb. >> Clone happens either in sk_psock_verdict_recv() or in __strp_recv(). >> There are not other users holding a ref, so clearing the redirect seems >> unneeded. Unless I'm missing something? > > Yes, skb dst is also cloned: > > 980 static void __copy_skb_header(struct sk_buff *new, const struct > sk_buff *old) > 981 { > 982 new->tstamp = old->tstamp; > 983 /* We do not copy old->sk */ > 984 new->dev = old->dev; > 985 memcpy(new->cb, old->cb, sizeof(old->cb)); > 986 skb_dst_copy(new, old); > > Also, if without this, dst_release() would complain again. I was not smart > enough to add it in the beginning, dst_release() taught me this lesson. ;) OK, I think I follow you now. Alternatively we could clear _skb_refdest after clone, but before enqueuing the skb in ingress_skb. And only for when we're redirecting. I believe that would be in sk_psock_skb_redirect, right before skb_queue_tail. > >> >> > + kfree_skb(skb); >> > + } >> > __sk_psock_purge_ingress_msg(psock); >> > } >> > >> > @@ -752,7 +759,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) >> > struct sk_psock *psock_other; >> > struct sock *sk_other; >> > >> > - sk_other = tcp_skb_bpf_redirect_fetch(skb); >> > + sk_other = skb_bpf_redirect_fetch(skb); >> > /* This error is a buggy BPF program, it returned a redirect >> > * return code, but then didn't set a redirect interface. >> > */ >> > @@ -802,9 +809,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) >> > * TLS context. >> > */ >> > skb->sk = psock->sk; >> > - tcp_skb_bpf_redirect_clear(skb); >> > + skb_dst_drop(skb); >> > + skb_bpf_redirect_clear(skb); >> >> After skb_dst_drop(), skb->_skb_refdst is clear. So it seems the >> redirect_clear() is not needed. But I'm guessing it is being invoked >> to communicate the intention? > > Technically true, but I prefer to call them explicitly, not to rely on the > fact skb->_skb_refdst shares the same storage with skb->_sk_redir, > which would also require some comments to explain. > OK
On Tue, Feb 23, 2021 at 9:53 AM Jakub Sitnicki <jakub@cloudflare.com> wrote: > Based on what I've seen around, mask for sanitizing tagged pointers is > usually derived from the flag(s). For instance: > > #define SKB_DST_NOREF 1UL > #define SKB_DST_PTRMASK ~(SKB_DST_NOREF) > > #define SK_USER_DATA_NOCOPY 1UL > #define SK_USER_DATA_BPF 2UL /* Managed by BPF */ > #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF) > > Using ~(BPF_F_INGRESS) expression would be like substituting mask > definition. Yes, that is why I said we need a mask. > > Alternatively we could clear _skb_refdest after clone, but before > enqueuing the skb in ingress_skb. And only for when we're redirecting. > > I believe that would be in sk_psock_skb_redirect, right before skb_queue_tail. Hmm? We definitely cannot clear skb->_sk_redir there, as it is used after enqueued in ingress_skb, that is in sk_psock_backlog(). Thanks.
On Tue, Feb 23, 2021 at 07:04 PM CET, Cong Wang wrote: > On Tue, Feb 23, 2021 at 9:53 AM Jakub Sitnicki <jakub@cloudflare.com> wrote: >> Based on what I've seen around, mask for sanitizing tagged pointers is >> usually derived from the flag(s). For instance: >> >> #define SKB_DST_NOREF 1UL >> #define SKB_DST_PTRMASK ~(SKB_DST_NOREF) >> >> #define SK_USER_DATA_NOCOPY 1UL >> #define SK_USER_DATA_BPF 2UL /* Managed by BPF */ >> #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF) >> >> Using ~(BPF_F_INGRESS) expression would be like substituting mask >> definition. > > Yes, that is why I said we need a mask. OK > >> >> Alternatively we could clear _skb_refdest after clone, but before >> enqueuing the skb in ingress_skb. And only for when we're redirecting. >> >> I believe that would be in sk_psock_skb_redirect, right before skb_queue_tail. > > Hmm? We definitely cannot clear skb->_sk_redir there, as it is used after > enqueued in ingress_skb, that is in sk_psock_backlog(). You're right. I focused on the sk pointer and forgot it also carries the ingress flag.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6d0a33d1c0db..bd84f799c952 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -755,6 +755,9 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); }; struct list_head tcp_tsorted_anchor; +#ifdef CONFIG_NET_SOCK_MSG + unsigned long _sk_redir; +#endif }; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e3bb712af257..fc234d507fd7 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -459,4 +459,39 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) return false; return !!psock->saved_data_ready; } + +#if IS_ENABLED(CONFIG_NET_SOCK_MSG) +static inline bool skb_bpf_ingress(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + return sk_redir & BPF_F_INGRESS; +} + +static inline void skb_bpf_set_ingress(struct sk_buff *skb) +{ + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline void skb_bpf_set_redir(struct sk_buff *skb, struct sock *sk_redir, + bool ingress) +{ + skb->_sk_redir = (unsigned long)sk_redir; + if (ingress) + skb->_sk_redir |= BPF_F_INGRESS; +} + +static inline struct sock *skb_bpf_redirect_fetch(const struct sk_buff *skb) +{ + unsigned long sk_redir = skb->_sk_redir; + + sk_redir &= ~0x1UL; + return (struct sock *)sk_redir; +} + +static inline void skb_bpf_redirect_clear(struct sk_buff *skb) +{ + skb->_sk_redir = 0; +} +#endif /* CONFIG_NET_SOCK_MSG */ #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 947ef5da6867..075de26f449d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -883,30 +883,11 @@ struct tcp_skb_cb { struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ - struct { - __u32 flags; - struct sock *sk_redir; - } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) -static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; -} - -static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) -{ - return TCP_SKB_CB(skb)->bpf.sk_redir; -} - -static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.sk_redir = NULL; -} - extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2d8bbb3fd87c..05b5af09ff42 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -494,6 +494,8 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { + skb_bpf_redirect_clear(skb); + if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; @@ -525,7 +527,7 @@ static void sk_psock_backlog(struct work_struct *work) len = skb->len; off = 0; start: - ingress = tcp_skb_bpf_ingress(skb); + ingress = skb_bpf_ingress(skb); do { ret = -EIO; if (likely(psock->sk->sk_socket)) @@ -631,7 +633,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) static void sk_psock_zap_ingress(struct sk_psock *psock) { - __skb_queue_purge(&psock->ingress_skb); + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + kfree_skb(skb); + } __sk_psock_purge_ingress_msg(psock); } @@ -752,7 +759,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) struct sk_psock *psock_other; struct sock *sk_other; - sk_other = tcp_skb_bpf_redirect_fetch(skb); + sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ @@ -802,9 +809,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) * TLS context. */ skb->sk = psock->sk; - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock->sk, ret); @@ -816,7 +824,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { - struct tcp_skb_cb *tcp; struct sock *sk_other; int err = -EIO; @@ -828,8 +835,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, goto out_free; } - tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to @@ -890,9 +896,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: @@ -1005,9 +1012,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1a28a5c2c61e..dbfcd7006338 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; }