Message ID | 20210328202013.29223-8-xiyou.wangcong@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | sockmap: introduce BPF_SK_SKB_VERDICT and support UDP | expand |
Cong Wang wrote: > From: Cong Wang <cong.wang@bytedance.com> > > Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is > confusing and more importantly we still want to distinguish them > from user-space. So we can just reuse the stream verdict code but > introduce a new type of eBPF program, skb_verdict. Users are not > allowed to set stream_verdict and skb_verdict at the same time. > > Cc: John Fastabend <john.fastabend@gmail.com> > Cc: Daniel Borkmann <daniel@iogearbox.net> > Cc: Jakub Sitnicki <jakub@cloudflare.com> > Cc: Lorenz Bauer <lmb@cloudflare.com> > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > --- [...] > diff --git a/net/core/skmsg.c b/net/core/skmsg.c > index 656eceab73bc..a045812d7c78 100644 > --- a/net/core/skmsg.c > +++ b/net/core/skmsg.c > @@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) > rcu_assign_sk_user_data(sk, NULL); > if (psock->progs.stream_parser) > sk_psock_stop_strp(sk, psock); > - else if (psock->progs.stream_verdict) > + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) > sk_psock_stop_verdict(sk, psock); > write_unlock_bh(&sk->sk_callback_lock); > > @@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, > } > skb_set_owner_r(skb, sk); > prog = READ_ONCE(psock->progs.stream_verdict); > + if (!prog) > + prog = READ_ONCE(psock->progs.skb_verdict); Trying to think through this case. User attachs skb_verdict program to map, then updates map with a bunch of TCP sockets. The above code will run the skb_verdict program with the TCP socket as far as I can tell. This is OK because there really is no difference, other than by name, between a skb_verdict and a stream_verdict program? Do we want something to block adding TCP sockets to maps with stream_verdict programs? It feels a bit odd in its current state to me. > if (likely(prog)) { > skb_dst_drop(skb); > skb_bpf_redirect_clear(skb); > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > index e564fdeaada1..c46709786a49 100644 > --- a/net/core/sock_map.c > +++ b/net/core/sock_map.c > @@ -155,6 +155,8 @@ static void sock_map_del_link(struct sock *sk, > strp_stop = true; > if (psock->saved_data_ready && stab->progs.stream_verdict) > verdict_stop = true; > + if (psock->saved_data_ready && stab->progs.skb_verdict) > + verdict_stop = true; > list_del(&link->list); > sk_psock_free_link(link); > } > @@ -227,7 +229,7 @@ static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) > static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, > struct sock *sk) > { > - struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; > + struct bpf_prog *msg_parser, *stream_parser, *stream_verdict, *skb_verdict; > struct sk_psock *psock; > int ret; > > @@ -256,6 +258,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, > } > } > > + skb_verdict = READ_ONCE(progs->skb_verdict); > + if (skb_verdict) { > + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); > + if (IS_ERR(skb_verdict)) { > + ret = PTR_ERR(skb_verdict); > + goto out_put_msg_parser; > + } > + } > + > psock = sock_map_psock_get_checked(sk); > if (IS_ERR(psock)) { > ret = PTR_ERR(psock); > @@ -265,6 +276,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, > if (psock) { > if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || > (stream_parser && READ_ONCE(psock->progs.stream_parser)) || > + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || > (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { > sk_psock_put(sk, psock); > ret = -EBUSY; Do we need another test here, (skb_verdict && READ_ONCE(psock->progs.stream_verdict) this way we return EBUSY and avoid having both stream_verdict and skb_verdict attached on the same map? From commit msg: "Users are not allowed to set stream_verdict and skb_verdict at the same time." > @@ -296,6 +308,9 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, > } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { > psock_set_prog(&psock->progs.stream_verdict, stream_verdict); > sk_psock_start_verdict(sk,psock); > + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { > + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); > + sk_psock_start_verdict(sk, psock); Thanks, John
On Mon, Mar 29, 2021 at 1:10 PM John Fastabend <john.fastabend@gmail.com> wrote: > > Cong Wang wrote: > > From: Cong Wang <cong.wang@bytedance.com> > > > > Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is > > confusing and more importantly we still want to distinguish them > > from user-space. So we can just reuse the stream verdict code but > > introduce a new type of eBPF program, skb_verdict. Users are not > > allowed to set stream_verdict and skb_verdict at the same time. > > > > Cc: John Fastabend <john.fastabend@gmail.com> > > Cc: Daniel Borkmann <daniel@iogearbox.net> > > Cc: Jakub Sitnicki <jakub@cloudflare.com> > > Cc: Lorenz Bauer <lmb@cloudflare.com> > > Signed-off-by: Cong Wang <cong.wang@bytedance.com> > > --- > > [...] > > > diff --git a/net/core/skmsg.c b/net/core/skmsg.c > > index 656eceab73bc..a045812d7c78 100644 > > --- a/net/core/skmsg.c > > +++ b/net/core/skmsg.c > > @@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) > > rcu_assign_sk_user_data(sk, NULL); > > if (psock->progs.stream_parser) > > sk_psock_stop_strp(sk, psock); > > - else if (psock->progs.stream_verdict) > > + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) > > sk_psock_stop_verdict(sk, psock); > > write_unlock_bh(&sk->sk_callback_lock); > > > > @@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, > > } > > skb_set_owner_r(skb, sk); > > prog = READ_ONCE(psock->progs.stream_verdict); > > + if (!prog) > > + prog = READ_ONCE(psock->progs.skb_verdict); > > Trying to think through this case. User attachs skb_verdict program > to map, then updates map with a bunch of TCP sockets. The above > code will run the skb_verdict program with the TCP socket as far as > I can tell. > > This is OK because there really is no difference, other than by name, > between a skb_verdict and a stream_verdict program? Do we want something > to block adding TCP sockets to maps with stream_verdict programs? It > feels a bit odd in its current state to me. Yes, it should work too. skb_verdict only extends stream_verdict beyond TCP, it does not prohibit TCP. > > > if (likely(prog)) { > > skb_dst_drop(skb); > > skb_bpf_redirect_clear(skb); > > diff --git a/net/core/sock_map.c b/net/core/sock_map.c > > index e564fdeaada1..c46709786a49 100644 > > --- a/net/core/sock_map.c > > +++ b/net/core/sock_map.c > > @@ -155,6 +155,8 @@ static void sock_map_del_link(struct sock *sk, > > strp_stop = true; > > if (psock->saved_data_ready && stab->progs.stream_verdict) > > verdict_stop = true; > > + if (psock->saved_data_ready && stab->progs.skb_verdict) > > + verdict_stop = true; > > list_del(&link->list); > > sk_psock_free_link(link); > > } > > @@ -227,7 +229,7 @@ static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) > > static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, > > struct sock *sk) > > { > > - struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; > > + struct bpf_prog *msg_parser, *stream_parser, *stream_verdict, *skb_verdict; > > struct sk_psock *psock; > > int ret; > > > > @@ -256,6 +258,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, > > } > > } > > > > + skb_verdict = READ_ONCE(progs->skb_verdict); > > + if (skb_verdict) { > > + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); > > + if (IS_ERR(skb_verdict)) { > > + ret = PTR_ERR(skb_verdict); > > + goto out_put_msg_parser; > > + } > > + } > > + > > psock = sock_map_psock_get_checked(sk); > > if (IS_ERR(psock)) { > > ret = PTR_ERR(psock); > > @@ -265,6 +276,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, > > if (psock) { > > if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || > > (stream_parser && READ_ONCE(psock->progs.stream_parser)) || > > + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || > > (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { > > sk_psock_put(sk, psock); > > ret = -EBUSY; > > Do we need another test here, > > (skb_verdict && READ_ONCE(psock->progs.stream_verdict) > > this way we return EBUSY and avoid having both stream_verdict and > skb_verdict attached on the same map? Yes, good catch, we do need a check here. And I will see if I can add a small test case for this too. Thanks.
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index e7aba150539d..c83dbc2d81d9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -58,6 +58,7 @@ struct sk_psock_progs { struct bpf_prog *msg_parser; struct bpf_prog *stream_parser; struct bpf_prog *stream_verdict; + struct bpf_prog *skb_verdict; }; enum sk_psock_state_bits { @@ -487,6 +488,7 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->msg_parser, NULL); psock_set_prog(&progs->stream_parser, NULL); psock_set_prog(&progs->stream_verdict, NULL); + psock_set_prog(&progs->skb_verdict, NULL); } int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 598716742593..49371eba98ba 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9603de81811a..6428634da57e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2948,6 +2948,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_MSG; case BPF_SK_SKB_STREAM_PARSER: case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_SKB_VERDICT: return BPF_PROG_TYPE_SK_SKB; case BPF_LIRC_MODE2: return BPF_PROG_TYPE_LIRC_MODE2; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 656eceab73bc..a045812d7c78 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.stream_verdict) + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); @@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, } skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.stream_verdict); + if (!prog) + prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index e564fdeaada1..c46709786a49 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -155,6 +155,8 @@ static void sock_map_del_link(struct sock *sk, strp_stop = true; if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; + if (psock->saved_data_ready && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } @@ -227,7 +229,7 @@ static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { - struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; + struct bpf_prog *msg_parser, *stream_parser, *stream_verdict, *skb_verdict; struct sk_psock *psock; int ret; @@ -256,6 +258,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, } } + skb_verdict = READ_ONCE(progs->skb_verdict); + if (skb_verdict) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) { + ret = PTR_ERR(skb_verdict); + goto out_put_msg_parser; + } + } + psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); @@ -265,6 +276,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; @@ -296,6 +308,9 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; @@ -304,6 +319,9 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, out_drop: sk_psock_put(sk, psock); out_progs: + if (skb_verdict) + bpf_prog_put(skb_verdict); +out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: @@ -1468,6 +1486,9 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, case BPF_SK_SKB_STREAM_VERDICT: pprog = &progs->stream_verdict; break; + case BPF_SK_SKB_VERDICT: + pprog = &progs->skb_verdict; + break; default: return -EOPNOTSUPP; } diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 65303664417e..1828bba19020 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", [BPF_LIRC_MODE2] = "lirc_mode2", [BPF_FLOW_DISSECTOR] = "flow_dissector", diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f2b915b20546..3f067d2d7584 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -76,6 +76,7 @@ enum dump_mode { static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_SKB_VERDICT] = "skb_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", [__MAX_BPF_ATTACH_TYPE] = NULL, diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ab9f2233607c..69902603012c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE };