Message ID | 1729737768-124596-4-git-send-email-alibuda@linux.alibaba.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net/smc: Introduce smc_bpf_ops | expand |
On 10/23/24 7:42 PM, D. Wythe wrote: > From: "D. Wythe" <alibuda@linux.alibaba.com> > > The introduction of IPPROTO_SMC enables eBPF programs to determine > whether to use SMC based on the context of socket creation, such as > network namespaces, PID and comm name, etc. > > As a subsequent enhancement, this patch introduces a new hook for eBPF > programs that allows decisions on whether to use SMC or not at runtime, > including but not limited to local/remote IP address or ports. In > simpler words, this feature allows modifications to syn_smc through eBPF > programs before the TCP three-way handshake got established. > > Signed-off-by: D. Wythe <alibuda@linux.alibaba.com> > --- > include/linux/tcp.h | 2 +- > include/net/smc.h | 47 +++++++++++ > include/net/tcp.h | 6 ++ > net/ipv4/tcp_input.c | 3 +- > net/ipv4/tcp_output.c | 14 +++- > net/smc/Kconfig | 12 +++ > net/smc/Makefile | 1 + > net/smc/af_smc.c | 38 ++++++--- > net/smc/smc.h | 4 + > net/smc/smc_bpf.c | 212 ++++++++++++++++++++++++++++++++++++++++++++++++++ > net/smc/smc_bpf.h | 34 ++++++++ > 11 files changed, 357 insertions(+), 16 deletions(-) > create mode 100644 net/smc/smc_bpf.c > create mode 100644 net/smc/smc_bpf.h > > diff --git a/include/linux/tcp.h b/include/linux/tcp.h > index 6a5e08b..4ef160a 100644 > --- a/include/linux/tcp.h > +++ b/include/linux/tcp.h > @@ -478,7 +478,7 @@ struct tcp_sock { > #endif > #if IS_ENABLED(CONFIG_SMC) > bool syn_smc; /* SYN includes SMC */ > - bool (*smc_hs_congested)(const struct sock *sk); > + struct tcpsmc_ctx *smc; > #endif > > #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) > diff --git a/include/net/smc.h b/include/net/smc.h > index db84e4e..34ab2c6 100644 > --- a/include/net/smc.h > +++ b/include/net/smc.h > @@ -18,6 +18,8 @@ > #include "linux/ism.h" > > struct sock; > +struct tcp_sock; > +struct inet_request_sock; > > #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ > > @@ -97,4 +99,49 @@ struct smcd_dev { > u8 going_away : 1; > }; > > +/* > + * This structure is used to store the parameters passed to the member of struct_ops. > + * Due to the BPF verifier cannot restrict the writing of bit fields, such as limiting > + * it to only write ireq->smc_ok. Using kfunc can solve this issue, but we don't want > + * to introduce a kfunc with such a narrow function. imo, adding kfunc is fine. > + * > + * Moreover, using this structure for unified parameters also addresses another > + * potential issue. Currently, kfunc cannot recognize the calling context > + * through BPF's existing structure. In the future, we can solve this problem > + * by passing this ctx to kfunc. This part I don't understand. How is it different from the "tcp_cubic_kfunc_set" allowed in tcp_congestion_ops? > + */ > +struct smc_bpf_ops_ctx { > + struct { > + struct tcp_sock *tp; > + } set_option; > + struct { > + const struct tcp_sock *tp; > + struct inet_request_sock *ireq; > + int smc_ok; > + } set_option_cond; > +}; There is no need to create one single ctx for struct_ops prog. struct_ops prog can take >1 args and different ops can take different args. > + > +struct smc_bpf_ops { > + /* priavte */ > + > + struct list_head list; > + > + /* public */ > + > + /* Invoked before computing SMC option for SYN packets. > + * We can control whether to set SMC options by modifying > + * ctx->set_option->tp->syn_smc. > + * This's also the only member that can be modified now. > + * Only member in ctx->set_option is valid for this callback. > + */ > + void (*set_option)(struct smc_bpf_ops_ctx *ctx); > + > + /* Invoked before Set up SMC options for SYN-ACK packets > + * We can control whether to respond SMC options by modifying > + * ctx->set_option_cond.smc_ok. > + * Only member in ctx->set_option_cond is valid for this callback. > + */ > + void (*set_option_cond)(struct smc_bpf_ops_ctx *ctx); The struct smc_bpf_ops already has set_option and set_option_cnd, but... > +}; > + > #endif /* _SMC_H */ > diff --git a/include/net/tcp.h b/include/net/tcp.h > index 739a9fb..c322443 100644 > --- a/include/net/tcp.h > +++ b/include/net/tcp.h > @@ -2730,6 +2730,12 @@ static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) > > #if IS_ENABLED(CONFIG_SMC) > extern struct static_key_false tcp_have_smc; > +struct tcpsmc_ctx { > + /* Invoked before computing SMC option for SYN packets. */ > + void (*set_option)(struct tcp_sock *tp); > + /* Invoked before Set up SMC options for SYN-ACK packets */ > + void (*set_option_cond)(const struct tcp_sock *tp, struct inet_request_sock *ireq); > +}; another new struct tcpsmc_ctx has exactly the same functions (at least the same name) but different arguments. I don't understand why this duplicate, is it because the need to prepare the "struct smc_bpf_ops_ctx"? The "struct tcpsmc_ctx" should be the "struct smc_bpf_ops" itself. [ ... ] > +static int smc_bpf_ops_btf_struct_access(struct bpf_verifier_log *log, > + const struct bpf_reg_state *reg, > + const struct bpf_prog *prog, > + int off, int size) > +{ > + const struct btf_member *member; > + const char *mname; > + int member_idx; > + > + member_idx = prog->expected_attach_type; > + if (member_idx >= btf_type_vlen(smc_bpf_ops_type)) > + goto out_err; > + > + member = &btf_type_member(smc_bpf_ops_type)[member_idx]; > + mname = btf_str_by_offset(saved_btf, member->name_off); > + > + if (!strcmp(mname, "set_option")) { btf_member_bit_offset can be used instead of strcmp. Take a look at bpf_tcp_ca.c and kernel/sched/ext.c > + /* only support to modify tcp_sock->syn_smc */ > + if (reg->btf_id == tcp_sock_id && > + off == offsetof(struct tcp_sock, syn_smc) && > + off + size == offsetofend(struct tcp_sock, syn_smc)) > + return 0; > + } else if (!strcmp(mname, "set_option_cond")) { > + /* only support to modify smc_bpf_ops_ctx->smc_ok */ > + if (reg->btf_id == smc_bpf_ops_ctx_id && > + off == offsetof(struct smc_bpf_ops_ctx, set_option_cond.smc_ok) && > + off + size == offsetofend(struct smc_bpf_ops_ctx, set_option_cond.smc_ok)) > + return 0; > + } > + > +out_err: > + return -EACCES; > +} > + > +static const struct bpf_verifier_ops smc_bpf_verifier_ops = { > + .get_func_proto = bpf_base_func_proto, > + .is_valid_access = bpf_tracing_btf_ctx_access, > + .btf_struct_access = smc_bpf_ops_btf_struct_access, > +}; > + > +static struct bpf_struct_ops bpf_smc_bpf_ops = { > + .init = smc_bpf_ops_init, > + .name = "smc_bpf_ops", > + .reg = smc_bpf_ops_reg, > + .unreg = smc_bpf_ops_unreg, > + .cfi_stubs = &__bpf_smc_bpf_ops, > + .verifier_ops = &smc_bpf_verifier_ops, > + .init_member = smc_bpf_ops_init_member, > + .check_member = smc_bpf_ops_check_member, > + .owner = THIS_MODULE, > +}; > + > +int smc_bpf_struct_ops_init(void) > +{ > + return register_bpf_struct_ops(&bpf_smc_bpf_ops, smc_bpf_ops); > +} > + > +void bpf_smc_set_tcp_option(struct tcp_sock *tp) > +{ > + struct smc_bpf_ops_ctx ops_ctx = {}; > + struct smc_bpf_ops *ops; > + > + ops_ctx.set_option.tp = tp; All this initialization should be unnecessary. Directly pass tp instead. > + > + rcu_read_lock(); > + list_for_each_entry_rcu(ops, &smc_bpf_ops_list, list) { Does it need to have a list (meaning >1) of smc_bpf_ops to act on a sock? The ordering expectation is hard to manage. > + ops->set_option(&ops_ctx); A dumb question. This will only affect AF_SMC (or AF_INET[6]/IPPROTO_SMC) socket but not the AF_INET[6]/IPPROTO_{TCP,UDP} socket? pw-bot: cr > + } > + rcu_read_unlock(); > +}
On 10/25/24 8:26 AM, Martin KaFai Lau wrote: > On 10/23/24 7:42 PM, D. Wythe wrote: >> From: "D. Wythe" <alibuda@linux.alibaba.com> >> >> The introduction of IPPROTO_SMC enables eBPF programs to determine >> whether to use SMC based on the context of socket creation, such as >> network namespaces, PID and comm name, etc. >> >> As a subsequent enhancement, this patch introduces a new hook for eBPF >> programs that allows decisions on whether to use SMC or not at runtime, >> including but not limited to local/remote IP address or ports. In >> simpler words, this feature allows modifications to syn_smc through eBPF >> programs before the TCP three-way handshake got established. >> >> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com> >> --- >> include/linux/tcp.h | 2 +- >> include/net/smc.h | 47 +++++++++++ >> include/net/tcp.h | 6 ++ >> net/ipv4/tcp_input.c | 3 +- >> net/ipv4/tcp_output.c | 14 +++- >> net/smc/Kconfig | 12 +++ >> net/smc/Makefile | 1 + >> net/smc/af_smc.c | 38 ++++++--- >> net/smc/smc.h | 4 + >> net/smc/smc_bpf.c | 212 ++++++++++++++++++++++++++++++++++++++++++++++++++ >> net/smc/smc_bpf.h | 34 ++++++++ >> 11 files changed, 357 insertions(+), 16 deletions(-) >> create mode 100644 net/smc/smc_bpf.c >> create mode 100644 net/smc/smc_bpf.h >> >> diff --git a/include/linux/tcp.h b/include/linux/tcp.h >> index 6a5e08b..4ef160a 100644 >> --- a/include/linux/tcp.h >> +++ b/include/linux/tcp.h >> @@ -478,7 +478,7 @@ struct tcp_sock { >> #endif >> #if IS_ENABLED(CONFIG_SMC) >> bool syn_smc; /* SYN includes SMC */ >> - bool (*smc_hs_congested)(const struct sock *sk); >> + struct tcpsmc_ctx *smc; >> #endif >> #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) >> diff --git a/include/net/smc.h b/include/net/smc.h >> index db84e4e..34ab2c6 100644 >> --- a/include/net/smc.h >> +++ b/include/net/smc.h >> @@ -18,6 +18,8 @@ >> #include "linux/ism.h" >> struct sock; >> +struct tcp_sock; >> +struct inet_request_sock; >> #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ >> @@ -97,4 +99,49 @@ struct smcd_dev { >> u8 going_away : 1; >> }; >> +/* >> + * This structure is used to store the parameters passed to the member of struct_ops. >> + * Due to the BPF verifier cannot restrict the writing of bit fields, such as limiting >> + * it to only write ireq->smc_ok. Using kfunc can solve this issue, but we don't want >> + * to introduce a kfunc with such a narrow function. > > imo, adding kfunc is fine. > >> + * >> + * Moreover, using this structure for unified parameters also addresses another >> + * potential issue. Currently, kfunc cannot recognize the calling context >> + * through BPF's existing structure. In the future, we can solve this problem >> + * by passing this ctx to kfunc. > > This part I don't understand. How is it different from the "tcp_cubic_kfunc_set" allowed in > tcp_congestion_ops? Hi Martin, Yes, creating an independent kfunc for each callback and filtering via expected_attach_type can indeed solve the problem. Our main concern is to avoid introducing kfuncs as much as possible. For our subsystem, we might need to maintain it in a way that maintains a uapi, as we certainly have user applications depending on it. This is also why we need to create a separate ctx, as there’s no way to restrict bit writes, so we created a ctx->smc_ok that is allowed to write. This is also why we had to create a separate structure, tcpsmc_ctx ... However, I now realize that compromising to avoid introducing kfuncs has gone too far, affecting the readability of the code. I will try to use kfuncs in the next version to solve those issues. > >> + */ >> +struct smc_bpf_ops_ctx { >> + struct { >> + struct tcp_sock *tp; >> + } set_option; >> + struct { >> + const struct tcp_sock *tp; >> + struct inet_request_sock *ireq; >> + int smc_ok; >> + } set_option_cond; >> +}; > > There is no need to create one single ctx for struct_ops prog. struct_ops prog can take >1 args and > different ops can take different args. > Same reason with concern on kfunc. I'll change it in next version. >> + >> +struct smc_bpf_ops { >> + /* priavte */ >> + >> + struct list_head list; >> + >> + /* public */ >> + >> + /* Invoked before computing SMC option for SYN packets. >> + * We can control whether to set SMC options by modifying >> + * ctx->set_option->tp->syn_smc. >> + * This's also the only member that can be modified now. >> + * Only member in ctx->set_option is valid for this callback. >> + */ >> + void (*set_option)(struct smc_bpf_ops_ctx *ctx); >> + >> + /* Invoked before Set up SMC options for SYN-ACK packets >> + * We can control whether to respond SMC options by modifying >> + * ctx->set_option_cond.smc_ok. >> + * Only member in ctx->set_option_cond is valid for this callback. >> + */ >> + void (*set_option_cond)(struct smc_bpf_ops_ctx *ctx); > > The struct smc_bpf_ops already has set_option and set_option_cnd, but... > >> +}; >> + >> #endif /* _SMC_H */ >> diff --git a/include/net/tcp.h b/include/net/tcp.h >> index 739a9fb..c322443 100644 >> --- a/include/net/tcp.h >> +++ b/include/net/tcp.h >> @@ -2730,6 +2730,12 @@ static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) >> #if IS_ENABLED(CONFIG_SMC) >> extern struct static_key_false tcp_have_smc; >> +struct tcpsmc_ctx { >> + /* Invoked before computing SMC option for SYN packets. */ >> + void (*set_option)(struct tcp_sock *tp); >> + /* Invoked before Set up SMC options for SYN-ACK packets */ >> + void (*set_option_cond)(const struct tcp_sock *tp, struct inet_request_sock *ireq); >> +}; > > another new struct tcpsmc_ctx has exactly the same functions (at least the same name) but different > arguments. I don't understand why this duplicate, is it because the need to prepare the "struct > smc_bpf_ops_ctx"? Yes, same reason with concern on kfunc. I'll change it in next version. > > The "struct tcpsmc_ctx" should be the "struct smc_bpf_ops" itself. > > [ ... ] > >> +static int smc_bpf_ops_btf_struct_access(struct bpf_verifier_log *log, >> + const struct bpf_reg_state *reg, >> + const struct bpf_prog *prog, >> + int off, int size) >> +{ >> + const struct btf_member *member; >> + const char *mname; >> + int member_idx; >> + >> + member_idx = prog->expected_attach_type; >> + if (member_idx >= btf_type_vlen(smc_bpf_ops_type)) >> + goto out_err; >> + >> + member = &btf_type_member(smc_bpf_ops_type)[member_idx]; >> + mname = btf_str_by_offset(saved_btf, member->name_off); >> + >> + if (!strcmp(mname, "set_option")) { > > btf_member_bit_offset can be used instead of strcmp. Take a look at bpf_tcp_ca.c and kernel/sched/ext.c > Got it, thanks for that. Besides, it seems that we don't need the export btf_str_by_offset anymore in that way. I'll remove it in the next version. >> + /* only support to modify tcp_sock->syn_smc */ >> + if (reg->btf_id == tcp_sock_id && >> + off == offsetof(struct tcp_sock, syn_smc) && >> + off + size == offsetofend(struct tcp_sock, syn_smc)) >> + return 0; >> + } else if (!strcmp(mname, "set_option_cond")) { >> + /* only support to modify smc_bpf_ops_ctx->smc_ok */ >> + if (reg->btf_id == smc_bpf_ops_ctx_id && >> + off == offsetof(struct smc_bpf_ops_ctx, set_option_cond.smc_ok) && >> + off + size == offsetofend(struct smc_bpf_ops_ctx, set_option_cond.smc_ok)) >> + return 0; >> + } >> + >> +out_err: >> + return -EACCES; >> +} >> + >> +static const struct bpf_verifier_ops smc_bpf_verifier_ops = { >> + .get_func_proto = bpf_base_func_proto, >> + .is_valid_access = bpf_tracing_btf_ctx_access, >> + .btf_struct_access = smc_bpf_ops_btf_struct_access, >> +}; >> + >> +static struct bpf_struct_ops bpf_smc_bpf_ops = { >> + .init = smc_bpf_ops_init, >> + .name = "smc_bpf_ops", >> + .reg = smc_bpf_ops_reg, >> + .unreg = smc_bpf_ops_unreg, >> + .cfi_stubs = &__bpf_smc_bpf_ops, >> + .verifier_ops = &smc_bpf_verifier_ops, >> + .init_member = smc_bpf_ops_init_member, >> + .check_member = smc_bpf_ops_check_member, >> + .owner = THIS_MODULE, >> +}; >> + >> +int smc_bpf_struct_ops_init(void) >> +{ >> + return register_bpf_struct_ops(&bpf_smc_bpf_ops, smc_bpf_ops); >> +} >> + >> +void bpf_smc_set_tcp_option(struct tcp_sock *tp) >> +{ >> + struct smc_bpf_ops_ctx ops_ctx = {}; >> + struct smc_bpf_ops *ops; >> + >> + ops_ctx.set_option.tp = tp; > > All this initialization should be unnecessary. Directly pass tp instead. > Same reason with kfunc concern. I'll change it in next version. >> + >> + rcu_read_lock(); >> + list_for_each_entry_rcu(ops, &smc_bpf_ops_list, list) { > > Does it need to have a list (meaning >1) of smc_bpf_ops to act on a sock? The ordering expectation > is hard to manage. > Considering that the SMC modules also has its own ops that needs to be registered on it (the logic of smc_limit_fs), and need to be all executed, perhaps a list is a more suitable choice. >> + ops->set_option(&ops_ctx); > > A dumb question. This will only affect AF_SMC (or AF_INET[6]/IPPROTO_SMC) socket but not the > AF_INET[6]/IPPROTO_{TCP,UDP} socket? > Yes, it only affects AF_SMC, AF_SMC6, or IPPROTO_SMC sockets. Due to only SMC sockets will set tp->syn_smc, and we will check it before calling the very ops. Best wishes, D. > pw-bot: cr > >> + } >> + rcu_read_unlock(); >> +}
On 10/25/24 4:05 AM, D. Wythe wrote: > Our main concern is to avoid introducing kfuncs as much as possible. For our > subsystem, we might need to maintain it in a way that maintains a uapi, as we > certainly have user applications depending on it. The smc_bpf_ops can read/write the tp and ireq. In patch 4, there is 'tp->syn_smc = 1'. I assume the real bpf prog will read something from the tp to make the decision also. Note that tp/ireq is also not in the uapi but the CO-RE can help in case the tp->syn_smc bool is moved around. From looking at the selftest in patch 4 again, I think all it needs is for the bpf prog (i.e. the ops) to return a bool instead of allowing the bpf prog to write or call a kfunc to change the tp/ireq.
On 10/26/24 2:30 AM, Martin KaFai Lau wrote: > On 10/25/24 4:05 AM, D. Wythe wrote: >> Our main concern is to avoid introducing kfuncs as much as possible. For our subsystem, we might >> need to maintain it in a way that maintains a uapi, as we certainly have user applications >> depending on it. > > The smc_bpf_ops can read/write the tp and ireq. In patch 4, there is 'tp->syn_smc = 1'. I assume the > real bpf prog will read something from the tp to make the decision also. Note that tp/ireq is also > not in the uapi but the CO-RE can help in case the tp->syn_smc bool is moved around. > > From looking at the selftest in patch 4 again, I think all it needs is for the bpf prog (i.e. the > ops) to return a bool instead of allowing the bpf prog to write or call a kfunc to change the tp/ireq. > Hi Martin, At the beginning, I did modify it by returning values, but later I wanted to make this ops more universal, so I considered influencing the behavior by modifying the tp without returning any value. But considering we currently do not have any other needs, perhaps modifying it by returning a value would be more appropriate. And If that's the case, we won't need to add new prog parameters to the struct_access anymore. I'll try this in the next series. Thanks, D. Wythe
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 6a5e08b..4ef160a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -478,7 +478,7 @@ struct tcp_sock { #endif #if IS_ENABLED(CONFIG_SMC) bool syn_smc; /* SYN includes SMC */ - bool (*smc_hs_congested)(const struct sock *sk); + struct tcpsmc_ctx *smc; #endif #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) diff --git a/include/net/smc.h b/include/net/smc.h index db84e4e..34ab2c6 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -18,6 +18,8 @@ #include "linux/ism.h" struct sock; +struct tcp_sock; +struct inet_request_sock; #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ @@ -97,4 +99,49 @@ struct smcd_dev { u8 going_away : 1; }; +/* + * This structure is used to store the parameters passed to the member of struct_ops. + * Due to the BPF verifier cannot restrict the writing of bit fields, such as limiting + * it to only write ireq->smc_ok. Using kfunc can solve this issue, but we don't want + * to introduce a kfunc with such a narrow function. + * + * Moreover, using this structure for unified parameters also addresses another + * potential issue. Currently, kfunc cannot recognize the calling context + * through BPF's existing structure. In the future, we can solve this problem + * by passing this ctx to kfunc. + */ +struct smc_bpf_ops_ctx { + struct { + struct tcp_sock *tp; + } set_option; + struct { + const struct tcp_sock *tp; + struct inet_request_sock *ireq; + int smc_ok; + } set_option_cond; +}; + +struct smc_bpf_ops { + /* priavte */ + + struct list_head list; + + /* public */ + + /* Invoked before computing SMC option for SYN packets. + * We can control whether to set SMC options by modifying + * ctx->set_option->tp->syn_smc. + * This's also the only member that can be modified now. + * Only member in ctx->set_option is valid for this callback. + */ + void (*set_option)(struct smc_bpf_ops_ctx *ctx); + + /* Invoked before Set up SMC options for SYN-ACK packets + * We can control whether to respond SMC options by modifying + * ctx->set_option_cond.smc_ok. + * Only member in ctx->set_option_cond is valid for this callback. + */ + void (*set_option_cond)(struct smc_bpf_ops_ctx *ctx); +}; + #endif /* _SMC_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 739a9fb..c322443 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2730,6 +2730,12 @@ static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt) #if IS_ENABLED(CONFIG_SMC) extern struct static_key_false tcp_have_smc; +struct tcpsmc_ctx { + /* Invoked before computing SMC option for SYN packets. */ + void (*set_option)(struct tcp_sock *tp); + /* Invoked before Set up SMC options for SYN-ACK packets */ + void (*set_option_cond)(const struct tcp_sock *tp, struct inet_request_sock *ireq); +}; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2d844e1..8ebd529 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7070,8 +7070,7 @@ static void tcp_openreq_init(struct request_sock *req, ireq->ir_num = ntohs(tcp_hdr(skb)->dest); ireq->ir_mark = inet_request_mark(sk, skb); #if IS_ENABLED(CONFIG_SMC) - ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested && - tcp_sk(sk)->smc_hs_congested(sk)); + ireq->smc_ok = rx_opt->smc_ok; #endif } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 054244ce..5ab47dd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -759,14 +759,17 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, mptcp_options_write(th, ptr, tp, opts); } -static void smc_set_option(const struct tcp_sock *tp, +static void smc_set_option(struct tcp_sock *tp, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc) { - if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + if (tp->smc && tp->smc->set_option) + tp->smc->set_option(tp); + /* set_option may modify syn_smc, so it needs to be checked again */ + if (tp->syn_smc && *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } @@ -776,14 +779,17 @@ static void smc_set_option(const struct tcp_sock *tp, } static void smc_set_option_cond(const struct tcp_sock *tp, - const struct inet_request_sock *ireq, + struct inet_request_sock *ireq, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) if (static_branch_unlikely(&tcp_have_smc)) { if (tp->syn_smc && ireq->smc_ok) { - if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + if (tp->smc && tp->smc->set_option_cond) + tp->smc->set_option_cond(tp, ireq); + /* set_option_cond may modify smc_ok, so it needs to be checked again */ + if (ireq->smc_ok && *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { opts->options |= OPTION_SMC; *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } diff --git a/net/smc/Kconfig b/net/smc/Kconfig index ba5e6a2..1eca835 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -33,3 +33,15 @@ config SMC_LO of architecture or hardware. if unsure, say N. + +config SMC_BPF + bool "eBPF support for SMC subsystem" + depends on SMC && BPF_SYSCALL + default n + help + This option enables support for eBPF programs for SMC + subsystem. eBPF programs offer much greater flexibility + in modifying the behavior of the SMC protocol stack compared + to a complete kernel-based approach. + + if unsure, say N. diff --git a/net/smc/Makefile b/net/smc/Makefile index 60f1c87..1c04906 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -7,3 +7,4 @@ smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_sta smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o smc-$(CONFIG_SMC_LO) += smc_loopback.o +smc-$(CONFIG_SMC_BPF) += smc_bpf.o \ No newline at end of file diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0316217..316c8a1 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -55,6 +55,7 @@ #include "smc_sysctl.h" #include "smc_loopback.h" #include "smc_inet.h" +#include "smc_bpf.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -156,19 +157,25 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk, return NULL; } -static bool smc_hs_congested(const struct sock *sk) +static void smc_set_tcp_option_cond(const struct tcp_sock *tp, struct inet_request_sock *ireq) { const struct smc_sock *smc; - smc = smc_clcsock_user_data(sk); + smc = smc_clcsock_user_data(&tp->inet_conn.icsk_inet.sk); if (!smc) - return true; + goto no_smc; - if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) - return true; + if (smc->limit_smc_hs && workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq)) + goto no_smc; - return false; +#if IS_ENABLED(CONFIG_SMC_BPF) + bpf_smc_set_tcp_option_cond(tp, ireq); +#endif /* CONFIG_SMC_BPF */ + + return; +no_smc: + ireq->smc_ok = 0; } struct smc_hashinfo smc_v4_hashinfo = { @@ -2650,9 +2657,6 @@ int smc_listen(struct socket *sock, int backlog) inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops; - if (smc->limit_smc_hs) - tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested; - rc = kernel_listen(smc->clcsock, backlog); if (rc) { write_lock_bh(&smc->clcsock->sk->sk_callback_lock); @@ -3324,6 +3328,13 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family) sk->sk_net_refcnt = 1; get_net_track(net, &sk->ns_tracker, GFP_KERNEL); sock_inuse_add(net, 1); + + /* init tcp_smc_ctx */ +#if IS_ENABLED(CONFIG_SMC_BPF) + smc->tcp_smc_ctx.set_option = bpf_smc_set_tcp_option; +#endif /* CONFIG_SMC_BPF */ + smc->tcp_smc_ctx.set_option_cond = smc_set_tcp_option_cond; + tcp_sk(sk)->smc = &smc->tcp_smc_ctx; return 0; } @@ -3574,8 +3585,17 @@ static int __init smc_init(void) pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); goto out_ulp; } + + rc = smc_bpf_struct_ops_init(); + if (rc) { + pr_err("%s: smc_bpf_struct_ops_init fails with %d\n", __func__, rc); + goto out_inet; + } + static_branch_enable(&tcp_have_smc); return 0; +out_inet: + smc_inet_exit(); out_ulp: tcp_unregister_ulp(&smc_ulp_ops); out_lo: diff --git a/net/smc/smc.h b/net/smc/smc.h index 78ae10d..a9794fb 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -16,6 +16,7 @@ #include <linux/compiler.h> /* __aligned */ #include <net/genetlink.h> #include <net/sock.h> +#include <net/tcp.h> #include "smc_ib.h" @@ -328,6 +329,9 @@ struct smc_sock { /* smc sock container */ /* protects clcsock of a listen * socket * */ + + /* smc context for tcp stack */ + struct tcpsmc_ctx tcp_smc_ctx; }; #define smc_sk(ptr) container_of_const(ptr, struct smc_sock, sk) diff --git a/net/smc/smc_bpf.c b/net/smc/smc_bpf.c new file mode 100644 index 00000000..fa90406 --- /dev/null +++ b/net/smc/smc_bpf.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * support for eBPF programs in SMC subsystem. + * + * Copyright IBM Corp. 2016 + * Copyright (c) 2024, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ + +#include <linux/bpf_verifier.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <net/smc.h> + +#include "smc_bpf.h" + +static DEFINE_SPINLOCK(smc_bpf_ops_list_lock); +static LIST_HEAD(smc_bpf_ops_list); + +static u32 tcp_sock_id, smc_bpf_ops_ctx_id; +static const struct btf_type *smc_bpf_ops_type; +static const struct btf *saved_btf; + +static int smc_bpf_ops_init(struct btf *btf) +{ + s32 type_id; + + type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + tcp_sock_id = type_id; + + type_id = btf_find_by_name_kind(btf, "smc_bpf_ops_ctx", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + smc_bpf_ops_ctx_id = type_id; + + type_id = btf_find_by_name_kind(btf, "smc_bpf_ops", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + smc_bpf_ops_type = btf_type_by_id(btf, type_id); + + saved_btf = btf; + return 0; +} + +static int smc_bpf_ops_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + struct smc_bpf_ops *k_ops; + u32 moff; + + k_ops = (struct smc_bpf_ops *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct smc_bpf_ops, list): + INIT_LIST_HEAD(&k_ops->list); + return 1; + default: + break; + } + + return 0; +} + +static int smc_bpf_ops_check_member(const struct btf_type *t, + const struct btf_member *member, + const struct bpf_prog *prog) +{ + u32 moff = __btf_member_bit_offset(t, member) / 8; + + switch (moff) { + case offsetof(struct smc_bpf_ops, set_option): + case offsetof(struct smc_bpf_ops, set_option_cond): + break; + default: + return -EINVAL; + } + + return 0; +} + +static int smc_bpf_ops_reg(void *kdata, struct bpf_link *link) +{ + struct smc_bpf_ops *ops = kdata; + + /* Prevent the same ops from being registered repeatedly. */ + if (!list_empty(&ops->list)) + return -EINVAL; + + spin_lock(&smc_bpf_ops_list_lock); + list_add_tail_rcu(&ops->list, &smc_bpf_ops_list); + spin_unlock(&smc_bpf_ops_list_lock); + + return 0; +} + +static void smc_bpf_ops_unreg(void *kdata, struct bpf_link *link) +{ + struct smc_bpf_ops *ops = kdata; + + spin_lock(&smc_bpf_ops_list_lock); + list_del_rcu(&ops->list); + spin_unlock(&smc_bpf_ops_list_lock); + + /* Ensure that all readers to complete */ + synchronize_rcu(); +} + +static void __bpf_smc_stub_set_tcp_option(struct smc_bpf_ops_ctx *ops_ctx) {} +static void __bpf_smc_stub_set_tcp_option_cond(struct smc_bpf_ops_ctx *ops_ctx) {} + +static struct smc_bpf_ops __bpf_smc_bpf_ops = { + .set_option = __bpf_smc_stub_set_tcp_option, + .set_option_cond = __bpf_smc_stub_set_tcp_option_cond, +}; + +static int smc_bpf_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + const struct bpf_prog *prog, + int off, int size) +{ + const struct btf_member *member; + const char *mname; + int member_idx; + + member_idx = prog->expected_attach_type; + if (member_idx >= btf_type_vlen(smc_bpf_ops_type)) + goto out_err; + + member = &btf_type_member(smc_bpf_ops_type)[member_idx]; + mname = btf_str_by_offset(saved_btf, member->name_off); + + if (!strcmp(mname, "set_option")) { + /* only support to modify tcp_sock->syn_smc */ + if (reg->btf_id == tcp_sock_id && + off == offsetof(struct tcp_sock, syn_smc) && + off + size == offsetofend(struct tcp_sock, syn_smc)) + return 0; + } else if (!strcmp(mname, "set_option_cond")) { + /* only support to modify smc_bpf_ops_ctx->smc_ok */ + if (reg->btf_id == smc_bpf_ops_ctx_id && + off == offsetof(struct smc_bpf_ops_ctx, set_option_cond.smc_ok) && + off + size == offsetofend(struct smc_bpf_ops_ctx, set_option_cond.smc_ok)) + return 0; + } + +out_err: + return -EACCES; +} + +static const struct bpf_verifier_ops smc_bpf_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_tracing_btf_ctx_access, + .btf_struct_access = smc_bpf_ops_btf_struct_access, +}; + +static struct bpf_struct_ops bpf_smc_bpf_ops = { + .init = smc_bpf_ops_init, + .name = "smc_bpf_ops", + .reg = smc_bpf_ops_reg, + .unreg = smc_bpf_ops_unreg, + .cfi_stubs = &__bpf_smc_bpf_ops, + .verifier_ops = &smc_bpf_verifier_ops, + .init_member = smc_bpf_ops_init_member, + .check_member = smc_bpf_ops_check_member, + .owner = THIS_MODULE, +}; + +int smc_bpf_struct_ops_init(void) +{ + return register_bpf_struct_ops(&bpf_smc_bpf_ops, smc_bpf_ops); +} + +void bpf_smc_set_tcp_option(struct tcp_sock *tp) +{ + struct smc_bpf_ops_ctx ops_ctx = {}; + struct smc_bpf_ops *ops; + + ops_ctx.set_option.tp = tp; + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &smc_bpf_ops_list, list) { + ops->set_option(&ops_ctx); + } + rcu_read_unlock(); +} + +void bpf_smc_set_tcp_option_cond(const struct tcp_sock *tp, struct inet_request_sock *ireq) +{ + struct smc_bpf_ops_ctx ops_ctx = {}; + struct smc_bpf_ops *ops; + + ops_ctx.set_option_cond.tp = tp; + ops_ctx.set_option_cond.ireq = ireq; + ops_ctx.set_option_cond.smc_ok = ireq->smc_ok; + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &smc_bpf_ops_list, list) { + ops->set_option_cond(&ops_ctx); + } + rcu_read_unlock(); + + ireq->smc_ok = ops_ctx.set_option_cond.smc_ok; +} diff --git a/net/smc/smc_bpf.h b/net/smc/smc_bpf.h new file mode 100644 index 00000000..a5ed0fc --- /dev/null +++ b/net/smc/smc_bpf.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * support for eBPF programs in SMC subsystem. + * + * Copyright IBM Corp. 2016 + * Copyright (c) 2024, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ +#ifndef __SMC_BPF +#define __SMC_BPF + +#include <linux/types.h> +#include <net/sock.h> +#include <net/tcp.h> + +#if IS_ENABLED(CONFIG_SMC_BPF) + +/* Initialize struct_ops registration. It will automatically unload + * when module is unloaded. + * @return 0 on success + */ +int smc_bpf_struct_ops_init(void); + +void bpf_smc_set_tcp_option(struct tcp_sock *sk); +void bpf_smc_set_tcp_option_cond(const struct tcp_sock *tp, struct inet_request_sock *ireq); + +#else +static inline int smc_bpf_struct_ops_init(void) { return 0; } +#endif /* CONFIG_SMC_BPF */ + +#endif /* __SMC_BPF */