Message ID | 20240821233440.1855263-2-martin.lau@linux.dev (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | BPF |
Headers | show |
Series | bpf: Add gen_epilogue and allow kfunc call in pro/epilogue | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Clearly marked for bpf-next, async |
netdev/apply | fail | Patch does not apply to bpf-next-0 |
On Wed, Aug 21, 2024 at 4:35 PM Martin KaFai Lau <martin.lau@linux.dev> wrote: > > From: Martin KaFai Lau <martin.lau@kernel.org> > > This patch adds a .gen_epilogue to the bpf_verifier_ops. It is similar > to the existing .gen_prologue. Instead of allowing a subsystem > to run code at the beginning of a bpf prog, it allows the subsystem > to run code just before the bpf prog exit. > > One of the use case is to allow the upcoming bpf qdisc to ensure that > the skb->dev is the same as the qdisc->dev_queue->dev. The bpf qdisc > struct_ops implementation could either fix it up or drop the skb. > Another use case could be in bpf_tcp_ca.c to enforce snd_cwnd > has sane value (e.g. non zero). > > The epilogue can do the useful thing (like checking skb->dev) if it > can access the bpf prog's ctx. Unlike prologue, r1 may not hold the > ctx pointer. This patch saves the r1 in the stack if the .gen_epilogue > has returned some instructions in the "epilogue_buf". > > The existing .gen_prologue is done in convert_ctx_accesses(). > The new .gen_epilogue is done in the convert_ctx_accesses() also. > When it sees the (BPF_JMP | BPF_EXIT) instruction, it will be patched > with the earlier generated "epilogue_buf". The epilogue patching is > only done for the main prog. > > Reviewed-by: Eduard Zingerman <eddyz87@gmail.com> > Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org> > --- > include/linux/bpf.h | 2 ++ > kernel/bpf/verifier.c | 34 ++++++++++++++++++++++++++++++++-- > 2 files changed, 34 insertions(+), 2 deletions(-) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index f0192c173ed8..8ee9d87c332a 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -974,6 +974,8 @@ struct bpf_verifier_ops { > struct bpf_insn_access_aux *info); > int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, > const struct bpf_prog *prog); > + int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog, > + s16 ctx_stack_off); > int (*gen_ld_abs)(const struct bpf_insn *orig, > struct bpf_insn *insn_buf); > u32 (*convert_ctx_access)(enum bpf_access_type type, > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > index df3be12096cf..bbb655f0c7b5 100644 > --- a/kernel/bpf/verifier.c > +++ b/kernel/bpf/verifier.c > @@ -19610,15 +19610,37 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, > */ > static int convert_ctx_accesses(struct bpf_verifier_env *env) > { > + struct bpf_subprog_info *subprogs = env->subprog_info; > const struct bpf_verifier_ops *ops = env->ops; > - int i, cnt, size, ctx_field_size, delta = 0; > + int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; > const int insn_cnt = env->prog->len; > - struct bpf_insn insn_buf[16], *insn; > + struct bpf_insn insn_buf[16], epilogue_buf[16], *insn; This is a noticeable stack increase. Maybe let's move both to env? > u32 target_size, size_default, off; > struct bpf_prog *new_prog; > enum bpf_access_type type; > bool is_narrower_load; > > + if (ops->gen_epilogue) { > + epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, > + -(subprogs[0].stack_depth + 8)); > + if (epilogue_cnt >= ARRAY_SIZE(epilogue_buf)) { > + verbose(env, "bpf verifier is misconfigured\n"); > + return -EINVAL; > + } else if (epilogue_cnt) { > + /* Save the ARG_PTR_TO_CTX for the epilogue to use */ > + cnt = 0; > + subprogs[0].stack_depth += 8; > + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, > + -subprogs[0].stack_depth); > + insn_buf[cnt++] = env->prog->insnsi[0]; > + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); > + if (!new_prog) > + return -ENOMEM; > + env->prog = new_prog; > + delta += cnt - 1; I suspect this is buggy. See commit 5337ac4c9b80 ("bpf: Fix the corner case with may_goto and jump to the 1st insn.") > + } > + } > + > if (ops->gen_prologue || env->seen_direct_write) { > if (!ops->gen_prologue) { > verbose(env, "bpf verifier is misconfigured\n"); > @@ -19671,6 +19693,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) > insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); > env->prog->aux->num_exentries++; > continue; > + } else if (insn->code == (BPF_JMP | BPF_EXIT) && > + epilogue_cnt && > + i + delta < subprogs[1].start) { > + /* Generate epilogue for the main prog */ > + memcpy(insn_buf, epilogue_buf, sizeof(epilogue_buf)); > + cnt = epilogue_cnt; > + goto patch_insn_buf; That's quite a bit of copy paste of epilogue if the program contains multiple bpf_exit insns. I think llvm generates it in such a way that often there is only one bpf_exit. But gcc might be using bpf_exit liberally. So let's patch it only once and replace other bpf_exit with jmp to one patched place ? That's pretty much what x86 JIT does when it converts bpf_exit. > } else { > continue; > } > @@ -19807,6 +19836,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) > insn->dst_reg, insn->dst_reg, > size * 8, 0); > > +patch_insn_buf: > new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); > if (!new_prog) > return -ENOMEM; > -- > 2.43.5 >
On Wed, 2024-08-21 at 17:22 -0700, Alexei Starovoitov wrote: [...] > > + if (ops->gen_epilogue) { > > + epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, > > + -(subprogs[0].stack_depth + 8)); > > + if (epilogue_cnt >= ARRAY_SIZE(epilogue_buf)) { > > + verbose(env, "bpf verifier is misconfigured\n"); > > + return -EINVAL; > > + } else if (epilogue_cnt) { > > + /* Save the ARG_PTR_TO_CTX for the epilogue to use */ > > + cnt = 0; > > + subprogs[0].stack_depth += 8; > > + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, > > + -subprogs[0].stack_depth); > > + insn_buf[cnt++] = env->prog->insnsi[0]; > > + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); > > + if (!new_prog) > > + return -ENOMEM; > > + env->prog = new_prog; > > + delta += cnt - 1; > > I suspect this is buggy. > See commit 5337ac4c9b80 ("bpf: Fix the corner case with may_goto and > jump to the 1st insn.") Actually, I was unable to figure out a counter example for this case, patching math seems to be correct, jump targets are just moved down. But let's see, maybe Martin can come up with something. [...]
On Wed, Aug 21, 2024 at 5:30 PM Eduard Zingerman <eddyz87@gmail.com> wrote: > > On Wed, 2024-08-21 at 17:22 -0700, Alexei Starovoitov wrote: > > [...] > > > > + if (ops->gen_epilogue) { > > > + epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, > > > + -(subprogs[0].stack_depth + 8)); > > > + if (epilogue_cnt >= ARRAY_SIZE(epilogue_buf)) { > > > + verbose(env, "bpf verifier is misconfigured\n"); > > > + return -EINVAL; > > > + } else if (epilogue_cnt) { > > > + /* Save the ARG_PTR_TO_CTX for the epilogue to use */ > > > + cnt = 0; > > > + subprogs[0].stack_depth += 8; > > > + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, > > > + -subprogs[0].stack_depth); > > > + insn_buf[cnt++] = env->prog->insnsi[0]; > > > + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); > > > + if (!new_prog) > > > + return -ENOMEM; > > > + env->prog = new_prog; > > > + delta += cnt - 1; > > > > I suspect this is buggy. > > See commit 5337ac4c9b80 ("bpf: Fix the corner case with may_goto and > > jump to the 1st insn.") > > Actually, I was unable to figure out a counter example for this case, > patching math seems to be correct, jump targets are just moved down. > But let's see, maybe Martin can come up with something. Something like insn_0 ... r1 = 0 if rX == .. goto insn_0 this jmp will be rewritten to point to newly added *(u64*)(r10 - ..) = r1 so at run time it will overwrite that slot with zero and epilogue will read zero from it instead of ctx.
On Wed, 2024-08-21 at 17:34 -0700, Alexei Starovoitov wrote: [...] > Something like > insn_0 > ... > r1 = 0 > if rX == .. goto insn_0 > > this jmp will be rewritten to point to newly added *(u64*)(r10 - ..) = r1 > > so at run time it will overwrite that slot with zero and > epilogue will read zero from it instead of ctx. That's exactly what I tried on paper and jmp target was just moving down.
On 8/21/24 5:38 PM, Eduard Zingerman wrote: > On Wed, 2024-08-21 at 17:34 -0700, Alexei Starovoitov wrote: > > [...] > >> Something like >> insn_0 >> ... >> r1 = 0 >> if rX == .. goto insn_0 >> >> this jmp will be rewritten to point to newly added *(u64*)(r10 - ..) = r1 >> >> so at run time it will overwrite that slot with zero and >> epilogue will read zero from it instead of ctx. > > That's exactly what I tried on paper and jmp target was just moving down. Thanks for bringing this up. I will try a test case to confirm.
diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f0192c173ed8..8ee9d87c332a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -974,6 +974,8 @@ struct bpf_verifier_ops { struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); + int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog, + s16 ctx_stack_off); int (*gen_ld_abs)(const struct bpf_insn *orig, struct bpf_insn *insn_buf); u32 (*convert_ctx_access)(enum bpf_access_type type, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df3be12096cf..bbb655f0c7b5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -19610,15 +19610,37 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, */ static int convert_ctx_accesses(struct bpf_verifier_env *env) { + struct bpf_subprog_info *subprogs = env->subprog_info; const struct bpf_verifier_ops *ops = env->ops; - int i, cnt, size, ctx_field_size, delta = 0; + int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0; const int insn_cnt = env->prog->len; - struct bpf_insn insn_buf[16], *insn; + struct bpf_insn insn_buf[16], epilogue_buf[16], *insn; u32 target_size, size_default, off; struct bpf_prog *new_prog; enum bpf_access_type type; bool is_narrower_load; + if (ops->gen_epilogue) { + epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog, + -(subprogs[0].stack_depth + 8)); + if (epilogue_cnt >= ARRAY_SIZE(epilogue_buf)) { + verbose(env, "bpf verifier is misconfigured\n"); + return -EINVAL; + } else if (epilogue_cnt) { + /* Save the ARG_PTR_TO_CTX for the epilogue to use */ + cnt = 0; + subprogs[0].stack_depth += 8; + insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1, + -subprogs[0].stack_depth); + insn_buf[cnt++] = env->prog->insnsi[0]; + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + env->prog = new_prog; + delta += cnt - 1; + } + } + if (ops->gen_prologue || env->seen_direct_write) { if (!ops->gen_prologue) { verbose(env, "bpf verifier is misconfigured\n"); @@ -19671,6 +19693,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); env->prog->aux->num_exentries++; continue; + } else if (insn->code == (BPF_JMP | BPF_EXIT) && + epilogue_cnt && + i + delta < subprogs[1].start) { + /* Generate epilogue for the main prog */ + memcpy(insn_buf, epilogue_buf, sizeof(epilogue_buf)); + cnt = epilogue_cnt; + goto patch_insn_buf; } else { continue; } @@ -19807,6 +19836,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->dst_reg, insn->dst_reg, size * 8, 0); +patch_insn_buf: new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM;