diff mbox series

[v2,bpf-next,1/8] bpf: Add gen_epilogue to bpf_verifier_ops

Message ID 20240821233440.1855263-2-martin.lau@linux.dev (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Add gen_epilogue and allow kfunc call in pro/epilogue | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next-0

Commit Message

Martin KaFai Lau Aug. 21, 2024, 11:34 p.m. UTC
From: Martin KaFai Lau <martin.lau@kernel.org>

This patch adds a .gen_epilogue to the bpf_verifier_ops. It is similar
to the existing .gen_prologue. Instead of allowing a subsystem
to run code at the beginning of a bpf prog, it allows the subsystem
to run code just before the bpf prog exit.

One of the use case is to allow the upcoming bpf qdisc to ensure that
the skb->dev is the same as the qdisc->dev_queue->dev. The bpf qdisc
struct_ops implementation could either fix it up or drop the skb.
Another use case could be in bpf_tcp_ca.c to enforce snd_cwnd
has sane value (e.g. non zero).

The epilogue can do the useful thing (like checking skb->dev) if it
can access the bpf prog's ctx. Unlike prologue, r1 may not hold the
ctx pointer. This patch saves the r1 in the stack if the .gen_epilogue
has returned some instructions in the "epilogue_buf".

The existing .gen_prologue is done in convert_ctx_accesses().
The new .gen_epilogue is done in the convert_ctx_accesses() also.
When it sees the (BPF_JMP | BPF_EXIT) instruction, it will be patched
with the earlier generated "epilogue_buf". The epilogue patching is
only done for the main prog.

Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/verifier.c | 34 ++++++++++++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

Comments

Alexei Starovoitov Aug. 22, 2024, 12:22 a.m. UTC | #1
On Wed, Aug 21, 2024 at 4:35 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> From: Martin KaFai Lau <martin.lau@kernel.org>
>
> This patch adds a .gen_epilogue to the bpf_verifier_ops. It is similar
> to the existing .gen_prologue. Instead of allowing a subsystem
> to run code at the beginning of a bpf prog, it allows the subsystem
> to run code just before the bpf prog exit.
>
> One of the use case is to allow the upcoming bpf qdisc to ensure that
> the skb->dev is the same as the qdisc->dev_queue->dev. The bpf qdisc
> struct_ops implementation could either fix it up or drop the skb.
> Another use case could be in bpf_tcp_ca.c to enforce snd_cwnd
> has sane value (e.g. non zero).
>
> The epilogue can do the useful thing (like checking skb->dev) if it
> can access the bpf prog's ctx. Unlike prologue, r1 may not hold the
> ctx pointer. This patch saves the r1 in the stack if the .gen_epilogue
> has returned some instructions in the "epilogue_buf".
>
> The existing .gen_prologue is done in convert_ctx_accesses().
> The new .gen_epilogue is done in the convert_ctx_accesses() also.
> When it sees the (BPF_JMP | BPF_EXIT) instruction, it will be patched
> with the earlier generated "epilogue_buf". The epilogue patching is
> only done for the main prog.
>
> Reviewed-by: Eduard Zingerman <eddyz87@gmail.com>
> Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
> ---
>  include/linux/bpf.h   |  2 ++
>  kernel/bpf/verifier.c | 34 ++++++++++++++++++++++++++++++++--
>  2 files changed, 34 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index f0192c173ed8..8ee9d87c332a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -974,6 +974,8 @@ struct bpf_verifier_ops {
>                                 struct bpf_insn_access_aux *info);
>         int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
>                             const struct bpf_prog *prog);
> +       int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog,
> +                           s16 ctx_stack_off);
>         int (*gen_ld_abs)(const struct bpf_insn *orig,
>                           struct bpf_insn *insn_buf);
>         u32 (*convert_ctx_access)(enum bpf_access_type type,
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index df3be12096cf..bbb655f0c7b5 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -19610,15 +19610,37 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
>   */
>  static int convert_ctx_accesses(struct bpf_verifier_env *env)
>  {
> +       struct bpf_subprog_info *subprogs = env->subprog_info;
>         const struct bpf_verifier_ops *ops = env->ops;
> -       int i, cnt, size, ctx_field_size, delta = 0;
> +       int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0;
>         const int insn_cnt = env->prog->len;
> -       struct bpf_insn insn_buf[16], *insn;
> +       struct bpf_insn insn_buf[16], epilogue_buf[16], *insn;

This is a noticeable stack increase.
Maybe let's move both to env?

>         u32 target_size, size_default, off;
>         struct bpf_prog *new_prog;
>         enum bpf_access_type type;
>         bool is_narrower_load;
>
> +       if (ops->gen_epilogue) {
> +               epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
> +                                                -(subprogs[0].stack_depth + 8));
> +               if (epilogue_cnt >= ARRAY_SIZE(epilogue_buf)) {
> +                       verbose(env, "bpf verifier is misconfigured\n");
> +                       return -EINVAL;
> +               } else if (epilogue_cnt) {
> +                       /* Save the ARG_PTR_TO_CTX for the epilogue to use */
> +                       cnt = 0;
> +                       subprogs[0].stack_depth += 8;
> +                       insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
> +                                                     -subprogs[0].stack_depth);
> +                       insn_buf[cnt++] = env->prog->insnsi[0];
> +                       new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
> +                       if (!new_prog)
> +                               return -ENOMEM;
> +                       env->prog = new_prog;
> +                       delta += cnt - 1;

I suspect this is buggy.
See commit 5337ac4c9b80 ("bpf: Fix the corner case with may_goto and
jump to the 1st insn.")

> +               }
> +       }
> +
>         if (ops->gen_prologue || env->seen_direct_write) {
>                 if (!ops->gen_prologue) {
>                         verbose(env, "bpf verifier is misconfigured\n");
> @@ -19671,6 +19693,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
>                         insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
>                         env->prog->aux->num_exentries++;
>                         continue;
> +               } else if (insn->code == (BPF_JMP | BPF_EXIT) &&
> +                          epilogue_cnt &&
> +                          i + delta < subprogs[1].start) {
> +                       /* Generate epilogue for the main prog */
> +                       memcpy(insn_buf, epilogue_buf, sizeof(epilogue_buf));
> +                       cnt = epilogue_cnt;
> +                       goto patch_insn_buf;

That's quite a bit of copy paste of epilogue if the program contains
multiple bpf_exit insns.
I think llvm generates it in such a way that often there is only
one bpf_exit. But gcc might be using bpf_exit liberally.

So let's patch it only once and replace other bpf_exit with jmp
to one patched place ?

That's pretty much what x86 JIT does when it converts bpf_exit.

>                 } else {
>                         continue;
>                 }
> @@ -19807,6 +19836,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
>                                                        insn->dst_reg, insn->dst_reg,
>                                                        size * 8, 0);
>
> +patch_insn_buf:
>                 new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
>                 if (!new_prog)
>                         return -ENOMEM;
> --
> 2.43.5
>
Eduard Zingerman Aug. 22, 2024, 12:30 a.m. UTC | #2
On Wed, 2024-08-21 at 17:22 -0700, Alexei Starovoitov wrote:

[...]

> > +       if (ops->gen_epilogue) {
> > +               epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
> > +                                                -(subprogs[0].stack_depth + 8));
> > +               if (epilogue_cnt >= ARRAY_SIZE(epilogue_buf)) {
> > +                       verbose(env, "bpf verifier is misconfigured\n");
> > +                       return -EINVAL;
> > +               } else if (epilogue_cnt) {
> > +                       /* Save the ARG_PTR_TO_CTX for the epilogue to use */
> > +                       cnt = 0;
> > +                       subprogs[0].stack_depth += 8;
> > +                       insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
> > +                                                     -subprogs[0].stack_depth);
> > +                       insn_buf[cnt++] = env->prog->insnsi[0];
> > +                       new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
> > +                       if (!new_prog)
> > +                               return -ENOMEM;
> > +                       env->prog = new_prog;
> > +                       delta += cnt - 1;
> 
> I suspect this is buggy.
> See commit 5337ac4c9b80 ("bpf: Fix the corner case with may_goto and
> jump to the 1st insn.")

Actually, I was unable to figure out a counter example for this case,
patching math seems to be correct, jump targets are just moved down.
But let's see, maybe Martin can come up with something.

[...]
Alexei Starovoitov Aug. 22, 2024, 12:34 a.m. UTC | #3
On Wed, Aug 21, 2024 at 5:30 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Wed, 2024-08-21 at 17:22 -0700, Alexei Starovoitov wrote:
>
> [...]
>
> > > +       if (ops->gen_epilogue) {
> > > +               epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
> > > +                                                -(subprogs[0].stack_depth + 8));
> > > +               if (epilogue_cnt >= ARRAY_SIZE(epilogue_buf)) {
> > > +                       verbose(env, "bpf verifier is misconfigured\n");
> > > +                       return -EINVAL;
> > > +               } else if (epilogue_cnt) {
> > > +                       /* Save the ARG_PTR_TO_CTX for the epilogue to use */
> > > +                       cnt = 0;
> > > +                       subprogs[0].stack_depth += 8;
> > > +                       insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
> > > +                                                     -subprogs[0].stack_depth);
> > > +                       insn_buf[cnt++] = env->prog->insnsi[0];
> > > +                       new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
> > > +                       if (!new_prog)
> > > +                               return -ENOMEM;
> > > +                       env->prog = new_prog;
> > > +                       delta += cnt - 1;
> >
> > I suspect this is buggy.
> > See commit 5337ac4c9b80 ("bpf: Fix the corner case with may_goto and
> > jump to the 1st insn.")
>
> Actually, I was unable to figure out a counter example for this case,
> patching math seems to be correct, jump targets are just moved down.
> But let's see, maybe Martin can come up with something.

Something like
insn_0
...
r1 = 0
if rX == .. goto insn_0

this jmp will be rewritten to point to newly added *(u64*)(r10 - ..) = r1

so at run time it will overwrite that slot with zero and
epilogue will read zero from it instead of ctx.
Eduard Zingerman Aug. 22, 2024, 12:38 a.m. UTC | #4
On Wed, 2024-08-21 at 17:34 -0700, Alexei Starovoitov wrote:

[...]

> Something like
> insn_0
> ...
> r1 = 0
> if rX == .. goto insn_0
> 
> this jmp will be rewritten to point to newly added *(u64*)(r10 - ..) = r1
> 
> so at run time it will overwrite that slot with zero and
> epilogue will read zero from it instead of ctx.

That's exactly what I tried on paper and jmp target was just moving down.
Martin KaFai Lau Aug. 22, 2024, 12:52 a.m. UTC | #5
On 8/21/24 5:38 PM, Eduard Zingerman wrote:
> On Wed, 2024-08-21 at 17:34 -0700, Alexei Starovoitov wrote:
> 
> [...]
> 
>> Something like
>> insn_0
>> ...
>> r1 = 0
>> if rX == .. goto insn_0
>>
>> this jmp will be rewritten to point to newly added *(u64*)(r10 - ..) = r1
>>
>> so at run time it will overwrite that slot with zero and
>> epilogue will read zero from it instead of ctx.
> 
> That's exactly what I tried on paper and jmp target was just moving down.

Thanks for bringing this up. I will try a test case to confirm.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f0192c173ed8..8ee9d87c332a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -974,6 +974,8 @@  struct bpf_verifier_ops {
 				struct bpf_insn_access_aux *info);
 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
 			    const struct bpf_prog *prog);
+	int (*gen_epilogue)(struct bpf_insn *insn, const struct bpf_prog *prog,
+			    s16 ctx_stack_off);
 	int (*gen_ld_abs)(const struct bpf_insn *orig,
 			  struct bpf_insn *insn_buf);
 	u32 (*convert_ctx_access)(enum bpf_access_type type,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index df3be12096cf..bbb655f0c7b5 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19610,15 +19610,37 @@  static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
+	struct bpf_subprog_info *subprogs = env->subprog_info;
 	const struct bpf_verifier_ops *ops = env->ops;
-	int i, cnt, size, ctx_field_size, delta = 0;
+	int i, cnt, size, ctx_field_size, delta = 0, epilogue_cnt = 0;
 	const int insn_cnt = env->prog->len;
-	struct bpf_insn insn_buf[16], *insn;
+	struct bpf_insn insn_buf[16], epilogue_buf[16], *insn;
 	u32 target_size, size_default, off;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
 	bool is_narrower_load;
 
+	if (ops->gen_epilogue) {
+		epilogue_cnt = ops->gen_epilogue(epilogue_buf, env->prog,
+						 -(subprogs[0].stack_depth + 8));
+		if (epilogue_cnt >= ARRAY_SIZE(epilogue_buf)) {
+			verbose(env, "bpf verifier is misconfigured\n");
+			return -EINVAL;
+		} else if (epilogue_cnt) {
+			/* Save the ARG_PTR_TO_CTX for the epilogue to use */
+			cnt = 0;
+			subprogs[0].stack_depth += 8;
+			insn_buf[cnt++] = BPF_STX_MEM(BPF_DW, BPF_REG_FP, BPF_REG_1,
+						      -subprogs[0].stack_depth);
+			insn_buf[cnt++] = env->prog->insnsi[0];
+			new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+			env->prog = new_prog;
+			delta += cnt - 1;
+		}
+	}
+
 	if (ops->gen_prologue || env->seen_direct_write) {
 		if (!ops->gen_prologue) {
 			verbose(env, "bpf verifier is misconfigured\n");
@@ -19671,6 +19693,13 @@  static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
 			env->prog->aux->num_exentries++;
 			continue;
+		} else if (insn->code == (BPF_JMP | BPF_EXIT) &&
+			   epilogue_cnt &&
+			   i + delta < subprogs[1].start) {
+			/* Generate epilogue for the main prog */
+			memcpy(insn_buf, epilogue_buf, sizeof(epilogue_buf));
+			cnt = epilogue_cnt;
+			goto patch_insn_buf;
 		} else {
 			continue;
 		}
@@ -19807,6 +19836,7 @@  static int convert_ctx_accesses(struct bpf_verifier_env *env)
 						       insn->dst_reg, insn->dst_reg,
 						       size * 8, 0);
 
+patch_insn_buf:
 		new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
 		if (!new_prog)
 			return -ENOMEM;