diff mbox series

[bpf-next,v7,3/5] bpf: Inline calls to bpf_loop when callback is known

Message ID 20220613205008.212724-4-eddyz87@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf_loop inlining | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1432 this patch: 1424
netdev/cc_maintainers warning 6 maintainers not CCed: netdev@vger.kernel.org songliubraving@fb.com yhs@fb.com john.fastabend@gmail.com kafai@fb.com kpsingh@kernel.org
netdev/build_clang success Errors and warnings before: 166 this patch: 166
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1439 this patch: 1431
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for Kernel LATEST on ubuntu-latest with gcc
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Kernel LATEST on ubuntu-latest with llvm-15
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-3 fail Logs for Kernel LATEST on z15 with gcc

Commit Message

Eduard Zingerman June 13, 2022, 8:50 p.m. UTC
Calls to `bpf_loop` are replaced with direct loops to avoid
indirection. E.g. the following:

  bpf_loop(10, foo, NULL, 0);

Is replaced by equivalent of the following:

  for (int i = 0; i < 10; ++i)
    foo(i, NULL);

This transformation could be applied when:
- callback is known and does not change during program execution;
- flags passed to `bpf_loop` are always zero.

Inlining logic works as follows:

- During execution simulation function `update_loop_inline_state`
  tracks the following information for each `bpf_loop` call
  instruction:
  - is callback known and constant?
  - are flags constant and zero?
- Function `optimize_bpf_loop` increases stack depth for functions
  where `bpf_loop` calls can be inlined and invokes `inline_bpf_loop`
  to apply the inlining. The additional stack space is used to spill
  registers R6, R7 and R8. These registers are used as loop counter,
  loop maximal bound and callback context parameter;

Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on
i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op.

Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
---
 include/linux/bpf.h          |   3 +
 include/linux/bpf_verifier.h |  12 +++
 kernel/bpf/bpf_iter.c        |   9 +-
 kernel/bpf/verifier.c        | 175 ++++++++++++++++++++++++++++++++++-
 4 files changed, 190 insertions(+), 9 deletions(-)

Comments

Song Liu June 14, 2022, 5:49 a.m. UTC | #1
On Mon, Jun 13, 2022 at 1:50 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> Calls to `bpf_loop` are replaced with direct loops to avoid
> indirection. E.g. the following:
>
>   bpf_loop(10, foo, NULL, 0);
>
> Is replaced by equivalent of the following:
>
>   for (int i = 0; i < 10; ++i)
>     foo(i, NULL);
>
> This transformation could be applied when:
> - callback is known and does not change during program execution;
> - flags passed to `bpf_loop` are always zero.
>
> Inlining logic works as follows:
>
> - During execution simulation function `update_loop_inline_state`
>   tracks the following information for each `bpf_loop` call
>   instruction:
>   - is callback known and constant?
>   - are flags constant and zero?
> - Function `optimize_bpf_loop` increases stack depth for functions
>   where `bpf_loop` calls can be inlined and invokes `inline_bpf_loop`
>   to apply the inlining. The additional stack space is used to spill
>   registers R6, R7 and R8. These registers are used as loop counter,
>   loop maximal bound and callback context parameter;
>
> Measurements using `benchs/run_bench_bpf_loop.sh` inside QEMU / KVM on
> i7-4710HQ CPU show a drop in latency from 14 ns/op to 2 ns/op.
>
> Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>

LGTM.
Acked-by: Song Liu <songliubraving@fb.com>
Daniel Borkmann June 16, 2022, 11:12 p.m. UTC | #2
On 6/13/22 10:50 PM, Eduard Zingerman wrote:
[...]
> diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
> index d5d96ceca105..7e8fd49406f6 100644
> --- a/kernel/bpf/bpf_iter.c
> +++ b/kernel/bpf/bpf_iter.c
> @@ -723,9 +723,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
>   	.arg4_type	= ARG_ANYTHING,
>   };
>   
> -/* maximum number of loops */
> -#define MAX_LOOPS	BIT(23)
> -
>   BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
>   	   u64, flags)
>   {
> @@ -733,9 +730,13 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
>   	u64 ret;
>   	u32 i;
>   
> +	/* Note: these safety checks are also verified when bpf_loop
> +	 * is inlined, be careful to modify this code in sync. See
> +	 * function verifier.c:inline_bpf_loop.
> +	 */
>   	if (flags)
>   		return -EINVAL;
> -	if (nr_loops > MAX_LOOPS)
> +	if (nr_loops > BPF_MAX_LOOPS)
>   		return -E2BIG;
>   
>   	for (i = 0; i < nr_loops; i++) {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 2d2872682278..db854c09b603 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -7103,6 +7103,38 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
>   	return -ENOTSUPP;
>   }
>   
> +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
> +{
> +	return &env->insn_aux_data[env->insn_idx];
> +}
> +
> +static bool loop_flag_is_zero(struct bpf_verifier_env *env)
> +{
> +	struct bpf_reg_state *regs = cur_regs(env);
> +	struct bpf_reg_state *reg = &regs[BPF_REG_4];
> +
> +	return register_is_const(reg) && reg->var_off.value == 0;

I think you might also need to add precision tracking for the flag check :

mark_chain_precision(env, BPF_REG_4)

See also cc52d9140aa92 ("bpf: Fix record_func_key to perform backtracking on r3").. not too
much of an issue at the moment, but once we extend flags.

> +}
> +
> +static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
> +{
> +	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
> +
> +	if (!state->initialized) {
> +		state->initialized = 1;
> +		state->fit_for_inline = loop_flag_is_zero(env);
> +		state->callback_subprogno = subprogno;
> +		return;
> +	}
> +
> +	if (!state->fit_for_inline)
> +		return;
> +
> +	state->fit_for_inline =
> +		loop_flag_is_zero(env) &&
> +		state->callback_subprogno == subprogno;
> +}
> +
>   static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>   			     int *insn_idx_p)
>   {
> @@ -7255,6 +7287,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
>   		err = check_bpf_snprintf_call(env, regs);
>   		break;
>   	case BPF_FUNC_loop:
> +		update_loop_inline_state(env, meta.subprogno);
>   		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
>   					set_loop_callback_state);
>   		break;
Alexei Starovoitov June 17, 2022, 2:14 a.m. UTC | #3
On Mon, Jun 13, 2022 at 1:50 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> +
> +static bool loop_flag_is_zero(struct bpf_verifier_env *env)
> +{
> +       struct bpf_reg_state *regs = cur_regs(env);
> +       struct bpf_reg_state *reg = &regs[BPF_REG_4];
> +
> +       return register_is_const(reg) && reg->var_off.value == 0;
> +}

Great catch here by Daniel.
It needs mark_chain_precision().

> +
> +static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
> +{
> +       struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
> +
> +       if (!state->initialized) {
> +               state->initialized = 1;
> +               state->fit_for_inline = loop_flag_is_zero(env);
> +               state->callback_subprogno = subprogno;
> +               return;
> +       }
> +
> +       if (!state->fit_for_inline)
> +               return;
> +
> +       state->fit_for_inline =
> +               loop_flag_is_zero(env) &&
> +               state->callback_subprogno == subprogno;

No need to heavy indent. Up to 100 char is fine.

> +static int optimize_bpf_loop(struct bpf_verifier_env *env)
> +{
> +       struct bpf_subprog_info *subprogs = env->subprog_info;
> +       int i, cur_subprog = 0, cnt, delta = 0;
> +       struct bpf_insn *insn = env->prog->insnsi;
> +       int insn_cnt = env->prog->len;
> +       u16 stack_depth = subprogs[cur_subprog].stack_depth;
> +       u16 stack_depth_extra = 0;
> +
> +       for (i = 0; i < insn_cnt; i++, insn++) {
> +               struct bpf_loop_inline_state *inline_state =
> +                       &env->insn_aux_data[i + delta].loop_inline_state;
> +
> +               if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
> +                       struct bpf_prog *new_prog;
> +
> +                       stack_depth_extra = BPF_REG_SIZE * 3;
> +                       new_prog = inline_bpf_loop(env,
> +                                                  i + delta,
> +                                                  -(stack_depth + stack_depth_extra),

See the fix that just landed:
https://lore.kernel.org/bpf/20220616162037.535469-2-jakub@cloudflare.com/

subprogs[cur_subprog].stack_depth may not be a multiple of 8.
But spill slots for r[678] have to be.
We need to round_up(,8) here and
increase stack_depth_extra accordingly.

The rest looks great.
Thank you for working on it!
Eduard Zingerman June 19, 2022, 8:09 p.m. UTC | #4
Hi Daniel, Alexei, 

> On Fri, 2022-06-17 at 01:12 +0200, Daniel Borkmann wrote:
> On Thu, 2022-06-16 at 19:14 -0700, Alexei Starovoitov wrote:
> On Mon, Jun 13, 2022 at 1:50 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> > +
> > +static bool loop_flag_is_zero(struct bpf_verifier_env *env)
[...]
> 
> Great catch here by Daniel.
> It needs mark_chain_precision().

Thanks for the catch regarding precision tracking. Unfortunately I
struggle to create a test case that demonstrates the issue without the
call to `mark_chain_precision`. As far as I understand this test case
should look as follows:


	... do something in such a way that:
	  - there is a branch where
	    BPF_REG_4 is 0, SCALAR_VALUE, !precise
	    and this branch is explored first
	  - there is a branch where
	    BPF_REG_4 is 1, SCALAR_VALUE, !precise

	/* create branching point */
	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
	/* load callback address to r2 */
	BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 5),
	BPF_RAW_INSN(0, 0, 0, 0, 0),
	BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0),
	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop),
	BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
	BPF_EXIT_INSN(),
	/* callback */
	BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1),
	BPF_EXIT_INSN(),

The "do something" part would then rely on the state pruning logic to
skip the verification for the second branch. Namely, the following
part of the `regsafe` function should consider registers identical:

/* Returns true if (rold safe implies rcur safe) */
static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
			struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
{
	...
	switch (base_type(rold->type)) {
	case SCALAR_VALUE:
		...
		if (rcur->type == SCALAR_VALUE) {
here ->			if (!rold->precise && !rcur->precise)
				return true;
			...
		} else {
			...
		}
		...	
	}
	...	
}

However, I don't understand what instructions could mark the register
as a scalar with particular value, but w/o `precise` mark. I tried
MOV, JEQ, JNE, MUL, sequence of BPF_ALU64_IMM(MOV, ...) - BPF_STX_MEM
- BPF_LDX_MEM to no avail.

The following observations might be relevant:
- `__mark_reg_known` does not change the state of the `precise` mark;
- `__mark_reg_unknown` always sets `precise` to `true` when there are
  multiple sub-programs (due to the following line:
  `reg->precise = env->subprog_cnt > 1 || !env->bpf_capable`);
- there are always multiple sub-programs when `bpf_loop` is used.

Could you please suggest what to do with this test?

Best regards,
Eduard Zingerman
Alexei Starovoitov June 19, 2022, 9:10 p.m. UTC | #5
On Sun, Jun 19, 2022 at 11:09:36PM +0300, Eduard Zingerman wrote:
> Hi Daniel, Alexei, 
> 
> > On Fri, 2022-06-17 at 01:12 +0200, Daniel Borkmann wrote:
> > On Thu, 2022-06-16 at 19:14 -0700, Alexei Starovoitov wrote:
> > On Mon, Jun 13, 2022 at 1:50 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> > > +
> > > +static bool loop_flag_is_zero(struct bpf_verifier_env *env)
> [...]
> > 
> > Great catch here by Daniel.
> > It needs mark_chain_precision().
> 
> Thanks for the catch regarding precision tracking. Unfortunately I
> struggle to create a test case that demonstrates the issue without the
> call to `mark_chain_precision`. As far as I understand this test case
> should look as follows:
> 
> 
> 	... do something in such a way that:
> 	  - there is a branch where
> 	    BPF_REG_4 is 0, SCALAR_VALUE, !precise
> 	    and this branch is explored first
> 	  - there is a branch where
> 	    BPF_REG_4 is 1, SCALAR_VALUE, !precise
> 
> 	/* create branching point */
> 	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
> 	/* load callback address to r2 */
> 	BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 5),
> 	BPF_RAW_INSN(0, 0, 0, 0, 0),
> 	BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0),
> 	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop),
> 	BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
> 	BPF_EXIT_INSN(),
> 	/* callback */
> 	BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1),
> 	BPF_EXIT_INSN(),
> 
> The "do something" part would then rely on the state pruning logic to
> skip the verification for the second branch. Namely, the following
> part of the `regsafe` function should consider registers identical:
> 
> /* Returns true if (rold safe implies rcur safe) */
> static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
> 			struct bpf_reg_state *rcur, struct bpf_id_pair *idmap)
> {
> 	...
> 	switch (base_type(rold->type)) {
> 	case SCALAR_VALUE:
> 		...
> 		if (rcur->type == SCALAR_VALUE) {
> here ->			if (!rold->precise && !rcur->precise)
> 				return true;
> 			...
> 		} else {
> 			...
> 		}
> 		...	
> 	}
> 	...	
> }
> 
> However, I don't understand what instructions could mark the register
> as a scalar with particular value, but w/o `precise` mark. I tried
> MOV, JEQ, JNE, MUL, sequence of BPF_ALU64_IMM(MOV, ...) - BPF_STX_MEM
> - BPF_LDX_MEM to no avail.

> The following observations might be relevant:
> - `__mark_reg_known` does not change the state of the `precise` mark;

yes.
Just BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0) and ,1) in the other branch
should do it.
The 'mov' won't make the register precise.

So something like below:

r0 = random_u32
r6 = random_u32
if (r0)
   goto L;

r4 = 0

pruning_point:
if (r6) goto next;
next:
/* load callback address to r2 */
BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 5),
BPF_RAW_INSN(0, 0, 0, 0, 0),
BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop),
BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
BPF_EXIT_INSN(),

L:
r4 = 1
goto pruning_point

The fallthrough path will proceed with r4=0
and pruning will trigger is_state_visited() with r4=1
which regsafe will incorrectly recognize as equivalent.
Eduard Zingerman June 19, 2022, 10:01 p.m. UTC | #6
> On Sun, 2022-06-19 at 14:10 -0700, Alexei Starovoitov wrote:
[...]
> yes.
> Just BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0) and ,1) in the other branch
> should do it.
> The 'mov' won't make the register precise.
> 
> So something like below:
> 
> r0 = random_u32
> r6 = random_u32
> if (r0)
>    goto L;
> 
> r4 = 0
> 
> pruning_point:
> if (r6) goto next;
> next:
> /* load callback address to r2 */
> BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 5),
> BPF_RAW_INSN(0, 0, 0, 0, 0),
> BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0),
> BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop),
> BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
> BPF_EXIT_INSN(),
> 
> L:
> r4 = 1
> goto pruning_point
> 
> The fallthrough path will proceed with r4=0
> and pruning will trigger is_state_visited() with r4=1
> which regsafe will incorrectly recognize as equivalent.
> 

Actually this was the first thing I tried. Here is the pseudo code
above translated to BPF instructions:

	/* main */
	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
	BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_0),
	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_jiffies64),
	BPF_ALU64_REG(BPF_MOV, BPF_REG_7, BPF_REG_0),
	BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 9),
	BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 0),
	BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 0),
	BPF_ALU64_IMM(BPF_MOV, BPF_REG_1, 1),
	BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_2, BPF_PSEUDO_FUNC, 0, 7),
	BPF_RAW_INSN(0, 0, 0, 0, 0),
	BPF_ALU64_IMM(BPF_MOV, BPF_REG_3, 0),
	BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_loop),
	BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
	BPF_EXIT_INSN(),
	BPF_ALU64_IMM(BPF_MOV, BPF_REG_4, 1),
	BPF_JMP_IMM(BPF_JA, 0, 0, -10),
	/* callback */
	BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 1),
	BPF_EXIT_INSN(),	

And here is the verification log for this program:

	#195/p don't inline bpf_loop call, flags non-zero 2 , verifier log:
	func#0 @0
	func#1 @16
	reg type unsupported for arg#0 function main#6
	
	from -1 to 0: R1=ctx(off=0,imm=0) R10=fp0
	0: (85) call bpf_jiffies64#118        ; R0_w=Pscalar()
	1: (bf) r6 = r0                       ; R0_w=Pscalar(id=1) R6_w=Pscalar(id=1)
	2: (85) call bpf_jiffies64#118        ; R0_w=Pscalar()
	3: (bf) r7 = r0                       ; R0_w=Pscalar(id=2) R7_w=Pscalar(id=2)
	4: (55) if r6 != 0x0 goto pc+9        ; R6_w=P0
-->	5: (b7) r4 = 0                        ; R4_w=P0
	6: (55) if r7 != 0x0 goto pc+0        ; R7_w=P0
	7: (b7) r1 = 1                        ; R1_w=P1
	8: (18) r2 = 0x7                      ; R2_w=func(off=0,imm=0)
	10: (b7) r3 = 0                       ; R3_w=P0
	11: (85) call bpf_loop#181
	reg type unsupported for arg#1 function callback#7
	caller:
	 R6=P0 R7=P0 R10=fp0
	callee:
	 frame1: R1=Pscalar() R2_w=P0 R10=fp0 cb
	16: frame1: R1_w=Pscalar() R2_w=P0 cb
	16: (b7) r0 = 1                       ; frame1: R0_w=P1 cb
	17: (95) exit
	returning from callee:
	 frame1: R0_w=P1 R1_w=Pscalar() R2_w=P0 R10=fp0 cb
	to caller at 12:
	 R0_w=Pscalar() R6=P0 R7=P0 R10=fp0
	
	from 17 to 12: R0_w=Pscalar() R6=P0 R7=P0 R10=fp0
	12: (b7) r0 = 0                       ; R0_w=P0
	13: (95) exit
	propagating r4
	
	from 6 to 7: safe
	
	from 4 to 14: R0=Pscalar(id=2) R6=Pscalar(id=1) R7=Pscalar(id=2) R10=fp0
-->	14: (b7) r4 = 1                       ; R4_w=P1
	15: (05) goto pc-10
	6: (55) if r7 != 0x0 goto pc+0        ; R7=P0
	7: (b7) r1 = 1                        ; R1_w=P1
	8: (18) r2 = 0x7                      ; R2_w=func(off=0,imm=0)
	10: (b7) r3 = 0                       ; R3_w=P0
	11: (85) call bpf_loop#181
	[...]

Note that for instructions [5] and [14] R4 is marked as precise. I
actually verified in the debugger that this happens because of the
following processing logic for MOV:

#0  check_alu_op (...) at kernel/bpf/verifier.c:9177
#1  ... in do_check (...) at kernel/bpf/verifier.c:12100
#2  do_check_common (...) at kernel/bpf/verifier.c:14552
#3  ... in do_check_main () at kernel/bpf/verifier.c:14615

The C code for relevant functions is below. Note that call to
`__mark_reg_unknown` will mark the register as precise when
`env->subprog_cnt > 1`, but for this test case it is 2.

static int do_check(struct bpf_verifier_env *env)
{
	...
		if (class == BPF_ALU || class == BPF_ALU64) {
12100:			err = check_alu_op(env, insn);
			if (err)
				return err;

		} else ...
	...
}

static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
	...
	else if (opcode == BPF_MOV) {
		...
		if (BPF_SRC(insn->code) == BPF_X) {
			...
		} else {
			/* case: R = imm
			 * remember the value we stored into this reg
			 */
			/* clear any state __mark_reg_known doesn't set */
			mark_reg_unknown(env, regs, insn->dst_reg);
			regs[insn->dst_reg].type = SCALAR_VALUE;
			if (BPF_CLASS(insn->code) == BPF_ALU64) {
9177:				__mark_reg_known(regs + insn->dst_reg,
						 insn->imm);
			} else {
				__mark_reg_known(regs + insn->dst_reg,
						 (u32)insn->imm);
			}
		}
		...
	}
	...
}

/* Mark a register as having a completely unknown (scalar) value. */
static void __mark_reg_unknown(const struct bpf_verifier_env *env,
			       struct bpf_reg_state *reg)
{
	...
	reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
	...
}

static void mark_reg_unknown(struct bpf_verifier_env *env,
			     struct bpf_reg_state *regs, u32 regno)
{
	...
	__mark_reg_unknown(env, regs + regno);
}

Thanks,
Eduard
Alexei Starovoitov June 19, 2022, 11:37 p.m. UTC | #7
On Sun, Jun 19, 2022 at 3:01 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> /* Mark a register as having a completely unknown (scalar) value. */
> static void __mark_reg_unknown(const struct bpf_verifier_env *env,
>                                struct bpf_reg_state *reg)
> {
>         ...
>         reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;

Ahh. Thanks for explaining.
We probably need to fix this conservative logic.
Can you repro the issue when you comment out above ?

Let's skip the test for now. Just add mark_chain_precision
to loop logic, so we don't have to come back to it later
when subprogs>1 is fixed.
Eduard Zingerman June 20, 2022, 12:59 p.m. UTC | #8
> On Sun, 2022-06-19 at 16:37 -0700, Alexei Starovoitov wrote:
> On Sun, Jun 19, 2022 at 3:01 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> > 
> > /* Mark a register as having a completely unknown (scalar) value. */
> > static void __mark_reg_unknown(const struct bpf_verifier_env *env,
> >                                struct bpf_reg_state *reg)
> > {
> >         ...
> >         reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
> 
> Ahh. Thanks for explaining.
> We probably need to fix this conservative logic.
> Can you repro the issue when you comment out above ?

If I replace the assignment above with `reg->precise = false` the
verifier does skip the second branch with BPF_REG_4 set to 1.

> Let's skip the test for now. Just add mark_chain_precision
> to loop logic, so we don't have to come back to it later
> when subprogs>1 is fixed.

Will provide the updated version tonight, thank you for the
suggestions.

Best regards,
Eduard.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8e6092d0ea95..3c75ede138b5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1236,6 +1236,9 @@  struct bpf_array {
 #define BPF_COMPLEXITY_LIMIT_INSNS      1000000 /* yes. 1M insns */
 #define MAX_TAIL_CALL_CNT 33
 
+/* Maximum number of loops for bpf_loop */
+#define BPF_MAX_LOOPS	BIT(23)
+
 #define BPF_F_ACCESS_MASK	(BPF_F_RDONLY |		\
 				 BPF_F_RDONLY_PROG |	\
 				 BPF_F_WRONLY |		\
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e8439f6cbe57..5cf152b0a53e 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -344,6 +344,14 @@  struct bpf_verifier_state_list {
 	int miss_cnt, hit_cnt;
 };
 
+struct bpf_loop_inline_state {
+	int initialized:1; /* set to true upon first entry */
+	int fit_for_inline:1; /* true if callback function is the same
+			       * at each call and flags are always zero
+			       */
+	u32 callback_subprogno; /* valid when fit_for_inline is true */
+};
+
 /* Possible states for alu_state member. */
 #define BPF_ALU_SANITIZE_SRC		(1U << 0)
 #define BPF_ALU_SANITIZE_DST		(1U << 1)
@@ -373,6 +381,10 @@  struct bpf_insn_aux_data {
 				u32 mem_size;	/* mem_size for non-struct typed var */
 			};
 		} btf_var;
+		/* if instruction is a call to bpf_loop this field tracks
+		 * the state of the relevant registers to make decision about inlining
+		 */
+		struct bpf_loop_inline_state loop_inline_state;
 	};
 	u64 map_key_state; /* constant (32 bit) key tracking for maps */
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index d5d96ceca105..7e8fd49406f6 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -723,9 +723,6 @@  const struct bpf_func_proto bpf_for_each_map_elem_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-/* maximum number of loops */
-#define MAX_LOOPS	BIT(23)
-
 BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
 	   u64, flags)
 {
@@ -733,9 +730,13 @@  BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
 	u64 ret;
 	u32 i;
 
+	/* Note: these safety checks are also verified when bpf_loop
+	 * is inlined, be careful to modify this code in sync. See
+	 * function verifier.c:inline_bpf_loop.
+	 */
 	if (flags)
 		return -EINVAL;
-	if (nr_loops > MAX_LOOPS)
+	if (nr_loops > BPF_MAX_LOOPS)
 		return -E2BIG;
 
 	for (i = 0; i < nr_loops; i++) {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2d2872682278..db854c09b603 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7103,6 +7103,38 @@  static int check_get_func_ip(struct bpf_verifier_env *env)
 	return -ENOTSUPP;
 }
 
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+	return &env->insn_aux_data[env->insn_idx];
+}
+
+static bool loop_flag_is_zero(struct bpf_verifier_env *env)
+{
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = &regs[BPF_REG_4];
+
+	return register_is_const(reg) && reg->var_off.value == 0;
+}
+
+static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
+{
+	struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+
+	if (!state->initialized) {
+		state->initialized = 1;
+		state->fit_for_inline = loop_flag_is_zero(env);
+		state->callback_subprogno = subprogno;
+		return;
+	}
+
+	if (!state->fit_for_inline)
+		return;
+
+	state->fit_for_inline =
+		loop_flag_is_zero(env) &&
+		state->callback_subprogno == subprogno;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			     int *insn_idx_p)
 {
@@ -7255,6 +7287,7 @@  static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 		err = check_bpf_snprintf_call(env, regs);
 		break;
 	case BPF_FUNC_loop:
+		update_loop_inline_state(env, meta.subprogno);
 		err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 					set_loop_callback_state);
 		break;
@@ -7661,11 +7694,6 @@  static bool check_reg_sane_offset(struct bpf_verifier_env *env,
 	return true;
 }
 
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
-{
-	return &env->insn_aux_data[env->insn_idx];
-}
-
 enum {
 	REASON_BOUNDS	= -1,
 	REASON_TYPE	= -2,
@@ -14294,6 +14322,140 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 	return 0;
 }
 
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+					int position,
+					s32 stack_base,
+					u32 callback_subprogno,
+					u32 *cnt)
+{
+	s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+	s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+	s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+	int reg_loop_max = BPF_REG_6;
+	int reg_loop_cnt = BPF_REG_7;
+	int reg_loop_ctx = BPF_REG_8;
+
+	struct bpf_prog *new_prog;
+	u32 callback_start;
+	u32 call_insn_offset;
+	s32 callback_offset;
+
+	/* This represents an inlined version of bpf_iter.c:bpf_loop,
+	 * be careful to modify this code in sync.
+	 */
+	struct bpf_insn insn_buf[] = {
+		/* Return error and jump to the end of the patch if
+		 * expected number of iterations is too big.
+		 */
+		BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
+		BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
+		BPF_JMP_IMM(BPF_JA, 0, 0, 16),
+		/* spill R6, R7, R8 to use these as loop vars */
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
+		/* initialize loop vars */
+		BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
+		BPF_MOV32_IMM(reg_loop_cnt, 0),
+		BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
+		/* loop header,
+		 * if reg_loop_cnt >= reg_loop_max skip the loop body
+		 */
+		BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
+		/* callback call,
+		 * correct callback offset would be set after patching
+		 */
+		BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
+		BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
+		BPF_CALL_REL(0),
+		/* increment loop counter */
+		BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
+		/* jump to loop header if callback returned 0 */
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
+		/* return value of bpf_loop,
+		 * set R0 to the number of iterations
+		 */
+		BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
+		/* restore original values of R6, R7, R8 */
+		BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
+		BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
+	};
+
+	*cnt = ARRAY_SIZE(insn_buf);
+	new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+	if (!new_prog)
+		return new_prog;
+
+	/* callback start is known only after patching */
+	callback_start = env->subprog_info[callback_subprogno].start;
+	/* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+	call_insn_offset = position + 12;
+	callback_offset = callback_start - call_insn_offset - 1;
+	env->prog->insnsi[call_insn_offset].imm = callback_offset;
+
+	return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+	return insn->code == (BPF_JMP | BPF_CALL) &&
+		insn->src_reg == 0 &&
+		insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8.  These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+static int optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+	struct bpf_subprog_info *subprogs = env->subprog_info;
+	int i, cur_subprog = 0, cnt, delta = 0;
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+	u16 stack_depth = subprogs[cur_subprog].stack_depth;
+	u16 stack_depth_extra = 0;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		struct bpf_loop_inline_state *inline_state =
+			&env->insn_aux_data[i + delta].loop_inline_state;
+
+		if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+			struct bpf_prog *new_prog;
+
+			stack_depth_extra = BPF_REG_SIZE * 3;
+			new_prog = inline_bpf_loop(env,
+						   i + delta,
+						   -(stack_depth + stack_depth_extra),
+						   inline_state->callback_subprogno,
+						   &cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta     += cnt - 1;
+			env->prog  = new_prog;
+			insn       = new_prog->insnsi + i + delta;
+		}
+
+		if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+			subprogs[cur_subprog].stack_depth += stack_depth_extra;
+			cur_subprog++;
+			stack_depth = subprogs[cur_subprog].stack_depth;
+			stack_depth_extra = 0;
+		}
+	}
+
+	env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+
+	return 0;
+}
+
 static void free_states(struct bpf_verifier_env *env)
 {
 	struct bpf_verifier_state_list *sl, *sln;
@@ -15031,6 +15193,9 @@  int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
 		ret = check_max_stack_depth(env);
 
 	/* instruction rewrites happen after this point */
+	if (ret == 0)
+		ret = optimize_bpf_loop(env);
+
 	if (is_priv) {
 		if (ret == 0)
 			opt_hard_wire_dead_code_branches(env);