diff mbox series

[bpf-next,1/2] bpf: Fix to preserve reg parent/live fields when copying range info

Message ID 20221205011754.310580-2-eddyz87@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf: Fix to preserve reg parent/live fields when copying range info | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 10 this patch: 10
netdev/cc_maintainers fail 1 blamed authors not CCed: ecree@solarflare.com; 8 maintainers not CCed: kpsingh@kernel.org haoluo@google.com song@kernel.org ecree@solarflare.com martin.lau@linux.dev sdf@google.com john.fastabend@gmail.com jolsa@kernel.org
netdev/build_clang success Errors and warnings before: 5 this patch: 5
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 10 this patch: 10
netdev/checkpatch warning WARNING: line length of 82 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-17 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-19 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-14 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-20 success Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-next-VM_Test-21 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 fail Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc

Commit Message

Eduard Zingerman Dec. 5, 2022, 1:17 a.m. UTC
Register range information is copied in several places. The intent is
to transfer range/id information from one register/stack spill to
another. Currently this is done using direct register assignment, e.g.:

static void find_equal_scalars(..., struct bpf_reg_state *known_reg)
{
	...
	struct bpf_reg_state *reg;
	...
			*reg = *known_reg;
	...
}

However, such assignments also copy the following bpf_reg_state fields:

struct bpf_reg_state {
	...
	struct bpf_reg_state *parent;
	...
	enum bpf_reg_liveness live;
	...
};

Copying of these fields is accidental and incorrect, as could be
demonstrated by the following example:

     0: call ktime_get_ns()
     1: r6 = r0
     2: call ktime_get_ns()
     3: r7 = r0
     4: if r0 > r6 goto +1             ; r0 & r6 are unbound thus generated
                                       ; branch states are identical
     5: *(u64 *)(r10 - 8) = 0xdeadbeef ; 64-bit write to fp[-8]
    --- checkpoint ---
     6: r1 = 42                        ; r1 marked as written
     7: *(u8 *)(r10 - 8) = r1          ; 8-bit write, fp[-8] parent & live
                                       ; overwritten
     8: r2 = *(u64 *)(r10 - 8)
     9: r0 = 0
    10: exit

This example is unsafe because 64-bit write to fp[-8] at (5) is
conditional, thus not all bytes of fp[-8] are guaranteed to be set
when it is read at (8). However, currently the example passes
verification.

First, the execution path 1-10 is examined by verifier.
Suppose that a new checkpoint is created by is_state_visited() at (6).
After checkpoint creation:
- r1.parent points to checkpoint.r1,
- fp[-8].parent points to checkpoint.fp[-8].
At (6) the r1.live is set to REG_LIVE_WRITTEN.
At (7) the fp[-8].parent is set to r1.parent and fp[-8].live is set to
REG_LIVE_WRITTEN, because of the following code called in
check_stack_write_fixed_off():

static void save_register_state(struct bpf_func_state *state,
				int spi, struct bpf_reg_state *reg,
				int size)
{
	...
	state->stack[spi].spilled_ptr = *reg;  // <--- parent & live copied
	if (size == BPF_REG_SIZE)
		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
	...
}

Note the intent to mark stack spill as written only if 8 bytes are
spilled to a slot, however this intent is spoiled by a 'live' field copy.
At (8) the checkpoint.fp[-8] should be marked as REG_LIVE_READ but
this does not happen:
- fp[-8] in a current state is already marked as REG_LIVE_WRITTEN;
- fp[-8].parent points to checkpoint.r1, parentage chain is used by
  mark_reg_read() to mark checkpoint states.
At (10) the verification is finished for path 1-10 and jump 4-6 is
examined. The checkpoint.fp[-8] never gets REG_LIVE_READ mark and this
spill is pruned from the cached states by clean_live_states(). Hence
verifier state obtained via path 1-4,6 is deemed identical to one
obtained via path 1-6 and program marked as safe.

Note: the example should be executed with BPF_F_TEST_STATE_FREQ flag
set to force creation of intermediate verifier states.

This commit revisits the locations where bpf_reg_state instances are
copied and replaces the direct copies with a call to a function
copy_register_state(dst, src) that preserves 'parent' and 'live'
fields of the 'dst'.

Fixes: 679c782de14b ("bpf/verifier: per-register parent pointers")
Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
---
 kernel/bpf/verifier.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

Comments

Andrii Nakryiko Dec. 8, 2022, 8:29 p.m. UTC | #1
On Sun, Dec 4, 2022 at 5:18 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> Register range information is copied in several places. The intent is
> to transfer range/id information from one register/stack spill to
> another. Currently this is done using direct register assignment, e.g.:
>
> static void find_equal_scalars(..., struct bpf_reg_state *known_reg)
> {
>         ...
>         struct bpf_reg_state *reg;
>         ...
>                         *reg = *known_reg;
>         ...
> }
>
> However, such assignments also copy the following bpf_reg_state fields:
>
> struct bpf_reg_state {
>         ...
>         struct bpf_reg_state *parent;
>         ...
>         enum bpf_reg_liveness live;
>         ...
> };
>
> Copying of these fields is accidental and incorrect, as could be
> demonstrated by the following example:
>
>      0: call ktime_get_ns()
>      1: r6 = r0
>      2: call ktime_get_ns()
>      3: r7 = r0
>      4: if r0 > r6 goto +1             ; r0 & r6 are unbound thus generated
>                                        ; branch states are identical
>      5: *(u64 *)(r10 - 8) = 0xdeadbeef ; 64-bit write to fp[-8]
>     --- checkpoint ---
>      6: r1 = 42                        ; r1 marked as written
>      7: *(u8 *)(r10 - 8) = r1          ; 8-bit write, fp[-8] parent & live
>                                        ; overwritten
>      8: r2 = *(u64 *)(r10 - 8)
>      9: r0 = 0
>     10: exit
>
> This example is unsafe because 64-bit write to fp[-8] at (5) is
> conditional, thus not all bytes of fp[-8] are guaranteed to be set
> when it is read at (8). However, currently the example passes
> verification.
>
> First, the execution path 1-10 is examined by verifier.
> Suppose that a new checkpoint is created by is_state_visited() at (6).
> After checkpoint creation:
> - r1.parent points to checkpoint.r1,
> - fp[-8].parent points to checkpoint.fp[-8].
> At (6) the r1.live is set to REG_LIVE_WRITTEN.
> At (7) the fp[-8].parent is set to r1.parent and fp[-8].live is set to
> REG_LIVE_WRITTEN, because of the following code called in
> check_stack_write_fixed_off():
>
> static void save_register_state(struct bpf_func_state *state,
>                                 int spi, struct bpf_reg_state *reg,
>                                 int size)
> {
>         ...
>         state->stack[spi].spilled_ptr = *reg;  // <--- parent & live copied
>         if (size == BPF_REG_SIZE)
>                 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
>         ...
> }
>
> Note the intent to mark stack spill as written only if 8 bytes are
> spilled to a slot, however this intent is spoiled by a 'live' field copy.
> At (8) the checkpoint.fp[-8] should be marked as REG_LIVE_READ but
> this does not happen:
> - fp[-8] in a current state is already marked as REG_LIVE_WRITTEN;
> - fp[-8].parent points to checkpoint.r1, parentage chain is used by
>   mark_reg_read() to mark checkpoint states.
> At (10) the verification is finished for path 1-10 and jump 4-6 is
> examined. The checkpoint.fp[-8] never gets REG_LIVE_READ mark and this
> spill is pruned from the cached states by clean_live_states(). Hence
> verifier state obtained via path 1-4,6 is deemed identical to one
> obtained via path 1-6 and program marked as safe.
>
> Note: the example should be executed with BPF_F_TEST_STATE_FREQ flag
> set to force creation of intermediate verifier states.
>
> This commit revisits the locations where bpf_reg_state instances are
> copied and replaces the direct copies with a call to a function
> copy_register_state(dst, src) that preserves 'parent' and 'live'
> fields of the 'dst'.
>
> Fixes: 679c782de14b ("bpf/verifier: per-register parent pointers")
> Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
> ---
>  kernel/bpf/verifier.c | 25 ++++++++++++++++++-------
>  1 file changed, 18 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index b0db9c10567b..8b0a03aad85e 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -3181,13 +3181,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
>         return reg->type != SCALAR_VALUE;
>  }
>
> +/* Copy src state preserving dst->parent and dst->live fields */
> +static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
> +{
> +       struct bpf_reg_state *parent = dst->parent;
> +       enum bpf_reg_liveness live = dst->live;
> +
> +       *dst = *src;
> +       dst->parent = parent;
> +       dst->live = live;

It feels like liveness should always be reset when we are copying
register states like this. This copy_register_state() happens when we
do `r1 = r2`, or when we spill/restore register to stack, right? In
all of these cases we should first assume that these registers or
stack slots won't be ever read and would need to be forgotten later.
So any REG_LIVE_READ{32,64} marks should be clear.

But we are preserving old liveness for some reason. Is that intentional?

Similarly for parent pointers, I still feel like resetting parent to
NULL for such statements is the right approach here. But as you
explained offline, LIVE_WRITTEN is equivalent, so ok, fine.

Now, for your example above. I feel like `7: *(u8 *)(r10 - 8) = r1`
should go through a parental chain before we reset parent and mark
parent as READ. That is, when we forcefully turn the previous spilled
register to STACK_MISC, we are basically reading that register and
casting it to an unknown integer. Does that work or does it break
something?

Sorry, I'm repaging all the context after a few days not looking at
this, so some of those questions we might have discussed. But it would
be useful for others to also understand these subtleties.


> +}
> +
>  static void save_register_state(struct bpf_func_state *state,
>                                 int spi, struct bpf_reg_state *reg,
>                                 int size)
>  {
>         int i;
>
> -       state->stack[spi].spilled_ptr = *reg;
> +       copy_register_state(&state->stack[spi].spilled_ptr, reg);

So what I mentioned above. Here, before we copy_register_state, if
size != BPF_REG_SIZE, mark current parent as READ, and then
copy_register_state. What does this break?

>         if (size == BPF_REG_SIZE)
>                 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
>
> @@ -3513,7 +3524,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
>                                  */
>                                 s32 subreg_def = state->regs[dst_regno].subreg_def;
>
> -                               state->regs[dst_regno] = *reg;
> +                               copy_register_state(&state->regs[dst_regno], reg);
>                                 state->regs[dst_regno].subreg_def = subreg_def;
>                         } else {
>                                 for (i = 0; i < size; i++) {
> @@ -3534,7 +3545,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
>
>                 if (dst_regno >= 0) {
>                         /* restore register state from stack */
> -                       state->regs[dst_regno] = *reg;
> +                       copy_register_state(&state->regs[dst_regno], reg);
>                         /* mark reg as written since spilled pointer state likely
>                          * has its liveness marks cleared by is_state_visited()
>                          * which resets stack/reg liveness for state transitions
> @@ -9407,7 +9418,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
>          */
>         if (!ptr_is_dst_reg) {
>                 tmp = *dst_reg;
> -               *dst_reg = *ptr_reg;
> +               copy_register_state(dst_reg, ptr_reg);
>         }
>         ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
>                                         env->insn_idx);
> @@ -10660,7 +10671,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
>                                          * to propagate min/max range.
>                                          */
>                                         src_reg->id = ++env->id_gen;
> -                               *dst_reg = *src_reg;
> +                               copy_register_state(dst_reg, src_reg);
>                                 dst_reg->live |= REG_LIVE_WRITTEN;
>                                 dst_reg->subreg_def = DEF_NOT_SUBREG;
>                         } else {
> @@ -10671,7 +10682,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
>                                                 insn->src_reg);
>                                         return -EACCES;
>                                 } else if (src_reg->type == SCALAR_VALUE) {
> -                                       *dst_reg = *src_reg;
> +                                       copy_register_state(dst_reg, src_reg);
>                                         /* Make sure ID is cleared otherwise
>                                          * dst_reg min/max could be incorrectly
>                                          * propagated into src_reg by find_equal_scalars()
> @@ -11470,7 +11481,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
>
>         bpf_for_each_reg_in_vstate(vstate, state, reg, ({
>                 if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
> -                       *reg = *known_reg;
> +                       copy_register_state(reg, known_reg);

did we discuss what should happen with precision propagation in cases
like this? These "equal scalars" are a bit mind bending, we need to
consider if by tracking precision independently for them we are going
to break anything,

>         }));
>  }
>
> --
> 2.34.1
>
Eduard Zingerman Dec. 9, 2022, 12:08 a.m. UTC | #2
On Thu, 2022-12-08 at 12:29 -0800, Andrii Nakryiko wrote:
> On Sun, Dec 4, 2022 at 5:18 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> > 
> > Register range information is copied in several places. The intent is
> > to transfer range/id information from one register/stack spill to
> > another. Currently this is done using direct register assignment, e.g.:
> > 
> > static void find_equal_scalars(..., struct bpf_reg_state *known_reg)
> > {
> >         ...
> >         struct bpf_reg_state *reg;
> >         ...
> >                         *reg = *known_reg;
> >         ...
> > }
> > 
> > However, such assignments also copy the following bpf_reg_state fields:
> > 
> > struct bpf_reg_state {
> >         ...
> >         struct bpf_reg_state *parent;
> >         ...
> >         enum bpf_reg_liveness live;
> >         ...
> > };
> > 
> > Copying of these fields is accidental and incorrect, as could be
> > demonstrated by the following example:
> > 
> >      0: call ktime_get_ns()
> >      1: r6 = r0
> >      2: call ktime_get_ns()
> >      3: r7 = r0
> >      4: if r0 > r6 goto +1             ; r0 & r6 are unbound thus generated
> >                                        ; branch states are identical
> >      5: *(u64 *)(r10 - 8) = 0xdeadbeef ; 64-bit write to fp[-8]
> >     --- checkpoint ---
> >      6: r1 = 42                        ; r1 marked as written
> >      7: *(u8 *)(r10 - 8) = r1          ; 8-bit write, fp[-8] parent & live
> >                                        ; overwritten
> >      8: r2 = *(u64 *)(r10 - 8)
> >      9: r0 = 0
> >     10: exit
> > 
> > This example is unsafe because 64-bit write to fp[-8] at (5) is
> > conditional, thus not all bytes of fp[-8] are guaranteed to be set
> > when it is read at (8). However, currently the example passes
> > verification.
> > 
> > First, the execution path 1-10 is examined by verifier.
> > Suppose that a new checkpoint is created by is_state_visited() at (6).
> > After checkpoint creation:
> > - r1.parent points to checkpoint.r1,
> > - fp[-8].parent points to checkpoint.fp[-8].
> > At (6) the r1.live is set to REG_LIVE_WRITTEN.
> > At (7) the fp[-8].parent is set to r1.parent and fp[-8].live is set to
> > REG_LIVE_WRITTEN, because of the following code called in
> > check_stack_write_fixed_off():
> > 
> > static void save_register_state(struct bpf_func_state *state,
> >                                 int spi, struct bpf_reg_state *reg,
> >                                 int size)
> > {
> >         ...
> >         state->stack[spi].spilled_ptr = *reg;  // <--- parent & live copied
> >         if (size == BPF_REG_SIZE)
> >                 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
> >         ...
> > }
> > 
> > Note the intent to mark stack spill as written only if 8 bytes are
> > spilled to a slot, however this intent is spoiled by a 'live' field copy.
> > At (8) the checkpoint.fp[-8] should be marked as REG_LIVE_READ but
> > this does not happen:
> > - fp[-8] in a current state is already marked as REG_LIVE_WRITTEN;
> > - fp[-8].parent points to checkpoint.r1, parentage chain is used by
> >   mark_reg_read() to mark checkpoint states.
> > At (10) the verification is finished for path 1-10 and jump 4-6 is
> > examined. The checkpoint.fp[-8] never gets REG_LIVE_READ mark and this
> > spill is pruned from the cached states by clean_live_states(). Hence
> > verifier state obtained via path 1-4,6 is deemed identical to one
> > obtained via path 1-6 and program marked as safe.
> > 
> > Note: the example should be executed with BPF_F_TEST_STATE_FREQ flag
> > set to force creation of intermediate verifier states.
> > 
> > This commit revisits the locations where bpf_reg_state instances are
> > copied and replaces the direct copies with a call to a function
> > copy_register_state(dst, src) that preserves 'parent' and 'live'
> > fields of the 'dst'.
> > 
> > Fixes: 679c782de14b ("bpf/verifier: per-register parent pointers")
> > Signed-off-by: Eduard Zingerman <eddyz87@gmail.com>
> > ---
> >  kernel/bpf/verifier.c | 25 ++++++++++++++++++-------
> >  1 file changed, 18 insertions(+), 7 deletions(-)
> > 
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index b0db9c10567b..8b0a03aad85e 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -3181,13 +3181,24 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
> >         return reg->type != SCALAR_VALUE;
> >  }
> > 
> > +/* Copy src state preserving dst->parent and dst->live fields */
> > +static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
> > +{
> > +       struct bpf_reg_state *parent = dst->parent;
> > +       enum bpf_reg_liveness live = dst->live;
> > +
> > +       *dst = *src;
> > +       dst->parent = parent;
> > +       dst->live = live;
> 
> It feels like liveness should always be reset when we are copying
> register states like this. This copy_register_state() happens when we
> do `r1 = r2`, or when we spill/restore register to stack, right? In

And also in find_equal_scalars().

> all of these cases we should first assume that these registers or
> stack slots won't be ever read and would need to be forgotten later.
> So any REG_LIVE_READ{32,64} marks should be clear.
> 
> But we are preserving old liveness for some reason. Is that intentional?

Yes. There are two values that we can write to dst->live:
REG_LIVE_NONE & REG_LIVE_WRITTEN.

The REG_LIVE_NONE would be incorrect for the following program:

--- checkpoint #0 ---
  0: *(u64 *)(r10 - 8) = 42
--- checkpoint #1 ---
  1: *(u64 *)(r10 - 8) = 7
  2: *(u8  *)(r10 - 8) = r1
  3: r2 = *(u64 *)(r10 - 8)

At (1) check_stack_write_fixed_off() will mark fp[-8] as REG_LIVE_WRITTEN.
At (2) check_stack_write_fixed_off() can't mark fp[-8] as
REG_LIVE_WRITTEN because the write is u8.

So, if copy_register_state() is changed to "dst->live = REG_LIVE_NONE;"
the REG_LIVE_WRITTEN mark would be lost and checkpoint[0].fp[-8] would
be marked as read at (3).

Now, suppose REG_LIVE_WRITTEN is used instead of REG_LIVE_NONE:
"dst->live = REG_LIVE_WRITTEN;". It would be incorrect for a similar
program:

--- checkpoint #0 ---
  0: *(u64 *)(r10 - 8) = 42
--- checkpoint #1 ---
  1: *(u8  *)(r10 - 8) = r1
  2: r2 = *(u64 *)(r10 - 8)

At (1) check_stack_write_fixed_off() will call copy_register_state()
thus marking fp[-8] as written, which would prevent
checkpoint[0].fp[-8] from getting a read mark at (2).

find_equal_scalars() shouldn't touch liveness as well.

So, I opted to leave 'dst->live' as is and let the calling code deal
with it (as it has more context).

> Similarly for parent pointers, I still feel like resetting parent to
> NULL for such statements is the right approach here. But as you
> explained offline, LIVE_WRITTEN is equivalent, so ok, fine.
>
> Now, for your example above. I feel like `7: *(u8 *)(r10 - 8) = r1`
> should go through a parental chain before we reset parent and mark
> parent as READ. That is, when we forcefully turn the previous spilled
> register to STACK_MISC, we are basically reading that register and
> casting it to an unknown integer. Does that work or does it break
> something?

For this instruction the following happens on master:
- do_check() in BPF_STX branch calls check_reg_arg(..., SRC_OP) for r1,
  which marks it as REG_LIVE_READ;
- do_check() in BPF_STX branch calls ... check_stack_write_fixed_off()
  which calls save_register_state(), which:
  - converts byte fp[-1]..fp[-7] to STACK_MISC;
  - converts byte fp[-8] to STACK_SPILL;
  - copies r1 byte-to-byte to fp[-8].spilled_ptr, which breaks
    parentage chain and liveness info:
    - fp[-8].spilled_ptr.parent is now checkpoint.r1;
    - fp[-8].spilled_ptr.live is now REG_LIVE_WRITTEN.

For this instruction the following happens with this patch:
- do_check() in BPF_STX branch calls check_reg_arg(..., SRC_OP) for r1,
  which marks it as REG_LIVE_READ;
- do_check() in BPF_STX branch calls ... check_stack_write_fixed_off()
  which calls save_register_state(), which:
  - converts bytes fp[-1]..fp[-7] to STACK_MISC;
  - converts byte fp[-8] to STACK_SPILL;
  - copies r1 to fp[-8].spilled_ptr using copy_register_state:
    - fp[-8].spilled_ptr.parent is preserved to be checkpoint.fp[-8].spilled_ptr;
    - fp[-8].spilled_ptr.live is preserved to be REG_LIVE_NONE.

In both cases fp[-8] is *not* marked as READ as a result of (7).
And it shouldn't because no read actually occurred and it would lead to
unnecessary read marks e.g. in the following situation:

 5: *(u64 *)(r10 - 8) = 0xdeadbeef
--- checkpoint ---
 6: r1 = 42
 7: *(u8 *)(r10 - 8) = r1
 8: *(u64 *)(r10 - 8) = r2

> 
> Sorry, I'm repaging all the context after a few days not looking at
> this, so some of those questions we might have discussed. But it would
> be useful for others to also understand these subtleties.
> 
> 
> > +}
> > +
> >  static void save_register_state(struct bpf_func_state *state,
> >                                 int spi, struct bpf_reg_state *reg,
> >                                 int size)
> >  {
> >         int i;
> > 
> > -       state->stack[spi].spilled_ptr = *reg;
> > +       copy_register_state(&state->stack[spi].spilled_ptr, reg);
> 
> So what I mentioned above. Here, before we copy_register_state, if
> size != BPF_REG_SIZE, mark current parent as READ, and then
> copy_register_state. What does this break?

You are talking about 'state->stack[spi].spilled_ptr.parent', right?
I don't see an example of it breaking something, but as in the example
above it can lead to unnecessary read marks.

> >         if (size == BPF_REG_SIZE)
> >                 state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
> > 
> > @@ -3513,7 +3524,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
> >                                  */
> >                                 s32 subreg_def = state->regs[dst_regno].subreg_def;
> > 
> > -                               state->regs[dst_regno] = *reg;
> > +                               copy_register_state(&state->regs[dst_regno], reg);
> >                                 state->regs[dst_regno].subreg_def = subreg_def;
> >                         } else {
> >                                 for (i = 0; i < size; i++) {
> > @@ -3534,7 +3545,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
> > 
> >                 if (dst_regno >= 0) {
> >                         /* restore register state from stack */
> > -                       state->regs[dst_regno] = *reg;
> > +                       copy_register_state(&state->regs[dst_regno], reg);
> >                         /* mark reg as written since spilled pointer state likely
> >                          * has its liveness marks cleared by is_state_visited()
> >                          * which resets stack/reg liveness for state transitions
> > @@ -9407,7 +9418,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
> >          */
> >         if (!ptr_is_dst_reg) {
> >                 tmp = *dst_reg;
> > -               *dst_reg = *ptr_reg;
> > +               copy_register_state(dst_reg, ptr_reg);
> >         }
> >         ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
> >                                         env->insn_idx);
> > @@ -10660,7 +10671,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
> >                                          * to propagate min/max range.
> >                                          */
> >                                         src_reg->id = ++env->id_gen;
> > -                               *dst_reg = *src_reg;
> > +                               copy_register_state(dst_reg, src_reg);
> >                                 dst_reg->live |= REG_LIVE_WRITTEN;
> >                                 dst_reg->subreg_def = DEF_NOT_SUBREG;
> >                         } else {
> > @@ -10671,7 +10682,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
> >                                                 insn->src_reg);
> >                                         return -EACCES;
> >                                 } else if (src_reg->type == SCALAR_VALUE) {
> > -                                       *dst_reg = *src_reg;
> > +                                       copy_register_state(dst_reg, src_reg);
> >                                         /* Make sure ID is cleared otherwise
> >                                          * dst_reg min/max could be incorrectly
> >                                          * propagated into src_reg by find_equal_scalars()
> > @@ -11470,7 +11481,7 @@ static void find_equal_scalars(struct bpf_verifier_state *vstate,
> > 
> >         bpf_for_each_reg_in_vstate(vstate, state, reg, ({
> >                 if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
> > -                       *reg = *known_reg;
> > +                       copy_register_state(reg, known_reg);
> 
> did we discuss what should happen with precision propagation in cases
> like this? These "equal scalars" are a bit mind bending, we need to
> consider if by tracking precision independently for them we are going
> to break anything,

That's the scary part. In [1] I have an example that shows that scalar
ids must be matched using check_ids() in regsafe(). There I modified
regsafe() as follows:

@@ -12891,6 +12969,11 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
 		if (env->explore_alu_limits)
 			return false;
 		if (rcur->type == SCALAR_VALUE) {
+			/* id relations must be preserved, see comment in
+			 * mark_equal_scalars_as_read() for SCALAR_VALUE example.
+			 */
+			if (rold->id && !check_ids(rold->id, rcur->id, idmap))
+				return false;
 			if (!rold->precise)
 				return true;
 			/* new val must satisfy old val knowledge */

The idea is to check that all scalar ids match before checking for
'precise'. Even if some of these scalar IDs are imprecise. It seems to
me that this is conservatively safe but I hoped to get some comments
from you or Alexei.

Also, I don't know how to make a test case for [1] without this patch,
so this one comes first.

I think that it is possible to do precision propagation
for find_equal_scalars() if jump history is enriched with
information like scalar ids, which could then be used in
__mark_chain_precision() (because when something is marked as precise
all the conditionals that lead to this are in the current jump
history).

[1] https://lore.kernel.org/bpf/20221128163442.280187-1-eddyz87@gmail.com/

> 
> >         }));
> >  }
> > 
> > --
> > 2.34.1
> >
diff mbox series

Patch

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b0db9c10567b..8b0a03aad85e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3181,13 +3181,24 @@  static bool __is_pointer_value(bool allow_ptr_leaks,
 	return reg->type != SCALAR_VALUE;
 }
 
+/* Copy src state preserving dst->parent and dst->live fields */
+static void copy_register_state(struct bpf_reg_state *dst, const struct bpf_reg_state *src)
+{
+	struct bpf_reg_state *parent = dst->parent;
+	enum bpf_reg_liveness live = dst->live;
+
+	*dst = *src;
+	dst->parent = parent;
+	dst->live = live;
+}
+
 static void save_register_state(struct bpf_func_state *state,
 				int spi, struct bpf_reg_state *reg,
 				int size)
 {
 	int i;
 
-	state->stack[spi].spilled_ptr = *reg;
+	copy_register_state(&state->stack[spi].spilled_ptr, reg);
 	if (size == BPF_REG_SIZE)
 		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
@@ -3513,7 +3524,7 @@  static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 				 */
 				s32 subreg_def = state->regs[dst_regno].subreg_def;
 
-				state->regs[dst_regno] = *reg;
+				copy_register_state(&state->regs[dst_regno], reg);
 				state->regs[dst_regno].subreg_def = subreg_def;
 			} else {
 				for (i = 0; i < size; i++) {
@@ -3534,7 +3545,7 @@  static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
 
 		if (dst_regno >= 0) {
 			/* restore register state from stack */
-			state->regs[dst_regno] = *reg;
+			copy_register_state(&state->regs[dst_regno], reg);
 			/* mark reg as written since spilled pointer state likely
 			 * has its liveness marks cleared by is_state_visited()
 			 * which resets stack/reg liveness for state transitions
@@ -9407,7 +9418,7 @@  static int sanitize_ptr_alu(struct bpf_verifier_env *env,
 	 */
 	if (!ptr_is_dst_reg) {
 		tmp = *dst_reg;
-		*dst_reg = *ptr_reg;
+		copy_register_state(dst_reg, ptr_reg);
 	}
 	ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1,
 					env->insn_idx);
@@ -10660,7 +10671,7 @@  static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 					 * to propagate min/max range.
 					 */
 					src_reg->id = ++env->id_gen;
-				*dst_reg = *src_reg;
+				copy_register_state(dst_reg, src_reg);
 				dst_reg->live |= REG_LIVE_WRITTEN;
 				dst_reg->subreg_def = DEF_NOT_SUBREG;
 			} else {
@@ -10671,7 +10682,7 @@  static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 						insn->src_reg);
 					return -EACCES;
 				} else if (src_reg->type == SCALAR_VALUE) {
-					*dst_reg = *src_reg;
+					copy_register_state(dst_reg, src_reg);
 					/* Make sure ID is cleared otherwise
 					 * dst_reg min/max could be incorrectly
 					 * propagated into src_reg by find_equal_scalars()
@@ -11470,7 +11481,7 @@  static void find_equal_scalars(struct bpf_verifier_state *vstate,
 
 	bpf_for_each_reg_in_vstate(vstate, state, reg, ({
 		if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
-			*reg = *known_reg;
+			copy_register_state(reg, known_reg);
 	}));
 }