Message ID | 20230125213817.1424447-18-iii@linux.ibm.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Support bpf trampoline for s390x | expand |
On Wed, Jan 25, 2023 at 1:39 PM Ilya Leoshkevich <iii@linux.ibm.com> wrote: > > Loading programs that use bpf_usdt_arg() on s390x fails with: > > ; switch (arg_spec->arg_type) { > 139: (61) r1 = *(u32 *)(r2 +8) > R2 unbounded memory access, make sure to bounds check any such access can you show a bit longer log? we shouldn't just use bpf_probe_read_kernel for this. I suspect strategically placed barrier_var() calls will solve this. This is usually an issue with compiler reordering operations and doing actual check after it already speculatively adjusted pointer (which is technically safe and ok if we never deref that pointer, but verifier doesn't recognize such pattern) > > The bound checks seem to be already in place in the C code, and maybe > it's even possible to add extra bogus checks to placate LLVM or the > verifier. Take a simpler approach and just use a helper. > > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com> > --- > tools/lib/bpf/usdt.bpf.h | 33 ++++++++++++++++++--------------- > 1 file changed, 18 insertions(+), 15 deletions(-) > > diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h > index fdfd235e52c4..ddfa2521ab67 100644 > --- a/tools/lib/bpf/usdt.bpf.h > +++ b/tools/lib/bpf/usdt.bpf.h > @@ -116,7 +116,7 @@ __weak __hidden > int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) > { > struct __bpf_usdt_spec *spec; > - struct __bpf_usdt_arg_spec *arg_spec; > + struct __bpf_usdt_arg_spec arg_spec; > unsigned long val; > int err, spec_id; > > @@ -133,21 +133,24 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) > if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) > return -ENOENT; > > - arg_spec = &spec->args[arg_num]; > - switch (arg_spec->arg_type) { > + err = bpf_probe_read_kernel(&arg_spec, sizeof(arg_spec), &spec->args[arg_num]); > + if (err) > + return err; > + > + switch (arg_spec.arg_type) { > case BPF_USDT_ARG_CONST: > /* Arg is just a constant ("-4@$-9" in USDT arg spec). > - * value is recorded in arg_spec->val_off directly. > + * value is recorded in arg_spec.val_off directly. > */ > - val = arg_spec->val_off; > + val = arg_spec.val_off; > break; > case BPF_USDT_ARG_REG: > /* Arg is in a register (e.g, "8@%rax" in USDT arg spec), > * so we read the contents of that register directly from > * struct pt_regs. To keep things simple user-space parts > - * record offsetof(struct pt_regs, <regname>) in arg_spec->reg_off. > + * record offsetof(struct pt_regs, <regname>) in arg_spec.reg_off. > */ > - err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); > + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec.reg_off); > if (err) > return err; > break; > @@ -155,18 +158,18 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) > /* Arg is in memory addressed by register, plus some offset > * (e.g., "-4@-1204(%rbp)" in USDT arg spec). Register is > * identified like with BPF_USDT_ARG_REG case, and the offset > - * is in arg_spec->val_off. We first fetch register contents > + * is in arg_spec.val_off. We first fetch register contents > * from pt_regs, then do another user-space probe read to > * fetch argument value itself. > */ > - err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); > + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec.reg_off); > if (err) > return err; > - err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec->val_off); > + err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec.val_off); > if (err) > return err; > #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ > - val >>= arg_spec->arg_bitshift; > + val >>= arg_spec.arg_bitshift; > #endif > break; > default: > @@ -177,11 +180,11 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) > * necessary upper arg_bitshift bits, with sign extension if argument > * is signed > */ > - val <<= arg_spec->arg_bitshift; > - if (arg_spec->arg_signed) > - val = ((long)val) >> arg_spec->arg_bitshift; > + val <<= arg_spec.arg_bitshift; > + if (arg_spec.arg_signed) > + val = ((long)val) >> arg_spec.arg_bitshift; > else > - val = val >> arg_spec->arg_bitshift; > + val = val >> arg_spec.arg_bitshift; > *res = val; > return 0; > } > -- > 2.39.1 >
On Wed, 2023-01-25 at 16:26 -0800, Andrii Nakryiko wrote: > On Wed, Jan 25, 2023 at 1:39 PM Ilya Leoshkevich <iii@linux.ibm.com> > wrote: > > > > Loading programs that use bpf_usdt_arg() on s390x fails with: > > > > ; switch (arg_spec->arg_type) { > > 139: (61) r1 = *(u32 *)(r2 +8) > > R2 unbounded memory access, make sure to bounds check any such > > access > > can you show a bit longer log? we shouldn't just use > bpf_probe_read_kernel for this. I suspect strategically placed > barrier_var() calls will solve this. This is usually an issue with > compiler reordering operations and doing actual check after it > already > speculatively adjusted pointer (which is technically safe and ok if > we > never deref that pointer, but verifier doesn't recognize such > pattern) The full log is here: https://gist.github.com/iii-i/b6149ee99b37078ec920ab1d3bb45134 The relevant part seems to be: ; if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) 128: (79) r1 = *(u64 *)(r10 -24) ; frame1: R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0 129: (25) if r1 > 0xb goto pc+83 ; frame1: R1_w=scalar(umax=11,var_off=(0x0; 0xf)) ; if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) 130: (69) r1 = *(u16 *)(r8 +200) ; frame1: R1_w=scalar(umax=65535,var_off=(0x0; 0xffff)) R8_w=map_value(off=0,ks=4,vs=208,imm=0) 131: (67) r1 <<= 48 ; frame1: R1_w=scalar(smax=9223090561878065152,umax=18446462598732840960,var_off= (0x0; 0xffff000000000000),s32_min=0,s32_max=0,u32_max=0) 132: (c7) r1 s>>= 48 ; frame1: R1_w=scalar(smin=- 32768,smax=32767) ; if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) 133: (79) r2 = *(u64 *)(r10 -24) ; frame1: R2=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0 134: (bd) if r1 <= r2 goto pc+78 ; frame1: R1=scalar(smin=- 32768,smax=32767) R2=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) ; arg_spec = &spec->args[arg_num]; 135: (79) r1 = *(u64 *)(r10 -24) ; frame1: R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0 136: (67) r1 <<= 4 ; frame1: R1_w=scalar(umax=68719476720,var_off=(0x0; 0xffffffff0),s32_max=2147483632,u32_max=-16) 137: (bf) r2 = r8 ; frame1: R2_w=map_value(off=0,ks=4,vs=208,imm=0) R8=map_value(off=0,ks=4,vs=208,imm=0) 138: (0f) r2 += r1 ; frame1: R1_w=scalar(umax=68719476720,var_off=(0x0; 0xffffffff0),s32_max=2147483632,u32_max=-16) R2_w=map_value(off=0,ks=4,vs=208,umax=68719476720,var_off=(0x0; 0xffffffff0),s32_max=2147483632,u32_max=-16) ; switch (arg_spec->arg_type) { 139: (61) r1 = *(u32 *)(r2 +8) #128-#129 make sure that *(u64 *)(r10 -24) <= 11, but when #133 loads it again, this constraint is not there. I guess we need to force flushing r1 to stack? The following helps: --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -130,7 +130,10 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) if (!spec) return -ESRCH; - if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec- >arg_cnt) + if (arg_num >= BPF_USDT_MAX_ARG_CNT) + return -ENOENT; + barrier_var(arg_num); + if (arg_num >= spec->arg_cnt) return -ENOENT; arg_spec = &spec->args[arg_num]; I can use this in v2 if it looks good. Btw, I looked at the barrier_var() definition: #define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var)) and I'm curious why it's not defined like this: #define barrier_var(var) asm volatile("" : "+r"(var)) which is a bit simpler? >
On Thu, Jan 26, 2023 at 3:41 AM Ilya Leoshkevich <iii@linux.ibm.com> wrote: > > On Wed, 2023-01-25 at 16:26 -0800, Andrii Nakryiko wrote: > > On Wed, Jan 25, 2023 at 1:39 PM Ilya Leoshkevich <iii@linux.ibm.com> > > wrote: > > > > > > Loading programs that use bpf_usdt_arg() on s390x fails with: > > > > > > ; switch (arg_spec->arg_type) { > > > 139: (61) r1 = *(u32 *)(r2 +8) > > > R2 unbounded memory access, make sure to bounds check any such > > > access > > > > can you show a bit longer log? we shouldn't just use > > bpf_probe_read_kernel for this. I suspect strategically placed > > barrier_var() calls will solve this. This is usually an issue with > > compiler reordering operations and doing actual check after it > > already > > speculatively adjusted pointer (which is technically safe and ok if > > we > > never deref that pointer, but verifier doesn't recognize such > > pattern) > > The full log is here: > > https://gist.github.com/iii-i/b6149ee99b37078ec920ab1d3bb45134 > > The relevant part seems to be: > > ; if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) > 128: (79) r1 = *(u64 *)(r10 -24) ; frame1: > R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0 > 129: (25) if r1 > 0xb goto pc+83 ; frame1: > R1_w=scalar(umax=11,var_off=(0x0; 0xf)) > ; if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) > 130: (69) r1 = *(u16 *)(r8 +200) ; frame1: > R1_w=scalar(umax=65535,var_off=(0x0; 0xffff)) > R8_w=map_value(off=0,ks=4,vs=208,imm=0) > 131: (67) r1 <<= 48 ; frame1: > R1_w=scalar(smax=9223090561878065152,umax=18446462598732840960,var_off= > (0x0; 0xffff000000000000),s32_min=0,s32_max=0,u32_max=0) > 132: (c7) r1 s>>= 48 ; frame1: R1_w=scalar(smin=- > 32768,smax=32767) > ; if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) > 133: (79) r2 = *(u64 *)(r10 -24) ; frame1: > R2=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0 > 134: (bd) if r1 <= r2 goto pc+78 ; frame1: R1=scalar(smin=- > 32768,smax=32767) R2=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) > ; arg_spec = &spec->args[arg_num]; > 135: (79) r1 = *(u64 *)(r10 -24) ; frame1: > R1_w=scalar(umax=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0 > 136: (67) r1 <<= 4 ; frame1: > R1_w=scalar(umax=68719476720,var_off=(0x0; > 0xffffffff0),s32_max=2147483632,u32_max=-16) > 137: (bf) r2 = r8 ; frame1: > R2_w=map_value(off=0,ks=4,vs=208,imm=0) > R8=map_value(off=0,ks=4,vs=208,imm=0) > 138: (0f) r2 += r1 ; frame1: > R1_w=scalar(umax=68719476720,var_off=(0x0; > 0xffffffff0),s32_max=2147483632,u32_max=-16) > R2_w=map_value(off=0,ks=4,vs=208,umax=68719476720,var_off=(0x0; > 0xffffffff0),s32_max=2147483632,u32_max=-16) > ; switch (arg_spec->arg_type) { > 139: (61) r1 = *(u32 *)(r2 +8) > > #128-#129 make sure that *(u64 *)(r10 -24) <= 11, but when #133 > loads it again, this constraint is not there. I guess we need to > force flushing r1 to stack? The following helps: > > --- a/tools/lib/bpf/usdt.bpf.h > +++ b/tools/lib/bpf/usdt.bpf.h > @@ -130,7 +130,10 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 > arg_num, long *res) > if (!spec) > return -ESRCH; > > - if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec- > >arg_cnt) > + if (arg_num >= BPF_USDT_MAX_ARG_CNT) > + return -ENOENT; > + barrier_var(arg_num); > + if (arg_num >= spec->arg_cnt) > return -ENOENT; > > arg_spec = &spec->args[arg_num]; > > I can use this in v2 if it looks good. arg_num -> spec->arg_cnt is "real" check, arg_num >= BPF_USDT_MAX_ARG_CNT is more to satisfy verifier (we know that spec->arg_cnt won't be >= BPF_USDT_MAX_ARG_CNT). Let's swap two checks in order and keep BPF_USDT_MAX_ARG_CNT close to spec->args[arg_num] use? And if barrier_var() is necessary, then so be it. > > > > Btw, I looked at the barrier_var() definition: > > #define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var)) > > and I'm curious why it's not defined like this: > > #define barrier_var(var) asm volatile("" : "+r"(var)) > > which is a bit simpler? > > no reason, just unfamiliarity with embedded asm back then, we can update it we they are equivalent
On Thu, 2023-01-26 at 11:03 -0800, Andrii Nakryiko wrote: > On Thu, Jan 26, 2023 at 3:41 AM Ilya Leoshkevich <iii@linux.ibm.com> > wrote: > > > > On Wed, 2023-01-25 at 16:26 -0800, Andrii Nakryiko wrote: > > > On Wed, Jan 25, 2023 at 1:39 PM Ilya Leoshkevich > > > <iii@linux.ibm.com> > > > wrote: > > > > > > > > Loading programs that use bpf_usdt_arg() on s390x fails with: > > > > > > > > ; switch (arg_spec->arg_type) { > > > > 139: (61) r1 = *(u32 *)(r2 +8) > > > > R2 unbounded memory access, make sure to bounds check any > > > > such > > > > access > > > > > > can you show a bit longer log? we shouldn't just use > > > bpf_probe_read_kernel for this. I suspect strategically placed > > > barrier_var() calls will solve this. This is usually an issue > > > with > > > compiler reordering operations and doing actual check after it > > > already > > > speculatively adjusted pointer (which is technically safe and ok > > > if > > > we > > > never deref that pointer, but verifier doesn't recognize such > > > pattern) > > > > The full log is here: > > > > https://gist.github.com/iii-i/b6149ee99b37078ec920ab1d3bb45134 [...] > > --- a/tools/lib/bpf/usdt.bpf.h > > +++ b/tools/lib/bpf/usdt.bpf.h > > @@ -130,7 +130,10 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 > > arg_num, long *res) > > if (!spec) > > return -ESRCH; > > > > - if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec- > > > arg_cnt) > > + if (arg_num >= BPF_USDT_MAX_ARG_CNT) > > + return -ENOENT; > > + barrier_var(arg_num); > > + if (arg_num >= spec->arg_cnt) > > return -ENOENT; > > > > arg_spec = &spec->args[arg_num]; > > > > I can use this in v2 if it looks good. > > arg_num -> spec->arg_cnt is "real" check, arg_num >= > BPF_USDT_MAX_ARG_CNT is more to satisfy verifier (we know that > spec->arg_cnt won't be >= BPF_USDT_MAX_ARG_CNT). Let's swap two > checks > in order and keep BPF_USDT_MAX_ARG_CNT close to spec->args[arg_num] > use? And if barrier_var() is necessary, then so be it. Unfortunately just swapping did not help, so let's use the barrier. > > Btw, I looked at the barrier_var() definition: > > > > #define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var)) > > > > and I'm curious why it's not defined like this: > > > > #define barrier_var(var) asm volatile("" : "+r"(var)) > > > > which is a bit simpler? > > > > > no reason, just unfamiliarity with embedded asm back then, we can > update it we they are equivalent Thanks! I will add a simplification in v2 then.
diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h index fdfd235e52c4..ddfa2521ab67 100644 --- a/tools/lib/bpf/usdt.bpf.h +++ b/tools/lib/bpf/usdt.bpf.h @@ -116,7 +116,7 @@ __weak __hidden int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) { struct __bpf_usdt_spec *spec; - struct __bpf_usdt_arg_spec *arg_spec; + struct __bpf_usdt_arg_spec arg_spec; unsigned long val; int err, spec_id; @@ -133,21 +133,24 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) return -ENOENT; - arg_spec = &spec->args[arg_num]; - switch (arg_spec->arg_type) { + err = bpf_probe_read_kernel(&arg_spec, sizeof(arg_spec), &spec->args[arg_num]); + if (err) + return err; + + switch (arg_spec.arg_type) { case BPF_USDT_ARG_CONST: /* Arg is just a constant ("-4@$-9" in USDT arg spec). - * value is recorded in arg_spec->val_off directly. + * value is recorded in arg_spec.val_off directly. */ - val = arg_spec->val_off; + val = arg_spec.val_off; break; case BPF_USDT_ARG_REG: /* Arg is in a register (e.g, "8@%rax" in USDT arg spec), * so we read the contents of that register directly from * struct pt_regs. To keep things simple user-space parts - * record offsetof(struct pt_regs, <regname>) in arg_spec->reg_off. + * record offsetof(struct pt_regs, <regname>) in arg_spec.reg_off. */ - err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec.reg_off); if (err) return err; break; @@ -155,18 +158,18 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) /* Arg is in memory addressed by register, plus some offset * (e.g., "-4@-1204(%rbp)" in USDT arg spec). Register is * identified like with BPF_USDT_ARG_REG case, and the offset - * is in arg_spec->val_off. We first fetch register contents + * is in arg_spec.val_off. We first fetch register contents * from pt_regs, then do another user-space probe read to * fetch argument value itself. */ - err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off); + err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec.reg_off); if (err) return err; - err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec->val_off); + err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec.val_off); if (err) return err; #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - val >>= arg_spec->arg_bitshift; + val >>= arg_spec.arg_bitshift; #endif break; default: @@ -177,11 +180,11 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) * necessary upper arg_bitshift bits, with sign extension if argument * is signed */ - val <<= arg_spec->arg_bitshift; - if (arg_spec->arg_signed) - val = ((long)val) >> arg_spec->arg_bitshift; + val <<= arg_spec.arg_bitshift; + if (arg_spec.arg_signed) + val = ((long)val) >> arg_spec.arg_bitshift; else - val = val >> arg_spec->arg_bitshift; + val = val >> arg_spec.arg_bitshift; *res = val; return 0; }
Loading programs that use bpf_usdt_arg() on s390x fails with: ; switch (arg_spec->arg_type) { 139: (61) r1 = *(u32 *)(r2 +8) R2 unbounded memory access, make sure to bounds check any such access The bound checks seem to be already in place in the C code, and maybe it's even possible to add extra bogus checks to placate LLVM or the verifier. Take a simpler approach and just use a helper. Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com> --- tools/lib/bpf/usdt.bpf.h | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-)