diff mbox series

[RFC,bpf-next,1/3] bpf: Fix certain narrow loads with offsets

Message ID 20220222182559.2865596-2-iii@linux.ibm.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series bpf_sk_lookup.remote_port fixes | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next fail VM_Test
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 20 this patch: 20
netdev/cc_maintainers fail 1 blamed authors not CCed: rdna@fb.com; 8 maintainers not CCed: andrii@kernel.org rdna@fb.com kpsingh@kernel.org john.fastabend@gmail.com kafai@fb.com songliubraving@fb.com yhs@fb.com netdev@vger.kernel.org
netdev/build_clang success Errors and warnings before: 18 this patch: 18
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 25 this patch: 25
netdev/checkpatch warning CHECK: Lines should not end with a '('
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Ilya Leoshkevich Feb. 22, 2022, 6:25 p.m. UTC
Verifier treats bpf_sk_lookup.remote_port as a 32-bit field for
backward compatibility, regardless of what the uapi headers say.
This field is mapped onto the 16-bit bpf_sk_lookup_kern.sport field.
Therefore, accessing the most significant 16 bits of
bpf_sk_lookup.remote_port must produce 0, which is currently not
the case.

The problem is that narrow loads with offset - commit 46f53a65d2de
("bpf: Allow narrow loads with offset > 0"), don't play nicely with
the masking optimization - commit 239946314e57 ("bpf: possibly avoid
extra masking for narrower load in verifier"). In particular, when we
suppress extra masking, we suppress shifting as well, which is not
correct.

Fix by moving the masking suppression check to BPF_AND generation.

Fixes: 46f53a65d2de ("bpf: Allow narrow loads with offset > 0")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
---
 kernel/bpf/verifier.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

Comments

Jakub Sitnicki March 8, 2022, 3:01 p.m. UTC | #1
On Tue, Feb 22, 2022 at 07:25 PM +01, Ilya Leoshkevich wrote:
> Verifier treats bpf_sk_lookup.remote_port as a 32-bit field for
> backward compatibility, regardless of what the uapi headers say.
> This field is mapped onto the 16-bit bpf_sk_lookup_kern.sport field.
> Therefore, accessing the most significant 16 bits of
> bpf_sk_lookup.remote_port must produce 0, which is currently not
> the case.
>
> The problem is that narrow loads with offset - commit 46f53a65d2de
> ("bpf: Allow narrow loads with offset > 0"), don't play nicely with
> the masking optimization - commit 239946314e57 ("bpf: possibly avoid
> extra masking for narrower load in verifier"). In particular, when we
> suppress extra masking, we suppress shifting as well, which is not
> correct.
>
> Fix by moving the masking suppression check to BPF_AND generation.
>
> Fixes: 46f53a65d2de ("bpf: Allow narrow loads with offset > 0")
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> ---
>  kernel/bpf/verifier.c | 14 +++++++++-----
>  1 file changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index d7473fee247c..195f2e9b5a47 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -12848,7 +12848,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
>  			return -EINVAL;
>  		}
>  
> -		if (is_narrower_load && size < target_size) {
> +		if (is_narrower_load) {
>  			u8 shift = bpf_ctx_narrow_access_offset(
>  				off, size, size_default) * 8;
>  			if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
> @@ -12860,15 +12860,19 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
>  					insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
>  									insn->dst_reg,
>  									shift);
> -				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
> -								(1 << size * 8) - 1);
> +				if (size < target_size)
> +					insn_buf[cnt++] = BPF_ALU32_IMM(
> +						BPF_AND, insn->dst_reg,
> +						(1 << size * 8) - 1);
>  			} else {
>  				if (shift)
>  					insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
>  									insn->dst_reg,
>  									shift);
> -				insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
> -								(1ULL << size * 8) - 1);
> +				if (size < target_size)
> +					insn_buf[cnt++] = BPF_ALU64_IMM(
> +						BPF_AND, insn->dst_reg,
> +						(1ULL << size * 8) - 1);
>  			}
>  		}

Thanks for patience. I'm coming back to this.

This fix affects the 2-byte load from bpf_sk_lookup.remote_port.
Dumping the xlated BPF code confirms it.

On LE (x86-64) things look well.

Before this patch:

* size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (b7) r0 = 0
   2: (95) exit

* size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (b7) r0 = 0
   2: (95) exit

After this patch:

* size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (b7) r0 = 0
   2: (95) exit

* size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (74) w2 >>= 16
   2: (b7) r0 = 0
   3: (95) exit

Which works great because the JIT generates a zero-extended load movzwq:

* size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
bpf_prog_5e4fe3dbdcb18fd3:
   0:   nopl   0x0(%rax,%rax,1)
   5:   xchg   %ax,%ax
   7:   push   %rbp
   8:   mov    %rsp,%rbp
   b:   movzwq 0x4(%rdi),%rsi
  10:   xor    %eax,%eax
  12:   leave
  13:   ret


* size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
bpf_prog_4a6336c64a340b96:
   0:   nopl   0x0(%rax,%rax,1)
   5:   xchg   %ax,%ax
   7:   push   %rbp
   8:   mov    %rsp,%rbp
   b:   movzwq 0x4(%rdi),%rsi
  10:   shr    $0x10,%esi
  13:   xor    %eax,%eax
  15:   leave
  16:   ret

Runtime checks for bpf_sk_lookup.remote_port load and the 2-bytes of
zero padding following it, like below, pass with flying colors:

	ok = ctx->remote_port == bpf_htons(8008);
	if (!ok)
		return SK_DROP;
	ok = *((__u16 *)&ctx->remote_port + 1) == 0;
	if (!ok)
		return SK_DROP;

(The above checks compile to half-word (2-byte) loads.)


On BE (s390x) things look different:

Before the patch:

* size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (bc) w2 = w2
   2: (b7) r0 = 0
   3: (95) exit

* size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (bc) w2 = w2
   2: (b7) r0 = 0
   3: (95) exit

After the patch:

* size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (bc) w2 = w2
   2: (74) w2 >>= 16
   3: (bc) w2 = w2
   4: (b7) r0 = 0
   5: (95) exit

* size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (bc) w2 = w2
   2: (b7) r0 = 0
   3: (95) exit

These compile to:

* size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
bpf_prog_fdd58b8caca29f00:
   0:   j       0x0000000000000006
   4:   nopr
   6:   stmg    %r11,%r15,112(%r15)
   c:   la      %r13,64(%r15)
  10:   aghi    %r15,-96
  14:   llgh    %r3,4(%r2,%r0)
  1a:   srl     %r3,16
  1e:   llgfr   %r3,%r3
  22:   lgfi    %r14,0
  28:   lgr     %r2,%r14
  2c:   lmg     %r11,%r15,208(%r15)
  32:   br      %r14


* size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
bpf_prog_5e3d8e92223c6841:
   0:   j       0x0000000000000006
   4:   nopr
   6:   stmg    %r11,%r15,112(%r15)
   c:   la      %r13,64(%r15)
  10:   aghi    %r15,-96
  14:   llgh    %r3,4(%r2,%r0)
  1a:   lgfi    %r14,0
  20:   lgr     %r2,%r14
  24:   lmg     %r11,%r15,208(%r15)
  2a:   br      %r14

Now, we right shift the value when loading

  *(u16 *)(r1 +36)

which in C BPF is equivalent to

  *((__u16 *)&ctx->remote_port + 0)

due to how the shift is calculated by bpf_ctx_narrow_access_offset().

This makes the expected typical use-case

  ctx->remote_port == bpf_htons(8008)

fail on s390x because llgh (Load Logical Halfword (64<-16)) seems to lay
out the data in the destination register so that it holds
0x0000_0000_0000_1f48.

I don't know that was the intention here, as it makes the BPF C code
non-portable.

WDYT?


BTW. Out of curiosity, how does a Logical Load Halfword (llgh) differ
differ from a non-logical Load Halfword (lgh) on s390x? Compiler
Explorer generates a non-logical load for similar C code.
Ilya Leoshkevich March 8, 2022, 11:58 p.m. UTC | #2
On Tue, 2022-03-08 at 16:01 +0100, Jakub Sitnicki wrote:
> On Tue, Feb 22, 2022 at 07:25 PM +01, Ilya Leoshkevich wrote:
> > Verifier treats bpf_sk_lookup.remote_port as a 32-bit field for
> > backward compatibility, regardless of what the uapi headers say.
> > This field is mapped onto the 16-bit bpf_sk_lookup_kern.sport
> > field.
> > Therefore, accessing the most significant 16 bits of
> > bpf_sk_lookup.remote_port must produce 0, which is currently not
> > the case.
> > 
> > The problem is that narrow loads with offset - commit 46f53a65d2de
> > ("bpf: Allow narrow loads with offset > 0"), don't play nicely with
> > the masking optimization - commit 239946314e57 ("bpf: possibly
> > avoid
> > extra masking for narrower load in verifier"). In particular, when
> > we
> > suppress extra masking, we suppress shifting as well, which is not
> > correct.
> > 
> > Fix by moving the masking suppression check to BPF_AND generation.
> > 
> > Fixes: 46f53a65d2de ("bpf: Allow narrow loads with offset > 0")
> > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > ---
> >  kernel/bpf/verifier.c | 14 +++++++++-----
> >  1 file changed, 9 insertions(+), 5 deletions(-)
> > 
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index d7473fee247c..195f2e9b5a47 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -12848,7 +12848,7 @@ static int convert_ctx_accesses(struct
> > bpf_verifier_env *env)
> >                         return -EINVAL;
> >                 }
> >  
> > -               if (is_narrower_load && size < target_size) {
> > +               if (is_narrower_load) {
> >                         u8 shift = bpf_ctx_narrow_access_offset(
> >                                 off, size, size_default) * 8;
> >                         if (shift && cnt + 1 >=
> > ARRAY_SIZE(insn_buf)) {
> > @@ -12860,15 +12860,19 @@ static int convert_ctx_accesses(struct
> > bpf_verifier_env *env)
> >                                         insn_buf[cnt++] =
> > BPF_ALU32_IMM(BPF_RSH,
> >                                                                    
> >      insn->dst_reg,
> >                                                                    
> >      shift);
> > -                               insn_buf[cnt++] =
> > BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
> > -                                                               (1
> > << size * 8) - 1);
> > +                               if (size < target_size)
> > +                                       insn_buf[cnt++] =
> > BPF_ALU32_IMM(
> > +                                               BPF_AND, insn-
> > >dst_reg,
> > +                                               (1 << size * 8) -
> > 1);
> >                         } else {
> >                                 if (shift)
> >                                         insn_buf[cnt++] =
> > BPF_ALU64_IMM(BPF_RSH,
> >                                                                    
> >      insn->dst_reg,
> >                                                                    
> >      shift);
> > -                               insn_buf[cnt++] =
> > BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
> > -
> >                                                                (1ULL
> >  << size * 8) - 1);
> > +                               if (size < target_size)
> > +                                       insn_buf[cnt++] =
> > BPF_ALU64_IMM(
> > +                                               BPF_AND, insn-
> > >dst_reg,
> > +                                               (1ULL << size * 8)
> > - 1);
> >                         }
> >                 }
> 
> Thanks for patience. I'm coming back to this.
> 
> This fix affects the 2-byte load from bpf_sk_lookup.remote_port.
> Dumping the xlated BPF code confirms it.
> 
> On LE (x86-64) things look well.
> 
> Before this patch:
> 
> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (b7) r0 = 0
>    2: (95) exit
> 
> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (b7) r0 = 0
>    2: (95) exit
> 
> After this patch:
> 
> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (b7) r0 = 0
>    2: (95) exit
> 
> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (74) w2 >>= 16
>    2: (b7) r0 = 0
>    3: (95) exit
> 
> Which works great because the JIT generates a zero-extended load
> movzwq:
> 
> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
> bpf_prog_5e4fe3dbdcb18fd3:
>    0:   nopl   0x0(%rax,%rax,1)
>    5:   xchg   %ax,%ax
>    7:   push   %rbp
>    8:   mov    %rsp,%rbp
>    b:   movzwq 0x4(%rdi),%rsi
>   10:   xor    %eax,%eax
>   12:   leave
>   13:   ret
> 
> 
> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
> bpf_prog_4a6336c64a340b96:
>    0:   nopl   0x0(%rax,%rax,1)
>    5:   xchg   %ax,%ax
>    7:   push   %rbp
>    8:   mov    %rsp,%rbp
>    b:   movzwq 0x4(%rdi),%rsi
>   10:   shr    $0x10,%esi
>   13:   xor    %eax,%eax
>   15:   leave
>   16:   ret
> 
> Runtime checks for bpf_sk_lookup.remote_port load and the 2-bytes of
> zero padding following it, like below, pass with flying colors:
> 
>         ok = ctx->remote_port == bpf_htons(8008);
>         if (!ok)
>                 return SK_DROP;
>         ok = *((__u16 *)&ctx->remote_port + 1) == 0;
>         if (!ok)
>                 return SK_DROP;
> 
> (The above checks compile to half-word (2-byte) loads.)
> 
> 
> On BE (s390x) things look different:
> 
> Before the patch:
> 
> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (bc) w2 = w2
>    2: (b7) r0 = 0
>    3: (95) exit
> 
> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (bc) w2 = w2
>    2: (b7) r0 = 0
>    3: (95) exit
> 
> After the patch:
> 
> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (bc) w2 = w2
>    2: (74) w2 >>= 16
>    3: (bc) w2 = w2
>    4: (b7) r0 = 0
>    5: (95) exit
> 
> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (bc) w2 = w2
>    2: (b7) r0 = 0
>    3: (95) exit
> 
> These compile to:
> 
> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
> bpf_prog_fdd58b8caca29f00:
>    0:   j       0x0000000000000006
>    4:   nopr
>    6:   stmg    %r11,%r15,112(%r15)
>    c:   la      %r13,64(%r15)
>   10:   aghi    %r15,-96
>   14:   llgh    %r3,4(%r2,%r0)
>   1a:   srl     %r3,16
>   1e:   llgfr   %r3,%r3
>   22:   lgfi    %r14,0
>   28:   lgr     %r2,%r14
>   2c:   lmg     %r11,%r15,208(%r15)
>   32:   br      %r14
> 
> 
> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
> bpf_prog_5e3d8e92223c6841:
>    0:   j       0x0000000000000006
>    4:   nopr
>    6:   stmg    %r11,%r15,112(%r15)
>    c:   la      %r13,64(%r15)
>   10:   aghi    %r15,-96
>   14:   llgh    %r3,4(%r2,%r0)
>   1a:   lgfi    %r14,0
>   20:   lgr     %r2,%r14
>   24:   lmg     %r11,%r15,208(%r15)
>   2a:   br      %r14
> 
> Now, we right shift the value when loading
> 
>   *(u16 *)(r1 +36)
> 
> which in C BPF is equivalent to
> 
>   *((__u16 *)&ctx->remote_port + 0)
> 
> due to how the shift is calculated by bpf_ctx_narrow_access_offset().

Right, that's exactly the intention here.
The way I see the situation is: the ABI forces us to treat remote_port
as a 32-bit field, even though the updated header now says otherwise.
And this:

    unsigned int remote_port;
    unsigned short result = *(unsigned short *)remote_port;

should be the same as:

    unsigned short result = remote_port >> 16;

on big-endian. Note that this is inherently non-portable.

> This makes the expected typical use-case
> 
>   ctx->remote_port == bpf_htons(8008)
> 
> fail on s390x because llgh (Load Logical Halfword (64<-16)) seems to
> lay
> out the data in the destination register so that it holds
> 0x0000_0000_0000_1f48.
> 
> I don't know that was the intention here, as it makes the BPF C code
> non-portable.
> 
> WDYT?

This depends on how we define the remote_port field. I would argue that
the definition from patch 2 - even though ugly - is the correct one.
It is consistent with both the little-endian (1f 48 00 00) and
big-endian (00 00 1f 48) ABIs.

I don't think the current definition is correct, because it expects
1f 48 00 00 on big-endian, and this is not the case. We can verify this
by taking 9a69e2^ and applying

--- a/tools/testing/selftests/bpf/progs/test_sk_lookup.c
+++ b/tools/testing/selftests/bpf/progs/test_sk_lookup.c
@@ -417,6 +417,8 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx)
                return SK_DROP;
        if (LSW(ctx->remote_port, 0) != SRC_PORT)
                return SK_DROP;
+       if (ctx->remote_port != SRC_PORT)
+               return SK_DROP;
 
        /* Narrow loads from local_port field. Expect DST_PORT. */
        if (LSB(ctx->local_port, 0) != ((DST_PORT >> 0) & 0xff) ||

Therefore that

  ctx->remote_port == bpf_htons(8008)

fails without patch 2 is as expected.

> BTW. Out of curiosity, how does a Logical Load Halfword (llgh) differ
> differ from a non-logical Load Halfword (lgh) on s390x? Compiler
> Explorer generates a non-logical load for similar C code.

The logical one does zero extension, and the regular one does sign
extension.

The following

  unsigned long foo(unsigned short *bar) {
          return *bar;
  }

is compiled to

  foo(unsigned short*):
          llgh    %r2,0(%r2)
          br      %r14

with -O3. Without -O3 zeroing out the upper bits is done using the sllg
and srlg (left and right logical shifts respectively).
Jakub Sitnicki March 9, 2022, 8:36 a.m. UTC | #3
On Wed, Mar 09, 2022 at 12:58 AM +01, Ilya Leoshkevich wrote:
> On Tue, 2022-03-08 at 16:01 +0100, Jakub Sitnicki wrote:
>> On Tue, Feb 22, 2022 at 07:25 PM +01, Ilya Leoshkevich wrote:
>> > Verifier treats bpf_sk_lookup.remote_port as a 32-bit field for
>> > backward compatibility, regardless of what the uapi headers say.
>> > This field is mapped onto the 16-bit bpf_sk_lookup_kern.sport
>> > field.
>> > Therefore, accessing the most significant 16 bits of
>> > bpf_sk_lookup.remote_port must produce 0, which is currently not
>> > the case.
>> > 
>> > The problem is that narrow loads with offset - commit 46f53a65d2de
>> > ("bpf: Allow narrow loads with offset > 0"), don't play nicely with
>> > the masking optimization - commit 239946314e57 ("bpf: possibly
>> > avoid
>> > extra masking for narrower load in verifier"). In particular, when
>> > we
>> > suppress extra masking, we suppress shifting as well, which is not
>> > correct.
>> > 
>> > Fix by moving the masking suppression check to BPF_AND generation.
>> > 
>> > Fixes: 46f53a65d2de ("bpf: Allow narrow loads with offset > 0")
>> > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
>> > ---
>> >  kernel/bpf/verifier.c | 14 +++++++++-----
>> >  1 file changed, 9 insertions(+), 5 deletions(-)
>> > 
>> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> > index d7473fee247c..195f2e9b5a47 100644
>> > --- a/kernel/bpf/verifier.c
>> > +++ b/kernel/bpf/verifier.c
>> > @@ -12848,7 +12848,7 @@ static int convert_ctx_accesses(struct
>> > bpf_verifier_env *env)
>> >                         return -EINVAL;
>> >                 }
>> >  
>> > -               if (is_narrower_load && size < target_size) {
>> > +               if (is_narrower_load) {
>> >                         u8 shift = bpf_ctx_narrow_access_offset(
>> >                                 off, size, size_default) * 8;
>> >                         if (shift && cnt + 1 >=
>> > ARRAY_SIZE(insn_buf)) {
>> > @@ -12860,15 +12860,19 @@ static int convert_ctx_accesses(struct
>> > bpf_verifier_env *env)
>> >                                         insn_buf[cnt++] =
>> > BPF_ALU32_IMM(BPF_RSH,
>> >                                                                    
>> >      insn->dst_reg,
>> >                                                                    
>> >      shift);
>> > -                               insn_buf[cnt++] =
>> > BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
>> > -                                                               (1
>> > << size * 8) - 1);
>> > +                               if (size < target_size)
>> > +                                       insn_buf[cnt++] =
>> > BPF_ALU32_IMM(
>> > +                                               BPF_AND, insn-
>> > >dst_reg,
>> > +                                               (1 << size * 8) -
>> > 1);
>> >                         } else {
>> >                                 if (shift)
>> >                                         insn_buf[cnt++] =
>> > BPF_ALU64_IMM(BPF_RSH,
>> >                                                                    
>> >      insn->dst_reg,
>> >                                                                    
>> >      shift);
>> > -                               insn_buf[cnt++] =
>> > BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
>> > -
>> >                                                                (1ULL
>> >  << size * 8) - 1);
>> > +                               if (size < target_size)
>> > +                                       insn_buf[cnt++] =
>> > BPF_ALU64_IMM(
>> > +                                               BPF_AND, insn-
>> > >dst_reg,
>> > +                                               (1ULL << size * 8)
>> > - 1);
>> >                         }
>> >                 }
>> 
>> Thanks for patience. I'm coming back to this.
>> 
>> This fix affects the 2-byte load from bpf_sk_lookup.remote_port.
>> Dumping the xlated BPF code confirms it.
>> 
>> On LE (x86-64) things look well.
>> 
>> Before this patch:
>> 
>> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (b7) r0 = 0
>>    2: (95) exit
>> 
>> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (b7) r0 = 0
>>    2: (95) exit
>> 
>> After this patch:
>> 
>> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (b7) r0 = 0
>>    2: (95) exit
>> 
>> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (74) w2 >>= 16
>>    2: (b7) r0 = 0
>>    3: (95) exit
>> 
>> Which works great because the JIT generates a zero-extended load
>> movzwq:
>> 
>> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
>> bpf_prog_5e4fe3dbdcb18fd3:
>>    0:   nopl   0x0(%rax,%rax,1)
>>    5:   xchg   %ax,%ax
>>    7:   push   %rbp
>>    8:   mov    %rsp,%rbp
>>    b:   movzwq 0x4(%rdi),%rsi
>>   10:   xor    %eax,%eax
>>   12:   leave
>>   13:   ret
>> 
>> 
>> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
>> bpf_prog_4a6336c64a340b96:
>>    0:   nopl   0x0(%rax,%rax,1)
>>    5:   xchg   %ax,%ax
>>    7:   push   %rbp
>>    8:   mov    %rsp,%rbp
>>    b:   movzwq 0x4(%rdi),%rsi
>>   10:   shr    $0x10,%esi
>>   13:   xor    %eax,%eax
>>   15:   leave
>>   16:   ret
>> 
>> Runtime checks for bpf_sk_lookup.remote_port load and the 2-bytes of
>> zero padding following it, like below, pass with flying colors:
>> 
>>         ok = ctx->remote_port == bpf_htons(8008);
>>         if (!ok)
>>                 return SK_DROP;
>>         ok = *((__u16 *)&ctx->remote_port + 1) == 0;
>>         if (!ok)
>>                 return SK_DROP;
>> 
>> (The above checks compile to half-word (2-byte) loads.)
>> 
>> 
>> On BE (s390x) things look different:
>> 
>> Before the patch:
>> 
>> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (bc) w2 = w2
>>    2: (b7) r0 = 0
>>    3: (95) exit
>> 
>> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (bc) w2 = w2
>>    2: (b7) r0 = 0
>>    3: (95) exit
>> 
>> After the patch:
>> 
>> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (bc) w2 = w2
>>    2: (74) w2 >>= 16
>>    3: (bc) w2 = w2
>>    4: (b7) r0 = 0
>>    5: (95) exit
>> 
>> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (bc) w2 = w2
>>    2: (b7) r0 = 0
>>    3: (95) exit
>> 
>> These compile to:
>> 
>> * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
>> bpf_prog_fdd58b8caca29f00:
>>    0:   j       0x0000000000000006
>>    4:   nopr
>>    6:   stmg    %r11,%r15,112(%r15)
>>    c:   la      %r13,64(%r15)
>>   10:   aghi    %r15,-96
>>   14:   llgh    %r3,4(%r2,%r0)
>>   1a:   srl     %r3,16
>>   1e:   llgfr   %r3,%r3
>>   22:   lgfi    %r14,0
>>   28:   lgr     %r2,%r14
>>   2c:   lmg     %r11,%r15,208(%r15)
>>   32:   br      %r14
>> 
>> 
>> * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
>> bpf_prog_5e3d8e92223c6841:
>>    0:   j       0x0000000000000006
>>    4:   nopr
>>    6:   stmg    %r11,%r15,112(%r15)
>>    c:   la      %r13,64(%r15)
>>   10:   aghi    %r15,-96
>>   14:   llgh    %r3,4(%r2,%r0)
>>   1a:   lgfi    %r14,0
>>   20:   lgr     %r2,%r14
>>   24:   lmg     %r11,%r15,208(%r15)
>>   2a:   br      %r14
>> 
>> Now, we right shift the value when loading
>> 
>>   *(u16 *)(r1 +36)
>> 
>> which in C BPF is equivalent to
>> 
>>   *((__u16 *)&ctx->remote_port + 0)
>> 
>> due to how the shift is calculated by bpf_ctx_narrow_access_offset().
>
> Right, that's exactly the intention here.
> The way I see the situation is: the ABI forces us to treat remote_port
> as a 32-bit field, even though the updated header now says otherwise.
> And this:
>
>     unsigned int remote_port;
>     unsigned short result = *(unsigned short *)remote_port;
>
> should be the same as:
>
>     unsigned short result = remote_port >> 16;
>
> on big-endian. Note that this is inherently non-portable.





>
>> This makes the expected typical use-case
>> 
>>   ctx->remote_port == bpf_htons(8008)
>> 
>> fail on s390x because llgh (Load Logical Halfword (64<-16)) seems to
>> lay
>> out the data in the destination register so that it holds
>> 0x0000_0000_0000_1f48.
>> 
>> I don't know that was the intention here, as it makes the BPF C code
>> non-portable.
>> 
>> WDYT?
>
> This depends on how we define the remote_port field. I would argue that
> the definition from patch 2 - even though ugly - is the correct one.
> It is consistent with both the little-endian (1f 48 00 00) and
> big-endian (00 00 1f 48) ABIs.
>
> I don't think the current definition is correct, because it expects
> 1f 48 00 00 on big-endian, and this is not the case. We can verify this
> by taking 9a69e2^ and applying
>
> --- a/tools/testing/selftests/bpf/progs/test_sk_lookup.c
> +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup.c
> @@ -417,6 +417,8 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx)
>                 return SK_DROP;
>         if (LSW(ctx->remote_port, 0) != SRC_PORT)
>                 return SK_DROP;
> +       if (ctx->remote_port != SRC_PORT)
> +               return SK_DROP;
>  
>         /* Narrow loads from local_port field. Expect DST_PORT. */
>         if (LSB(ctx->local_port, 0) != ((DST_PORT >> 0) & 0xff) ||
>
> Therefore that
>
>   ctx->remote_port == bpf_htons(8008)
>
> fails without patch 2 is as expected.
>

Consider this - today the below is true on both LE and BE, right?

  *(u32 *)&ctx->remote_port == *(u16 *)&ctx->remote_port

because the loads get converted to:

  *(u16 *)&ctx_kern->sport == *(u16 *)&ctx_kern->sport

IOW, today, because of the bug that you are fixing here, the data layout
changes from the PoV of the BPF program depending on the load size.

With 2-byte loads, without this patch, the data layout appears as:

  struct bpf_sk_lookup {
    ...
    __be16 remote_port;
    __be16 remote_port;
    ...
  }

While for 4-byte loads, it appears as in your 2nd patch:

  struct bpf_sk_lookup {
    ...
    #if little-endian
    __be16 remote_port;
    __u16  :16; /* zero padding */
    #elif big-endian
    __u16  :16; /* zero padding */
    __be16 remote_port;
    #endif
    ...
  }

Because of that I don't see how we could keep complete ABI compatiblity,
and have just one definition of struct bpf_sk_lookup that reflects
it. These are conflicting requirements.

I'd bite the bullet for 4-byte loads, for the sake of having an
endian-agnostic struct bpf_sk_lookup and struct bpf_sock definition in
the uAPI header.

The sacrifice here is that the access converter will have to keep
rewriting 4-byte access to bpf_sk_lookup.remote_port and
bpf_sock.dst_port in this unexpected, quirky manner.

The expectation is that with time users will recompile their BPF progs
against the updated bpf.h, and switch to 2-byte loads. That will make
the quirk in the access converter dead code in time.

I don't have any better ideas. Sorry.

[...]
Ilya Leoshkevich March 9, 2022, 12:34 p.m. UTC | #4
On Wed, 2022-03-09 at 09:36 +0100, Jakub Sitnicki wrote:
> On Wed, Mar 09, 2022 at 12:58 AM +01, Ilya Leoshkevich wrote:
> > On Tue, 2022-03-08 at 16:01 +0100, Jakub Sitnicki wrote:
> > > On Tue, Feb 22, 2022 at 07:25 PM +01, Ilya Leoshkevich wrote:
> > > > Verifier treats bpf_sk_lookup.remote_port as a 32-bit field for
> > > > backward compatibility, regardless of what the uapi headers
> > > > say.
> > > > This field is mapped onto the 16-bit bpf_sk_lookup_kern.sport
> > > > field.
> > > > Therefore, accessing the most significant 16 bits of
> > > > bpf_sk_lookup.remote_port must produce 0, which is currently
> > > > not
> > > > the case.
> > > > 
> > > > The problem is that narrow loads with offset - commit
> > > > 46f53a65d2de
> > > > ("bpf: Allow narrow loads with offset > 0"), don't play nicely
> > > > with
> > > > the masking optimization - commit 239946314e57 ("bpf: possibly
> > > > avoid
> > > > extra masking for narrower load in verifier"). In particular,
> > > > when
> > > > we
> > > > suppress extra masking, we suppress shifting as well, which is
> > > > not
> > > > correct.
> > > > 
> > > > Fix by moving the masking suppression check to BPF_AND
> > > > generation.
> > > > 
> > > > Fixes: 46f53a65d2de ("bpf: Allow narrow loads with offset > 0")
> > > > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > > > ---
> > > >  kernel/bpf/verifier.c | 14 +++++++++-----
> > > >  1 file changed, 9 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > > > index d7473fee247c..195f2e9b5a47 100644
> > > > --- a/kernel/bpf/verifier.c
> > > > +++ b/kernel/bpf/verifier.c
> > > > @@ -12848,7 +12848,7 @@ static int convert_ctx_accesses(struct
> > > > bpf_verifier_env *env)
> > > >                         return -EINVAL;
> > > >                 }
> > > >  
> > > > -               if (is_narrower_load && size < target_size) {
> > > > +               if (is_narrower_load) {
> > > >                         u8 shift =
> > > > bpf_ctx_narrow_access_offset(
> > > >                                 off, size, size_default) * 8;
> > > >                         if (shift && cnt + 1 >=
> > > > ARRAY_SIZE(insn_buf)) {
> > > > @@ -12860,15 +12860,19 @@ static int
> > > > convert_ctx_accesses(struct
> > > > bpf_verifier_env *env)
> > > >                                         insn_buf[cnt++] =
> > > > BPF_ALU32_IMM(BPF_RSH,
> > > >                                                                
> > > >     
> > > >      insn->dst_reg,
> > > >                                                                
> > > >     
> > > >      shift);
> > > > -                               insn_buf[cnt++] =
> > > > BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
> > > > -
> > > >                                                                (
> > > > 1
> > > > << size * 8) - 1);
> > > > +                               if (size < target_size)
> > > > +                                       insn_buf[cnt++] =
> > > > BPF_ALU32_IMM(
> > > > +                                               BPF_AND, insn-
> > > > > dst_reg,
> > > > +                                               (1 << size * 8)
> > > > -
> > > > 1);
> > > >                         } else {
> > > >                                 if (shift)
> > > >                                         insn_buf[cnt++] =
> > > > BPF_ALU64_IMM(BPF_RSH,
> > > >                                                                
> > > >     
> > > >      insn->dst_reg,
> > > >                                                                
> > > >     
> > > >      shift);
> > > > -                               insn_buf[cnt++] =
> > > > BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
> > > > -
> > > >                                                                
> > > > (1ULL
> > > >  << size * 8) - 1);
> > > > +                               if (size < target_size)
> > > > +                                       insn_buf[cnt++] =
> > > > BPF_ALU64_IMM(
> > > > +                                               BPF_AND, insn-
> > > > > dst_reg,
> > > > +                                               (1ULL << size *
> > > > 8)
> > > > - 1);
> > > >                         }
> > > >                 }
> > > 
> > > Thanks for patience. I'm coming back to this.
> > > 
> > > This fix affects the 2-byte load from bpf_sk_lookup.remote_port.
> > > Dumping the xlated BPF code confirms it.
> > > 
> > > On LE (x86-64) things look well.
> > > 
> > > Before this patch:
> > > 
> > > * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
> > >    0: (69) r2 = *(u16 *)(r1 +4)
> > >    1: (b7) r0 = 0
> > >    2: (95) exit
> > > 
> > > * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
> > >    0: (69) r2 = *(u16 *)(r1 +4)
> > >    1: (b7) r0 = 0
> > >    2: (95) exit
> > > 
> > > After this patch:
> > > 
> > > * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
> > >    0: (69) r2 = *(u16 *)(r1 +4)
> > >    1: (b7) r0 = 0
> > >    2: (95) exit
> > > 
> > > * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
> > >    0: (69) r2 = *(u16 *)(r1 +4)
> > >    1: (74) w2 >>= 16
> > >    2: (b7) r0 = 0
> > >    3: (95) exit
> > > 
> > > Which works great because the JIT generates a zero-extended load
> > > movzwq:
> > > 
> > > * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
> > > bpf_prog_5e4fe3dbdcb18fd3:
> > >    0:   nopl   0x0(%rax,%rax,1)
> > >    5:   xchg   %ax,%ax
> > >    7:   push   %rbp
> > >    8:   mov    %rsp,%rbp
> > >    b:   movzwq 0x4(%rdi),%rsi
> > >   10:   xor    %eax,%eax
> > >   12:   leave
> > >   13:   ret
> > > 
> > > 
> > > * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
> > > bpf_prog_4a6336c64a340b96:
> > >    0:   nopl   0x0(%rax,%rax,1)
> > >    5:   xchg   %ax,%ax
> > >    7:   push   %rbp
> > >    8:   mov    %rsp,%rbp
> > >    b:   movzwq 0x4(%rdi),%rsi
> > >   10:   shr    $0x10,%esi
> > >   13:   xor    %eax,%eax
> > >   15:   leave
> > >   16:   ret
> > > 
> > > Runtime checks for bpf_sk_lookup.remote_port load and the 2-bytes
> > > of
> > > zero padding following it, like below, pass with flying colors:
> > > 
> > >         ok = ctx->remote_port == bpf_htons(8008);
> > >         if (!ok)
> > >                 return SK_DROP;
> > >         ok = *((__u16 *)&ctx->remote_port + 1) == 0;
> > >         if (!ok)
> > >                 return SK_DROP;
> > > 
> > > (The above checks compile to half-word (2-byte) loads.)
> > > 
> > > 
> > > On BE (s390x) things look different:
> > > 
> > > Before the patch:
> > > 
> > > * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
> > >    0: (69) r2 = *(u16 *)(r1 +4)
> > >    1: (bc) w2 = w2
> > >    2: (b7) r0 = 0
> > >    3: (95) exit
> > > 
> > > * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
> > >    0: (69) r2 = *(u16 *)(r1 +4)
> > >    1: (bc) w2 = w2
> > >    2: (b7) r0 = 0
> > >    3: (95) exit
> > > 
> > > After the patch:
> > > 
> > > * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
> > >    0: (69) r2 = *(u16 *)(r1 +4)
> > >    1: (bc) w2 = w2
> > >    2: (74) w2 >>= 16
> > >    3: (bc) w2 = w2
> > >    4: (b7) r0 = 0
> > >    5: (95) exit
> > > 
> > > * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
> > >    0: (69) r2 = *(u16 *)(r1 +4)
> > >    1: (bc) w2 = w2
> > >    2: (b7) r0 = 0
> > >    3: (95) exit
> > > 
> > > These compile to:
> > > 
> > > * size=2, offset=0, 0: (69) r2 = *(u16 *)(r1 +36)
> > > bpf_prog_fdd58b8caca29f00:
> > >    0:   j       0x0000000000000006
> > >    4:   nopr
> > >    6:   stmg    %r11,%r15,112(%r15)
> > >    c:   la      %r13,64(%r15)
> > >   10:   aghi    %r15,-96
> > >   14:   llgh    %r3,4(%r2,%r0)
> > >   1a:   srl     %r3,16
> > >   1e:   llgfr   %r3,%r3
> > >   22:   lgfi    %r14,0
> > >   28:   lgr     %r2,%r14
> > >   2c:   lmg     %r11,%r15,208(%r15)
> > >   32:   br      %r14
> > > 
> > > 
> > > * size=2, offset=2, 0: (69) r2 = *(u16 *)(r1 +38)
> > > bpf_prog_5e3d8e92223c6841:
> > >    0:   j       0x0000000000000006
> > >    4:   nopr
> > >    6:   stmg    %r11,%r15,112(%r15)
> > >    c:   la      %r13,64(%r15)
> > >   10:   aghi    %r15,-96
> > >   14:   llgh    %r3,4(%r2,%r0)
> > >   1a:   lgfi    %r14,0
> > >   20:   lgr     %r2,%r14
> > >   24:   lmg     %r11,%r15,208(%r15)
> > >   2a:   br      %r14
> > > 
> > > Now, we right shift the value when loading
> > > 
> > >   *(u16 *)(r1 +36)
> > > 
> > > which in C BPF is equivalent to
> > > 
> > >   *((__u16 *)&ctx->remote_port + 0)
> > > 
> > > due to how the shift is calculated by
> > > bpf_ctx_narrow_access_offset().
> > 
> > Right, that's exactly the intention here.
> > The way I see the situation is: the ABI forces us to treat
> > remote_port
> > as a 32-bit field, even though the updated header now says
> > otherwise.
> > And this:
> > 
> >     unsigned int remote_port;
> >     unsigned short result = *(unsigned short *)remote_port;
> > 
> > should be the same as:
> > 
> >     unsigned short result = remote_port >> 16;
> > 
> > on big-endian. Note that this is inherently non-portable.
> 
> 
> 
> 
> 
> > 
> > > This makes the expected typical use-case
> > > 
> > >   ctx->remote_port == bpf_htons(8008)
> > > 
> > > fail on s390x because llgh (Load Logical Halfword (64<-16)) seems
> > > to
> > > lay
> > > out the data in the destination register so that it holds
> > > 0x0000_0000_0000_1f48.
> > > 
> > > I don't know that was the intention here, as it makes the BPF C
> > > code
> > > non-portable.
> > > 
> > > WDYT?
> > 
> > This depends on how we define the remote_port field. I would argue
> > that
> > the definition from patch 2 - even though ugly - is the correct
> > one.
> > It is consistent with both the little-endian (1f 48 00 00) and
> > big-endian (00 00 1f 48) ABIs.
> > 
> > I don't think the current definition is correct, because it expects
> > 1f 48 00 00 on big-endian, and this is not the case. We can verify
> > this
> > by taking 9a69e2^ and applying
> > 
> > --- a/tools/testing/selftests/bpf/progs/test_sk_lookup.c
> > +++ b/tools/testing/selftests/bpf/progs/test_sk_lookup.c
> > @@ -417,6 +417,8 @@ int ctx_narrow_access(struct bpf_sk_lookup
> > *ctx)
> >                 return SK_DROP;
> >         if (LSW(ctx->remote_port, 0) != SRC_PORT)
> >                 return SK_DROP;
> > +       if (ctx->remote_port != SRC_PORT)
> > +               return SK_DROP;
> >  
> >         /* Narrow loads from local_port field. Expect DST_PORT. */
> >         if (LSB(ctx->local_port, 0) != ((DST_PORT >> 0) & 0xff) ||
> > 
> > Therefore that
> > 
> >   ctx->remote_port == bpf_htons(8008)
> > 
> > fails without patch 2 is as expected.
> > 
> 
> Consider this - today the below is true on both LE and BE, right?
> 
>   *(u32 *)&ctx->remote_port == *(u16 *)&ctx->remote_port
> 
> because the loads get converted to:
> 
>   *(u16 *)&ctx_kern->sport == *(u16 *)&ctx_kern->sport
> 
> IOW, today, because of the bug that you are fixing here, the data
> layout
> changes from the PoV of the BPF program depending on the load size.
> 
> With 2-byte loads, without this patch, the data layout appears as:
> 
>   struct bpf_sk_lookup {
>     ...
>     __be16 remote_port;
>     __be16 remote_port;
>     ...
>   }

I see, one can indeed argue that this is also a part of the ABI now.
So we're stuck between a rock and a hard place.

> While for 4-byte loads, it appears as in your 2nd patch:
> 
>   struct bpf_sk_lookup {
>     ...
>     #if little-endian
>     __be16 remote_port;
>     __u16  :16; /* zero padding */
>     #elif big-endian
>     __u16  :16; /* zero padding */
>     __be16 remote_port;
>     #endif
>     ...
>   }
> 
> Because of that I don't see how we could keep complete ABI
> compatiblity,
> and have just one definition of struct bpf_sk_lookup that reflects
> it. These are conflicting requirements.
> 
> I'd bite the bullet for 4-byte loads, for the sake of having an
> endian-agnostic struct bpf_sk_lookup and struct bpf_sock definition
> in
> the uAPI header.
> 
> The sacrifice here is that the access converter will have to keep
> rewriting 4-byte access to bpf_sk_lookup.remote_port and
> bpf_sock.dst_port in this unexpected, quirky manner.
> 
> The expectation is that with time users will recompile their BPF
> progs
> against the updated bpf.h, and switch to 2-byte loads. That will make
> the quirk in the access converter dead code in time.
> 
> I don't have any better ideas. Sorry.
> 
> [...]

I agree, let's go ahead with this solution.

The only remaining problem that I see is: the bug is in the common
code, and it will affect the fields that we add in the future.

Can we either document this state of things in a comment, or fix the
bug and emulate the old behavior for certain fields?
Jakub Sitnicki March 10, 2022, 10:57 p.m. UTC | #5
On Wed, Mar 09, 2022 at 01:34 PM +01, Ilya Leoshkevich wrote:
> On Wed, 2022-03-09 at 09:36 +0100, Jakub Sitnicki wrote:

[...]

>> 
>> Consider this - today the below is true on both LE and BE, right?
>> 
>>   *(u32 *)&ctx->remote_port == *(u16 *)&ctx->remote_port
>> 
>> because the loads get converted to:
>> 
>>   *(u16 *)&ctx_kern->sport == *(u16 *)&ctx_kern->sport
>> 
>> IOW, today, because of the bug that you are fixing here, the data
>> layout
>> changes from the PoV of the BPF program depending on the load size.
>> 
>> With 2-byte loads, without this patch, the data layout appears as:
>> 
>>   struct bpf_sk_lookup {
>>     ...
>>     __be16 remote_port;
>>     __be16 remote_port;
>>     ...
>>   }
>
> I see, one can indeed argue that this is also a part of the ABI now.
> So we're stuck between a rock and a hard place.
>
>> While for 4-byte loads, it appears as in your 2nd patch:
>> 
>>   struct bpf_sk_lookup {
>>     ...
>>     #if little-endian
>>     __be16 remote_port;
>>     __u16  :16; /* zero padding */
>>     #elif big-endian
>>     __u16  :16; /* zero padding */
>>     __be16 remote_port;
>>     #endif
>>     ...
>>   }
>> 
>> Because of that I don't see how we could keep complete ABI
>> compatiblity,
>> and have just one definition of struct bpf_sk_lookup that reflects
>> it. These are conflicting requirements.
>> 
>> I'd bite the bullet for 4-byte loads, for the sake of having an
>> endian-agnostic struct bpf_sk_lookup and struct bpf_sock definition
>> in
>> the uAPI header.
>> 
>> The sacrifice here is that the access converter will have to keep
>> rewriting 4-byte access to bpf_sk_lookup.remote_port and
>> bpf_sock.dst_port in this unexpected, quirky manner.
>> 
>> The expectation is that with time users will recompile their BPF
>> progs
>> against the updated bpf.h, and switch to 2-byte loads. That will make
>> the quirk in the access converter dead code in time.
>> 
>> I don't have any better ideas. Sorry.
>> 
>> [...]
>
> I agree, let's go ahead with this solution.
>
> The only remaining problem that I see is: the bug is in the common
> code, and it will affect the fields that we add in the future.
>
> Can we either document this state of things in a comment, or fix the
> bug and emulate the old behavior for certain fields?

I think we can fix the bug in the common code, and compensate for the
quirky 4-byte access to bpf_sk_lookup.remote_port and bpf_sock.dst_port
in the is_valid_access and convert_ctx_access.

With the patch as below, access to remote_port gets rewritten to:

* size=1, offset=0, r2 = *(u8 *)(r1 +36)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (54) w2 &= 255
   2: (b7) r0 = 0
   3: (95) exit

* size=1, offset=1, r2 = *(u8 *)(r1 +37)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (74) w2 >>= 8
   2: (54) w2 &= 255
   3: (b7) r0 = 0
   4: (95) exit

* size=1, offset=2, r2 = *(u8 *)(r1 +38)
   0: (b4) w2 = 0
   1: (54) w2 &= 255
   2: (b7) r0 = 0
   3: (95) exit

* size=1, offset=3, r2 = *(u8 *)(r1 +39)
   0: (b4) w2 = 0
   1: (74) w2 >>= 8
   2: (54) w2 &= 255
   3: (b7) r0 = 0
   4: (95) exit

* size=2, offset=0, r2 = *(u16 *)(r1 +36)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (b7) r0 = 0
   2: (95) exit

* size=2, offset=2, r2 = *(u16 *)(r1 +38)
   0: (b4) w2 = 0
   1: (b7) r0 = 0
   2: (95) exit

* size=4, offset=0, r2 = *(u32 *)(r1 +36)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (b7) r0 = 0
   2: (95) exit

How does that look to you?

Still need to give it a test on s390x.

--8<--

diff --git a/net/core/filter.c b/net/core/filter.c
index 65869fd510e8..2625a1d2dabc 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10856,13 +10856,24 @@ static bool sk_lookup_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
 	case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
 	case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
-	case offsetof(struct bpf_sk_lookup, remote_port) ...
-	     offsetof(struct bpf_sk_lookup, local_ip4) - 1:
 	case bpf_ctx_range(struct bpf_sk_lookup, local_port):
 	case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
 		bpf_ctx_record_field_size(info, sizeof(__u32));
 		return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
 
+	case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
+		/* Allow 4-byte access to 2-byte field for backward compatibility */
+		if (size == sizeof(__u32))
+			return off == offsetof(struct bpf_sk_lookup, remote_port);
+		bpf_ctx_record_field_size(info, sizeof(__be16));
+		return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));
+
+	case offsetofend(struct bpf_sk_lookup, remote_port) ...
+	     offsetof(struct bpf_sk_lookup, local_ip4) - 1:
+		/* Allow access to zero padding for backward compatiblity */
+		bpf_ctx_record_field_size(info, sizeof(__u16));
+		return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));
+
 	default:
 		return false;
 	}
@@ -10944,6 +10955,11 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
 						     sport, 2, target_size));
 		break;
 
+	case offsetofend(struct bpf_sk_lookup, remote_port):
+		*target_size = 2;
+		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+		break;
+
 	case offsetof(struct bpf_sk_lookup, local_port):
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
 				      bpf_target_off(struct bpf_sk_lookup_kern,
Jakub Sitnicki March 14, 2022, 5:35 p.m. UTC | #6
On Thu, Mar 10, 2022 at 11:57 PM +01, Jakub Sitnicki wrote:
> On Wed, Mar 09, 2022 at 01:34 PM +01, Ilya Leoshkevich wrote:
>> On Wed, 2022-03-09 at 09:36 +0100, Jakub Sitnicki wrote:
>
> [...]
>
>>> 
>>> Consider this - today the below is true on both LE and BE, right?
>>> 
>>>   *(u32 *)&ctx->remote_port == *(u16 *)&ctx->remote_port
>>> 
>>> because the loads get converted to:
>>> 
>>>   *(u16 *)&ctx_kern->sport == *(u16 *)&ctx_kern->sport
>>> 
>>> IOW, today, because of the bug that you are fixing here, the data
>>> layout
>>> changes from the PoV of the BPF program depending on the load size.
>>> 
>>> With 2-byte loads, without this patch, the data layout appears as:
>>> 
>>>   struct bpf_sk_lookup {
>>>     ...
>>>     __be16 remote_port;
>>>     __be16 remote_port;
>>>     ...
>>>   }
>>
>> I see, one can indeed argue that this is also a part of the ABI now.
>> So we're stuck between a rock and a hard place.
>>
>>> While for 4-byte loads, it appears as in your 2nd patch:
>>> 
>>>   struct bpf_sk_lookup {
>>>     ...
>>>     #if little-endian
>>>     __be16 remote_port;
>>>     __u16  :16; /* zero padding */
>>>     #elif big-endian
>>>     __u16  :16; /* zero padding */
>>>     __be16 remote_port;
>>>     #endif
>>>     ...
>>>   }
>>> 
>>> Because of that I don't see how we could keep complete ABI
>>> compatiblity,
>>> and have just one definition of struct bpf_sk_lookup that reflects
>>> it. These are conflicting requirements.
>>> 
>>> I'd bite the bullet for 4-byte loads, for the sake of having an
>>> endian-agnostic struct bpf_sk_lookup and struct bpf_sock definition
>>> in
>>> the uAPI header.
>>> 
>>> The sacrifice here is that the access converter will have to keep
>>> rewriting 4-byte access to bpf_sk_lookup.remote_port and
>>> bpf_sock.dst_port in this unexpected, quirky manner.
>>> 
>>> The expectation is that with time users will recompile their BPF
>>> progs
>>> against the updated bpf.h, and switch to 2-byte loads. That will make
>>> the quirk in the access converter dead code in time.
>>> 
>>> I don't have any better ideas. Sorry.
>>> 
>>> [...]
>>
>> I agree, let's go ahead with this solution.
>>
>> The only remaining problem that I see is: the bug is in the common
>> code, and it will affect the fields that we add in the future.
>>
>> Can we either document this state of things in a comment, or fix the
>> bug and emulate the old behavior for certain fields?
>
> I think we can fix the bug in the common code, and compensate for the
> quirky 4-byte access to bpf_sk_lookup.remote_port and bpf_sock.dst_port
> in the is_valid_access and convert_ctx_access.
>
> With the patch as below, access to remote_port gets rewritten to:
>
> * size=1, offset=0, r2 = *(u8 *)(r1 +36)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (54) w2 &= 255
>    2: (b7) r0 = 0
>    3: (95) exit
>
> * size=1, offset=1, r2 = *(u8 *)(r1 +37)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (74) w2 >>= 8
>    2: (54) w2 &= 255
>    3: (b7) r0 = 0
>    4: (95) exit
>
> * size=1, offset=2, r2 = *(u8 *)(r1 +38)
>    0: (b4) w2 = 0
>    1: (54) w2 &= 255
>    2: (b7) r0 = 0
>    3: (95) exit
>
> * size=1, offset=3, r2 = *(u8 *)(r1 +39)
>    0: (b4) w2 = 0
>    1: (74) w2 >>= 8
>    2: (54) w2 &= 255
>    3: (b7) r0 = 0
>    4: (95) exit
>
> * size=2, offset=0, r2 = *(u16 *)(r1 +36)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (b7) r0 = 0
>    2: (95) exit
>
> * size=2, offset=2, r2 = *(u16 *)(r1 +38)
>    0: (b4) w2 = 0
>    1: (b7) r0 = 0
>    2: (95) exit
>
> * size=4, offset=0, r2 = *(u32 *)(r1 +36)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (b7) r0 = 0
>    2: (95) exit
>
> How does that look to you?
>
> Still need to give it a test on s390x.

Context conversion with patch below applied looks correct to me on s390x
as well:

* size=1, offset=0, r2 = *(u8 *)(r1 +36)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (bc) w2 = w2
   2: (74) w2 >>= 8
   3: (bc) w2 = w2
   4: (54) w2 &= 255
   5: (bc) w2 = w2
   6: (b7) r0 = 0
   7: (95) exit

* size=1, offset=1, r2 = *(u8 *)(r1 +37)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (bc) w2 = w2
   2: (54) w2 &= 255
   3: (bc) w2 = w2
   4: (b7) r0 = 0
   5: (95) exit

* size=1, offset=2, r2 = *(u8 *)(r1 +38)
   0: (b4) w2 = 0
   1: (bc) w2 = w2
   2: (74) w2 >>= 8
   3: (bc) w2 = w2
   4: (54) w2 &= 255
   5: (bc) w2 = w2
   6: (b7) r0 = 0
   7: (95) exit

* size=1, offset=3, r2 = *(u8 *)(r1 +39)
   0: (b4) w2 = 0
   1: (bc) w2 = w2
   2: (54) w2 &= 255
   3: (bc) w2 = w2
   4: (b7) r0 = 0
   5: (95) exit

* size=2, offset=0, r2 = *(u16 *)(r1 +36)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (bc) w2 = w2
   2: (b7) r0 = 0
   3: (95) exit

* size=2, offset=2, r2 = *(u16 *)(r1 +38)
   0: (b4) w2 = 0
   1: (bc) w2 = w2
   2: (b7) r0 = 0
   3: (95) exit

* size=4, offset=0, r2 = *(u32 *)(r1 +36)
   0: (69) r2 = *(u16 *)(r1 +4)
   1: (bc) w2 = w2
   2: (b7) r0 = 0
   3: (95) exit

If we go this way, this should unbreak the bpf selftests on BE,
independently of the patch 1 from this series.

Will send it as a patch, so that we continue the review discussion.

>
> --8<--
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 65869fd510e8..2625a1d2dabc 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -10856,13 +10856,24 @@ static bool sk_lookup_is_valid_access(int off, int size,
>  	case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
>  	case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
>  	case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
> -	case offsetof(struct bpf_sk_lookup, remote_port) ...
> -	     offsetof(struct bpf_sk_lookup, local_ip4) - 1:
>  	case bpf_ctx_range(struct bpf_sk_lookup, local_port):
>  	case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
>  		bpf_ctx_record_field_size(info, sizeof(__u32));
>  		return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
>  
> +	case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
> +		/* Allow 4-byte access to 2-byte field for backward compatibility */
> +		if (size == sizeof(__u32))
> +			return off == offsetof(struct bpf_sk_lookup, remote_port);
> +		bpf_ctx_record_field_size(info, sizeof(__be16));
> +		return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));
> +
> +	case offsetofend(struct bpf_sk_lookup, remote_port) ...
> +	     offsetof(struct bpf_sk_lookup, local_ip4) - 1:
> +		/* Allow access to zero padding for backward compatiblity */
> +		bpf_ctx_record_field_size(info, sizeof(__u16));
> +		return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));
> +
>  	default:
>  		return false;
>  	}
> @@ -10944,6 +10955,11 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
>  						     sport, 2, target_size));
>  		break;
>  
> +	case offsetofend(struct bpf_sk_lookup, remote_port):
> +		*target_size = 2;
> +		*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
> +		break;
> +
>  	case offsetof(struct bpf_sk_lookup, local_port):
>  		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
>  				      bpf_target_off(struct bpf_sk_lookup_kern,
Ilya Leoshkevich March 14, 2022, 6:25 p.m. UTC | #7
On Mon, 2022-03-14 at 18:35 +0100, Jakub Sitnicki wrote:
> On Thu, Mar 10, 2022 at 11:57 PM +01, Jakub Sitnicki wrote:
> > On Wed, Mar 09, 2022 at 01:34 PM +01, Ilya Leoshkevich wrote:
> > > On Wed, 2022-03-09 at 09:36 +0100, Jakub Sitnicki wrote:
> > 
> > [...]
> > 
> > > > 
> > > > Consider this - today the below is true on both LE and BE,
> > > > right?
> > > > 
> > > >   *(u32 *)&ctx->remote_port == *(u16 *)&ctx->remote_port
> > > > 
> > > > because the loads get converted to:
> > > > 
> > > >   *(u16 *)&ctx_kern->sport == *(u16 *)&ctx_kern->sport
> > > > 
> > > > IOW, today, because of the bug that you are fixing here, the
> > > > data
> > > > layout
> > > > changes from the PoV of the BPF program depending on the load
> > > > size.
> > > > 
> > > > With 2-byte loads, without this patch, the data layout appears
> > > > as:
> > > > 
> > > >   struct bpf_sk_lookup {
> > > >     ...
> > > >     __be16 remote_port;
> > > >     __be16 remote_port;
> > > >     ...
> > > >   }
> > > 
> > > I see, one can indeed argue that this is also a part of the ABI
> > > now.
> > > So we're stuck between a rock and a hard place.
> > > 
> > > > While for 4-byte loads, it appears as in your 2nd patch:
> > > > 
> > > >   struct bpf_sk_lookup {
> > > >     ...
> > > >     #if little-endian
> > > >     __be16 remote_port;
> > > >     __u16  :16; /* zero padding */
> > > >     #elif big-endian
> > > >     __u16  :16; /* zero padding */
> > > >     __be16 remote_port;
> > > >     #endif
> > > >     ...
> > > >   }
> > > > 
> > > > Because of that I don't see how we could keep complete ABI
> > > > compatiblity,
> > > > and have just one definition of struct bpf_sk_lookup that
> > > > reflects
> > > > it. These are conflicting requirements.
> > > > 
> > > > I'd bite the bullet for 4-byte loads, for the sake of having an
> > > > endian-agnostic struct bpf_sk_lookup and struct bpf_sock
> > > > definition
> > > > in
> > > > the uAPI header.
> > > > 
> > > > The sacrifice here is that the access converter will have to
> > > > keep
> > > > rewriting 4-byte access to bpf_sk_lookup.remote_port and
> > > > bpf_sock.dst_port in this unexpected, quirky manner.
> > > > 
> > > > The expectation is that with time users will recompile their
> > > > BPF
> > > > progs
> > > > against the updated bpf.h, and switch to 2-byte loads. That
> > > > will make
> > > > the quirk in the access converter dead code in time.
> > > > 
> > > > I don't have any better ideas. Sorry.
> > > > 
> > > > [...]
> > > 
> > > I agree, let's go ahead with this solution.
> > > 
> > > The only remaining problem that I see is: the bug is in the
> > > common
> > > code, and it will affect the fields that we add in the future.
> > > 
> > > Can we either document this state of things in a comment, or fix
> > > the
> > > bug and emulate the old behavior for certain fields?
> > 
> > I think we can fix the bug in the common code, and compensate for
> > the
> > quirky 4-byte access to bpf_sk_lookup.remote_port and
> > bpf_sock.dst_port
> > in the is_valid_access and convert_ctx_access.
> > 
> > With the patch as below, access to remote_port gets rewritten to:
> > 
> > * size=1, offset=0, r2 = *(u8 *)(r1 +36)
> >    0: (69) r2 = *(u16 *)(r1 +4)
> >    1: (54) w2 &= 255
> >    2: (b7) r0 = 0
> >    3: (95) exit
> > 
> > * size=1, offset=1, r2 = *(u8 *)(r1 +37)
> >    0: (69) r2 = *(u16 *)(r1 +4)
> >    1: (74) w2 >>= 8
> >    2: (54) w2 &= 255
> >    3: (b7) r0 = 0
> >    4: (95) exit
> > 
> > * size=1, offset=2, r2 = *(u8 *)(r1 +38)
> >    0: (b4) w2 = 0
> >    1: (54) w2 &= 255
> >    2: (b7) r0 = 0
> >    3: (95) exit
> > 
> > * size=1, offset=3, r2 = *(u8 *)(r1 +39)
> >    0: (b4) w2 = 0
> >    1: (74) w2 >>= 8
> >    2: (54) w2 &= 255
> >    3: (b7) r0 = 0
> >    4: (95) exit
> > 
> > * size=2, offset=0, r2 = *(u16 *)(r1 +36)
> >    0: (69) r2 = *(u16 *)(r1 +4)
> >    1: (b7) r0 = 0
> >    2: (95) exit
> > 
> > * size=2, offset=2, r2 = *(u16 *)(r1 +38)
> >    0: (b4) w2 = 0
> >    1: (b7) r0 = 0
> >    2: (95) exit
> > 
> > * size=4, offset=0, r2 = *(u32 *)(r1 +36)
> >    0: (69) r2 = *(u16 *)(r1 +4)
> >    1: (b7) r0 = 0
> >    2: (95) exit
> > 
> > How does that look to you?
> > 
> > Still need to give it a test on s390x.
> 
> Context conversion with patch below applied looks correct to me on
> s390x
> as well:
> 
> * size=1, offset=0, r2 = *(u8 *)(r1 +36)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (bc) w2 = w2
>    2: (74) w2 >>= 8
>    3: (bc) w2 = w2
>    4: (54) w2 &= 255
>    5: (bc) w2 = w2
>    6: (b7) r0 = 0
>    7: (95) exit
> 
> * size=1, offset=1, r2 = *(u8 *)(r1 +37)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (bc) w2 = w2
>    2: (54) w2 &= 255
>    3: (bc) w2 = w2
>    4: (b7) r0 = 0
>    5: (95) exit
> 
> * size=1, offset=2, r2 = *(u8 *)(r1 +38)
>    0: (b4) w2 = 0
>    1: (bc) w2 = w2
>    2: (74) w2 >>= 8
>    3: (bc) w2 = w2
>    4: (54) w2 &= 255
>    5: (bc) w2 = w2
>    6: (b7) r0 = 0
>    7: (95) exit
> 
> * size=1, offset=3, r2 = *(u8 *)(r1 +39)
>    0: (b4) w2 = 0
>    1: (bc) w2 = w2
>    2: (54) w2 &= 255
>    3: (bc) w2 = w2
>    4: (b7) r0 = 0
>    5: (95) exit
> 
> * size=2, offset=0, r2 = *(u16 *)(r1 +36)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (bc) w2 = w2
>    2: (b7) r0 = 0
>    3: (95) exit
> 
> * size=2, offset=2, r2 = *(u16 *)(r1 +38)
>    0: (b4) w2 = 0
>    1: (bc) w2 = w2
>    2: (b7) r0 = 0
>    3: (95) exit
> 
> * size=4, offset=0, r2 = *(u32 *)(r1 +36)
>    0: (69) r2 = *(u16 *)(r1 +4)
>    1: (bc) w2 = w2
>    2: (b7) r0 = 0
>    3: (95) exit
> 
> If we go this way, this should unbreak the bpf selftests on BE,
> independently of the patch 1 from this series.
> 
> Will send it as a patch, so that we continue the review discussion.

I applied this patch to bpf-next, and while it looks reasonable, the
test still fails, e.g. here:

	/* Load from remote_port field with zero padding (backward
compatibility) */
	val_u32 = *(__u32 *)&ctx->remote_port;
	if (val_u32 != bpf_htonl(bpf_ntohs(SRC_PORT) << 16))
		return SK_DROP;

> 
> > 
> > --8<--
> > 
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index 65869fd510e8..2625a1d2dabc 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -10856,13 +10856,24 @@ static bool sk_lookup_is_valid_access(int
> > off, int size,
> >         case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
> >         case bpf_ctx_range_till(struct bpf_sk_lookup,
> > remote_ip6[0], remote_ip6[3]):
> >         case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0],
> > local_ip6[3]):
> > -       case offsetof(struct bpf_sk_lookup, remote_port) ...
> > -            offsetof(struct bpf_sk_lookup, local_ip4) - 1:
> >         case bpf_ctx_range(struct bpf_sk_lookup, local_port):
> >         case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
> >                 bpf_ctx_record_field_size(info, sizeof(__u32));
> >                 return bpf_ctx_narrow_access_ok(off, size,
> > sizeof(__u32));
> >  
> > +       case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
> > +               /* Allow 4-byte access to 2-byte field for backward
> > compatibility */
> > +               if (size == sizeof(__u32))
> > +                       return off == offsetof(struct
> > bpf_sk_lookup, remote_port);
> > +               bpf_ctx_record_field_size(info, sizeof(__be16));
> > +               return bpf_ctx_narrow_access_ok(off, size,
> > sizeof(__be16));
> > +
> > +       case offsetofend(struct bpf_sk_lookup, remote_port) ...
> > +            offsetof(struct bpf_sk_lookup, local_ip4) - 1:
> > +               /* Allow access to zero padding for backward
> > compatiblity */
> > +               bpf_ctx_record_field_size(info, sizeof(__u16));
> > +               return bpf_ctx_narrow_access_ok(off, size,
> > sizeof(__u16));
> > +
> >         default:
> >                 return false;
> >         }
> > @@ -10944,6 +10955,11 @@ static u32
> > sk_lookup_convert_ctx_access(enum bpf_access_type type,
> >                                                      sport, 2,
> > target_size));
> >                 break;
> >  
> > +       case offsetofend(struct bpf_sk_lookup, remote_port):
> > +               *target_size = 2;
> > +               *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
> > +               break;
> > +
> >         case offsetof(struct bpf_sk_lookup, local_port):
> >                 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si-
> > >src_reg,
> >                                       bpf_target_off(struct
> > bpf_sk_lookup_kern,
>
Jakub Sitnicki March 14, 2022, 8:57 p.m. UTC | #8
On Mon, Mar 14, 2022 at 07:25 PM +01, Ilya Leoshkevich wrote:
> On Mon, 2022-03-14 at 18:35 +0100, Jakub Sitnicki wrote:
>> On Thu, Mar 10, 2022 at 11:57 PM +01, Jakub Sitnicki wrote:
>> > On Wed, Mar 09, 2022 at 01:34 PM +01, Ilya Leoshkevich wrote:
>> > > On Wed, 2022-03-09 at 09:36 +0100, Jakub Sitnicki wrote:
>> > 
>> > [...]
>> > 
>> > > > 
>> > > > Consider this - today the below is true on both LE and BE,
>> > > > right?
>> > > > 
>> > > >   *(u32 *)&ctx->remote_port == *(u16 *)&ctx->remote_port
>> > > > 
>> > > > because the loads get converted to:
>> > > > 
>> > > >   *(u16 *)&ctx_kern->sport == *(u16 *)&ctx_kern->sport
>> > > > 
>> > > > IOW, today, because of the bug that you are fixing here, the
>> > > > data
>> > > > layout
>> > > > changes from the PoV of the BPF program depending on the load
>> > > > size.
>> > > > 
>> > > > With 2-byte loads, without this patch, the data layout appears
>> > > > as:
>> > > > 
>> > > >   struct bpf_sk_lookup {
>> > > >     ...
>> > > >     __be16 remote_port;
>> > > >     __be16 remote_port;
>> > > >     ...
>> > > >   }
>> > > 
>> > > I see, one can indeed argue that this is also a part of the ABI
>> > > now.
>> > > So we're stuck between a rock and a hard place.
>> > > 
>> > > > While for 4-byte loads, it appears as in your 2nd patch:
>> > > > 
>> > > >   struct bpf_sk_lookup {
>> > > >     ...
>> > > >     #if little-endian
>> > > >     __be16 remote_port;
>> > > >     __u16  :16; /* zero padding */
>> > > >     #elif big-endian
>> > > >     __u16  :16; /* zero padding */
>> > > >     __be16 remote_port;
>> > > >     #endif
>> > > >     ...
>> > > >   }
>> > > > 
>> > > > Because of that I don't see how we could keep complete ABI
>> > > > compatiblity,
>> > > > and have just one definition of struct bpf_sk_lookup that
>> > > > reflects
>> > > > it. These are conflicting requirements.
>> > > > 
>> > > > I'd bite the bullet for 4-byte loads, for the sake of having an
>> > > > endian-agnostic struct bpf_sk_lookup and struct bpf_sock
>> > > > definition
>> > > > in
>> > > > the uAPI header.
>> > > > 
>> > > > The sacrifice here is that the access converter will have to
>> > > > keep
>> > > > rewriting 4-byte access to bpf_sk_lookup.remote_port and
>> > > > bpf_sock.dst_port in this unexpected, quirky manner.
>> > > > 
>> > > > The expectation is that with time users will recompile their
>> > > > BPF
>> > > > progs
>> > > > against the updated bpf.h, and switch to 2-byte loads. That
>> > > > will make
>> > > > the quirk in the access converter dead code in time.
>> > > > 
>> > > > I don't have any better ideas. Sorry.
>> > > > 
>> > > > [...]
>> > > 
>> > > I agree, let's go ahead with this solution.
>> > > 
>> > > The only remaining problem that I see is: the bug is in the
>> > > common
>> > > code, and it will affect the fields that we add in the future.
>> > > 
>> > > Can we either document this state of things in a comment, or fix
>> > > the
>> > > bug and emulate the old behavior for certain fields?
>> > 
>> > I think we can fix the bug in the common code, and compensate for
>> > the
>> > quirky 4-byte access to bpf_sk_lookup.remote_port and
>> > bpf_sock.dst_port
>> > in the is_valid_access and convert_ctx_access.
>> > 
>> > With the patch as below, access to remote_port gets rewritten to:
>> > 
>> > * size=1, offset=0, r2 = *(u8 *)(r1 +36)
>> >    0: (69) r2 = *(u16 *)(r1 +4)
>> >    1: (54) w2 &= 255
>> >    2: (b7) r0 = 0
>> >    3: (95) exit
>> > 
>> > * size=1, offset=1, r2 = *(u8 *)(r1 +37)
>> >    0: (69) r2 = *(u16 *)(r1 +4)
>> >    1: (74) w2 >>= 8
>> >    2: (54) w2 &= 255
>> >    3: (b7) r0 = 0
>> >    4: (95) exit
>> > 
>> > * size=1, offset=2, r2 = *(u8 *)(r1 +38)
>> >    0: (b4) w2 = 0
>> >    1: (54) w2 &= 255
>> >    2: (b7) r0 = 0
>> >    3: (95) exit
>> > 
>> > * size=1, offset=3, r2 = *(u8 *)(r1 +39)
>> >    0: (b4) w2 = 0
>> >    1: (74) w2 >>= 8
>> >    2: (54) w2 &= 255
>> >    3: (b7) r0 = 0
>> >    4: (95) exit
>> > 
>> > * size=2, offset=0, r2 = *(u16 *)(r1 +36)
>> >    0: (69) r2 = *(u16 *)(r1 +4)
>> >    1: (b7) r0 = 0
>> >    2: (95) exit
>> > 
>> > * size=2, offset=2, r2 = *(u16 *)(r1 +38)
>> >    0: (b4) w2 = 0
>> >    1: (b7) r0 = 0
>> >    2: (95) exit
>> > 
>> > * size=4, offset=0, r2 = *(u32 *)(r1 +36)
>> >    0: (69) r2 = *(u16 *)(r1 +4)
>> >    1: (b7) r0 = 0
>> >    2: (95) exit
>> > 
>> > How does that look to you?
>> > 
>> > Still need to give it a test on s390x.
>> 
>> Context conversion with patch below applied looks correct to me on
>> s390x
>> as well:
>> 
>> * size=1, offset=0, r2 = *(u8 *)(r1 +36)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (bc) w2 = w2
>>    2: (74) w2 >>= 8
>>    3: (bc) w2 = w2
>>    4: (54) w2 &= 255
>>    5: (bc) w2 = w2
>>    6: (b7) r0 = 0
>>    7: (95) exit
>> 
>> * size=1, offset=1, r2 = *(u8 *)(r1 +37)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (bc) w2 = w2
>>    2: (54) w2 &= 255
>>    3: (bc) w2 = w2
>>    4: (b7) r0 = 0
>>    5: (95) exit
>> 
>> * size=1, offset=2, r2 = *(u8 *)(r1 +38)
>>    0: (b4) w2 = 0
>>    1: (bc) w2 = w2
>>    2: (74) w2 >>= 8
>>    3: (bc) w2 = w2
>>    4: (54) w2 &= 255
>>    5: (bc) w2 = w2
>>    6: (b7) r0 = 0
>>    7: (95) exit
>> 
>> * size=1, offset=3, r2 = *(u8 *)(r1 +39)
>>    0: (b4) w2 = 0
>>    1: (bc) w2 = w2
>>    2: (54) w2 &= 255
>>    3: (bc) w2 = w2
>>    4: (b7) r0 = 0
>>    5: (95) exit
>> 
>> * size=2, offset=0, r2 = *(u16 *)(r1 +36)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (bc) w2 = w2
>>    2: (b7) r0 = 0
>>    3: (95) exit
>> 
>> * size=2, offset=2, r2 = *(u16 *)(r1 +38)
>>    0: (b4) w2 = 0
>>    1: (bc) w2 = w2
>>    2: (b7) r0 = 0
>>    3: (95) exit
>> 
>> * size=4, offset=0, r2 = *(u32 *)(r1 +36)
>>    0: (69) r2 = *(u16 *)(r1 +4)
>>    1: (bc) w2 = w2
>>    2: (b7) r0 = 0
>>    3: (95) exit
>> 
>> If we go this way, this should unbreak the bpf selftests on BE,
>> independently of the patch 1 from this series.
>> 
>> Will send it as a patch, so that we continue the review discussion.
>
> I applied this patch to bpf-next, and while it looks reasonable, the
> test still fails, e.g. here:
>
> 	/* Load from remote_port field with zero padding (backward
> compatibility) */
> 	val_u32 = *(__u32 *)&ctx->remote_port;
> 	if (val_u32 != bpf_htonl(bpf_ntohs(SRC_PORT) << 16))
> 		return SK_DROP;
>

You are right. That's that the check I recently added that broke the bpf
CI runs for s390x [1], and started our thread.

I had a stale build of test_progs with a fix akin to patch [2] that I
was testing and missed that. Thanks for giving it a run.

If we go with Martin's suggestion [3] here and avoid bpf_htonl(), then
we could make it work and save ourselves endianess checks.

IOW, a patch like below would be also needed to unbreak the tests.

First chunk is copied from your fixes to test_sk_lookup in patch 3 in
this RFC series, naturally.

[1] https://lore.kernel.org/bpf/CAEf4BzaRNLw9_EnaMo5e46CdEkzbJiVU3j9oxnsemBKjNFf3wQ@mail.gmail.com/
[2] https://lore.kernel.org/bpf/20220227202757.519015-4-jakub@cloudflare.com/
[3] https://lore.kernel.org/bpf/20220301062207.d3aqge5qg623asr6@kafai-mbp.dhcp.thefacebook.com/

---8<---

diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup.c b/tools/testing/selftests/bpf/progs/test_sk_lookup.c
index bf5b7caefdd0..2765a4bd500c 100644
--- a/tools/testing/selftests/bpf/progs/test_sk_lookup.c
+++ b/tools/testing/selftests/bpf/progs/test_sk_lookup.c
@@ -413,15 +413,14 @@ int ctx_narrow_access(struct bpf_sk_lookup *ctx)

        /* Narrow loads from remote_port field. Expect SRC_PORT. */
        if (LSB(ctx->remote_port, 0) != ((SRC_PORT >> 0) & 0xff) ||
-           LSB(ctx->remote_port, 1) != ((SRC_PORT >> 8) & 0xff) ||
-           LSB(ctx->remote_port, 2) != 0 || LSB(ctx->remote_port, 3) != 0)
+           LSB(ctx->remote_port, 1) != ((SRC_PORT >> 8) & 0xff))
                return SK_DROP;
        if (LSW(ctx->remote_port, 0) != SRC_PORT)
                return SK_DROP;

        /* Load from remote_port field with zero padding (backward compatibility) */
        val_u32 = *(__u32 *)&ctx->remote_port;
-       if (val_u32 != bpf_htonl(bpf_ntohs(SRC_PORT) << 16))
+       if (val_u32 != SRC_PORT)
                return SK_DROP;

        /* Narrow loads from local_port field. Expect DST_PORT. */
diff mbox series

Patch

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d7473fee247c..195f2e9b5a47 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -12848,7 +12848,7 @@  static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			return -EINVAL;
 		}
 
-		if (is_narrower_load && size < target_size) {
+		if (is_narrower_load) {
 			u8 shift = bpf_ctx_narrow_access_offset(
 				off, size, size_default) * 8;
 			if (shift && cnt + 1 >= ARRAY_SIZE(insn_buf)) {
@@ -12860,15 +12860,19 @@  static int convert_ctx_accesses(struct bpf_verifier_env *env)
 					insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
 									insn->dst_reg,
 									shift);
-				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
-								(1 << size * 8) - 1);
+				if (size < target_size)
+					insn_buf[cnt++] = BPF_ALU32_IMM(
+						BPF_AND, insn->dst_reg,
+						(1 << size * 8) - 1);
 			} else {
 				if (shift)
 					insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH,
 									insn->dst_reg,
 									shift);
-				insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
-								(1ULL << size * 8) - 1);
+				if (size < target_size)
+					insn_buf[cnt++] = BPF_ALU64_IMM(
+						BPF_AND, insn->dst_reg,
+						(1ULL << size * 8) - 1);
 			}
 		}