diff mbox series

[bpf,v4] bpf: verifier: prevent userspace memory access

Message ID 20240321124640.8870-1-puranjay12@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series [bpf,v4] bpf: verifier: prevent userspace memory access | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 2305 this patch: 2305
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 22 of 22 maintainers
netdev/build_clang success Errors and warnings before: 985 this patch: 985
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 2380 this patch: 2380
netdev/checkpatch warning CHECK: multiple assignments should be avoided WARNING: line length of 100 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-PR fail PR summary
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-7 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc

Commit Message

Puranjay Mohan March 21, 2024, 12:46 p.m. UTC
With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To
thwart invalid memory accesses, the JITs add an exception table entry
for all such accesses. But in case the src_reg + offset overflows and
turns into a userspace address, the BPF program might read that memory if
the user has mapped it.

There are architectural features that prevent the kernel from accessing
userspace memory, like Privileged Access Never (PAN) on ARM64,
Supervisor Mode Access Prevention (SMAP) on x86-64, Supervisor User
Memory access (SUM) on RISC-V, etc. But BPF should not rely on the
existence of these features.

Make the verifier add guard instructions around such memory accesses and
skip the load if the address falls into the userspace region.

The JITs need to implement bpf_arch_uaddress_limit() to define where
the userspace addresses end for that architecture or TASK_SIZE is taken
as default.

The implementation is as follows:

REG_AX =  SRC_REG
if(offset)
	REG_AX += offset;
REG_AX >>= 32;
if (REG_AX <= (uaddress_limit >> 32))
	DST_REG = 0;
else
	DST_REG = *(size *)(SRC_REG + offset);

Comparing just the upper 32 bits of the load address with the upper
32 bits of uaddress_limit implies that the values are being aligned down
to a 4GB boundary before comparison.

The above means that all loads with address <= uaddress_limit + 4GB are
skipped. This is acceptable because there is a large hole (much larger
than 4GB) between userspace and kernel space memory, therefore a
correctly functioning BPF program should not access this 4GB memory
above the userspace.

Let's analyze what this patch does to the following fentry program
dereferencing an untrusted pointer:

  SEC("fentry/tcp_v4_connect")
  int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk)
  {
                *(volatile long *)sk;
                return 0;
  }

    BPF Program before              |           BPF Program after
    ------------------              |           -----------------

  0: (79) r1 = *(u64 *)(r1 +0)          0: (79) r1 = *(u64 *)(r1 +0)
  -----------------------------------------------------------------------
  1: (79) r1 = *(u64 *)(r1 +0) --\      1: (bf) r11 = r1
  ----------------------------\   \     2: (77) r11 >>= 32
  2: (b7) r0 = 0               \   \    3: (b5) if r11 <= 0x8000 goto pc+2
  3: (95) exit                  \   \-> 4: (79) r1 = *(u64 *)(r1 +0)
                                 \      5: (05) goto pc+1
                                  \     6: (b7) r1 = 0
                                   \--------------------------------------
                                        7: (b7) r0 = 0
                                        8: (95) exit

As you can see from above, in the best case (off=0), 5 extra instructions
are emitted.

Now, we analyse the same program after it has gone through the JITs of
X86-64, ARM64, and RISC-V architectures. We follow the single load
instruction that has the untrusted pointer and see what instrumentation
has been added around it.

                                x86-64 JIT
                                ==========
     JIT's Instrumentation                  Verifier's Instrumentation
          (upstream)                               (This patch)
     ---------------------                  --------------------------

   0:   nopl   0x0(%rax,%rax,1)             0:   nopl   0x0(%rax,%rax,1)
   5:   xchg   %ax,%ax                      5:   xchg   %ax,%ax
   7:   push   %rbp                         7:   push   %rbp
   8:   mov    %rsp,%rbp                    8:   mov    %rsp,%rbp
   b:   mov    0x0(%rdi),%rdi               b:   mov    0x0(%rdi),%rdi
  ------------------------------------------------------------------------
   f:   movabs $0x800000000000,%r11         f:   mov    %rdi,%r10
  19:   cmp    %r11,%rdi                   12:   shr    $0x20,%r10
  1c:   jb     0x000000000000002a          16:   cmp    $0x8000,%r10
  1e:   mov    %rdi,%r11                   1d:   jbe    0x0000000000000025
  21:   add    $0x0,%r11              /--> 1f:   mov    0x0(%rdi),%rdi
  28:   jae    0x000000000000002e    /     23:   jmp    0x0000000000000027
  2a:   xor    %edi,%edi            /      25:   xor    %edi,%edi
  2c:   jmp    0x0000000000000032  / /------------------------------------
  2e:   mov    0x0(%rdi),%rdi  ---/ /      27:   xor    %eax,%eax
  ---------------------------------/       29:   leave
  32:   xor    %eax,%eax                   2a:   ret
  34:   leave
  35:   ret

The x86-64 JIT already emits some instructions to protect against user
memory access. The implementation in this patch leads to a smaller
number of instructions being emitted. In the worst case the JIT will
emit 9 extra instructions and this patch decreases it to 7.

                                  ARM64 JIT
                                  =========

        No Intrumentation                       Verifier's Instrumentation
           (upstream)                                  (This patch)
        -----------------                       --------------------------

   0:   add     x9, x30, #0x0                0:   add     x9, x30, #0x0
   4:   nop                                  4:   nop
   8:   paciasp                              8:   paciasp
   c:   stp     x29, x30, [sp, #-16]!        c:   stp     x29, x30, [sp, #-16]!
  10:   mov     x29, sp                     10:   mov     x29, sp
  14:   stp     x19, x20, [sp, #-16]!       14:   stp     x19, x20, [sp, #-16]!
  18:   stp     x21, x22, [sp, #-16]!       18:   stp     x21, x22, [sp, #-16]!
  1c:   stp     x25, x26, [sp, #-16]!       1c:   stp     x25, x26, [sp, #-16]!
  20:   stp     x27, x28, [sp, #-16]!       20:   stp     x27, x28, [sp, #-16]!
  24:   mov     x25, sp                     24:   mov     x25, sp
  28:   mov     x26, #0x0                   28:   mov     x26, #0x0
  2c:   sub     x27, x25, #0x0              2c:   sub     x27, x25, #0x0
  30:   sub     sp, sp, #0x0                30:   sub     sp, sp, #0x0
  34:   ldr     x0, [x0]                    34:   ldr     x0, [x0]
--------------------------------------------------------------------------------
  38:   ldr     x0, [x0] ----------\        38:   add     x9, x0, #0x0
-----------------------------------\\       3c:   lsr     x9, x9, #32
  3c:   mov     x7, #0x0            \\      40:   cmp     x9, #0x10, lsl #12
  40:   mov     sp, sp               \\     44:   b.ls    0x0000000000000050
  44:   ldp     x27, x28, [sp], #16   \\--> 48:   ldr     x0, [x0]
  48:   ldp     x25, x26, [sp], #16    \    4c:   b       0x0000000000000054
  4c:   ldp     x21, x22, [sp], #16     \   50:   mov     x0, #0x0
  50:   ldp     x19, x20, [sp], #16      \---------------------------------------
  54:   ldp     x29, x30, [sp], #16         54:   mov     x7, #0x0
  58:   add     x0, x7, #0x0                58:   mov     sp, sp
  5c:   autiasp                             5c:   ldp     x27, x28, [sp], #16
  60:   ret                                 60:   ldp     x25, x26, [sp], #16
  64:   nop                                 64:   ldp     x21, x22, [sp], #16
  68:   ldr     x10, 0x0000000000000070     68:   ldp     x19, x20, [sp], #16
  6c:   br      x10                         6c:   ldp     x29, x30, [sp], #16
                                            70:   add     x0, x7, #0x0
                                            74:   autiasp
                                            78:   ret
                                            7c:   nop
                                            80:   ldr     x10, 0x0000000000000088
                                            84:   br      x10

There are 6 extra instructions added in ARM64 in the best case. This will
become 7 in the worst case (off != 0).

                           RISC-V JIT (RISCV_ISA_C Disabled)
                           ==========

        No Intrumentation           Verifier's Instrumentation
           (upstream)                      (This patch)
        -----------------           --------------------------

   0:   nop                            0:   nop
   4:   nop                            4:   nop
   8:   li      a6, 33                 8:   li      a6, 33
   c:   addi    sp, sp, -16            c:   addi    sp, sp, -16
  10:   sd      s0, 8(sp)             10:   sd      s0, 8(sp)
  14:   addi    s0, sp, 16            14:   addi    s0, sp, 16
  18:   ld      a0, 0(a0)             18:   ld      a0, 0(a0)
---------------------------------------------------------------
  1c:   ld      a0, 0(a0) --\         1c:   mv      t0, a0
--------------------------\  \        20:   srli    t0, t0, 32
  20:   li      a5, 0      \  \       24:   lui     t1, 4096
  24:   ld      s0, 8(sp)   \  \      28:   sext.w  t1, t1
  28:   addi    sp, sp, 16   \  \     2c:   bgeu    t1, t0, 12
  2c:   sext.w  a0, a5        \  \--> 30:   ld      a0, 0(a0)
  30:   ret                    \      34:   j       8
                                \     38:   li      a0, 0
                                 \------------------------------
                                      3c:   li      a5, 0
                                      40:   ld      s0, 8(sp)
                                      44:   addi    sp, sp, 16
                                      48:   sext.w  a0, a5
                                      4c:   ret

There are 7 extra instructions added in RISC-V.

Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables")
Reported-by: Breno Leitao <leitao@debian.org>
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
---
V3: https://lore.kernel.org/bpf/20240321120842.78983-1-puranjay12@gmail.com/
Changes in V4:
- Disable this feature on architectures that don't define
  CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE.
- By doing the above, we don't need anything explicitly for s390x.

V2: https://lore.kernel.org/bpf/20240321101058.68530-1-puranjay12@gmail.com/
Changes in V3:
- Return 0 from bpf_arch_uaddress_limit() in disabled case because it
  returns u64.
- Modify the check in verifier to no do instrumentation when uaddress_limit
  is 0.

V1: https://lore.kernel.org/bpf/20240320105436.4781-1-puranjay12@gmail.com/
Changes in V2:
- Disable this feature on s390x.
---
 arch/x86/net/bpf_jit_comp.c | 72 +++++--------------------------------
 include/linux/filter.h      |  1 +
 kernel/bpf/core.c           |  9 +++++
 kernel/bpf/verifier.c       | 30 ++++++++++++++++
 4 files changed, 48 insertions(+), 64 deletions(-)

Comments

Ilya Leoshkevich March 22, 2024, 2:24 a.m. UTC | #1
On Thu, 2024-03-21 at 12:46 +0000, Puranjay Mohan wrote:
> With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer.
> To
> thwart invalid memory accesses, the JITs add an exception table entry
> for all such accesses. But in case the src_reg + offset overflows and
> turns into a userspace address, the BPF program might read that
> memory if
> the user has mapped it.
> 
> There are architectural features that prevent the kernel from
> accessing
> userspace memory, like Privileged Access Never (PAN) on ARM64,
> Supervisor Mode Access Prevention (SMAP) on x86-64, Supervisor User
> Memory access (SUM) on RISC-V, etc. But BPF should not rely on the
> existence of these features.
> 
> Make the verifier add guard instructions around such memory accesses
> and
> skip the load if the address falls into the userspace region.
> 
> The JITs need to implement bpf_arch_uaddress_limit() to define where
> the userspace addresses end for that architecture or TASK_SIZE is
> taken
> as default.
> 
> The implementation is as follows:
> 
> REG_AX =  SRC_REG
> if(offset)
> 	REG_AX += offset;
> REG_AX >>= 32;
> if (REG_AX <= (uaddress_limit >> 32))
> 	DST_REG = 0;
> else
> 	DST_REG = *(size *)(SRC_REG + offset);
> 
> Comparing just the upper 32 bits of the load address with the upper
> 32 bits of uaddress_limit implies that the values are being aligned
> down
> to a 4GB boundary before comparison.
> 
> The above means that all loads with address <= uaddress_limit + 4GB
> are
> skipped. This is acceptable because there is a large hole (much
> larger
> than 4GB) between userspace and kernel space memory, therefore a
> correctly functioning BPF program should not access this 4GB memory
> above the userspace.
> 
> Let's analyze what this patch does to the following fentry program
> dereferencing an untrusted pointer:
> 
>   SEC("fentry/tcp_v4_connect")
>   int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk)
>   {
>                 *(volatile long *)sk;
>                 return 0;
>   }
> 
>     BPF Program before              |           BPF Program after
>     ------------------              |           -----------------
> 
>   0: (79) r1 = *(u64 *)(r1 +0)          0: (79) r1 = *(u64 *)(r1 +0)
>   -------------------------------------------------------------------
> ----
>   1: (79) r1 = *(u64 *)(r1 +0) --\      1: (bf) r11 = r1
>   ----------------------------\   \     2: (77) r11 >>= 32
>   2: (b7) r0 = 0               \   \    3: (b5) if r11 <= 0x8000 goto
> pc+2
>   3: (95) exit                  \   \-> 4: (79) r1 = *(u64 *)(r1 +0)
>                                  \      5: (05) goto pc+1
>                                   \     6: (b7) r1 = 0
>                                    \---------------------------------
> -----
>                                         7: (b7) r0 = 0
>                                         8: (95) exit
> 
> As you can see from above, in the best case (off=0), 5 extra
> instructions
> are emitted.
> 
> Now, we analyse the same program after it has gone through the JITs
> of
> X86-64, ARM64, and RISC-V architectures. We follow the single load
> instruction that has the untrusted pointer and see what
> instrumentation
> has been added around it.
> 
>                                 x86-64 JIT
>                                 ==========
>      JIT's Instrumentation                  Verifier's
> Instrumentation
>           (upstream)                               (This patch)
>      ---------------------                  -------------------------
> -
> 
>    0:   nopl   0x0(%rax,%rax,1)             0:   nopl  
> 0x0(%rax,%rax,1)
>    5:   xchg   %ax,%ax                      5:   xchg   %ax,%ax
>    7:   push   %rbp                         7:   push   %rbp
>    8:   mov    %rsp,%rbp                    8:   mov    %rsp,%rbp
>    b:   mov    0x0(%rdi),%rdi               b:   mov   
> 0x0(%rdi),%rdi
>   -------------------------------------------------------------------
> -----
>    f:   movabs $0x800000000000,%r11         f:   mov    %rdi,%r10
>   19:   cmp    %r11,%rdi                   12:   shr    $0x20,%r10
>   1c:   jb     0x000000000000002a          16:   cmp    $0x8000,%r10
>   1e:   mov    %rdi,%r11                   1d:   jbe   
> 0x0000000000000025
>   21:   add    $0x0,%r11              /--> 1f:   mov   
> 0x0(%rdi),%rdi
>   28:   jae    0x000000000000002e    /     23:   jmp   
> 0x0000000000000027
>   2a:   xor    %edi,%edi            /      25:   xor    %edi,%edi
>   2c:   jmp    0x0000000000000032  / /-------------------------------
> -----
>   2e:   mov    0x0(%rdi),%rdi  ---/ /      27:   xor    %eax,%eax
>   ---------------------------------/       29:   leave
>   32:   xor    %eax,%eax                   2a:   ret
>   34:   leave
>   35:   ret
> 
> The x86-64 JIT already emits some instructions to protect against
> user
> memory access. The implementation in this patch leads to a smaller
> number of instructions being emitted. In the worst case the JIT will
> emit 9 extra instructions and this patch decreases it to 7.
> 
>                                   ARM64 JIT
>                                   =========
> 
>         No Intrumentation                       Verifier's
> Instrumentation
>            (upstream)                                  (This patch)
>         -----------------                       ---------------------
> -----
> 
>    0:   add     x9, x30, #0x0                0:   add     x9, x30,
> #0x0
>    4:   nop                                  4:   nop
>    8:   paciasp                              8:   paciasp
>    c:   stp     x29, x30, [sp, #-16]!        c:   stp     x29, x30,
> [sp, #-16]!
>   10:   mov     x29, sp                     10:   mov     x29, sp
>   14:   stp     x19, x20, [sp, #-16]!       14:   stp     x19, x20,
> [sp, #-16]!
>   18:   stp     x21, x22, [sp, #-16]!       18:   stp     x21, x22,
> [sp, #-16]!
>   1c:   stp     x25, x26, [sp, #-16]!       1c:   stp     x25, x26,
> [sp, #-16]!
>   20:   stp     x27, x28, [sp, #-16]!       20:   stp     x27, x28,
> [sp, #-16]!
>   24:   mov     x25, sp                     24:   mov     x25, sp
>   28:   mov     x26, #0x0                   28:   mov     x26, #0x0
>   2c:   sub     x27, x25, #0x0              2c:   sub     x27, x25,
> #0x0
>   30:   sub     sp, sp, #0x0                30:   sub     sp, sp,
> #0x0
>   34:   ldr     x0, [x0]                    34:   ldr     x0, [x0]
> ---------------------------------------------------------------------
> -----------
>   38:   ldr     x0, [x0] ----------\        38:   add     x9, x0,
> #0x0
> -----------------------------------\\       3c:   lsr     x9, x9, #32
>   3c:   mov     x7, #0x0            \\      40:   cmp     x9, #0x10,
> lsl #12
>   40:   mov     sp, sp               \\     44:   b.ls   
> 0x0000000000000050
>   44:   ldp     x27, x28, [sp], #16   \\--> 48:   ldr     x0, [x0]
>   48:   ldp     x25, x26, [sp], #16    \    4c:   b      
> 0x0000000000000054
>   4c:   ldp     x21, x22, [sp], #16     \   50:   mov     x0, #0x0
>   50:   ldp     x19, x20, [sp], #16      \---------------------------
> ------------
>   54:   ldp     x29, x30, [sp], #16         54:   mov     x7, #0x0
>   58:   add     x0, x7, #0x0                58:   mov     sp, sp
>   5c:   autiasp                             5c:   ldp     x27, x28,
> [sp], #16
>   60:   ret                                 60:   ldp     x25, x26,
> [sp], #16
>   64:   nop                                 64:   ldp     x21, x22,
> [sp], #16
>   68:   ldr     x10, 0x0000000000000070     68:   ldp     x19, x20,
> [sp], #16
>   6c:   br      x10                         6c:   ldp     x29, x30,
> [sp], #16
>                                             70:   add     x0, x7,
> #0x0
>                                             74:   autiasp
>                                             78:   ret
>                                             7c:   nop
>                                             80:   ldr     x10,
> 0x0000000000000088
>                                             84:   br      x10
> 
> There are 6 extra instructions added in ARM64 in the best case. This
> will
> become 7 in the worst case (off != 0).
> 
>                            RISC-V JIT (RISCV_ISA_C Disabled)
>                            ==========
> 
>         No Intrumentation           Verifier's Instrumentation
>            (upstream)                      (This patch)
>         -----------------           --------------------------
> 
>    0:   nop                            0:   nop
>    4:   nop                            4:   nop
>    8:   li      a6, 33                 8:   li      a6, 33
>    c:   addi    sp, sp, -16            c:   addi    sp, sp, -16
>   10:   sd      s0, 8(sp)             10:   sd      s0, 8(sp)
>   14:   addi    s0, sp, 16            14:   addi    s0, sp, 16
>   18:   ld      a0, 0(a0)             18:   ld      a0, 0(a0)
> ---------------------------------------------------------------
>   1c:   ld      a0, 0(a0) --\         1c:   mv      t0, a0
> --------------------------\  \        20:   srli    t0, t0, 32
>   20:   li      a5, 0      \  \       24:   lui     t1, 4096
>   24:   ld      s0, 8(sp)   \  \      28:   sext.w  t1, t1
>   28:   addi    sp, sp, 16   \  \     2c:   bgeu    t1, t0, 12
>   2c:   sext.w  a0, a5        \  \--> 30:   ld      a0, 0(a0)
>   30:   ret                    \      34:   j       8
>                                 \     38:   li      a0, 0
>                                  \------------------------------
>                                       3c:   li      a5, 0
>                                       40:   ld      s0, 8(sp)
>                                       44:   addi    sp, sp, 16
>                                       48:   sext.w  a0, a5
>                                       4c:   ret
> 
> There are 7 extra instructions added in RISC-V.
> 
> Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables")
> Reported-by: Breno Leitao <leitao@debian.org>
> Suggested-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
> ---
> V3:
> https://lore.kernel.org/bpf/20240321120842.78983-1-puranjay12@gmail.com/
> Changes in V4:
> - Disable this feature on architectures that don't define
>   CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE.
> - By doing the above, we don't need anything explicitly for s390x.
> 
> V2:
> https://lore.kernel.org/bpf/20240321101058.68530-1-puranjay12@gmail.com/
> Changes in V3:
> - Return 0 from bpf_arch_uaddress_limit() in disabled case because it
>   returns u64.
> - Modify the check in verifier to no do instrumentation when
> uaddress_limit
>   is 0.
> 
> V1:
> https://lore.kernel.org/bpf/20240320105436.4781-1-puranjay12@gmail.com/
> Changes in V2:
> - Disable this feature on s390x.
> ---
>  arch/x86/net/bpf_jit_comp.c | 72 +++++------------------------------
> --
>  include/linux/filter.h      |  1 +
>  kernel/bpf/core.c           |  9 +++++
>  kernel/bpf/verifier.c       | 30 ++++++++++++++++
>  4 files changed, 48 insertions(+), 64 deletions(-)

Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Daniel Borkmann March 22, 2024, 2:46 p.m. UTC | #2
On 3/21/24 1:46 PM, Puranjay Mohan wrote:
> With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To
> thwart invalid memory accesses, the JITs add an exception table entry
> for all such accesses. But in case the src_reg + offset overflows and
> turns into a userspace address, the BPF program might read that memory if
> the user has mapped it.
> 
> There are architectural features that prevent the kernel from accessing
> userspace memory, like Privileged Access Never (PAN) on ARM64,
> Supervisor Mode Access Prevention (SMAP) on x86-64, Supervisor User
> Memory access (SUM) on RISC-V, etc. But BPF should not rely on the
> existence of these features.
> 
> Make the verifier add guard instructions around such memory accesses and
> skip the load if the address falls into the userspace region.
> 
> The JITs need to implement bpf_arch_uaddress_limit() to define where
> the userspace addresses end for that architecture or TASK_SIZE is taken
> as default.
> 
> The implementation is as follows:
> 
> REG_AX =  SRC_REG
> if(offset)
> 	REG_AX += offset;
> REG_AX >>= 32;
> if (REG_AX <= (uaddress_limit >> 32))
> 	DST_REG = 0;
> else
> 	DST_REG = *(size *)(SRC_REG + offset);
> 
> Comparing just the upper 32 bits of the load address with the upper
> 32 bits of uaddress_limit implies that the values are being aligned down
> to a 4GB boundary before comparison.
> 
> The above means that all loads with address <= uaddress_limit + 4GB are
> skipped. This is acceptable because there is a large hole (much larger
> than 4GB) between userspace and kernel space memory, therefore a
> correctly functioning BPF program should not access this 4GB memory
> above the userspace.
> 
> Let's analyze what this patch does to the following fentry program
> dereferencing an untrusted pointer:
> 
>    SEC("fentry/tcp_v4_connect")
>    int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk)
>    {
>                  *(volatile long *)sk;
>                  return 0;
>    }
> 
>      BPF Program before              |           BPF Program after
>      ------------------              |           -----------------
> 
>    0: (79) r1 = *(u64 *)(r1 +0)          0: (79) r1 = *(u64 *)(r1 +0)
>    -----------------------------------------------------------------------
>    1: (79) r1 = *(u64 *)(r1 +0) --\      1: (bf) r11 = r1
>    ----------------------------\   \     2: (77) r11 >>= 32
>    2: (b7) r0 = 0               \   \    3: (b5) if r11 <= 0x8000 goto pc+2
>    3: (95) exit                  \   \-> 4: (79) r1 = *(u64 *)(r1 +0)
>                                   \      5: (05) goto pc+1
>                                    \     6: (b7) r1 = 0
>                                     \--------------------------------------
>                                          7: (b7) r0 = 0
>                                          8: (95) exit
> 
> As you can see from above, in the best case (off=0), 5 extra instructions
> are emitted.
> 
> Now, we analyse the same program after it has gone through the JITs of
> X86-64, ARM64, and RISC-V architectures. We follow the single load
> instruction that has the untrusted pointer and see what instrumentation
> has been added around it.
> 
>                                  x86-64 JIT
>                                  ==========
>       JIT's Instrumentation                  Verifier's Instrumentation
>            (upstream)                               (This patch)
>       ---------------------                  --------------------------
> 
>     0:   nopl   0x0(%rax,%rax,1)             0:   nopl   0x0(%rax,%rax,1)
>     5:   xchg   %ax,%ax                      5:   xchg   %ax,%ax
>     7:   push   %rbp                         7:   push   %rbp
>     8:   mov    %rsp,%rbp                    8:   mov    %rsp,%rbp
>     b:   mov    0x0(%rdi),%rdi               b:   mov    0x0(%rdi),%rdi
>    ------------------------------------------------------------------------
>     f:   movabs $0x800000000000,%r11         f:   mov    %rdi,%r10
>    19:   cmp    %r11,%rdi                   12:   shr    $0x20,%r10
>    1c:   jb     0x000000000000002a          16:   cmp    $0x8000,%r10
>    1e:   mov    %rdi,%r11                   1d:   jbe    0x0000000000000025
>    21:   add    $0x0,%r11              /--> 1f:   mov    0x0(%rdi),%rdi
>    28:   jae    0x000000000000002e    /     23:   jmp    0x0000000000000027
>    2a:   xor    %edi,%edi            /      25:   xor    %edi,%edi
>    2c:   jmp    0x0000000000000032  / /------------------------------------
>    2e:   mov    0x0(%rdi),%rdi  ---/ /      27:   xor    %eax,%eax
>    ---------------------------------/       29:   leave
>    32:   xor    %eax,%eax                   2a:   ret
>    34:   leave
>    35:   ret
> 
> The x86-64 JIT already emits some instructions to protect against user
> memory access. The implementation in this patch leads to a smaller
> number of instructions being emitted. In the worst case the JIT will
> emit 9 extra instructions and this patch decreases it to 7.
> 
>                                    ARM64 JIT
>                                    =========
> 
>          No Intrumentation                       Verifier's Instrumentation
>             (upstream)                                  (This patch)
>          -----------------                       --------------------------
> 
>     0:   add     x9, x30, #0x0                0:   add     x9, x30, #0x0
>     4:   nop                                  4:   nop
>     8:   paciasp                              8:   paciasp
>     c:   stp     x29, x30, [sp, #-16]!        c:   stp     x29, x30, [sp, #-16]!
>    10:   mov     x29, sp                     10:   mov     x29, sp
>    14:   stp     x19, x20, [sp, #-16]!       14:   stp     x19, x20, [sp, #-16]!
>    18:   stp     x21, x22, [sp, #-16]!       18:   stp     x21, x22, [sp, #-16]!
>    1c:   stp     x25, x26, [sp, #-16]!       1c:   stp     x25, x26, [sp, #-16]!
>    20:   stp     x27, x28, [sp, #-16]!       20:   stp     x27, x28, [sp, #-16]!
>    24:   mov     x25, sp                     24:   mov     x25, sp
>    28:   mov     x26, #0x0                   28:   mov     x26, #0x0
>    2c:   sub     x27, x25, #0x0              2c:   sub     x27, x25, #0x0
>    30:   sub     sp, sp, #0x0                30:   sub     sp, sp, #0x0
>    34:   ldr     x0, [x0]                    34:   ldr     x0, [x0]
> --------------------------------------------------------------------------------
>    38:   ldr     x0, [x0] ----------\        38:   add     x9, x0, #0x0
> -----------------------------------\\       3c:   lsr     x9, x9, #32
>    3c:   mov     x7, #0x0            \\      40:   cmp     x9, #0x10, lsl #12
>    40:   mov     sp, sp               \\     44:   b.ls    0x0000000000000050
>    44:   ldp     x27, x28, [sp], #16   \\--> 48:   ldr     x0, [x0]
>    48:   ldp     x25, x26, [sp], #16    \    4c:   b       0x0000000000000054
>    4c:   ldp     x21, x22, [sp], #16     \   50:   mov     x0, #0x0
>    50:   ldp     x19, x20, [sp], #16      \---------------------------------------
>    54:   ldp     x29, x30, [sp], #16         54:   mov     x7, #0x0
>    58:   add     x0, x7, #0x0                58:   mov     sp, sp
>    5c:   autiasp                             5c:   ldp     x27, x28, [sp], #16
>    60:   ret                                 60:   ldp     x25, x26, [sp], #16
>    64:   nop                                 64:   ldp     x21, x22, [sp], #16
>    68:   ldr     x10, 0x0000000000000070     68:   ldp     x19, x20, [sp], #16
>    6c:   br      x10                         6c:   ldp     x29, x30, [sp], #16
>                                              70:   add     x0, x7, #0x0
>                                              74:   autiasp
>                                              78:   ret
>                                              7c:   nop
>                                              80:   ldr     x10, 0x0000000000000088
>                                              84:   br      x10
> 
> There are 6 extra instructions added in ARM64 in the best case. This will
> become 7 in the worst case (off != 0).
> 
>                             RISC-V JIT (RISCV_ISA_C Disabled)
>                             ==========
> 
>          No Intrumentation           Verifier's Instrumentation
>             (upstream)                      (This patch)
>          -----------------           --------------------------
> 
>     0:   nop                            0:   nop
>     4:   nop                            4:   nop
>     8:   li      a6, 33                 8:   li      a6, 33
>     c:   addi    sp, sp, -16            c:   addi    sp, sp, -16
>    10:   sd      s0, 8(sp)             10:   sd      s0, 8(sp)
>    14:   addi    s0, sp, 16            14:   addi    s0, sp, 16
>    18:   ld      a0, 0(a0)             18:   ld      a0, 0(a0)
> ---------------------------------------------------------------
>    1c:   ld      a0, 0(a0) --\         1c:   mv      t0, a0
> --------------------------\  \        20:   srli    t0, t0, 32
>    20:   li      a5, 0      \  \       24:   lui     t1, 4096
>    24:   ld      s0, 8(sp)   \  \      28:   sext.w  t1, t1
>    28:   addi    sp, sp, 16   \  \     2c:   bgeu    t1, t0, 12
>    2c:   sext.w  a0, a5        \  \--> 30:   ld      a0, 0(a0)
>    30:   ret                    \      34:   j       8
>                                  \     38:   li      a0, 0
>                                   \------------------------------
>                                        3c:   li      a5, 0
>                                        40:   ld      s0, 8(sp)
>                                        44:   addi    sp, sp, 16
>                                        48:   sext.w  a0, a5
>                                        4c:   ret
> 
> There are 7 extra instructions added in RISC-V.
> 
> Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables")
> Reported-by: Breno Leitao <leitao@debian.org>
> Suggested-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
> ---
> V3: https://lore.kernel.org/bpf/20240321120842.78983-1-puranjay12@gmail.com/
> Changes in V4:
> - Disable this feature on architectures that don't define
>    CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE.
> - By doing the above, we don't need anything explicitly for s390x.
> 
> V2: https://lore.kernel.org/bpf/20240321101058.68530-1-puranjay12@gmail.com/
> Changes in V3:
> - Return 0 from bpf_arch_uaddress_limit() in disabled case because it
>    returns u64.
> - Modify the check in verifier to no do instrumentation when uaddress_limit
>    is 0.
> 
> V1: https://lore.kernel.org/bpf/20240320105436.4781-1-puranjay12@gmail.com/
> Changes in V2:
> - Disable this feature on s390x.
> ---
>   arch/x86/net/bpf_jit_comp.c | 72 +++++--------------------------------
>   include/linux/filter.h      |  1 +
>   kernel/bpf/core.c           |  9 +++++
>   kernel/bpf/verifier.c       | 30 ++++++++++++++++
>   4 files changed, 48 insertions(+), 64 deletions(-)
> 
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 4900b1ee019f..9b3136187938 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -1327,7 +1327,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>   		u8 b2 = 0, b3 = 0;
>   		u8 *start_of_ldx;
>   		s64 jmp_offset;
> -		s16 insn_off;
>   		u8 jmp_cond;
>   		u8 *func;
>   		int nops;
> @@ -1802,78 +1801,18 @@ st:			if (is_imm8(insn->off))
>   		case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
>   		case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
>   		case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
> -			insn_off = insn->off;
> -
> -			if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
> -			    BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
> -				/* Conservatively check that src_reg + insn->off is a kernel address:
> -				 *   src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
> -				 * src_reg is used as scratch for src_reg += insn->off and restored
> -				 * after emit_ldx if necessary
> -				 */
> -
> -				u64 limit = TASK_SIZE_MAX + PAGE_SIZE;
> -				u8 *end_of_jmp;
> -
> -				/* At end of these emitted checks, insn->off will have been added
> -				 * to src_reg, so no need to do relative load with insn->off offset
> -				 */
> -				insn_off = 0;
> -
> -				/* movabsq r11, limit */
> -				EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
> -				EMIT((u32)limit, 4);
> -				EMIT(limit >> 32, 4);
> -
> -				if (insn->off) {
> -					/* add src_reg, insn->off */
> -					maybe_emit_1mod(&prog, src_reg, true);
> -					EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off);
> -				}
> -
> -				/* cmp src_reg, r11 */
> -				maybe_emit_mod(&prog, src_reg, AUX_REG, true);
> -				EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
> -
> -				/* if unsigned '>=', goto load */
> -				EMIT2(X86_JAE, 0);
> -				end_of_jmp = prog;
> -
> -				/* xor dst_reg, dst_reg */
> -				emit_mov_imm32(&prog, false, dst_reg, 0);
> -				/* jmp byte_after_ldx */
> -				EMIT2(0xEB, 0);
> -
> -				/* populate jmp_offset for JAE above to jump to start_of_ldx */
> -				start_of_ldx = prog;
> -				end_of_jmp[-1] = start_of_ldx - end_of_jmp;
> -			}
> +			start_of_ldx = prog;
>   			if (BPF_MODE(insn->code) == BPF_PROBE_MEMSX ||
>   			    BPF_MODE(insn->code) == BPF_MEMSX)
> -				emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
> +				emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
>   			else
> -				emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
> +				emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
>   			if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
>   			    BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
>   				struct exception_table_entry *ex;
>   				u8 *_insn = image + proglen + (start_of_ldx - temp);
>   				s64 delta;
>   
> -				/* populate jmp_offset for JMP above */
> -				start_of_ldx[-1] = prog - start_of_ldx;
> -
> -				if (insn->off && src_reg != dst_reg) {
> -					/* sub src_reg, insn->off
> -					 * Restore src_reg after "add src_reg, insn->off" in prev
> -					 * if statement. But if src_reg == dst_reg, emit_ldx
> -					 * above already clobbered src_reg, so no need to restore.
> -					 * If add src_reg, insn->off was unnecessary, no need to
> -					 * restore either.
> -					 */
> -					maybe_emit_1mod(&prog, src_reg, true);
> -					EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off);
> -				}
> -
>   				if (!bpf_prog->aux->extable)
>   					break;
>   
> @@ -3473,3 +3412,8 @@ bool bpf_jit_supports_ptr_xchg(void)
>   {
>   	return true;
>   }
> +
> +u64 bpf_arch_uaddress_limit(void)
> +{
> +	return TASK_SIZE_MAX + PAGE_SIZE;
> +}
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index c0d51bff8f96..cf12bfa2a78c 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -965,6 +965,7 @@ bool bpf_jit_supports_far_kfunc_call(void);
>   bool bpf_jit_supports_exceptions(void);
>   bool bpf_jit_supports_ptr_xchg(void);
>   bool bpf_jit_supports_arena(void);
> +u64 bpf_arch_uaddress_limit(void);
>   void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
>   bool bpf_helper_changes_pkt_data(void *func);
>   
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 5aacb1d3c4cc..a04695ca82b9 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -2958,6 +2958,15 @@ bool __weak bpf_jit_supports_arena(void)
>   	return false;
>   }
>   
> +u64 __weak bpf_arch_uaddress_limit(void)
> +{
> +#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
> +	return TASK_SIZE;
> +#else
> +	return 0;
> +#endif
> +}
> +
>   /* Return TRUE if the JIT backend satisfies the following two conditions:
>    * 1) JIT backend supports atomic_xchg() on pointer-sized words.
>    * 2) Under the specific arch, the implementation of xchg() is the same
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index de7813947981..7ce56da6cfa4 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -19657,6 +19657,36 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>   			goto next_insn;
>   		}
>   
> +		/* Make it impossible to de-reference a userspace address */
> +		if (BPF_CLASS(insn->code) == BPF_LDX &&
> +		    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
> +		     BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
> +			struct bpf_insn *patch = &insn_buf[0];
> +			u64 uaddress_limit = bpf_arch_uaddress_limit();
> +
> +			if (!uaddress_limit)
> +				goto next_insn;
> +
> +			*patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
> +			if (insn->off)
> +				*patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
> +			*patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
> +			*patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
> +			*patch++ = *insn;
> +			*patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
> +			*patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);

But how does this address other cases where we could fault e.g. non-canonical,
vsyscall page, etc? Technically, we would have to call to copy_from_kernel_nofault_allowed()
to really address all the cases aside from the overflow (good catch btw!) where kernel
turns into user address.

Thanks,
Daniel
Puranjay Mohan March 22, 2024, 3:05 p.m. UTC | #3
Daniel Borkmann <daniel@iogearbox.net> writes:

> On 3/21/24 1:46 PM, Puranjay Mohan wrote:
>> With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To
>> thwart invalid memory accesses, the JITs add an exception table entry
>> for all such accesses. But in case the src_reg + offset overflows and
>> turns into a userspace address, the BPF program might read that memory if
>> the user has mapped it.
>> 
>> There are architectural features that prevent the kernel from accessing
>> userspace memory, like Privileged Access Never (PAN) on ARM64,
>> Supervisor Mode Access Prevention (SMAP) on x86-64, Supervisor User
>> Memory access (SUM) on RISC-V, etc. But BPF should not rely on the
>> existence of these features.
>> 
>> Make the verifier add guard instructions around such memory accesses and
>> skip the load if the address falls into the userspace region.
>> 
>> The JITs need to implement bpf_arch_uaddress_limit() to define where
>> the userspace addresses end for that architecture or TASK_SIZE is taken
>> as default.
>> 
>> The implementation is as follows:
>> 
>> REG_AX =  SRC_REG
>> if(offset)
>> 	REG_AX += offset;
>> REG_AX >>= 32;
>> if (REG_AX <= (uaddress_limit >> 32))
>> 	DST_REG = 0;
>> else
>> 	DST_REG = *(size *)(SRC_REG + offset);
>> 
>> Comparing just the upper 32 bits of the load address with the upper
>> 32 bits of uaddress_limit implies that the values are being aligned down
>> to a 4GB boundary before comparison.
>> 
>> The above means that all loads with address <= uaddress_limit + 4GB are
>> skipped. This is acceptable because there is a large hole (much larger
>> than 4GB) between userspace and kernel space memory, therefore a
>> correctly functioning BPF program should not access this 4GB memory
>> above the userspace.
>> 
>> Let's analyze what this patch does to the following fentry program
>> dereferencing an untrusted pointer:
>> 
>>    SEC("fentry/tcp_v4_connect")
>>    int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk)
>>    {
>>                  *(volatile long *)sk;
>>                  return 0;
>>    }
>> 
>>      BPF Program before              |           BPF Program after
>>      ------------------              |           -----------------
>> 
>>    0: (79) r1 = *(u64 *)(r1 +0)          0: (79) r1 = *(u64 *)(r1 +0)
>>    -----------------------------------------------------------------------
>>    1: (79) r1 = *(u64 *)(r1 +0) --\      1: (bf) r11 = r1
>>    ----------------------------\   \     2: (77) r11 >>= 32
>>    2: (b7) r0 = 0               \   \    3: (b5) if r11 <= 0x8000 goto pc+2
>>    3: (95) exit                  \   \-> 4: (79) r1 = *(u64 *)(r1 +0)
>>                                   \      5: (05) goto pc+1
>>                                    \     6: (b7) r1 = 0
>>                                     \--------------------------------------
>>                                          7: (b7) r0 = 0
>>                                          8: (95) exit
>> 
>> As you can see from above, in the best case (off=0), 5 extra instructions
>> are emitted.
>> 
>> Now, we analyse the same program after it has gone through the JITs of
>> X86-64, ARM64, and RISC-V architectures. We follow the single load
>> instruction that has the untrusted pointer and see what instrumentation
>> has been added around it.
>> 
>>                                  x86-64 JIT
>>                                  ==========
>>       JIT's Instrumentation                  Verifier's Instrumentation
>>            (upstream)                               (This patch)
>>       ---------------------                  --------------------------
>> 
>>     0:   nopl   0x0(%rax,%rax,1)             0:   nopl   0x0(%rax,%rax,1)
>>     5:   xchg   %ax,%ax                      5:   xchg   %ax,%ax
>>     7:   push   %rbp                         7:   push   %rbp
>>     8:   mov    %rsp,%rbp                    8:   mov    %rsp,%rbp
>>     b:   mov    0x0(%rdi),%rdi               b:   mov    0x0(%rdi),%rdi
>>    ------------------------------------------------------------------------
>>     f:   movabs $0x800000000000,%r11         f:   mov    %rdi,%r10
>>    19:   cmp    %r11,%rdi                   12:   shr    $0x20,%r10
>>    1c:   jb     0x000000000000002a          16:   cmp    $0x8000,%r10
>>    1e:   mov    %rdi,%r11                   1d:   jbe    0x0000000000000025
>>    21:   add    $0x0,%r11              /--> 1f:   mov    0x0(%rdi),%rdi
>>    28:   jae    0x000000000000002e    /     23:   jmp    0x0000000000000027
>>    2a:   xor    %edi,%edi            /      25:   xor    %edi,%edi
>>    2c:   jmp    0x0000000000000032  / /------------------------------------
>>    2e:   mov    0x0(%rdi),%rdi  ---/ /      27:   xor    %eax,%eax
>>    ---------------------------------/       29:   leave
>>    32:   xor    %eax,%eax                   2a:   ret
>>    34:   leave
>>    35:   ret
>> 
>> The x86-64 JIT already emits some instructions to protect against user
>> memory access. The implementation in this patch leads to a smaller
>> number of instructions being emitted. In the worst case the JIT will
>> emit 9 extra instructions and this patch decreases it to 7.
>> 
>>                                    ARM64 JIT
>>                                    =========
>> 
>>          No Intrumentation                       Verifier's Instrumentation
>>             (upstream)                                  (This patch)
>>          -----------------                       --------------------------
>> 
>>     0:   add     x9, x30, #0x0                0:   add     x9, x30, #0x0
>>     4:   nop                                  4:   nop
>>     8:   paciasp                              8:   paciasp
>>     c:   stp     x29, x30, [sp, #-16]!        c:   stp     x29, x30, [sp, #-16]!
>>    10:   mov     x29, sp                     10:   mov     x29, sp
>>    14:   stp     x19, x20, [sp, #-16]!       14:   stp     x19, x20, [sp, #-16]!
>>    18:   stp     x21, x22, [sp, #-16]!       18:   stp     x21, x22, [sp, #-16]!
>>    1c:   stp     x25, x26, [sp, #-16]!       1c:   stp     x25, x26, [sp, #-16]!
>>    20:   stp     x27, x28, [sp, #-16]!       20:   stp     x27, x28, [sp, #-16]!
>>    24:   mov     x25, sp                     24:   mov     x25, sp
>>    28:   mov     x26, #0x0                   28:   mov     x26, #0x0
>>    2c:   sub     x27, x25, #0x0              2c:   sub     x27, x25, #0x0
>>    30:   sub     sp, sp, #0x0                30:   sub     sp, sp, #0x0
>>    34:   ldr     x0, [x0]                    34:   ldr     x0, [x0]
>> --------------------------------------------------------------------------------
>>    38:   ldr     x0, [x0] ----------\        38:   add     x9, x0, #0x0
>> -----------------------------------\\       3c:   lsr     x9, x9, #32
>>    3c:   mov     x7, #0x0            \\      40:   cmp     x9, #0x10, lsl #12
>>    40:   mov     sp, sp               \\     44:   b.ls    0x0000000000000050
>>    44:   ldp     x27, x28, [sp], #16   \\--> 48:   ldr     x0, [x0]
>>    48:   ldp     x25, x26, [sp], #16    \    4c:   b       0x0000000000000054
>>    4c:   ldp     x21, x22, [sp], #16     \   50:   mov     x0, #0x0
>>    50:   ldp     x19, x20, [sp], #16      \---------------------------------------
>>    54:   ldp     x29, x30, [sp], #16         54:   mov     x7, #0x0
>>    58:   add     x0, x7, #0x0                58:   mov     sp, sp
>>    5c:   autiasp                             5c:   ldp     x27, x28, [sp], #16
>>    60:   ret                                 60:   ldp     x25, x26, [sp], #16
>>    64:   nop                                 64:   ldp     x21, x22, [sp], #16
>>    68:   ldr     x10, 0x0000000000000070     68:   ldp     x19, x20, [sp], #16
>>    6c:   br      x10                         6c:   ldp     x29, x30, [sp], #16
>>                                              70:   add     x0, x7, #0x0
>>                                              74:   autiasp
>>                                              78:   ret
>>                                              7c:   nop
>>                                              80:   ldr     x10, 0x0000000000000088
>>                                              84:   br      x10
>> 
>> There are 6 extra instructions added in ARM64 in the best case. This will
>> become 7 in the worst case (off != 0).
>> 
>>                             RISC-V JIT (RISCV_ISA_C Disabled)
>>                             ==========
>> 
>>          No Intrumentation           Verifier's Instrumentation
>>             (upstream)                      (This patch)
>>          -----------------           --------------------------
>> 
>>     0:   nop                            0:   nop
>>     4:   nop                            4:   nop
>>     8:   li      a6, 33                 8:   li      a6, 33
>>     c:   addi    sp, sp, -16            c:   addi    sp, sp, -16
>>    10:   sd      s0, 8(sp)             10:   sd      s0, 8(sp)
>>    14:   addi    s0, sp, 16            14:   addi    s0, sp, 16
>>    18:   ld      a0, 0(a0)             18:   ld      a0, 0(a0)
>> ---------------------------------------------------------------
>>    1c:   ld      a0, 0(a0) --\         1c:   mv      t0, a0
>> --------------------------\  \        20:   srli    t0, t0, 32
>>    20:   li      a5, 0      \  \       24:   lui     t1, 4096
>>    24:   ld      s0, 8(sp)   \  \      28:   sext.w  t1, t1
>>    28:   addi    sp, sp, 16   \  \     2c:   bgeu    t1, t0, 12
>>    2c:   sext.w  a0, a5        \  \--> 30:   ld      a0, 0(a0)
>>    30:   ret                    \      34:   j       8
>>                                  \     38:   li      a0, 0
>>                                   \------------------------------
>>                                        3c:   li      a5, 0
>>                                        40:   ld      s0, 8(sp)
>>                                        44:   addi    sp, sp, 16
>>                                        48:   sext.w  a0, a5
>>                                        4c:   ret
>> 
>> There are 7 extra instructions added in RISC-V.
>> 
>> Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables")
>> Reported-by: Breno Leitao <leitao@debian.org>
>> Suggested-by: Alexei Starovoitov <ast@kernel.org>
>> Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
>> ---
>> V3: https://lore.kernel.org/bpf/20240321120842.78983-1-puranjay12@gmail.com/
>> Changes in V4:
>> - Disable this feature on architectures that don't define
>>    CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE.
>> - By doing the above, we don't need anything explicitly for s390x.
>> 
>> V2: https://lore.kernel.org/bpf/20240321101058.68530-1-puranjay12@gmail.com/
>> Changes in V3:
>> - Return 0 from bpf_arch_uaddress_limit() in disabled case because it
>>    returns u64.
>> - Modify the check in verifier to no do instrumentation when uaddress_limit
>>    is 0.
>> 
>> V1: https://lore.kernel.org/bpf/20240320105436.4781-1-puranjay12@gmail.com/
>> Changes in V2:
>> - Disable this feature on s390x.
>> ---
>>   arch/x86/net/bpf_jit_comp.c | 72 +++++--------------------------------
>>   include/linux/filter.h      |  1 +
>>   kernel/bpf/core.c           |  9 +++++
>>   kernel/bpf/verifier.c       | 30 ++++++++++++++++
>>   4 files changed, 48 insertions(+), 64 deletions(-)
>> 
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index 4900b1ee019f..9b3136187938 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -1327,7 +1327,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>>   		u8 b2 = 0, b3 = 0;
>>   		u8 *start_of_ldx;
>>   		s64 jmp_offset;
>> -		s16 insn_off;
>>   		u8 jmp_cond;
>>   		u8 *func;
>>   		int nops;
>> @@ -1802,78 +1801,18 @@ st:			if (is_imm8(insn->off))
>>   		case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
>>   		case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
>>   		case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
>> -			insn_off = insn->off;
>> -
>> -			if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
>> -			    BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
>> -				/* Conservatively check that src_reg + insn->off is a kernel address:
>> -				 *   src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
>> -				 * src_reg is used as scratch for src_reg += insn->off and restored
>> -				 * after emit_ldx if necessary
>> -				 */
>> -
>> -				u64 limit = TASK_SIZE_MAX + PAGE_SIZE;
>> -				u8 *end_of_jmp;
>> -
>> -				/* At end of these emitted checks, insn->off will have been added
>> -				 * to src_reg, so no need to do relative load with insn->off offset
>> -				 */
>> -				insn_off = 0;
>> -
>> -				/* movabsq r11, limit */
>> -				EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
>> -				EMIT((u32)limit, 4);
>> -				EMIT(limit >> 32, 4);
>> -
>> -				if (insn->off) {
>> -					/* add src_reg, insn->off */
>> -					maybe_emit_1mod(&prog, src_reg, true);
>> -					EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off);
>> -				}
>> -
>> -				/* cmp src_reg, r11 */
>> -				maybe_emit_mod(&prog, src_reg, AUX_REG, true);
>> -				EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
>> -
>> -				/* if unsigned '>=', goto load */
>> -				EMIT2(X86_JAE, 0);
>> -				end_of_jmp = prog;
>> -
>> -				/* xor dst_reg, dst_reg */
>> -				emit_mov_imm32(&prog, false, dst_reg, 0);
>> -				/* jmp byte_after_ldx */
>> -				EMIT2(0xEB, 0);
>> -
>> -				/* populate jmp_offset for JAE above to jump to start_of_ldx */
>> -				start_of_ldx = prog;
>> -				end_of_jmp[-1] = start_of_ldx - end_of_jmp;
>> -			}
>> +			start_of_ldx = prog;
>>   			if (BPF_MODE(insn->code) == BPF_PROBE_MEMSX ||
>>   			    BPF_MODE(insn->code) == BPF_MEMSX)
>> -				emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
>> +				emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
>>   			else
>> -				emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
>> +				emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
>>   			if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
>>   			    BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
>>   				struct exception_table_entry *ex;
>>   				u8 *_insn = image + proglen + (start_of_ldx - temp);
>>   				s64 delta;
>>   
>> -				/* populate jmp_offset for JMP above */
>> -				start_of_ldx[-1] = prog - start_of_ldx;
>> -
>> -				if (insn->off && src_reg != dst_reg) {
>> -					/* sub src_reg, insn->off
>> -					 * Restore src_reg after "add src_reg, insn->off" in prev
>> -					 * if statement. But if src_reg == dst_reg, emit_ldx
>> -					 * above already clobbered src_reg, so no need to restore.
>> -					 * If add src_reg, insn->off was unnecessary, no need to
>> -					 * restore either.
>> -					 */
>> -					maybe_emit_1mod(&prog, src_reg, true);
>> -					EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off);
>> -				}
>> -
>>   				if (!bpf_prog->aux->extable)
>>   					break;
>>   
>> @@ -3473,3 +3412,8 @@ bool bpf_jit_supports_ptr_xchg(void)
>>   {
>>   	return true;
>>   }
>> +
>> +u64 bpf_arch_uaddress_limit(void)
>> +{
>> +	return TASK_SIZE_MAX + PAGE_SIZE;
>> +}
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index c0d51bff8f96..cf12bfa2a78c 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -965,6 +965,7 @@ bool bpf_jit_supports_far_kfunc_call(void);
>>   bool bpf_jit_supports_exceptions(void);
>>   bool bpf_jit_supports_ptr_xchg(void);
>>   bool bpf_jit_supports_arena(void);
>> +u64 bpf_arch_uaddress_limit(void);
>>   void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
>>   bool bpf_helper_changes_pkt_data(void *func);
>>   
>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>> index 5aacb1d3c4cc..a04695ca82b9 100644
>> --- a/kernel/bpf/core.c
>> +++ b/kernel/bpf/core.c
>> @@ -2958,6 +2958,15 @@ bool __weak bpf_jit_supports_arena(void)
>>   	return false;
>>   }
>>   
>> +u64 __weak bpf_arch_uaddress_limit(void)
>> +{
>> +#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
>> +	return TASK_SIZE;
>> +#else
>> +	return 0;
>> +#endif
>> +}
>> +
>>   /* Return TRUE if the JIT backend satisfies the following two conditions:
>>    * 1) JIT backend supports atomic_xchg() on pointer-sized words.
>>    * 2) Under the specific arch, the implementation of xchg() is the same
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index de7813947981..7ce56da6cfa4 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -19657,6 +19657,36 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
>>   			goto next_insn;
>>   		}
>>   
>> +		/* Make it impossible to de-reference a userspace address */
>> +		if (BPF_CLASS(insn->code) == BPF_LDX &&
>> +		    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
>> +		     BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
>> +			struct bpf_insn *patch = &insn_buf[0];
>> +			u64 uaddress_limit = bpf_arch_uaddress_limit();
>> +
>> +			if (!uaddress_limit)
>> +				goto next_insn;
>> +
>> +			*patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
>> +			if (insn->off)
>> +				*patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
>> +			*patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
>> +			*patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
>> +			*patch++ = *insn;
>> +			*patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
>> +			*patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
>
> But how does this address other cases where we could fault e.g. non-canonical,
> vsyscall page, etc? Technically, we would have to call to copy_from_kernel_nofault_allowed()
> to really address all the cases aside from the overflow (good catch btw!) where kernel
> turns into user address.

So, we are trying to ~simulate a call to
copy_from_kernel_nofault_allowed() here. If the address under
consideration is below TASK_SIZE (TASK_SIZE + 4GB to be precise) then we
skip that load because that address could be mapped by the user.

If the address is above TASK_SIZE + 4GB, we allow the load and it could
cause a fault if the address is invalid, non-canonical etc. Taking the
fault is fine because JIT will add an exception table entry for
for that load with BPF_PBOBE_MEM.

The vsyscall page is special, this approach skips all loads from this
page. I am not sure if that is acceptable.


Thanks,
Puranjay
Daniel Borkmann March 22, 2024, 4:28 p.m. UTC | #4
On 3/22/24 4:05 PM, Puranjay Mohan wrote:
[...]
>>> +		/* Make it impossible to de-reference a userspace address */
>>> +		if (BPF_CLASS(insn->code) == BPF_LDX &&
>>> +		    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
>>> +		     BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
>>> +			struct bpf_insn *patch = &insn_buf[0];
>>> +			u64 uaddress_limit = bpf_arch_uaddress_limit();
>>> +
>>> +			if (!uaddress_limit)
>>> +				goto next_insn;
>>> +
>>> +			*patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
>>> +			if (insn->off)
>>> +				*patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
>>> +			*patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
>>> +			*patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
>>> +			*patch++ = *insn;
>>> +			*patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
>>> +			*patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
>>
>> But how does this address other cases where we could fault e.g. non-canonical,
>> vsyscall page, etc? Technically, we would have to call to copy_from_kernel_nofault_allowed()
>> to really address all the cases aside from the overflow (good catch btw!) where kernel
>> turns into user address.
> 
> So, we are trying to ~simulate a call to
> copy_from_kernel_nofault_allowed() here. If the address under
> consideration is below TASK_SIZE (TASK_SIZE + 4GB to be precise) then we
> skip that load because that address could be mapped by the user.
> 
> If the address is above TASK_SIZE + 4GB, we allow the load and it could
> cause a fault if the address is invalid, non-canonical etc. Taking the
> fault is fine because JIT will add an exception table entry for
> for that load with BPF_PBOBE_MEM.

Are you sure? I don't think the kernel handles non-canonical fixup.

> The vsyscall page is special, this approach skips all loads from this
> page. I am not sure if that is acceptable.

The bpf_probe_read_kernel() does handle it fine via copy_from_kernel_nofault().

So there is tail risk that BPF_PROBE_* could trigger a crash. Other archs might
have other quirks, e.g. in case of loongarch it says highest bit set means kernel
space.
Puranjay Mohan March 22, 2024, 4:53 p.m. UTC | #5
Daniel Borkmann <daniel@iogearbox.net> writes:

> On 3/22/24 4:05 PM, Puranjay Mohan wrote:
> [...]
>>>> +		/* Make it impossible to de-reference a userspace address */
>>>> +		if (BPF_CLASS(insn->code) == BPF_LDX &&
>>>> +		    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
>>>> +		     BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
>>>> +			struct bpf_insn *patch = &insn_buf[0];
>>>> +			u64 uaddress_limit = bpf_arch_uaddress_limit();
>>>> +
>>>> +			if (!uaddress_limit)
>>>> +				goto next_insn;
>>>> +
>>>> +			*patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
>>>> +			if (insn->off)
>>>> +				*patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
>>>> +			*patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
>>>> +			*patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
>>>> +			*patch++ = *insn;
>>>> +			*patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
>>>> +			*patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
>>>
>>> But how does this address other cases where we could fault e.g. non-canonical,
>>> vsyscall page, etc? Technically, we would have to call to copy_from_kernel_nofault_allowed()
>>> to really address all the cases aside from the overflow (good catch btw!) where kernel
>>> turns into user address.
>> 
>> So, we are trying to ~simulate a call to
>> copy_from_kernel_nofault_allowed() here. If the address under
>> consideration is below TASK_SIZE (TASK_SIZE + 4GB to be precise) then we
>> skip that load because that address could be mapped by the user.
>> 
>> If the address is above TASK_SIZE + 4GB, we allow the load and it could
>> cause a fault if the address is invalid, non-canonical etc. Taking the
>> fault is fine because JIT will add an exception table entry for
>> for that load with BPF_PBOBE_MEM.
>
> Are you sure? I don't think the kernel handles non-canonical fixup.

Atleast for ARM64 for I don't see a differentiation between the handling
of canonical and non-canonical addresses.
do_translation_fault() checks if addr < TASK_SIZE and calls
do_page_fault() or if the address is greater than TASK_SIZE (it is a
kernel address), do_bad_area() is called.

Both of these call __do_kernel_fault() if fault is from kernel mode and it
does fixup_exception().

>
>> The vsyscall page is special, this approach skips all loads from this
>> page. I am not sure if that is acceptable.
>
> The bpf_probe_read_kernel() does handle it fine via copy_from_kernel_nofault().
bpf_probe_read_kernel() is skipping reading from the vsyscall page, that
is what this patch does as well.

ARM64, RISCV, and some other archs don't implement
copy_from_kernel_nofault_allowed() so I think the we should fix the
common case where the BPF program should not be allowed to access memory
below TASK_SIZE. This would be true for all architectures. 

>
> So there is tail risk that BPF_PROBE_* could trigger a crash. Other archs might

Can you explain this a bit more, how will BPF_PROBE_* trigger a crash?

> have other quirks, e.g. in case of loongarch it says highest bit set means kernel
> space.

Thanks,
Puranjay
Alexei Starovoitov March 23, 2024, 7:13 p.m. UTC | #6
On Fri, Mar 22, 2024 at 9:28 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 3/22/24 4:05 PM, Puranjay Mohan wrote:
> [...]
> >>> +           /* Make it impossible to de-reference a userspace address */
> >>> +           if (BPF_CLASS(insn->code) == BPF_LDX &&
> >>> +               (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
> >>> +                BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
> >>> +                   struct bpf_insn *patch = &insn_buf[0];
> >>> +                   u64 uaddress_limit = bpf_arch_uaddress_limit();
> >>> +
> >>> +                   if (!uaddress_limit)
> >>> +                           goto next_insn;
> >>> +
> >>> +                   *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
> >>> +                   if (insn->off)
> >>> +                           *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
> >>> +                   *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
> >>> +                   *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
> >>> +                   *patch++ = *insn;
> >>> +                   *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
> >>> +                   *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
> >>
> >> But how does this address other cases where we could fault e.g. non-canonical,
> >> vsyscall page, etc? Technically, we would have to call to copy_from_kernel_nofault_allowed()
> >> to really address all the cases aside from the overflow (good catch btw!) where kernel
> >> turns into user address.
> >
> > So, we are trying to ~simulate a call to
> > copy_from_kernel_nofault_allowed() here. If the address under
> > consideration is below TASK_SIZE (TASK_SIZE + 4GB to be precise) then we
> > skip that load because that address could be mapped by the user.
> >
> > If the address is above TASK_SIZE + 4GB, we allow the load and it could
> > cause a fault if the address is invalid, non-canonical etc. Taking the
> > fault is fine because JIT will add an exception table entry for
> > for that load with BPF_PBOBE_MEM.
>
> Are you sure? I don't think the kernel handles non-canonical fixup.

I believe it handles it fine otherwise our selftest bpf_testmod_return_ptr:
   case 4: return (void *)(1ull << 60);    /* non-canonical and invalid */
would have been crashing for the last 3 years,
since we've been running it.

> > The vsyscall page is special, this approach skips all loads from this
> > page. I am not sure if that is acceptable.
>
> The bpf_probe_read_kernel() does handle it fine via copy_from_kernel_nofault().
>
> So there is tail risk that BPF_PROBE_* could trigger a crash.

For this patch let's do
return max(TASK_SIZE_MAX + PAGE_SIZE, VSYSCALL_ADDR)
to cover both with one check?

> Other archs might
> have other quirks, e.g. in case of loongarch it says highest bit set means kernel
> space.

let's tackle loongarch with whatever quirks it has separately.
Puranjay Mohan March 24, 2024, 10:44 a.m. UTC | #7
Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:

> On Fri, Mar 22, 2024 at 9:28 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>>
>> On 3/22/24 4:05 PM, Puranjay Mohan wrote:
>> [...]
>> >>> +           /* Make it impossible to de-reference a userspace address */
>> >>> +           if (BPF_CLASS(insn->code) == BPF_LDX &&
>> >>> +               (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
>> >>> +                BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
>> >>> +                   struct bpf_insn *patch = &insn_buf[0];
>> >>> +                   u64 uaddress_limit = bpf_arch_uaddress_limit();
>> >>> +
>> >>> +                   if (!uaddress_limit)
>> >>> +                           goto next_insn;
>> >>> +
>> >>> +                   *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
>> >>> +                   if (insn->off)
>> >>> +                           *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
>> >>> +                   *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
>> >>> +                   *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
>> >>> +                   *patch++ = *insn;
>> >>> +                   *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
>> >>> +                   *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
>> >>
>> >> But how does this address other cases where we could fault e.g. non-canonical,
>> >> vsyscall page, etc? Technically, we would have to call to copy_from_kernel_nofault_allowed()
>> >> to really address all the cases aside from the overflow (good catch btw!) where kernel
>> >> turns into user address.
>> >
>> > So, we are trying to ~simulate a call to
>> > copy_from_kernel_nofault_allowed() here. If the address under
>> > consideration is below TASK_SIZE (TASK_SIZE + 4GB to be precise) then we
>> > skip that load because that address could be mapped by the user.
>> >
>> > If the address is above TASK_SIZE + 4GB, we allow the load and it could
>> > cause a fault if the address is invalid, non-canonical etc. Taking the
>> > fault is fine because JIT will add an exception table entry for
>> > for that load with BPF_PBOBE_MEM.
>>
>> Are you sure? I don't think the kernel handles non-canonical fixup.
>
> I believe it handles it fine otherwise our selftest bpf_testmod_return_ptr:
>    case 4: return (void *)(1ull << 60);    /* non-canonical and invalid */
> would have been crashing for the last 3 years,
> since we've been running it.
>
>> > The vsyscall page is special, this approach skips all loads from this
>> > page. I am not sure if that is acceptable.
>>
>> The bpf_probe_read_kernel() does handle it fine via copy_from_kernel_nofault().
>>
>> So there is tail risk that BPF_PROBE_* could trigger a crash.
>
> For this patch let's do
> return max(TASK_SIZE_MAX + PAGE_SIZE, VSYSCALL_ADDR)
> to cover both with one check?

I agree, will add this in the next version.

>> Other archs might
>> have other quirks, e.g. in case of loongarch it says highest bit set means kernel
>> space.
>
> let's tackle loongarch with whatever quirks it has separately.

Yes, having the current patch will not break loongarch, it will help it
skip some userspace addresses. We can later implement
bpf_arch_uaddress_limit() in loongarch JIT to handle its specific
quirks.
Alexei Starovoitov March 24, 2024, 7:12 p.m. UTC | #8
On Sun, Mar 24, 2024 at 3:44 AM Puranjay Mohan <puranjay12@gmail.com> wrote:
>
> Alexei Starovoitov <alexei.starovoitov@gmail.com> writes:
>
> > On Fri, Mar 22, 2024 at 9:28 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >>
> >> On 3/22/24 4:05 PM, Puranjay Mohan wrote:
> >> [...]
> >> >>> +           /* Make it impossible to de-reference a userspace address */
> >> >>> +           if (BPF_CLASS(insn->code) == BPF_LDX &&
> >> >>> +               (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
> >> >>> +                BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
> >> >>> +                   struct bpf_insn *patch = &insn_buf[0];
> >> >>> +                   u64 uaddress_limit = bpf_arch_uaddress_limit();
> >> >>> +
> >> >>> +                   if (!uaddress_limit)
> >> >>> +                           goto next_insn;
> >> >>> +
> >> >>> +                   *patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
> >> >>> +                   if (insn->off)
> >> >>> +                           *patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
> >> >>> +                   *patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
> >> >>> +                   *patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
> >> >>> +                   *patch++ = *insn;
> >> >>> +                   *patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
> >> >>> +                   *patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
> >> >>
> >> >> But how does this address other cases where we could fault e.g. non-canonical,
> >> >> vsyscall page, etc? Technically, we would have to call to copy_from_kernel_nofault_allowed()
> >> >> to really address all the cases aside from the overflow (good catch btw!) where kernel
> >> >> turns into user address.
> >> >
> >> > So, we are trying to ~simulate a call to
> >> > copy_from_kernel_nofault_allowed() here. If the address under
> >> > consideration is below TASK_SIZE (TASK_SIZE + 4GB to be precise) then we
> >> > skip that load because that address could be mapped by the user.
> >> >
> >> > If the address is above TASK_SIZE + 4GB, we allow the load and it could
> >> > cause a fault if the address is invalid, non-canonical etc. Taking the
> >> > fault is fine because JIT will add an exception table entry for
> >> > for that load with BPF_PBOBE_MEM.
> >>
> >> Are you sure? I don't think the kernel handles non-canonical fixup.
> >
> > I believe it handles it fine otherwise our selftest bpf_testmod_return_ptr:
> >    case 4: return (void *)(1ull << 60);    /* non-canonical and invalid */
> > would have been crashing for the last 3 years,
> > since we've been running it.
> >
> >> > The vsyscall page is special, this approach skips all loads from this
> >> > page. I am not sure if that is acceptable.
> >>
> >> The bpf_probe_read_kernel() does handle it fine via copy_from_kernel_nofault().
> >>
> >> So there is tail risk that BPF_PROBE_* could trigger a crash.
> >
> > For this patch let's do
> > return max(TASK_SIZE_MAX + PAGE_SIZE, VSYSCALL_ADDR)
> > to cover both with one check?
>
> I agree, will add this in the next version.

Sorry. I didn't look at actual value when I suggested this.
Let's think how to check for them with one conditional branch.
Maybe we can REG_AX >>= 24 instead of 32
and do some clever math, since we need to:
addr < 0x7ffffful || addr == 0xfffffffffful
?
If we have to do two branches that's fine for now,
but we probably need to leave it to JITs,
since they can emit two faster conditional moves and use
single conditional jump or
introduce new pseudo bpf insns equivalent to x86 set[ae|ne|..]
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4900b1ee019f..9b3136187938 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1327,7 +1327,6 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		u8 b2 = 0, b3 = 0;
 		u8 *start_of_ldx;
 		s64 jmp_offset;
-		s16 insn_off;
 		u8 jmp_cond;
 		u8 *func;
 		int nops;
@@ -1802,78 +1801,18 @@  st:			if (is_imm8(insn->off))
 		case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
 		case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
 		case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
-			insn_off = insn->off;
-
-			if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
-			    BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
-				/* Conservatively check that src_reg + insn->off is a kernel address:
-				 *   src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
-				 * src_reg is used as scratch for src_reg += insn->off and restored
-				 * after emit_ldx if necessary
-				 */
-
-				u64 limit = TASK_SIZE_MAX + PAGE_SIZE;
-				u8 *end_of_jmp;
-
-				/* At end of these emitted checks, insn->off will have been added
-				 * to src_reg, so no need to do relative load with insn->off offset
-				 */
-				insn_off = 0;
-
-				/* movabsq r11, limit */
-				EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
-				EMIT((u32)limit, 4);
-				EMIT(limit >> 32, 4);
-
-				if (insn->off) {
-					/* add src_reg, insn->off */
-					maybe_emit_1mod(&prog, src_reg, true);
-					EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off);
-				}
-
-				/* cmp src_reg, r11 */
-				maybe_emit_mod(&prog, src_reg, AUX_REG, true);
-				EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
-
-				/* if unsigned '>=', goto load */
-				EMIT2(X86_JAE, 0);
-				end_of_jmp = prog;
-
-				/* xor dst_reg, dst_reg */
-				emit_mov_imm32(&prog, false, dst_reg, 0);
-				/* jmp byte_after_ldx */
-				EMIT2(0xEB, 0);
-
-				/* populate jmp_offset for JAE above to jump to start_of_ldx */
-				start_of_ldx = prog;
-				end_of_jmp[-1] = start_of_ldx - end_of_jmp;
-			}
+			start_of_ldx = prog;
 			if (BPF_MODE(insn->code) == BPF_PROBE_MEMSX ||
 			    BPF_MODE(insn->code) == BPF_MEMSX)
-				emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
+				emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
 			else
-				emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
+				emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
 			if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
 			    BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
 				struct exception_table_entry *ex;
 				u8 *_insn = image + proglen + (start_of_ldx - temp);
 				s64 delta;
 
-				/* populate jmp_offset for JMP above */
-				start_of_ldx[-1] = prog - start_of_ldx;
-
-				if (insn->off && src_reg != dst_reg) {
-					/* sub src_reg, insn->off
-					 * Restore src_reg after "add src_reg, insn->off" in prev
-					 * if statement. But if src_reg == dst_reg, emit_ldx
-					 * above already clobbered src_reg, so no need to restore.
-					 * If add src_reg, insn->off was unnecessary, no need to
-					 * restore either.
-					 */
-					maybe_emit_1mod(&prog, src_reg, true);
-					EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off);
-				}
-
 				if (!bpf_prog->aux->extable)
 					break;
 
@@ -3473,3 +3412,8 @@  bool bpf_jit_supports_ptr_xchg(void)
 {
 	return true;
 }
+
+u64 bpf_arch_uaddress_limit(void)
+{
+	return TASK_SIZE_MAX + PAGE_SIZE;
+}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c0d51bff8f96..cf12bfa2a78c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -965,6 +965,7 @@  bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_jit_supports_exceptions(void);
 bool bpf_jit_supports_ptr_xchg(void);
 bool bpf_jit_supports_arena(void);
+u64 bpf_arch_uaddress_limit(void);
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5aacb1d3c4cc..a04695ca82b9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2958,6 +2958,15 @@  bool __weak bpf_jit_supports_arena(void)
 	return false;
 }
 
+u64 __weak bpf_arch_uaddress_limit(void)
+{
+#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
+	return TASK_SIZE;
+#else
+	return 0;
+#endif
+}
+
 /* Return TRUE if the JIT backend satisfies the following two conditions:
  * 1) JIT backend supports atomic_xchg() on pointer-sized words.
  * 2) Under the specific arch, the implementation of xchg() is the same
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index de7813947981..7ce56da6cfa4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19657,6 +19657,36 @@  static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto next_insn;
 		}
 
+		/* Make it impossible to de-reference a userspace address */
+		if (BPF_CLASS(insn->code) == BPF_LDX &&
+		    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+		     BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
+			struct bpf_insn *patch = &insn_buf[0];
+			u64 uaddress_limit = bpf_arch_uaddress_limit();
+
+			if (!uaddress_limit)
+				goto next_insn;
+
+			*patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+			if (insn->off)
+				*patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
+			*patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
+			*patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
+			*patch++ = *insn;
+			*patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+			*patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
+
+			cnt = patch - insn_buf;
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			goto next_insn;
+		}
+
 		/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
 		if (BPF_CLASS(insn->code) == BPF_LD &&
 		    (BPF_MODE(insn->code) == BPF_ABS ||