diff mbox series

[bpf-next,v11,4/7] bpf, x86: Support private stack in jit

Message ID 20241109025332.150019-1-yonghong.song@linux.dev (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Support private stack for bpf progs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 204 this patch: 204
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 16 maintainers not CCed: mingo@redhat.com x86@kernel.org hpa@zytor.com eddyz87@gmail.com bp@alien8.de dsahern@kernel.org netdev@vger.kernel.org john.fastabend@gmail.com haoluo@google.com dave.hansen@linux.intel.com kpsingh@kernel.org martin.lau@linux.dev tglx@linutronix.de sdf@fomichev.me song@kernel.org jolsa@kernel.org
netdev/build_clang success Errors and warnings before: 255 this patch: 255
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6971 this patch: 6971
netdev/checkpatch warning CHECK: No space is necessary after a cast WARNING: line length of 91 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0

Commit Message

Yonghong Song Nov. 9, 2024, 2:53 a.m. UTC
Private stack is allocated in function bpf_int_jit_compile() with
alignment 16. The x86 register 9 (X86_REG_R9) is used to replace
bpf frame register (BPF_REG_10). The private stack is used per
subprog per cpu. The X86_REG_R9 is saved and restored around every
func call (not including tailcall) to maintain correctness of
X86_REG_R9.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 arch/x86/net/bpf_jit_comp.c | 77 +++++++++++++++++++++++++++++++++++++
 include/linux/bpf.h         |  1 +
 2 files changed, 78 insertions(+)

Comments

Alexei Starovoitov Nov. 9, 2024, 8:14 p.m. UTC | #1
On Fri, Nov 8, 2024 at 6:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
>         stack_depth = bpf_prog->aux->stack_depth;
> +       if (bpf_prog->aux->priv_stack_ptr) {
> +               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
> +               stack_depth = 0;
> +       }

...

> +       priv_stack_ptr = prog->aux->priv_stack_ptr;
> +       if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
> +               priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);

After applying I started to see crashes running test_progs -j like:

[  173.465191] Oops: general protection fault, probably for
non-canonical address 0xdffffc0000000af9: 0000 [#1] PREEMPT SMP KASAN
[  173.466053] KASAN: probably user-memory-access in range
[0x00000000000057c8-0x00000000000057cf]
[  173.466053] RIP: 0010:dst_dev_put+0x1e/0x220
[  173.466053] Call Trace:
[  173.466053]  <IRQ>
[  173.466053]  ? die_addr+0x40/0xa0
[  173.466053]  ? exc_general_protection+0x138/0x1f0
[  173.466053]  ? asm_exc_general_protection+0x26/0x30
[  173.466053]  ? dst_dev_put+0x1e/0x220
[  173.466053]  rt_fibinfo_free_cpus.part.0+0x8c/0x130
[  173.466053]  fib_nh_common_release+0xd6/0x2a0
[  173.466053]  free_fib_info_rcu+0x129/0x360
[  173.466053]  ? rcu_core+0xa55/0x1340
[  173.466053]  rcu_core+0xa55/0x1340
[  173.466053]  ? rcutree_report_cpu_dead+0x380/0x380
[  173.466053]  ? hrtimer_interrupt+0x319/0x7c0
[  173.466053]  handle_softirqs+0x14c/0x4d0

[   35.134115] Oops: general protection fault, probably for
non-canonical address 0xe0000bfff101fbbc: 0000 [#1] PREEMPT SMP KASAN
[   35.135089] KASAN: probably user-memory-access in range
[0x00007fff880fdde0-0x00007fff880fdde7]
[   35.135089] RIP: 0010:destroy_workqueue+0x4b4/0xa70
[   35.135089] Call Trace:
[   35.135089]  <TASK>
[   35.135089]  ? die_addr+0x40/0xa0
[   35.135089]  ? exc_general_protection+0x138/0x1f0
[   35.135089]  ? asm_exc_general_protection+0x26/0x30
[   35.135089]  ? destroy_workqueue+0x4b4/0xa70
[   35.135089]  ? destroy_workqueue+0x592/0xa70
[   35.135089]  ? __mutex_unlock_slowpath.isra.0+0x270/0x270
[   35.135089]  ext4_put_super+0xff/0xd70
[   35.135089]  generic_shutdown_super+0x148/0x4c0
[   35.135089]  kill_block_super+0x3b/0x90
[   35.135089]  ext4_kill_sb+0x65/0x90

I think I see the bug... quoted it above...

Please make sure you reproduce it first.

Then let's figure out a way how to test for such things and
what we can do to make kasan detect it sooner,
since above crashes have no indication at all that bpf priv stack
is responsible.
If there is another bug in priv stack and it will cause future
crashes we need to make sure that priv stack corruption is
detected by kasan (or whatever mechanism) earlier.

We cannot land private stack support when there is
a possibility of such silent corruption.

pw-bot: cr
Yonghong Song Nov. 10, 2024, 2:34 a.m. UTC | #2
On 11/9/24 12:14 PM, Alexei Starovoitov wrote:
> On Fri, Nov 8, 2024 at 6:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>>          stack_depth = bpf_prog->aux->stack_depth;
>> +       if (bpf_prog->aux->priv_stack_ptr) {
>> +               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
>> +               stack_depth = 0;
>> +       }
> ...
>
>> +       priv_stack_ptr = prog->aux->priv_stack_ptr;
>> +       if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
>> +               priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
> After applying I started to see crashes running test_progs -j like:
>
> [  173.465191] Oops: general protection fault, probably for
> non-canonical address 0xdffffc0000000af9: 0000 [#1] PREEMPT SMP KASAN
> [  173.466053] KASAN: probably user-memory-access in range
> [0x00000000000057c8-0x00000000000057cf]
> [  173.466053] RIP: 0010:dst_dev_put+0x1e/0x220
> [  173.466053] Call Trace:
> [  173.466053]  <IRQ>
> [  173.466053]  ? die_addr+0x40/0xa0
> [  173.466053]  ? exc_general_protection+0x138/0x1f0
> [  173.466053]  ? asm_exc_general_protection+0x26/0x30
> [  173.466053]  ? dst_dev_put+0x1e/0x220
> [  173.466053]  rt_fibinfo_free_cpus.part.0+0x8c/0x130
> [  173.466053]  fib_nh_common_release+0xd6/0x2a0
> [  173.466053]  free_fib_info_rcu+0x129/0x360
> [  173.466053]  ? rcu_core+0xa55/0x1340
> [  173.466053]  rcu_core+0xa55/0x1340
> [  173.466053]  ? rcutree_report_cpu_dead+0x380/0x380
> [  173.466053]  ? hrtimer_interrupt+0x319/0x7c0
> [  173.466053]  handle_softirqs+0x14c/0x4d0
>
> [   35.134115] Oops: general protection fault, probably for
> non-canonical address 0xe0000bfff101fbbc: 0000 [#1] PREEMPT SMP KASAN
> [   35.135089] KASAN: probably user-memory-access in range
> [0x00007fff880fdde0-0x00007fff880fdde7]
> [   35.135089] RIP: 0010:destroy_workqueue+0x4b4/0xa70
> [   35.135089] Call Trace:
> [   35.135089]  <TASK>
> [   35.135089]  ? die_addr+0x40/0xa0
> [   35.135089]  ? exc_general_protection+0x138/0x1f0
> [   35.135089]  ? asm_exc_general_protection+0x26/0x30
> [   35.135089]  ? destroy_workqueue+0x4b4/0xa70
> [   35.135089]  ? destroy_workqueue+0x592/0xa70
> [   35.135089]  ? __mutex_unlock_slowpath.isra.0+0x270/0x270
> [   35.135089]  ext4_put_super+0xff/0xd70
> [   35.135089]  generic_shutdown_super+0x148/0x4c0
> [   35.135089]  kill_block_super+0x3b/0x90
> [   35.135089]  ext4_kill_sb+0x65/0x90
>
> I think I see the bug... quoted it above...
>
> Please make sure you reproduce it first.
>
> Then let's figure out a way how to test for such things and
> what we can do to make kasan detect it sooner,
> since above crashes have no indication at all that bpf priv stack
> is responsible.
> If there is another bug in priv stack and it will cause future
> crashes we need to make sure that priv stack corruption is
> detected by kasan (or whatever mechanism) earlier.
>
> We cannot land private stack support when there is
> a possibility of such silent corruption.

I can reproduce it now when running multiple times.
I will debug this ASAP.

>
> pw-bot: cr
Yonghong Song Nov. 11, 2024, 11:18 p.m. UTC | #3
On 11/9/24 12:14 PM, Alexei Starovoitov wrote:
> On Fri, Nov 8, 2024 at 6:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>>          stack_depth = bpf_prog->aux->stack_depth;
>> +       if (bpf_prog->aux->priv_stack_ptr) {
>> +               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
>> +               stack_depth = 0;
>> +       }
> ...
>
>> +       priv_stack_ptr = prog->aux->priv_stack_ptr;
>> +       if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
>> +               priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
> After applying I started to see crashes running test_progs -j like:
>
> [  173.465191] Oops: general protection fault, probably for
> non-canonical address 0xdffffc0000000af9: 0000 [#1] PREEMPT SMP KASAN
> [  173.466053] KASAN: probably user-memory-access in range
> [0x00000000000057c8-0x00000000000057cf]
> [  173.466053] RIP: 0010:dst_dev_put+0x1e/0x220
> [  173.466053] Call Trace:
> [  173.466053]  <IRQ>
> [  173.466053]  ? die_addr+0x40/0xa0
> [  173.466053]  ? exc_general_protection+0x138/0x1f0
> [  173.466053]  ? asm_exc_general_protection+0x26/0x30
> [  173.466053]  ? dst_dev_put+0x1e/0x220
> [  173.466053]  rt_fibinfo_free_cpus.part.0+0x8c/0x130
> [  173.466053]  fib_nh_common_release+0xd6/0x2a0
> [  173.466053]  free_fib_info_rcu+0x129/0x360
> [  173.466053]  ? rcu_core+0xa55/0x1340
> [  173.466053]  rcu_core+0xa55/0x1340
> [  173.466053]  ? rcutree_report_cpu_dead+0x380/0x380
> [  173.466053]  ? hrtimer_interrupt+0x319/0x7c0
> [  173.466053]  handle_softirqs+0x14c/0x4d0
>
> [   35.134115] Oops: general protection fault, probably for
> non-canonical address 0xe0000bfff101fbbc: 0000 [#1] PREEMPT SMP KASAN
> [   35.135089] KASAN: probably user-memory-access in range
> [0x00007fff880fdde0-0x00007fff880fdde7]
> [   35.135089] RIP: 0010:destroy_workqueue+0x4b4/0xa70
> [   35.135089] Call Trace:
> [   35.135089]  <TASK>
> [   35.135089]  ? die_addr+0x40/0xa0
> [   35.135089]  ? exc_general_protection+0x138/0x1f0
> [   35.135089]  ? asm_exc_general_protection+0x26/0x30
> [   35.135089]  ? destroy_workqueue+0x4b4/0xa70
> [   35.135089]  ? destroy_workqueue+0x592/0xa70
> [   35.135089]  ? __mutex_unlock_slowpath.isra.0+0x270/0x270
> [   35.135089]  ext4_put_super+0xff/0xd70
> [   35.135089]  generic_shutdown_super+0x148/0x4c0
> [   35.135089]  kill_block_super+0x3b/0x90
> [   35.135089]  ext4_kill_sb+0x65/0x90
>
> I think I see the bug... quoted it above...
>
> Please make sure you reproduce it first.

Indeed, to use the allocation size round_up(stack_depth, 16) for __alloc_percpu_gfp()
indeed fixed the problem.

The following is additional change on top of this patch set to
   - fix the memory access bug as suggested by Alexei in the above
   - Add guard space for private stack, additional 16 bytes at the
     end of stack will be the guard space. The content is prepopulated
     and checked at per cpu private stack free site. If the content
     is not expected, a kernel message will emit.
   - Add kasan support for guard space.

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 55556a64f776..d796d419bb48 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1446,6 +1446,9 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
  #define LOAD_TAIL_CALL_CNT_PTR(stack)                          \
         __LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
  
+#define PRIV_STACK_GUARD_SZ    16
+#define PRIV_STACK_GUARD_VAL   0xEB9F1234eb9f1234ULL
+
  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
                   int oldproglen, struct jit_context *ctx, bool jmp_padding)
  {
@@ -1462,10 +1465,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
         u8 *prog = temp;
         u32 stack_depth;
         int err;
+       // int stack_size;
  
         stack_depth = bpf_prog->aux->stack_depth;
         if (bpf_prog->aux->priv_stack_ptr) {
-               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
+               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16) + PRIV_STACK_GUARD_SZ;
                 stack_depth = 0;
         }
  
@@ -1496,8 +1500,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
                 emit_mov_imm64(&prog, X86_REG_R12,
                                arena_vm_start >> 32, (u32) arena_vm_start);
  
-       if (priv_frame_ptr)
+       if (priv_frame_ptr) {
+#if 0
+               /* hack to emit and write some data to guard area */
+               emit_priv_frame_ptr(&prog, bpf_prog->aux->priv_stack_ptr);
+
+               /* See case BPF_ST | BPF_MEM | BPF_W */
+               EMIT2(0x41, 0xC7);
+               EMIT2(add_1reg(0x40, X86_REG_R9), 0);
+               EMIT(0xFFFFFFFF, bpf_size_to_x86_bytes(BPF_W));
+#endif
                 emit_priv_frame_ptr(&prog, priv_frame_ptr);
+       }
  
         ilen = prog - temp;
         if (rw_image)
@@ -3383,11 +3397,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
         struct x64_jit_data *jit_data;
         int proglen, oldproglen = 0;
         struct jit_context ctx = {};
+       int priv_stack_size, cpu;
         bool tmp_blinded = false;
         bool extra_pass = false;
         bool padding = false;
         u8 *rw_image = NULL;
         u8 *image = NULL;
+       u64 *guard_ptr;
         int *addrs;
         int pass;
         int i;
@@ -3418,11 +3434,17 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
         }
         priv_stack_ptr = prog->aux->priv_stack_ptr;
         if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
-               priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
+               priv_stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
+               priv_stack_ptr = __alloc_percpu_gfp(priv_stack_size, 16, GFP_KERNEL);
                 if (!priv_stack_ptr) {
                         prog = orig_prog;
                         goto out_priv_stack;
                 }
+               for_each_possible_cpu(cpu) {
+                       guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+                       guard_ptr[0] = guard_ptr[1] = PRIV_STACK_GUARD_VAL;
+                       kasan_poison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ);
+               }
                 prog->aux->priv_stack_ptr = priv_stack_ptr;
         }
         addrs = jit_data->addrs;
@@ -3561,6 +3583,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
  out_addrs:
                 kvfree(addrs);
                 if (!image && priv_stack_ptr) {
+                       for_each_possible_cpu(cpu) {
+                               guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+                               kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
+                       }
                         free_percpu(priv_stack_ptr);
                         prog->aux->priv_stack_ptr = NULL;
                 }
@@ -3603,6 +3629,9 @@ void bpf_jit_free(struct bpf_prog *prog)
         if (prog->jited) {
                 struct x64_jit_data *jit_data = prog->aux->jit_data;
                 struct bpf_binary_header *hdr;
+               void __percpu *priv_stack_ptr;
+               u64 *guard_ptr;
+               int cpu;
  
                 /*
                  * If we fail the final pass of JIT (from jit_subprogs),
@@ -3618,7 +3647,21 @@ void bpf_jit_free(struct bpf_prog *prog)
                 prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
                 hdr = bpf_jit_binary_pack_hdr(prog);
                 bpf_jit_binary_pack_free(hdr, NULL);
-               free_percpu(prog->aux->priv_stack_ptr);
+
+               priv_stack_ptr = prog->aux->priv_stack_ptr;
+               if (priv_stack_ptr) {
+                       int stack_size;
+
+                       stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
+                       for_each_possible_cpu(cpu) {
+                               guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+                               kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
+                               if (guard_ptr[0] != PRIV_STACK_GUARD_VAL || guard_ptr[1] != PRIV_STACK_GUARD_VAL)
+                                       pr_err("Private stack Overflow happened for prog %sx\n", prog->aux->name);
+                       }
+                       free_percpu(priv_stack_ptr);
+               }
+
                 WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
         }

This fixed the issue Alexei discovered.

16 bytes guard space is allocated since allocation is done with 16byte aligned
with multiple-16 size. If bpf program overflows the stack (change '#if 0' to '#if 1')
in the above change, we will see:

[root@arch-fb-vm1 bpf]# ./test_progs -n 336
[   28.447390] bpf_testmod: loading out-of-tree module taints kernel.
[   28.448180] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
#336/1   struct_ops_private_stack/private_stack:OK
#336/2   struct_ops_private_stack/private_stack_fail:OK
#336/3   struct_ops_private_stack/private_stack_recur:OK
#336     struct_ops_private_stack:OK
Summary: 1/3 PASSED, 0 SKIPPED, 0 FAILED
[   28.737710] Private stack Overflow happened for prog Fx
[   28.739284] Private stack Overflow happened for prog Fx
[   28.968732] Private stack Overflow happened for prog Fx

Here the func name is 'Fx' (representing the sub prog). We might need
to add more meaningful info (e.g. main prog name) to make message more
meaningful.

I added some changes related kasan. If I made a change to guard space in kernel (not in bpf prog),
the kasan can print out the error message properly. But unfortunately, in jit, there is no
kasan instrumentation so warning (with "#if 1" change) is not reported. One possibility is
if kernel config enables kasan, bpf jit could add kasan to jited binary. Not sure the
complexity and whether it is worthwhile or not since supposedly verifier should already
prevent overflow and we already have a guard check (Private stack overflow happened ...)
in jit.

> Then let's figure out a way how to test for such things and
> what we can do to make kasan detect it sooner,
> since above crashes have no indication at all that bpf priv stack
> is responsible.
> If there is another bug in priv stack and it will cause future
> crashes we need to make sure that priv stack corruption is
> detected by kasan (or whatever mechanism) earlier.
>
> We cannot land private stack support when there is
> a possibility of such silent corruption.
>
> pw-bot: cr
Alexei Starovoitov Nov. 12, 2024, 1:29 a.m. UTC | #4
On Mon, Nov 11, 2024 at 3:18 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
>
>
> On 11/9/24 12:14 PM, Alexei Starovoitov wrote:
> > On Fri, Nov 8, 2024 at 6:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>
> >>          stack_depth = bpf_prog->aux->stack_depth;
> >> +       if (bpf_prog->aux->priv_stack_ptr) {
> >> +               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
> >> +               stack_depth = 0;
> >> +       }
> > ...
> >
> >> +       priv_stack_ptr = prog->aux->priv_stack_ptr;
> >> +       if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
> >> +               priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
> > After applying I started to see crashes running test_progs -j like:
> >
> > [  173.465191] Oops: general protection fault, probably for
> > non-canonical address 0xdffffc0000000af9: 0000 [#1] PREEMPT SMP KASAN
> > [  173.466053] KASAN: probably user-memory-access in range
> > [0x00000000000057c8-0x00000000000057cf]
> > [  173.466053] RIP: 0010:dst_dev_put+0x1e/0x220
> > [  173.466053] Call Trace:
> > [  173.466053]  <IRQ>
> > [  173.466053]  ? die_addr+0x40/0xa0
> > [  173.466053]  ? exc_general_protection+0x138/0x1f0
> > [  173.466053]  ? asm_exc_general_protection+0x26/0x30
> > [  173.466053]  ? dst_dev_put+0x1e/0x220
> > [  173.466053]  rt_fibinfo_free_cpus.part.0+0x8c/0x130
> > [  173.466053]  fib_nh_common_release+0xd6/0x2a0
> > [  173.466053]  free_fib_info_rcu+0x129/0x360
> > [  173.466053]  ? rcu_core+0xa55/0x1340
> > [  173.466053]  rcu_core+0xa55/0x1340
> > [  173.466053]  ? rcutree_report_cpu_dead+0x380/0x380
> > [  173.466053]  ? hrtimer_interrupt+0x319/0x7c0
> > [  173.466053]  handle_softirqs+0x14c/0x4d0
> >
> > [   35.134115] Oops: general protection fault, probably for
> > non-canonical address 0xe0000bfff101fbbc: 0000 [#1] PREEMPT SMP KASAN
> > [   35.135089] KASAN: probably user-memory-access in range
> > [0x00007fff880fdde0-0x00007fff880fdde7]
> > [   35.135089] RIP: 0010:destroy_workqueue+0x4b4/0xa70
> > [   35.135089] Call Trace:
> > [   35.135089]  <TASK>
> > [   35.135089]  ? die_addr+0x40/0xa0
> > [   35.135089]  ? exc_general_protection+0x138/0x1f0
> > [   35.135089]  ? asm_exc_general_protection+0x26/0x30
> > [   35.135089]  ? destroy_workqueue+0x4b4/0xa70
> > [   35.135089]  ? destroy_workqueue+0x592/0xa70
> > [   35.135089]  ? __mutex_unlock_slowpath.isra.0+0x270/0x270
> > [   35.135089]  ext4_put_super+0xff/0xd70
> > [   35.135089]  generic_shutdown_super+0x148/0x4c0
> > [   35.135089]  kill_block_super+0x3b/0x90
> > [   35.135089]  ext4_kill_sb+0x65/0x90
> >
> > I think I see the bug... quoted it above...
> >
> > Please make sure you reproduce it first.
>
> Indeed, to use the allocation size round_up(stack_depth, 16) for __alloc_percpu_gfp()
> indeed fixed the problem.
>
> The following is additional change on top of this patch set to
>    - fix the memory access bug as suggested by Alexei in the above
>    - Add guard space for private stack, additional 16 bytes at the
>      end of stack will be the guard space. The content is prepopulated
>      and checked at per cpu private stack free site. If the content
>      is not expected, a kernel message will emit.
>    - Add kasan support for guard space.
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 55556a64f776..d796d419bb48 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -1446,6 +1446,9 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
>   #define LOAD_TAIL_CALL_CNT_PTR(stack)                          \
>          __LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
>
> +#define PRIV_STACK_GUARD_SZ    16
> +#define PRIV_STACK_GUARD_VAL   0xEB9F1234eb9f1234ULL
> +
>   static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
>                    int oldproglen, struct jit_context *ctx, bool jmp_padding)
>   {
> @@ -1462,10 +1465,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>          u8 *prog = temp;
>          u32 stack_depth;
>          int err;
> +       // int stack_size;
>
>          stack_depth = bpf_prog->aux->stack_depth;
>          if (bpf_prog->aux->priv_stack_ptr) {
> -               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
> +               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16) + PRIV_STACK_GUARD_SZ;
>                  stack_depth = 0;

Since priv stack is not really a stack there is no need to align it to 16
and no need to round_up() either.
let's drop these parts and it will simplify the code.

Re: GUARD_SZ.
I think it's better to guard top and bottom.
8 byte for each will do.

>          }
>
> @@ -1496,8 +1500,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>                  emit_mov_imm64(&prog, X86_REG_R12,
>                                 arena_vm_start >> 32, (u32) arena_vm_start);
>
> -       if (priv_frame_ptr)
> +       if (priv_frame_ptr) {
> +#if 0
> +               /* hack to emit and write some data to guard area */
> +               emit_priv_frame_ptr(&prog, bpf_prog->aux->priv_stack_ptr);
> +
> +               /* See case BPF_ST | BPF_MEM | BPF_W */
> +               EMIT2(0x41, 0xC7);
> +               EMIT2(add_1reg(0x40, X86_REG_R9), 0);
> +               EMIT(0xFFFFFFFF, bpf_size_to_x86_bytes(BPF_W));
> +#endif
>                  emit_priv_frame_ptr(&prog, priv_frame_ptr);
> +       }
>
>          ilen = prog - temp;
>          if (rw_image)
> @@ -3383,11 +3397,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>          struct x64_jit_data *jit_data;
>          int proglen, oldproglen = 0;
>          struct jit_context ctx = {};
> +       int priv_stack_size, cpu;
>          bool tmp_blinded = false;
>          bool extra_pass = false;
>          bool padding = false;
>          u8 *rw_image = NULL;
>          u8 *image = NULL;
> +       u64 *guard_ptr;
>          int *addrs;
>          int pass;
>          int i;
> @@ -3418,11 +3434,17 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>          }
>          priv_stack_ptr = prog->aux->priv_stack_ptr;
>          if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
> -               priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
> +               priv_stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
> +               priv_stack_ptr = __alloc_percpu_gfp(priv_stack_size, 16, GFP_KERNEL);
>                  if (!priv_stack_ptr) {
>                          prog = orig_prog;
>                          goto out_priv_stack;
>                  }
> +               for_each_possible_cpu(cpu) {
> +                       guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
> +                       guard_ptr[0] = guard_ptr[1] = PRIV_STACK_GUARD_VAL;
> +                       kasan_poison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ);

with top and bottom guards there will be two calls to poison.

But did you check that percpu allocs come from the vmalloc region?
Does kasan_poison_vmalloc() actually work or silently nop ?

> +               }
>                  prog->aux->priv_stack_ptr = priv_stack_ptr;
>          }
>          addrs = jit_data->addrs;
> @@ -3561,6 +3583,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>   out_addrs:
>                  kvfree(addrs);
>                  if (!image && priv_stack_ptr) {
> +                       for_each_possible_cpu(cpu) {
> +                               guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
> +                               kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
> +                       }
>                          free_percpu(priv_stack_ptr);
>                          prog->aux->priv_stack_ptr = NULL;
>                  }
> @@ -3603,6 +3629,9 @@ void bpf_jit_free(struct bpf_prog *prog)
>          if (prog->jited) {
>                  struct x64_jit_data *jit_data = prog->aux->jit_data;
>                  struct bpf_binary_header *hdr;
> +               void __percpu *priv_stack_ptr;
> +               u64 *guard_ptr;
> +               int cpu;
>
>                  /*
>                   * If we fail the final pass of JIT (from jit_subprogs),
> @@ -3618,7 +3647,21 @@ void bpf_jit_free(struct bpf_prog *prog)
>                  prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
>                  hdr = bpf_jit_binary_pack_hdr(prog);
>                  bpf_jit_binary_pack_free(hdr, NULL);
> -               free_percpu(prog->aux->priv_stack_ptr);
> +
> +               priv_stack_ptr = prog->aux->priv_stack_ptr;
> +               if (priv_stack_ptr) {
> +                       int stack_size;
> +
> +                       stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
> +                       for_each_possible_cpu(cpu) {
> +                               guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
> +                               kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
> +                               if (guard_ptr[0] != PRIV_STACK_GUARD_VAL || guard_ptr[1] != PRIV_STACK_GUARD_VAL)
> +                                       pr_err("Private stack Overflow happened for prog %sx\n", prog->aux->name);
> +                       }

Common helper is needed to check guards before free_percpu.

> +                       free_percpu(priv_stack_ptr);
> +               }
> +
>                  WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
>          }
>
> This fixed the issue Alexei discovered.
>
> 16 bytes guard space is allocated since allocation is done with 16byte aligned
> with multiple-16 size. If bpf program overflows the stack (change '#if 0' to '#if 1')
> in the above change, we will see:
>
> [root@arch-fb-vm1 bpf]# ./test_progs -n 336
> [   28.447390] bpf_testmod: loading out-of-tree module taints kernel.
> [   28.448180] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
> #336/1   struct_ops_private_stack/private_stack:OK
> #336/2   struct_ops_private_stack/private_stack_fail:OK
> #336/3   struct_ops_private_stack/private_stack_recur:OK
> #336     struct_ops_private_stack:OK
> Summary: 1/3 PASSED, 0 SKIPPED, 0 FAILED
> [   28.737710] Private stack Overflow happened for prog Fx
> [   28.739284] Private stack Overflow happened for prog Fx
> [   28.968732] Private stack Overflow happened for prog Fx
>
> Here the func name is 'Fx' (representing the sub prog). We might need
> to add more meaningful info (e.g. main prog name) to make message more
> meaningful.

Probably worth introducing a helper like:

const char *bpf_get_prog_name(prog)
{
  if (fp->aux->ksym.prog)
    return prog->aux->ksym.name;
  return prog->aux->name;
}


>
> I added some changes related kasan. If I made a change to guard space in kernel (not in bpf prog),
> the kasan can print out the error message properly. But unfortunately, in jit, there is no
> kasan instrumentation so warning (with "#if 1" change) is not reported. One possibility is
> if kernel config enables kasan, bpf jit could add kasan to jited binary. Not sure the
> complexity and whether it is worthwhile or not since supposedly verifier should already
> prevent overflow and we already have a guard check (Private stack overflow happened ...)
> in jit.

As a follow up we should teach JIT to emit calls __asan_loadN/storeN
when processing LDX/STX.
imo it's been long overdue.
Yonghong Song Nov. 12, 2024, 3:42 a.m. UTC | #5
On 11/11/24 5:29 PM, Alexei Starovoitov wrote:
> On Mon, Nov 11, 2024 at 3:18 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>>
>>
>> On 11/9/24 12:14 PM, Alexei Starovoitov wrote:
>>> On Fri, Nov 8, 2024 at 6:53 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>           stack_depth = bpf_prog->aux->stack_depth;
>>>> +       if (bpf_prog->aux->priv_stack_ptr) {
>>>> +               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
>>>> +               stack_depth = 0;
>>>> +       }
>>> ...
>>>
>>>> +       priv_stack_ptr = prog->aux->priv_stack_ptr;
>>>> +       if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
>>>> +               priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
>>> After applying I started to see crashes running test_progs -j like:
>>>
>>> [  173.465191] Oops: general protection fault, probably for
>>> non-canonical address 0xdffffc0000000af9: 0000 [#1] PREEMPT SMP KASAN
>>> [  173.466053] KASAN: probably user-memory-access in range
>>> [0x00000000000057c8-0x00000000000057cf]
>>> [  173.466053] RIP: 0010:dst_dev_put+0x1e/0x220
>>> [  173.466053] Call Trace:
>>> [  173.466053]  <IRQ>
>>> [  173.466053]  ? die_addr+0x40/0xa0
>>> [  173.466053]  ? exc_general_protection+0x138/0x1f0
>>> [  173.466053]  ? asm_exc_general_protection+0x26/0x30
>>> [  173.466053]  ? dst_dev_put+0x1e/0x220
>>> [  173.466053]  rt_fibinfo_free_cpus.part.0+0x8c/0x130
>>> [  173.466053]  fib_nh_common_release+0xd6/0x2a0
>>> [  173.466053]  free_fib_info_rcu+0x129/0x360
>>> [  173.466053]  ? rcu_core+0xa55/0x1340
>>> [  173.466053]  rcu_core+0xa55/0x1340
>>> [  173.466053]  ? rcutree_report_cpu_dead+0x380/0x380
>>> [  173.466053]  ? hrtimer_interrupt+0x319/0x7c0
>>> [  173.466053]  handle_softirqs+0x14c/0x4d0
>>>
>>> [   35.134115] Oops: general protection fault, probably for
>>> non-canonical address 0xe0000bfff101fbbc: 0000 [#1] PREEMPT SMP KASAN
>>> [   35.135089] KASAN: probably user-memory-access in range
>>> [0x00007fff880fdde0-0x00007fff880fdde7]
>>> [   35.135089] RIP: 0010:destroy_workqueue+0x4b4/0xa70
>>> [   35.135089] Call Trace:
>>> [   35.135089]  <TASK>
>>> [   35.135089]  ? die_addr+0x40/0xa0
>>> [   35.135089]  ? exc_general_protection+0x138/0x1f0
>>> [   35.135089]  ? asm_exc_general_protection+0x26/0x30
>>> [   35.135089]  ? destroy_workqueue+0x4b4/0xa70
>>> [   35.135089]  ? destroy_workqueue+0x592/0xa70
>>> [   35.135089]  ? __mutex_unlock_slowpath.isra.0+0x270/0x270
>>> [   35.135089]  ext4_put_super+0xff/0xd70
>>> [   35.135089]  generic_shutdown_super+0x148/0x4c0
>>> [   35.135089]  kill_block_super+0x3b/0x90
>>> [   35.135089]  ext4_kill_sb+0x65/0x90
>>>
>>> I think I see the bug... quoted it above...
>>>
>>> Please make sure you reproduce it first.
>> Indeed, to use the allocation size round_up(stack_depth, 16) for __alloc_percpu_gfp()
>> indeed fixed the problem.
>>
>> The following is additional change on top of this patch set to
>>     - fix the memory access bug as suggested by Alexei in the above
>>     - Add guard space for private stack, additional 16 bytes at the
>>       end of stack will be the guard space. The content is prepopulated
>>       and checked at per cpu private stack free site. If the content
>>       is not expected, a kernel message will emit.
>>     - Add kasan support for guard space.
>>
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index 55556a64f776..d796d419bb48 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -1446,6 +1446,9 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
>>    #define LOAD_TAIL_CALL_CNT_PTR(stack)                          \
>>           __LOAD_TCC_PTR(BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
>>
>> +#define PRIV_STACK_GUARD_SZ    16
>> +#define PRIV_STACK_GUARD_VAL   0xEB9F1234eb9f1234ULL
>> +
>>    static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
>>                     int oldproglen, struct jit_context *ctx, bool jmp_padding)
>>    {
>> @@ -1462,10 +1465,11 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>>           u8 *prog = temp;
>>           u32 stack_depth;
>>           int err;
>> +       // int stack_size;
>>
>>           stack_depth = bpf_prog->aux->stack_depth;
>>           if (bpf_prog->aux->priv_stack_ptr) {
>> -               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
>> +               priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16) + PRIV_STACK_GUARD_SZ;
>>                   stack_depth = 0;
> Since priv stack is not really a stack there is no need to align it to 16
> and no need to round_up() either.
> let's drop these parts and it will simplify the code.
>
> Re: GUARD_SZ.
> I think it's better to guard top and bottom.
> 8 byte for each will do.

Make sense for both. I will align to 8 bytes.

>
>>           }
>>
>> @@ -1496,8 +1500,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>>                   emit_mov_imm64(&prog, X86_REG_R12,
>>                                  arena_vm_start >> 32, (u32) arena_vm_start);
>>
>> -       if (priv_frame_ptr)
>> +       if (priv_frame_ptr) {
>> +#if 0
>> +               /* hack to emit and write some data to guard area */
>> +               emit_priv_frame_ptr(&prog, bpf_prog->aux->priv_stack_ptr);
>> +
>> +               /* See case BPF_ST | BPF_MEM | BPF_W */
>> +               EMIT2(0x41, 0xC7);
>> +               EMIT2(add_1reg(0x40, X86_REG_R9), 0);
>> +               EMIT(0xFFFFFFFF, bpf_size_to_x86_bytes(BPF_W));
>> +#endif
>>                   emit_priv_frame_ptr(&prog, priv_frame_ptr);
>> +       }
>>
>>           ilen = prog - temp;
>>           if (rw_image)
>> @@ -3383,11 +3397,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>>           struct x64_jit_data *jit_data;
>>           int proglen, oldproglen = 0;
>>           struct jit_context ctx = {};
>> +       int priv_stack_size, cpu;
>>           bool tmp_blinded = false;
>>           bool extra_pass = false;
>>           bool padding = false;
>>           u8 *rw_image = NULL;
>>           u8 *image = NULL;
>> +       u64 *guard_ptr;
>>           int *addrs;
>>           int pass;
>>           int i;
>> @@ -3418,11 +3434,17 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>>           }
>>           priv_stack_ptr = prog->aux->priv_stack_ptr;
>>           if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
>> -               priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
>> +               priv_stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
>> +               priv_stack_ptr = __alloc_percpu_gfp(priv_stack_size, 16, GFP_KERNEL);
>>                   if (!priv_stack_ptr) {
>>                           prog = orig_prog;
>>                           goto out_priv_stack;
>>                   }
>> +               for_each_possible_cpu(cpu) {
>> +                       guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
>> +                       guard_ptr[0] = guard_ptr[1] = PRIV_STACK_GUARD_VAL;
>> +                       kasan_poison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ);
> with top and bottom guards there will be two calls to poison.
>
> But did you check that percpu allocs come from the vmalloc region?
> Does kasan_poison_vmalloc() actually work or silently nop ?

I tried again. it is silent nop. So later when we add kasan for bpf progs,
we need to tackle this as well.

>
>> +               }
>>                   prog->aux->priv_stack_ptr = priv_stack_ptr;
>>           }
>>           addrs = jit_data->addrs;
>> @@ -3561,6 +3583,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
>>    out_addrs:
>>                   kvfree(addrs);
>>                   if (!image && priv_stack_ptr) {
>> +                       for_each_possible_cpu(cpu) {
>> +                               guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
>> +                               kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
>> +                       }
>>                           free_percpu(priv_stack_ptr);
>>                           prog->aux->priv_stack_ptr = NULL;
>>                   }
>> @@ -3603,6 +3629,9 @@ void bpf_jit_free(struct bpf_prog *prog)
>>           if (prog->jited) {
>>                   struct x64_jit_data *jit_data = prog->aux->jit_data;
>>                   struct bpf_binary_header *hdr;
>> +               void __percpu *priv_stack_ptr;
>> +               u64 *guard_ptr;
>> +               int cpu;
>>
>>                   /*
>>                    * If we fail the final pass of JIT (from jit_subprogs),
>> @@ -3618,7 +3647,21 @@ void bpf_jit_free(struct bpf_prog *prog)
>>                   prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
>>                   hdr = bpf_jit_binary_pack_hdr(prog);
>>                   bpf_jit_binary_pack_free(hdr, NULL);
>> -               free_percpu(prog->aux->priv_stack_ptr);
>> +
>> +               priv_stack_ptr = prog->aux->priv_stack_ptr;
>> +               if (priv_stack_ptr) {
>> +                       int stack_size;
>> +
>> +                       stack_size = round_up(prog->aux->stack_depth, 16) + PRIV_STACK_GUARD_SZ;
>> +                       for_each_possible_cpu(cpu) {
>> +                               guard_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
>> +                               kasan_unpoison_vmalloc(guard_ptr, PRIV_STACK_GUARD_SZ, KASAN_VMALLOC_PROT_NORMAL);
>> +                               if (guard_ptr[0] != PRIV_STACK_GUARD_VAL || guard_ptr[1] != PRIV_STACK_GUARD_VAL)
>> +                                       pr_err("Private stack Overflow happened for prog %sx\n", prog->aux->name);
>> +                       }
> Common helper is needed to check guards before free_percpu.

Ack.

>
>> +                       free_percpu(priv_stack_ptr);
>> +               }
>> +
>>                   WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
>>           }
>>
>> This fixed the issue Alexei discovered.
>>
>> 16 bytes guard space is allocated since allocation is done with 16byte aligned
>> with multiple-16 size. If bpf program overflows the stack (change '#if 0' to '#if 1')
>> in the above change, we will see:
>>
>> [root@arch-fb-vm1 bpf]# ./test_progs -n 336
>> [   28.447390] bpf_testmod: loading out-of-tree module taints kernel.
>> [   28.448180] bpf_testmod: module verification failed: signature and/or required key missing - tainting kernel
>> #336/1   struct_ops_private_stack/private_stack:OK
>> #336/2   struct_ops_private_stack/private_stack_fail:OK
>> #336/3   struct_ops_private_stack/private_stack_recur:OK
>> #336     struct_ops_private_stack:OK
>> Summary: 1/3 PASSED, 0 SKIPPED, 0 FAILED
>> [   28.737710] Private stack Overflow happened for prog Fx
>> [   28.739284] Private stack Overflow happened for prog Fx
>> [   28.968732] Private stack Overflow happened for prog Fx
>>
>> Here the func name is 'Fx' (representing the sub prog). We might need
>> to add more meaningful info (e.g. main prog name) to make message more
>> meaningful.
> Probably worth introducing a helper like:
>
> const char *bpf_get_prog_name(prog)
> {
>    if (fp->aux->ksym.prog)
>      return prog->aux->ksym.name;
>    return prog->aux->name;
> }

Looks good. Thanks for suggestion.

>
>
>> I added some changes related kasan. If I made a change to guard space in kernel (not in bpf prog),
>> the kasan can print out the error message properly. But unfortunately, in jit, there is no
>> kasan instrumentation so warning (with "#if 1" change) is not reported. One possibility is
>> if kernel config enables kasan, bpf jit could add kasan to jited binary. Not sure the
>> complexity and whether it is worthwhile or not since supposedly verifier should already
>> prevent overflow and we already have a guard check (Private stack overflow happened ...)
>> in jit.
> As a follow up we should teach JIT to emit calls __asan_loadN/storeN
> when processing LDX/STX.
> imo it's been long overdue.

I will fix the random crash issue and add guard support.
Will do followup for jit kasan support.
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 3ff638c37999..55556a64f776 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -325,6 +325,22 @@  struct jit_context {
 /* Number of bytes that will be skipped on tailcall */
 #define X86_TAIL_CALL_OFFSET	(12 + ENDBR_INSN_SIZE)
 
+static void push_r9(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT2(0x41, 0x51);   /* push r9 */
+	*pprog = prog;
+}
+
+static void pop_r9(u8 **pprog)
+{
+	u8 *prog = *pprog;
+
+	EMIT2(0x41, 0x59);   /* pop r9 */
+	*pprog = prog;
+}
+
 static void push_r12(u8 **pprog)
 {
 	u8 *prog = *pprog;
@@ -1404,6 +1420,24 @@  static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
 	*pprog = prog;
 }
 
+static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
+{
+	u8 *prog = *pprog;
+
+	/* movabs r9, priv_frame_ptr */
+	emit_mov_imm64(&prog, X86_REG_R9, (__force long) priv_frame_ptr >> 32,
+		       (u32) (__force long) priv_frame_ptr);
+
+#ifdef CONFIG_SMP
+	/* add <r9>, gs:[<off>] */
+	EMIT2(0x65, 0x4c);
+	EMIT3(0x03, 0x0c, 0x25);
+	EMIT((u32)(unsigned long)&this_cpu_off, 4);
+#endif
+
+	*pprog = prog;
+}
+
 #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
 
 #define __LOAD_TCC_PTR(off)			\
@@ -1421,6 +1455,7 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 	int insn_cnt = bpf_prog->len;
 	bool seen_exit = false;
 	u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+	void __percpu *priv_frame_ptr = NULL;
 	u64 arena_vm_start, user_vm_start;
 	int i, excnt = 0;
 	int ilen, proglen = 0;
@@ -1429,6 +1464,10 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 	int err;
 
 	stack_depth = bpf_prog->aux->stack_depth;
+	if (bpf_prog->aux->priv_stack_ptr) {
+		priv_frame_ptr = bpf_prog->aux->priv_stack_ptr + round_up(stack_depth, 16);
+		stack_depth = 0;
+	}
 
 	arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
 	user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
@@ -1457,6 +1496,9 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		emit_mov_imm64(&prog, X86_REG_R12,
 			       arena_vm_start >> 32, (u32) arena_vm_start);
 
+	if (priv_frame_ptr)
+		emit_priv_frame_ptr(&prog, priv_frame_ptr);
+
 	ilen = prog - temp;
 	if (rw_image)
 		memcpy(rw_image + proglen, temp, ilen);
@@ -1476,6 +1518,14 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 		u8 *func;
 		int nops;
 
+		if (priv_frame_ptr) {
+			if (src_reg == BPF_REG_FP)
+				src_reg = X86_REG_R9;
+
+			if (dst_reg == BPF_REG_FP)
+				dst_reg = X86_REG_R9;
+		}
+
 		switch (insn->code) {
 			/* ALU */
 		case BPF_ALU | BPF_ADD | BPF_X:
@@ -2136,9 +2186,15 @@  st:			if (is_imm8(insn->off))
 			}
 			if (!imm32)
 				return -EINVAL;
+			if (priv_frame_ptr) {
+				push_r9(&prog);
+				ip += 2;
+			}
 			ip += x86_call_depth_emit_accounting(&prog, func, ip);
 			if (emit_call(&prog, func, ip))
 				return -EINVAL;
+			if (priv_frame_ptr)
+				pop_r9(&prog);
 			break;
 		}
 
@@ -3323,6 +3379,7 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	struct bpf_binary_header *rw_header = NULL;
 	struct bpf_binary_header *header = NULL;
 	struct bpf_prog *tmp, *orig_prog = prog;
+	void __percpu *priv_stack_ptr = NULL;
 	struct x64_jit_data *jit_data;
 	int proglen, oldproglen = 0;
 	struct jit_context ctx = {};
@@ -3359,6 +3416,15 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		}
 		prog->aux->jit_data = jit_data;
 	}
+	priv_stack_ptr = prog->aux->priv_stack_ptr;
+	if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+		priv_stack_ptr = __alloc_percpu_gfp(prog->aux->stack_depth, 16, GFP_KERNEL);
+		if (!priv_stack_ptr) {
+			prog = orig_prog;
+			goto out_priv_stack;
+		}
+		prog->aux->priv_stack_ptr = priv_stack_ptr;
+	}
 	addrs = jit_data->addrs;
 	if (addrs) {
 		ctx = jit_data->ctx;
@@ -3494,6 +3560,11 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 			bpf_prog_fill_jited_linfo(prog, addrs + 1);
 out_addrs:
 		kvfree(addrs);
+		if (!image && priv_stack_ptr) {
+			free_percpu(priv_stack_ptr);
+			prog->aux->priv_stack_ptr = NULL;
+		}
+out_priv_stack:
 		kfree(jit_data);
 		prog->aux->jit_data = NULL;
 	}
@@ -3547,6 +3618,7 @@  void bpf_jit_free(struct bpf_prog *prog)
 		prog->bpf_func = (void *)prog->bpf_func - cfi_get_offset();
 		hdr = bpf_jit_binary_pack_hdr(prog);
 		bpf_jit_binary_pack_free(hdr, NULL);
+		free_percpu(prog->aux->priv_stack_ptr);
 		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
 	}
 
@@ -3562,6 +3634,11 @@  bool bpf_jit_supports_exceptions(void)
 	return IS_ENABLED(CONFIG_UNWINDER_ORC);
 }
 
+bool bpf_jit_supports_private_stack(void)
+{
+	return true;
+}
+
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
 {
 #if defined(CONFIG_UNWINDER_ORC)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 15f20d508174..9cfb8f55d691 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1507,6 +1507,7 @@  struct bpf_prog_aux {
 	u32 max_rdwr_access;
 	struct btf *attach_btf;
 	const struct bpf_ctx_arg_aux *ctx_arg_info;
+	void __percpu *priv_stack_ptr;
 	struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */
 	struct bpf_prog *dst_prog;
 	struct bpf_trampoline *dst_trampoline;