diff mbox series

[bpf-next,v6,1/9] bpf: Allow each subprog having stack size of 512 bytes

Message ID 20241020191347.2105090-1-yonghong.song@linux.dev (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Support private stack for bpf progs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 206 this patch: 206
netdev/build_tools success Errors and warnings before: 0 (+1) this patch: 0 (+1)
netdev/cc_maintainers warning 8 maintainers not CCed: song@kernel.org haoluo@google.com john.fastabend@gmail.com sdf@fomichev.me martin.lau@linux.dev kpsingh@kernel.org eddyz87@gmail.com jolsa@kernel.org
netdev/build_clang success Errors and warnings before: 257 this patch: 257
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6962 this patch: 6962
netdev/checkpatch warning CHECK: multiple assignments should be avoided WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18

Commit Message

Yonghong Song Oct. 20, 2024, 7:13 p.m. UTC
With private stack support, each subprog can have stack with up to 512
bytes. The limit of 512 bytes per subprog is kept to avoid increasing
verifier complexity since greater than 512 bytes will cause big verifier
change and increase memory consumption and verification time.

If private stack is supported, for a bpf prog, esp. when it has
subprogs, private stack will be allocated for the main prog
and for each callback subprog. For example,
  main_prog
    subprog1
      calling helper
        subprog10 (callback func)
          subprog11
    subprog2
      calling helper
        subprog10 (callback func)
          subprog11

Separate private allocations for main_prog and callback_fn subprog10
will make things easier since the helper function uses the kernel stack.

In this patch, some tracing programs are allowed to use private
stack since tracing prog may be triggered in the middle of some other
prog runs. Additional subprog info is also collected for later to
allocate private stack for main prog and each callback functions.

Note that if any tail_call is called in the prog (including all subprogs),
then private stack is not used.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 include/linux/bpf.h          |   1 +
 include/linux/bpf_verifier.h |   3 ++
 include/linux/filter.h       |   1 +
 kernel/bpf/core.c            |   5 ++
 kernel/bpf/verifier.c        | 100 ++++++++++++++++++++++++++++++-----
 5 files changed, 97 insertions(+), 13 deletions(-)

Comments

Alexei Starovoitov Oct. 22, 2024, 1:18 a.m. UTC | #1
On Sun, Oct 20, 2024 at 12:14 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> With private stack support, each subprog can have stack with up to 512
> bytes. The limit of 512 bytes per subprog is kept to avoid increasing
> verifier complexity since greater than 512 bytes will cause big verifier
> change and increase memory consumption and verification time.
>
> If private stack is supported, for a bpf prog, esp. when it has
> subprogs, private stack will be allocated for the main prog
> and for each callback subprog. For example,
>   main_prog
>     subprog1
>       calling helper
>         subprog10 (callback func)
>           subprog11
>     subprog2
>       calling helper
>         subprog10 (callback func)
>           subprog11
>
> Separate private allocations for main_prog and callback_fn subprog10
> will make things easier since the helper function uses the kernel stack.
>
> In this patch, some tracing programs are allowed to use private
> stack since tracing prog may be triggered in the middle of some other
> prog runs. Additional subprog info is also collected for later to
> allocate private stack for main prog and each callback functions.
>
> Note that if any tail_call is called in the prog (including all subprogs),
> then private stack is not used.
>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf.h          |   1 +
>  include/linux/bpf_verifier.h |   3 ++
>  include/linux/filter.h       |   1 +
>  kernel/bpf/core.c            |   5 ++
>  kernel/bpf/verifier.c        | 100 ++++++++++++++++++++++++++++++-----
>  5 files changed, 97 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 0c216e71cec7..6ad8ace7075a 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -1490,6 +1490,7 @@ struct bpf_prog_aux {
>         bool exception_cb;
>         bool exception_boundary;
>         bool is_extended; /* true if extended by freplace program */
> +       bool priv_stack_eligible;
>         u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
>         struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
>         struct bpf_arena *arena;
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 4513372c5bc8..bcfe868e3801 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -659,6 +659,8 @@ struct bpf_subprog_info {
>          * are used for bpf_fastcall spills and fills.
>          */
>         s16 fastcall_stack_off;
> +       u16 subtree_stack_depth;
> +       u16 subtree_top_idx;
>         bool has_tail_call: 1;
>         bool tail_call_reachable: 1;
>         bool has_ld_abs: 1;
> @@ -668,6 +670,7 @@ struct bpf_subprog_info {
>         bool args_cached: 1;
>         /* true if bpf_fastcall stack region is used by functions that can't be inlined */
>         bool keep_fastcall_stack: 1;
> +       bool priv_stack_eligible: 1;
>
>         u8 arg_cnt;
>         struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 7d7578a8eac1..3a21947f2fd4 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void);
>  bool bpf_jit_supports_ptr_xchg(void);
>  bool bpf_jit_supports_arena(void);
>  bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
> +bool bpf_jit_supports_private_stack(void);
>  u64 bpf_arch_uaddress_limit(void);
>  void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
>  bool bpf_helper_changes_pkt_data(void *func);
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 233ea78f8f1b..14d9288441f2 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
>         return false;
>  }
>
> +bool __weak bpf_jit_supports_private_stack(void)
> +{
> +       return false;
> +}
> +
>  void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
>  {
>  }
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index f514247ba8ba..45bea4066272 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
>
>  #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
>
> +#define BPF_PRIV_STACK_MIN_SUBTREE_SIZE        128
> +
>  static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
>  static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
>  static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
> @@ -5982,6 +5984,41 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
>                                            strict);
>  }
>
> +static bool bpf_enable_private_stack(struct bpf_verifier_env *env)
> +{
> +       if (!bpf_jit_supports_private_stack())
> +               return false;
> +
> +       switch (env->prog->type) {
> +       case BPF_PROG_TYPE_KPROBE:
> +       case BPF_PROG_TYPE_TRACEPOINT:
> +       case BPF_PROG_TYPE_PERF_EVENT:
> +       case BPF_PROG_TYPE_RAW_TRACEPOINT:
> +               return true;
> +       case BPF_PROG_TYPE_TRACING:
> +               if (env->prog->expected_attach_type != BPF_TRACE_ITER)
> +                       return true;
> +               fallthrough;
> +       default:
> +               return false;
> +       }
> +}
> +
> +static bool is_priv_stack_supported(struct bpf_verifier_env *env)
> +{
> +       struct bpf_subprog_info *si = env->subprog_info;
> +       bool has_tail_call = false;
> +
> +       for (int i = 0; i < env->subprog_cnt; i++) {
> +               if (si[i].has_tail_call) {
> +                       has_tail_call = true;
> +                       break;
> +               }
> +       }
> +
> +       return !has_tail_call && bpf_enable_private_stack(env);
> +}
> +
>  static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
>  {
>         if (env->prog->jit_requested)
> @@ -5999,16 +6036,21 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
>   * Since recursion is prevented by check_cfg() this algorithm
>   * only needs a local stack of MAX_CALL_FRAMES to remember callsites
>   */
> -static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
> +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
> +                                        bool check_priv_stack, bool priv_stack_supported)
>  {
>         struct bpf_subprog_info *subprog = env->subprog_info;
>         struct bpf_insn *insn = env->prog->insnsi;
>         int depth = 0, frame = 0, i, subprog_end;
>         bool tail_call_reachable = false;
> +       bool priv_stack_eligible = false;
>         int ret_insn[MAX_CALL_FRAMES];
>         int ret_prog[MAX_CALL_FRAMES];
> -       int j;
> +       int j, subprog_stack_depth;
> +       int orig_idx = idx;
>
> +       if (check_priv_stack)
> +               subprog[idx].subtree_top_idx = idx;
>         i = subprog[idx].start;
>  process_func:
>         /* protect against potential stack overflow that might happen when
> @@ -6030,18 +6072,33 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>          * tailcall will unwind the current stack frame but it will not get rid
>          * of caller's stack as shown on the example above.
>          */
> -       if (idx && subprog[idx].has_tail_call && depth >= 256) {
> +       if (!check_priv_stack && idx && subprog[idx].has_tail_call && depth >= 256) {
>                 verbose(env,
>                         "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
>                         depth);
>                 return -EACCES;
>         }
> -       depth += round_up_stack_depth(env, subprog[idx].stack_depth);
> -       if (depth > MAX_BPF_STACK) {
> +       subprog_stack_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
> +       depth += subprog_stack_depth;
> +       if (!check_priv_stack && !priv_stack_supported && depth > MAX_BPF_STACK) {
>                 verbose(env, "combined stack size of %d calls is %d. Too large\n",
>                         frame + 1, depth);
>                 return -EACCES;
>         }
> +       if (check_priv_stack) {
> +               if (subprog_stack_depth > MAX_BPF_STACK) {
> +                       verbose(env, "stack size of subprog %d is %d. Too large\n",
> +                               idx, subprog_stack_depth);
> +                       return -EACCES;
> +               }
> +
> +               if (!priv_stack_eligible && depth >= BPF_PRIV_STACK_MIN_SUBTREE_SIZE) {
> +                       subprog[orig_idx].priv_stack_eligible = true;
> +                       env->prog->aux->priv_stack_eligible = priv_stack_eligible = true;
> +               }
> +               subprog[orig_idx].subtree_stack_depth =
> +                       max_t(u16, subprog[orig_idx].subtree_stack_depth, depth);
> +       }
>  continue_func:
>         subprog_end = subprog[idx + 1].start;
>         for (; i < subprog_end; i++) {
> @@ -6078,6 +6135,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>                 next_insn = i + insn[i].imm + 1;
>                 sidx = find_subprog(env, next_insn);
>                 if (sidx < 0) {
> +                       /* It is possible that callback func has been removed as dead code after
> +                        * instruction rewrites, e.g. bpf_loop with cnt 0.
> +                        */
> +                       if (check_priv_stack)
> +                               continue;
> +

and this extra hack only because check_max_stack_depth() will
be called the 2nd time ?
Why call it twice at all ?
Record everything in the first pass.

>                         WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
>                                   next_insn);
>                         return -EFAULT;
> @@ -6097,8 +6160,10 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>                 }
>                 i = next_insn;
>                 idx = sidx;
> +               if (check_priv_stack)
> +                       subprog[idx].subtree_top_idx = orig_idx;
>
> -               if (subprog[idx].has_tail_call)
> +               if (!check_priv_stack && subprog[idx].has_tail_call)
>                         tail_call_reachable = true;
>
>                 frame++;
> @@ -6122,7 +6187,7 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>                         }
>                         subprog[ret_prog[j]].tail_call_reachable = true;
>                 }
> -       if (subprog[0].tail_call_reachable)
> +       if (!check_priv_stack && subprog[0].tail_call_reachable)
>                 env->prog->aux->tail_call_reachable = true;
>
>         /* end of for() loop means the last insn of the 'subprog'
> @@ -6137,14 +6202,18 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>         goto continue_func;
>  }
>
> -static int check_max_stack_depth(struct bpf_verifier_env *env)
> +static int check_max_stack_depth(struct bpf_verifier_env *env, bool check_priv_stack,
> +                                bool priv_stack_supported)
>  {
>         struct bpf_subprog_info *si = env->subprog_info;
> +       bool check_subprog;
>         int ret;
>
>         for (int i = 0; i < env->subprog_cnt; i++) {
> -               if (!i || si[i].is_async_cb) {
> -                       ret = check_max_stack_depth_subprog(env, i);
> +               check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);

why?
This looks very suspicious.

> +               if (check_subprog) {
> +                       ret = check_max_stack_depth_subprog(env, i, check_priv_stack,
> +                                                           priv_stack_supported);
>                         if (ret < 0)
>                                 return ret;
>                 }
> @@ -22303,7 +22372,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>         struct bpf_verifier_env *env;
>         int i, len, ret = -EINVAL, err;
>         u32 log_true_size;
> -       bool is_priv;
> +       bool is_priv, priv_stack_supported = false;
>
>         /* no program is valid */
>         if (ARRAY_SIZE(bpf_verifier_ops) == 0)
> @@ -22430,8 +22499,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>         if (ret == 0)
>                 ret = remove_fastcall_spills_fills(env);
>
> -       if (ret == 0)
> -               ret = check_max_stack_depth(env);
> +       if (ret == 0) {
> +               priv_stack_supported = is_priv_stack_supported(env);
> +               ret = check_max_stack_depth(env, false, priv_stack_supported);
> +       }
>
>         /* instruction rewrites happen after this point */
>         if (ret == 0)
> @@ -22465,6 +22536,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>                                                                      : false;
>         }
>
> +       if (ret == 0 && priv_stack_supported)
> +               ret = check_max_stack_depth(env, true, true);
> +
>         if (ret == 0)
>                 ret = fixup_call_args(env);
>
> --
> 2.43.5
>
Yonghong Song Oct. 22, 2024, 3:21 a.m. UTC | #2
On 10/21/24 6:18 PM, Alexei Starovoitov wrote:
> On Sun, Oct 20, 2024 at 12:14 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> With private stack support, each subprog can have stack with up to 512
>> bytes. The limit of 512 bytes per subprog is kept to avoid increasing
>> verifier complexity since greater than 512 bytes will cause big verifier
>> change and increase memory consumption and verification time.
>>
>> If private stack is supported, for a bpf prog, esp. when it has
>> subprogs, private stack will be allocated for the main prog
>> and for each callback subprog. For example,
>>    main_prog
>>      subprog1
>>        calling helper
>>          subprog10 (callback func)
>>            subprog11
>>      subprog2
>>        calling helper
>>          subprog10 (callback func)
>>            subprog11
>>
>> Separate private allocations for main_prog and callback_fn subprog10
>> will make things easier since the helper function uses the kernel stack.
>>
>> In this patch, some tracing programs are allowed to use private
>> stack since tracing prog may be triggered in the middle of some other
>> prog runs. Additional subprog info is also collected for later to
>> allocate private stack for main prog and each callback functions.
>>
>> Note that if any tail_call is called in the prog (including all subprogs),
>> then private stack is not used.
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf.h          |   1 +
>>   include/linux/bpf_verifier.h |   3 ++
>>   include/linux/filter.h       |   1 +
>>   kernel/bpf/core.c            |   5 ++
>>   kernel/bpf/verifier.c        | 100 ++++++++++++++++++++++++++++++-----
>>   5 files changed, 97 insertions(+), 13 deletions(-)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index 0c216e71cec7..6ad8ace7075a 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -1490,6 +1490,7 @@ struct bpf_prog_aux {
>>          bool exception_cb;
>>          bool exception_boundary;
>>          bool is_extended; /* true if extended by freplace program */
>> +       bool priv_stack_eligible;
>>          u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
>>          struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
>>          struct bpf_arena *arena;
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index 4513372c5bc8..bcfe868e3801 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -659,6 +659,8 @@ struct bpf_subprog_info {
>>           * are used for bpf_fastcall spills and fills.
>>           */
>>          s16 fastcall_stack_off;
>> +       u16 subtree_stack_depth;
>> +       u16 subtree_top_idx;
>>          bool has_tail_call: 1;
>>          bool tail_call_reachable: 1;
>>          bool has_ld_abs: 1;
>> @@ -668,6 +670,7 @@ struct bpf_subprog_info {
>>          bool args_cached: 1;
>>          /* true if bpf_fastcall stack region is used by functions that can't be inlined */
>>          bool keep_fastcall_stack: 1;
>> +       bool priv_stack_eligible: 1;
>>
>>          u8 arg_cnt;
>>          struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
>> diff --git a/include/linux/filter.h b/include/linux/filter.h
>> index 7d7578a8eac1..3a21947f2fd4 100644
>> --- a/include/linux/filter.h
>> +++ b/include/linux/filter.h
>> @@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void);
>>   bool bpf_jit_supports_ptr_xchg(void);
>>   bool bpf_jit_supports_arena(void);
>>   bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
>> +bool bpf_jit_supports_private_stack(void);
>>   u64 bpf_arch_uaddress_limit(void);
>>   void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
>>   bool bpf_helper_changes_pkt_data(void *func);
>> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
>> index 233ea78f8f1b..14d9288441f2 100644
>> --- a/kernel/bpf/core.c
>> +++ b/kernel/bpf/core.c
>> @@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
>>          return false;
>>   }
>>
>> +bool __weak bpf_jit_supports_private_stack(void)
>> +{
>> +       return false;
>> +}
>> +
>>   void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
>>   {
>>   }
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index f514247ba8ba..45bea4066272 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
>>
>>   #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
>>
>> +#define BPF_PRIV_STACK_MIN_SUBTREE_SIZE        128
>> +
>>   static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
>>   static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
>>   static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
>> @@ -5982,6 +5984,41 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
>>                                             strict);
>>   }
>>
>> +static bool bpf_enable_private_stack(struct bpf_verifier_env *env)
>> +{
>> +       if (!bpf_jit_supports_private_stack())
>> +               return false;
>> +
>> +       switch (env->prog->type) {
>> +       case BPF_PROG_TYPE_KPROBE:
>> +       case BPF_PROG_TYPE_TRACEPOINT:
>> +       case BPF_PROG_TYPE_PERF_EVENT:
>> +       case BPF_PROG_TYPE_RAW_TRACEPOINT:
>> +               return true;
>> +       case BPF_PROG_TYPE_TRACING:
>> +               if (env->prog->expected_attach_type != BPF_TRACE_ITER)
>> +                       return true;
>> +               fallthrough;
>> +       default:
>> +               return false;
>> +       }
>> +}
>> +
>> +static bool is_priv_stack_supported(struct bpf_verifier_env *env)
>> +{
>> +       struct bpf_subprog_info *si = env->subprog_info;
>> +       bool has_tail_call = false;
>> +
>> +       for (int i = 0; i < env->subprog_cnt; i++) {
>> +               if (si[i].has_tail_call) {
>> +                       has_tail_call = true;
>> +                       break;
>> +               }
>> +       }
>> +
>> +       return !has_tail_call && bpf_enable_private_stack(env);
>> +}
>> +
>>   static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
>>   {
>>          if (env->prog->jit_requested)
>> @@ -5999,16 +6036,21 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
>>    * Since recursion is prevented by check_cfg() this algorithm
>>    * only needs a local stack of MAX_CALL_FRAMES to remember callsites
>>    */
>> -static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>> +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
>> +                                        bool check_priv_stack, bool priv_stack_supported)
>>   {
>>          struct bpf_subprog_info *subprog = env->subprog_info;
>>          struct bpf_insn *insn = env->prog->insnsi;
>>          int depth = 0, frame = 0, i, subprog_end;
>>          bool tail_call_reachable = false;
>> +       bool priv_stack_eligible = false;
>>          int ret_insn[MAX_CALL_FRAMES];
>>          int ret_prog[MAX_CALL_FRAMES];
>> -       int j;
>> +       int j, subprog_stack_depth;
>> +       int orig_idx = idx;
>>
>> +       if (check_priv_stack)
>> +               subprog[idx].subtree_top_idx = idx;
>>          i = subprog[idx].start;
>>   process_func:
>>          /* protect against potential stack overflow that might happen when
>> @@ -6030,18 +6072,33 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>>           * tailcall will unwind the current stack frame but it will not get rid
>>           * of caller's stack as shown on the example above.
>>           */
>> -       if (idx && subprog[idx].has_tail_call && depth >= 256) {
>> +       if (!check_priv_stack && idx && subprog[idx].has_tail_call && depth >= 256) {
>>                  verbose(env,
>>                          "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
>>                          depth);
>>                  return -EACCES;
>>          }
>> -       depth += round_up_stack_depth(env, subprog[idx].stack_depth);
>> -       if (depth > MAX_BPF_STACK) {
>> +       subprog_stack_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
>> +       depth += subprog_stack_depth;
>> +       if (!check_priv_stack && !priv_stack_supported && depth > MAX_BPF_STACK) {
>>                  verbose(env, "combined stack size of %d calls is %d. Too large\n",
>>                          frame + 1, depth);
>>                  return -EACCES;
>>          }
>> +       if (check_priv_stack) {
>> +               if (subprog_stack_depth > MAX_BPF_STACK) {
>> +                       verbose(env, "stack size of subprog %d is %d. Too large\n",
>> +                               idx, subprog_stack_depth);
>> +                       return -EACCES;
>> +               }
>> +
>> +               if (!priv_stack_eligible && depth >= BPF_PRIV_STACK_MIN_SUBTREE_SIZE) {
>> +                       subprog[orig_idx].priv_stack_eligible = true;
>> +                       env->prog->aux->priv_stack_eligible = priv_stack_eligible = true;
>> +               }
>> +               subprog[orig_idx].subtree_stack_depth =
>> +                       max_t(u16, subprog[orig_idx].subtree_stack_depth, depth);
>> +       }
>>   continue_func:
>>          subprog_end = subprog[idx + 1].start;
>>          for (; i < subprog_end; i++) {
>> @@ -6078,6 +6135,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>>                  next_insn = i + insn[i].imm + 1;
>>                  sidx = find_subprog(env, next_insn);
>>                  if (sidx < 0) {
>> +                       /* It is possible that callback func has been removed as dead code after
>> +                        * instruction rewrites, e.g. bpf_loop with cnt 0.
>> +                        */
>> +                       if (check_priv_stack)
>> +                               continue;
>> +
> and this extra hack only because check_max_stack_depth() will
> be called the 2nd time ?
> Why call it twice at all ?
> Record everything in the first pass.

The individual stack size may increase between check_max_stack_depth() and jit.
So we have to go through second pass to compute precise subtree (prog + subprogs)
stack size, which is needed to allocate percpu private stack.

One thing we could do is to record the (sub)prog<->subprog relations in the first
pass and right before the jit do another pass to calculate subtree stack size.
I guess that is what you suggest?

>
>>                          WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
>>                                    next_insn);
>>                          return -EFAULT;
>> @@ -6097,8 +6160,10 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>>                  }
>>                  i = next_insn;
>>                  idx = sidx;
>> +               if (check_priv_stack)
>> +                       subprog[idx].subtree_top_idx = orig_idx;
>>
>> -               if (subprog[idx].has_tail_call)
>> +               if (!check_priv_stack && subprog[idx].has_tail_call)
>>                          tail_call_reachable = true;
>>
>>                  frame++;
>> @@ -6122,7 +6187,7 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>>                          }
>>                          subprog[ret_prog[j]].tail_call_reachable = true;
>>                  }
>> -       if (subprog[0].tail_call_reachable)
>> +       if (!check_priv_stack && subprog[0].tail_call_reachable)
>>                  env->prog->aux->tail_call_reachable = true;
>>
>>          /* end of for() loop means the last insn of the 'subprog'
>> @@ -6137,14 +6202,18 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
>>          goto continue_func;
>>   }
>>
>> -static int check_max_stack_depth(struct bpf_verifier_env *env)
>> +static int check_max_stack_depth(struct bpf_verifier_env *env, bool check_priv_stack,
>> +                                bool priv_stack_supported)
>>   {
>>          struct bpf_subprog_info *si = env->subprog_info;
>> +       bool check_subprog;
>>          int ret;
>>
>>          for (int i = 0; i < env->subprog_cnt; i++) {
>> -               if (!i || si[i].is_async_cb) {
>> -                       ret = check_max_stack_depth_subprog(env, i);
>> +               check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
> why?
> This looks very suspicious.

This is to simplify jit. For example,
    main_prog   <=== main_prog_priv_stack_ptr
      subprog1  <=== there is a helper which has a callback_fn
                <=== for example bpf_for_each_map_elem

        callback_fn
          subprog2

In callback_fn, we cannot simplify do
    r9 += stack_size_for_callback_fn
since r9 may have been clobbered between subprog1 and callback_fn.
That is why currently I allocate private_stack separately for callback_fn.

Alternatively we could do
    callback_fn_priv_stack_ptr = main_prog_priv_stack_ptr + off
where off equals to (stack size tree main_prog+subprog1).
I can do this approach too with a little more information in prog->aux.
WDYT?

>
>> +               if (check_subprog) {
>> +                       ret = check_max_stack_depth_subprog(env, i, check_priv_stack,
>> +                                                           priv_stack_supported);
>>                          if (ret < 0)
>>                                  return ret;
>>                  }
>> @@ -22303,7 +22372,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>>          struct bpf_verifier_env *env;
>>          int i, len, ret = -EINVAL, err;
>>          u32 log_true_size;
>> -       bool is_priv;
>> +       bool is_priv, priv_stack_supported = false;
>>
>>          /* no program is valid */
>>          if (ARRAY_SIZE(bpf_verifier_ops) == 0)
>> @@ -22430,8 +22499,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>>          if (ret == 0)
>>                  ret = remove_fastcall_spills_fills(env);
>>
>> -       if (ret == 0)
>> -               ret = check_max_stack_depth(env);
>> +       if (ret == 0) {
>> +               priv_stack_supported = is_priv_stack_supported(env);
>> +               ret = check_max_stack_depth(env, false, priv_stack_supported);
>> +       }
>>
>>          /* instruction rewrites happen after this point */
>>          if (ret == 0)
>> @@ -22465,6 +22536,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
>>                                                                       : false;
>>          }
>>
>> +       if (ret == 0 && priv_stack_supported)
>> +               ret = check_max_stack_depth(env, true, true);
>> +
>>          if (ret == 0)
>>                  ret = fixup_call_args(env);
>>
>> --
>> 2.43.5
>>
Alexei Starovoitov Oct. 22, 2024, 3:43 a.m. UTC | #3
On Mon, Oct 21, 2024 at 8:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> >>          for (int i = 0; i < env->subprog_cnt; i++) {
> >> -               if (!i || si[i].is_async_cb) {
> >> -                       ret = check_max_stack_depth_subprog(env, i);
> >> +               check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
> > why?
> > This looks very suspicious.
>
> This is to simplify jit. For example,
>     main_prog   <=== main_prog_priv_stack_ptr
>       subprog1  <=== there is a helper which has a callback_fn
>                 <=== for example bpf_for_each_map_elem
>
>         callback_fn
>           subprog2
>
> In callback_fn, we cannot simplify do
>     r9 += stack_size_for_callback_fn
> since r9 may have been clobbered between subprog1 and callback_fn.
> That is why currently I allocate private_stack separately for callback_fn.
>
> Alternatively we could do
>     callback_fn_priv_stack_ptr = main_prog_priv_stack_ptr + off
> where off equals to (stack size tree main_prog+subprog1).
> I can do this approach too with a little more information in prog->aux.
> WDYT?

I see. I think we're overcomplicating the verifier just to
be able to do 'r9 += stack' in the subprog.
The cases of async vs sync and directly vs kfunc/helper
(and soon with inlining of kfuncs) are getting too hard
to reason about.

I think we need to go back to the earlier approach
where every subprog had its own private stack and was
setting up r9 = my_priv_stack in the prologue.

I suspect it's possible to construct a convoluted subprog
that calls itself a limited amount of time and the verifier allows that.
I feel it will be easier to detect just that condition
in the verifier and fallback to the normal stack.
Yonghong Song Oct. 22, 2024, 4:08 a.m. UTC | #4
On 10/21/24 8:43 PM, Alexei Starovoitov wrote:
> On Mon, Oct 21, 2024 at 8:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>           for (int i = 0; i < env->subprog_cnt; i++) {
>>>> -               if (!i || si[i].is_async_cb) {
>>>> -                       ret = check_max_stack_depth_subprog(env, i);
>>>> +               check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
>>> why?
>>> This looks very suspicious.
>> This is to simplify jit. For example,
>>      main_prog   <=== main_prog_priv_stack_ptr
>>        subprog1  <=== there is a helper which has a callback_fn
>>                  <=== for example bpf_for_each_map_elem
>>
>>          callback_fn
>>            subprog2
>>
>> In callback_fn, we cannot simplify do
>>      r9 += stack_size_for_callback_fn
>> since r9 may have been clobbered between subprog1 and callback_fn.
>> That is why currently I allocate private_stack separately for callback_fn.
>>
>> Alternatively we could do
>>      callback_fn_priv_stack_ptr = main_prog_priv_stack_ptr + off
>> where off equals to (stack size tree main_prog+subprog1).
>> I can do this approach too with a little more information in prog->aux.
>> WDYT?
> I see. I think we're overcomplicating the verifier just to
> be able to do 'r9 += stack' in the subprog.
> The cases of async vs sync and directly vs kfunc/helper
> (and soon with inlining of kfuncs) are getting too hard
> to reason about.
>
> I think we need to go back to the earlier approach
> where every subprog had its own private stack and was
> setting up r9 = my_priv_stack in the prologue.

Indeed, per private_stack per prog(subprog) will be much
simpler.

>
> I suspect it's possible to construct a convoluted subprog
> that calls itself a limited amount of time and the verifier allows that.
> I feel it will be easier to detect just that condition
> in the verifier and fallback to the normal stack.

Yes, I think check_max_stack_depth_subprog() should be able to detect 
subprog recursion.
Yonghong Song Oct. 22, 2024, 8:13 p.m. UTC | #5
On 10/21/24 8:43 PM, Alexei Starovoitov wrote:
> On Mon, Oct 21, 2024 at 8:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>           for (int i = 0; i < env->subprog_cnt; i++) {
>>>> -               if (!i || si[i].is_async_cb) {
>>>> -                       ret = check_max_stack_depth_subprog(env, i);
>>>> +               check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
>>> why?
>>> This looks very suspicious.
>> This is to simplify jit. For example,
>>      main_prog   <=== main_prog_priv_stack_ptr
>>        subprog1  <=== there is a helper which has a callback_fn
>>                  <=== for example bpf_for_each_map_elem
>>
>>          callback_fn
>>            subprog2
>>
>> In callback_fn, we cannot simplify do
>>      r9 += stack_size_for_callback_fn
>> since r9 may have been clobbered between subprog1 and callback_fn.
>> That is why currently I allocate private_stack separately for callback_fn.
>>
>> Alternatively we could do
>>      callback_fn_priv_stack_ptr = main_prog_priv_stack_ptr + off
>> where off equals to (stack size tree main_prog+subprog1).
>> I can do this approach too with a little more information in prog->aux.
>> WDYT?
> I see. I think we're overcomplicating the verifier just to
> be able to do 'r9 += stack' in the subprog.
> The cases of async vs sync and directly vs kfunc/helper
> (and soon with inlining of kfuncs) are getting too hard
> to reason about.
>
> I think we need to go back to the earlier approach
> where every subprog had its own private stack and was
> setting up r9 = my_priv_stack in the prologue.
>
> I suspect it's possible to construct a convoluted subprog
> that calls itself a limited amount of time and the verifier allows that.
> I feel it will be easier to detect just that condition
> in the verifier and fallback to the normal stack.

I tried a simple bpf prog below.

$ cat private_stack_subprog_recur.c
// SPDX-License-Identifier: GPL-2.0

#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "../bpf_testmod/bpf_testmod.h"

char _license[] SEC("license") = "GPL";

#if defined(__TARGET_ARCH_x86)
bool skip __attribute((__section__(".data"))) = false;
#else
bool skip = true;
#endif

int i;

__noinline static void subprog1(int level)
{
         if (level > 0) {
                 subprog1(level >> 1);
                 i++;
         }
}

SEC("kprobe")
int prog1(void)
{
         subprog1(1);
         return 0;
}

In the above prog, we have a recursion of subprog1. The
callchain is:
    prog -> subprog1 -> subprog1

The insn-level verification is successful since argument
of subprog1() has precise value.

But eventually, verification failed with the following message:
   the call stack of 8 frames is too deep !

The error message is
                 if (frame >= MAX_CALL_FRAMES) {
                         verbose(env, "the call stack of %d frames is too deep !\n",
                                 frame);
                         return -E2BIG;
                 }
in function check_max_stack_depth_subprog().
Basically in function check_max_stack_depth_subprog(), tracing subprog
call is done only based on call insn. All conditionals are ignored.
In the above example, check_max_stack_depth_subprog() will have the
call graph like
     prog -> subprog1 -> subprog1 -> subprog1 -> subprog1 -> ...
and eventually hit the error.

Basically with check_max_stack_depth_subprog() self recursion is not
possible for a bpf prog.

This limitation is back to year 2017.
   commit 70a87ffea8ac  bpf: fix maximum stack depth tracking logic

So I assume people really do not write progs with self recursion inside
the main prog (including subprogs).
Alexei Starovoitov Oct. 22, 2024, 8:41 p.m. UTC | #6
On Tue, Oct 22, 2024 at 1:13 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 10/21/24 8:43 PM, Alexei Starovoitov wrote:
> > On Mon, Oct 21, 2024 at 8:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>>>           for (int i = 0; i < env->subprog_cnt; i++) {
> >>>> -               if (!i || si[i].is_async_cb) {
> >>>> -                       ret = check_max_stack_depth_subprog(env, i);
> >>>> +               check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
> >>> why?
> >>> This looks very suspicious.
> >> This is to simplify jit. For example,
> >>      main_prog   <=== main_prog_priv_stack_ptr
> >>        subprog1  <=== there is a helper which has a callback_fn
> >>                  <=== for example bpf_for_each_map_elem
> >>
> >>          callback_fn
> >>            subprog2
> >>
> >> In callback_fn, we cannot simplify do
> >>      r9 += stack_size_for_callback_fn
> >> since r9 may have been clobbered between subprog1 and callback_fn.
> >> That is why currently I allocate private_stack separately for callback_fn.
> >>
> >> Alternatively we could do
> >>      callback_fn_priv_stack_ptr = main_prog_priv_stack_ptr + off
> >> where off equals to (stack size tree main_prog+subprog1).
> >> I can do this approach too with a little more information in prog->aux.
> >> WDYT?
> > I see. I think we're overcomplicating the verifier just to
> > be able to do 'r9 += stack' in the subprog.
> > The cases of async vs sync and directly vs kfunc/helper
> > (and soon with inlining of kfuncs) are getting too hard
> > to reason about.
> >
> > I think we need to go back to the earlier approach
> > where every subprog had its own private stack and was
> > setting up r9 = my_priv_stack in the prologue.
> >
> > I suspect it's possible to construct a convoluted subprog
> > that calls itself a limited amount of time and the verifier allows that.
> > I feel it will be easier to detect just that condition
> > in the verifier and fallback to the normal stack.
>
> I tried a simple bpf prog below.
>
> $ cat private_stack_subprog_recur.c
> // SPDX-License-Identifier: GPL-2.0
>
> #include <vmlinux.h>
> #include <bpf/bpf_helpers.h>
> #include <bpf/bpf_tracing.h>
> #include "../bpf_testmod/bpf_testmod.h"
>
> char _license[] SEC("license") = "GPL";
>
> #if defined(__TARGET_ARCH_x86)
> bool skip __attribute((__section__(".data"))) = false;
> #else
> bool skip = true;
> #endif
>
> int i;
>
> __noinline static void subprog1(int level)
> {
>          if (level > 0) {
>                  subprog1(level >> 1);
>                  i++;
>          }
> }
>
> SEC("kprobe")
> int prog1(void)
> {
>          subprog1(1);
>          return 0;
> }
>
> In the above prog, we have a recursion of subprog1. The
> callchain is:
>     prog -> subprog1 -> subprog1
>
> The insn-level verification is successful since argument
> of subprog1() has precise value.
>
> But eventually, verification failed with the following message:
>    the call stack of 8 frames is too deep !
>
> The error message is
>                  if (frame >= MAX_CALL_FRAMES) {
>                          verbose(env, "the call stack of %d frames is too deep !\n",
>                                  frame);
>                          return -E2BIG;
>                  }
> in function check_max_stack_depth_subprog().
> Basically in function check_max_stack_depth_subprog(), tracing subprog
> call is done only based on call insn. All conditionals are ignored.
> In the above example, check_max_stack_depth_subprog() will have the
> call graph like
>      prog -> subprog1 -> subprog1 -> subprog1 -> subprog1 -> ...
> and eventually hit the error.
>
> Basically with check_max_stack_depth_subprog() self recursion is not
> possible for a bpf prog.
>
> This limitation is back to year 2017.
>    commit 70a87ffea8ac  bpf: fix maximum stack depth tracking logic
>
> So I assume people really do not write progs with self recursion inside
> the main prog (including subprogs).

Thanks for checking this part.

What about sync and async callbacks? Can they recurse?

Since progs are preemptible is the following possible:

__noinline static void subprog(void)
{
  /* delay */
}

static int timer_cb(void *map, int *key, void *val)
{
  subprog();
}

SEC("tc")
int prog1(void)
{
    bpf_timer_set_callback(  &timer_cb);
    subprog();
    return 0;
}

timers use softirq.
I'm not sure whether it's the same stack or not.
So it may be borderline ok-ish for other reasons,
but the question remains. Will subprog recurse this way?
Kumar Kartikeya Dwivedi Oct. 22, 2024, 9:29 p.m. UTC | #7
On Tue, 22 Oct 2024 at 22:41, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Oct 22, 2024 at 1:13 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >
> >
> > On 10/21/24 8:43 PM, Alexei Starovoitov wrote:
> > > On Mon, Oct 21, 2024 at 8:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> > >>>>           for (int i = 0; i < env->subprog_cnt; i++) {
> > >>>> -               if (!i || si[i].is_async_cb) {
> > >>>> -                       ret = check_max_stack_depth_subprog(env, i);
> > >>>> +               check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
> > >>> why?
> > >>> This looks very suspicious.
> > >> This is to simplify jit. For example,
> > >>      main_prog   <=== main_prog_priv_stack_ptr
> > >>        subprog1  <=== there is a helper which has a callback_fn
> > >>                  <=== for example bpf_for_each_map_elem
> > >>
> > >>          callback_fn
> > >>            subprog2
> > >>
> > >> In callback_fn, we cannot simplify do
> > >>      r9 += stack_size_for_callback_fn
> > >> since r9 may have been clobbered between subprog1 and callback_fn.
> > >> That is why currently I allocate private_stack separately for callback_fn.
> > >>
> > >> Alternatively we could do
> > >>      callback_fn_priv_stack_ptr = main_prog_priv_stack_ptr + off
> > >> where off equals to (stack size tree main_prog+subprog1).
> > >> I can do this approach too with a little more information in prog->aux.
> > >> WDYT?
> > > I see. I think we're overcomplicating the verifier just to
> > > be able to do 'r9 += stack' in the subprog.
> > > The cases of async vs sync and directly vs kfunc/helper
> > > (and soon with inlining of kfuncs) are getting too hard
> > > to reason about.
> > >
> > > I think we need to go back to the earlier approach
> > > where every subprog had its own private stack and was
> > > setting up r9 = my_priv_stack in the prologue.
> > >
> > > I suspect it's possible to construct a convoluted subprog
> > > that calls itself a limited amount of time and the verifier allows that.
> > > I feel it will be easier to detect just that condition
> > > in the verifier and fallback to the normal stack.
> >
> > I tried a simple bpf prog below.
> >
> > $ cat private_stack_subprog_recur.c
> > // SPDX-License-Identifier: GPL-2.0
> >
> > #include <vmlinux.h>
> > #include <bpf/bpf_helpers.h>
> > #include <bpf/bpf_tracing.h>
> > #include "../bpf_testmod/bpf_testmod.h"
> >
> > char _license[] SEC("license") = "GPL";
> >
> > #if defined(__TARGET_ARCH_x86)
> > bool skip __attribute((__section__(".data"))) = false;
> > #else
> > bool skip = true;
> > #endif
> >
> > int i;
> >
> > __noinline static void subprog1(int level)
> > {
> >          if (level > 0) {
> >                  subprog1(level >> 1);
> >                  i++;
> >          }
> > }
> >
> > SEC("kprobe")
> > int prog1(void)
> > {
> >          subprog1(1);
> >          return 0;
> > }
> >
> > In the above prog, we have a recursion of subprog1. The
> > callchain is:
> >     prog -> subprog1 -> subprog1
> >
> > The insn-level verification is successful since argument
> > of subprog1() has precise value.
> >
> > But eventually, verification failed with the following message:
> >    the call stack of 8 frames is too deep !
> >
> > The error message is
> >                  if (frame >= MAX_CALL_FRAMES) {
> >                          verbose(env, "the call stack of %d frames is too deep !\n",
> >                                  frame);
> >                          return -E2BIG;
> >                  }
> > in function check_max_stack_depth_subprog().
> > Basically in function check_max_stack_depth_subprog(), tracing subprog
> > call is done only based on call insn. All conditionals are ignored.
> > In the above example, check_max_stack_depth_subprog() will have the
> > call graph like
> >      prog -> subprog1 -> subprog1 -> subprog1 -> subprog1 -> ...
> > and eventually hit the error.
> >
> > Basically with check_max_stack_depth_subprog() self recursion is not
> > possible for a bpf prog.
> >
> > This limitation is back to year 2017.
> >    commit 70a87ffea8ac  bpf: fix maximum stack depth tracking logic
> >
> > So I assume people really do not write progs with self recursion inside
> > the main prog (including subprogs).
>
> Thanks for checking this part.
>
> What about sync and async callbacks? Can they recurse?
>
> Since progs are preemptible is the following possible:
>
> __noinline static void subprog(void)
> {
>   /* delay */
> }
>
> static int timer_cb(void *map, int *key, void *val)
> {
>   subprog();
> }
>
> SEC("tc")
> int prog1(void)
> {
>     bpf_timer_set_callback(  &timer_cb);
>     subprog();
>     return 0;
> }
>
> timers use softirq.
> I'm not sure whether it's the same stack or not.
> So it may be borderline ok-ish for other reasons,
> but the question remains. Will subprog recurse this way?
>

Yes, but not in the normal ways.
There can be only one softirq context per-CPU (even on preemptible RT
with timers running in kthreads), but timer_cb can also be called
directly by the prog. So any other context the same prog can execute
in will allow it to call timer_cb while another invocation is
potentially preempted out on the same CPU.
It might be better to disallow direct calling such async callbacks,
because I'm not sure anyone relies on that behavior, but it is
something I've previously looked at (for exception_cb, which is
disallowed to be called directly due to the distinct way prologue is
set up).

We'll also need to remember this when/if we introduce hardirq mode for
BPF timers.
Kumar Kartikeya Dwivedi Oct. 22, 2024, 9:36 p.m. UTC | #8
On Tue, 22 Oct 2024 at 23:29, Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
>
> On Tue, 22 Oct 2024 at 22:41, Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Oct 22, 2024 at 1:13 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> > >
> > >
> > > On 10/21/24 8:43 PM, Alexei Starovoitov wrote:
> > > > On Mon, Oct 21, 2024 at 8:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> > > >>>>           for (int i = 0; i < env->subprog_cnt; i++) {
> > > >>>> -               if (!i || si[i].is_async_cb) {
> > > >>>> -                       ret = check_max_stack_depth_subprog(env, i);
> > > >>>> +               check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
> > > >>> why?
> > > >>> This looks very suspicious.
> > > >> This is to simplify jit. For example,
> > > >>      main_prog   <=== main_prog_priv_stack_ptr
> > > >>        subprog1  <=== there is a helper which has a callback_fn
> > > >>                  <=== for example bpf_for_each_map_elem
> > > >>
> > > >>          callback_fn
> > > >>            subprog2
> > > >>
> > > >> In callback_fn, we cannot simplify do
> > > >>      r9 += stack_size_for_callback_fn
> > > >> since r9 may have been clobbered between subprog1 and callback_fn.
> > > >> That is why currently I allocate private_stack separately for callback_fn.
> > > >>
> > > >> Alternatively we could do
> > > >>      callback_fn_priv_stack_ptr = main_prog_priv_stack_ptr + off
> > > >> where off equals to (stack size tree main_prog+subprog1).
> > > >> I can do this approach too with a little more information in prog->aux.
> > > >> WDYT?
> > > > I see. I think we're overcomplicating the verifier just to
> > > > be able to do 'r9 += stack' in the subprog.
> > > > The cases of async vs sync and directly vs kfunc/helper
> > > > (and soon with inlining of kfuncs) are getting too hard
> > > > to reason about.
> > > >
> > > > I think we need to go back to the earlier approach
> > > > where every subprog had its own private stack and was
> > > > setting up r9 = my_priv_stack in the prologue.
> > > >
> > > > I suspect it's possible to construct a convoluted subprog
> > > > that calls itself a limited amount of time and the verifier allows that.
> > > > I feel it will be easier to detect just that condition
> > > > in the verifier and fallback to the normal stack.
> > >
> > > I tried a simple bpf prog below.
> > >
> > > $ cat private_stack_subprog_recur.c
> > > // SPDX-License-Identifier: GPL-2.0
> > >
> > > #include <vmlinux.h>
> > > #include <bpf/bpf_helpers.h>
> > > #include <bpf/bpf_tracing.h>
> > > #include "../bpf_testmod/bpf_testmod.h"
> > >
> > > char _license[] SEC("license") = "GPL";
> > >
> > > #if defined(__TARGET_ARCH_x86)
> > > bool skip __attribute((__section__(".data"))) = false;
> > > #else
> > > bool skip = true;
> > > #endif
> > >
> > > int i;
> > >
> > > __noinline static void subprog1(int level)
> > > {
> > >          if (level > 0) {
> > >                  subprog1(level >> 1);
> > >                  i++;
> > >          }
> > > }
> > >
> > > SEC("kprobe")
> > > int prog1(void)
> > > {
> > >          subprog1(1);
> > >          return 0;
> > > }
> > >
> > > In the above prog, we have a recursion of subprog1. The
> > > callchain is:
> > >     prog -> subprog1 -> subprog1
> > >
> > > The insn-level verification is successful since argument
> > > of subprog1() has precise value.
> > >
> > > But eventually, verification failed with the following message:
> > >    the call stack of 8 frames is too deep !
> > >
> > > The error message is
> > >                  if (frame >= MAX_CALL_FRAMES) {
> > >                          verbose(env, "the call stack of %d frames is too deep !\n",
> > >                                  frame);
> > >                          return -E2BIG;
> > >                  }
> > > in function check_max_stack_depth_subprog().
> > > Basically in function check_max_stack_depth_subprog(), tracing subprog
> > > call is done only based on call insn. All conditionals are ignored.
> > > In the above example, check_max_stack_depth_subprog() will have the
> > > call graph like
> > >      prog -> subprog1 -> subprog1 -> subprog1 -> subprog1 -> ...
> > > and eventually hit the error.
> > >
> > > Basically with check_max_stack_depth_subprog() self recursion is not
> > > possible for a bpf prog.
> > >
> > > This limitation is back to year 2017.
> > >    commit 70a87ffea8ac  bpf: fix maximum stack depth tracking logic
> > >
> > > So I assume people really do not write progs with self recursion inside
> > > the main prog (including subprogs).
> >
> > Thanks for checking this part.
> >
> > What about sync and async callbacks? Can they recurse?
> >
> > Since progs are preemptible is the following possible:
> >
> > __noinline static void subprog(void)
> > {
> >   /* delay */
> > }
> >
> > static int timer_cb(void *map, int *key, void *val)
> > {
> >   subprog();
> > }
> >
> > SEC("tc")
> > int prog1(void)
> > {
> >     bpf_timer_set_callback(  &timer_cb);
> >     subprog();
> >     return 0;
> > }
> >
> > timers use softirq.
> > I'm not sure whether it's the same stack or not.
> > So it may be borderline ok-ish for other reasons,
> > but the question remains. Will subprog recurse this way?
> >
>
> Yes, but not in the normal ways.
> There can be only one softirq context per-CPU (even on preemptible RT
> with timers running in kthreads), but timer_cb can also be called
> directly by the prog. So any other context the same prog can execute
> in will allow it to call timer_cb while another invocation is
> potentially preempted out on the same CPU.
> It might be better to disallow direct calling such async callbacks,
> because I'm not sure anyone relies on that behavior, but it is
> something I've previously looked at (for exception_cb, which is
> disallowed to be called directly due to the distinct way prologue is
> set up).

Ah, in your example it's a subprog() called by both. Yeah, I guess we
can't really prevent that from happening.

>
> We'll also need to remember this when/if we introduce hardirq mode for
> BPF timers.
Yonghong Song Oct. 22, 2024, 9:43 p.m. UTC | #9
On 10/22/24 1:41 PM, Alexei Starovoitov wrote:
> On Tue, Oct 22, 2024 at 1:13 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>> On 10/21/24 8:43 PM, Alexei Starovoitov wrote:
>>> On Mon, Oct 21, 2024 at 8:21 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>>>            for (int i = 0; i < env->subprog_cnt; i++) {
>>>>>> -               if (!i || si[i].is_async_cb) {
>>>>>> -                       ret = check_max_stack_depth_subprog(env, i);
>>>>>> +               check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
>>>>> why?
>>>>> This looks very suspicious.
>>>> This is to simplify jit. For example,
>>>>       main_prog   <=== main_prog_priv_stack_ptr
>>>>         subprog1  <=== there is a helper which has a callback_fn
>>>>                   <=== for example bpf_for_each_map_elem
>>>>
>>>>           callback_fn
>>>>             subprog2
>>>>
>>>> In callback_fn, we cannot simplify do
>>>>       r9 += stack_size_for_callback_fn
>>>> since r9 may have been clobbered between subprog1 and callback_fn.
>>>> That is why currently I allocate private_stack separately for callback_fn.
>>>>
>>>> Alternatively we could do
>>>>       callback_fn_priv_stack_ptr = main_prog_priv_stack_ptr + off
>>>> where off equals to (stack size tree main_prog+subprog1).
>>>> I can do this approach too with a little more information in prog->aux.
>>>> WDYT?
>>> I see. I think we're overcomplicating the verifier just to
>>> be able to do 'r9 += stack' in the subprog.
>>> The cases of async vs sync and directly vs kfunc/helper
>>> (and soon with inlining of kfuncs) are getting too hard
>>> to reason about.
>>>
>>> I think we need to go back to the earlier approach
>>> where every subprog had its own private stack and was
>>> setting up r9 = my_priv_stack in the prologue.
>>>
>>> I suspect it's possible to construct a convoluted subprog
>>> that calls itself a limited amount of time and the verifier allows that.
>>> I feel it will be easier to detect just that condition
>>> in the verifier and fallback to the normal stack.
>> I tried a simple bpf prog below.
>>
>> $ cat private_stack_subprog_recur.c
>> // SPDX-License-Identifier: GPL-2.0
>>
>> #include <vmlinux.h>
>> #include <bpf/bpf_helpers.h>
>> #include <bpf/bpf_tracing.h>
>> #include "../bpf_testmod/bpf_testmod.h"
>>
>> char _license[] SEC("license") = "GPL";
>>
>> #if defined(__TARGET_ARCH_x86)
>> bool skip __attribute((__section__(".data"))) = false;
>> #else
>> bool skip = true;
>> #endif
>>
>> int i;
>>
>> __noinline static void subprog1(int level)
>> {
>>           if (level > 0) {
>>                   subprog1(level >> 1);
>>                   i++;
>>           }
>> }
>>
>> SEC("kprobe")
>> int prog1(void)
>> {
>>           subprog1(1);
>>           return 0;
>> }
>>
>> In the above prog, we have a recursion of subprog1. The
>> callchain is:
>>      prog -> subprog1 -> subprog1
>>
>> The insn-level verification is successful since argument
>> of subprog1() has precise value.
>>
>> But eventually, verification failed with the following message:
>>     the call stack of 8 frames is too deep !
>>
>> The error message is
>>                   if (frame >= MAX_CALL_FRAMES) {
>>                           verbose(env, "the call stack of %d frames is too deep !\n",
>>                                   frame);
>>                           return -E2BIG;
>>                   }
>> in function check_max_stack_depth_subprog().
>> Basically in function check_max_stack_depth_subprog(), tracing subprog
>> call is done only based on call insn. All conditionals are ignored.
>> In the above example, check_max_stack_depth_subprog() will have the
>> call graph like
>>       prog -> subprog1 -> subprog1 -> subprog1 -> subprog1 -> ...
>> and eventually hit the error.
>>
>> Basically with check_max_stack_depth_subprog() self recursion is not
>> possible for a bpf prog.
>>
>> This limitation is back to year 2017.
>>     commit 70a87ffea8ac  bpf: fix maximum stack depth tracking logic
>>
>> So I assume people really do not write progs with self recursion inside
>> the main prog (including subprogs).
> Thanks for checking this part.
>
> What about sync and async callbacks? Can they recurse?

For sync, there will be no recurses between subprogs.
This is due to the following func.

static int check_max_stack_depth(struct bpf_verifier_env *env)
{
         struct bpf_subprog_info *si = env->subprog_info;
         int ret;
         
         for (int i = 0; i < env->subprog_cnt; i++) {
                 if (!i || si[i].is_async_cb) {
                         ret = check_max_stack_depth_subprog(env, i);
                         if (ret < 0)
                                 return ret;
                 }
                 continue;
         }
         return 0;
}

subprog root only starts from the main prog or async_cb.
So regular sync callback will is treated similar
to other direct-call subprog.

>
> Since progs are preemptible is the following possible:
>
> __noinline static void subprog(void)
> {
>    /* delay */
> }
>
> static int timer_cb(void *map, int *key, void *val)
> {
>    subprog();
> }
>
> SEC("tc")
> int prog1(void)
> {
>      bpf_timer_set_callback(  &timer_cb);
>      subprog();
>      return 0;
> }
>
> timers use softirq.
> I'm not sure whether it's the same stack or not.
> So it may be borderline ok-ish for other reasons,
> but the question remains. Will subprog recurse this way?

But for async cb, as you mentioned it is possible that
prog1->subprog could be called in process context
and the callback timer_cb->subprog could be called in
nested way on top of prog1->subprog.

To handle such cases, I guess I can refactor the code
to record maximum stack_tree_depth in subprog info and
do the checking after the subprog 0 and all async
progs are processed.

To handle a subprog may be used in more than one
subtree (subprog 0 tree or async tree), I need to
add a 'visited' field to bpf_subprog_info.
I think this should work.
Alexei Starovoitov Oct. 22, 2024, 9:57 p.m. UTC | #10
On Tue, Oct 22, 2024 at 2:43 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> To handle a subprog may be used in more than one
> subtree (subprog 0 tree or async tree), I need to
> add a 'visited' field to bpf_subprog_info.
> I think this should work.

This is getting quite complicated.

But looks like we have even bigger problem:

SEC("lsm/...")
int BPF_PROG(...)
{
  volatile char buf[..];
  buf[..] =
}

The approach to have per-prog per-cpu priv stack
doesn't work for the above.
Sleepable and non-sleepable LSM progs are preemptible.
Multiple tasks can be running the same program on the same cpu
preempting each other.
The priv stack of this prog will be corrupted.

Maybe it won't be an issue for sched-ext prog
attached to a cgroup, but it feels fragile for bpf infra
to rely on implementation detail of another subsystem.
We probably need to go back to the drawing board.
Yonghong Song Oct. 22, 2024, 10:41 p.m. UTC | #11
On 10/22/24 2:57 PM, Alexei Starovoitov wrote:
> On Tue, Oct 22, 2024 at 2:43 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>> To handle a subprog may be used in more than one
>> subtree (subprog 0 tree or async tree), I need to
>> add a 'visited' field to bpf_subprog_info.
>> I think this should work.
> This is getting quite complicated.
>
> But looks like we have even bigger problem:
>
> SEC("lsm/...")
> int BPF_PROG(...)
> {
>    volatile char buf[..];
>    buf[..] =
> }

If I understand correctly, lsm/... corresponds to BPF_PROG_TYPE_LSM prog type.
The current implementation only supports the following plus struct_ops programs.

+       switch (env->prog->type) {
+       case BPF_PROG_TYPE_KPROBE:
+       case BPF_PROG_TYPE_TRACEPOINT:
+       case BPF_PROG_TYPE_PERF_EVENT:
+       case BPF_PROG_TYPE_RAW_TRACEPOINT:
+               return true;
+       case BPF_PROG_TYPE_TRACING:
+               if (env->prog->expected_attach_type != BPF_TRACE_ITER)
+                       return true;
+               fallthrough;
+       default:
+               return false;
+       }

I do agree that lsm programs will have issues if using private stack
since preemptible is possible and we don't have recursion check for
them (which is right in order to provide correct functionality).

>
> The approach to have per-prog per-cpu priv stack
> doesn't work for the above.
> Sleepable and non-sleepable LSM progs are preemptible.
> Multiple tasks can be running the same program on the same cpu
> preempting each other.
> The priv stack of this prog will be corrupted.
>
> Maybe it won't be an issue for sched-ext prog
> attached to a cgroup, but it feels fragile for bpf infra
> to rely on implementation detail of another subsystem.
> We probably need to go back to the drawing board.
Alexei Starovoitov Oct. 22, 2024, 10:59 p.m. UTC | #12
On Tue, Oct 22, 2024 at 3:41 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 10/22/24 2:57 PM, Alexei Starovoitov wrote:
> > On Tue, Oct 22, 2024 at 2:43 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >> To handle a subprog may be used in more than one
> >> subtree (subprog 0 tree or async tree), I need to
> >> add a 'visited' field to bpf_subprog_info.
> >> I think this should work.
> > This is getting quite complicated.
> >
> > But looks like we have even bigger problem:
> >
> > SEC("lsm/...")
> > int BPF_PROG(...)
> > {
> >    volatile char buf[..];
> >    buf[..] =
> > }
>
> If I understand correctly, lsm/... corresponds to BPF_PROG_TYPE_LSM prog type.
> The current implementation only supports the following plus struct_ops programs.
>
> +       switch (env->prog->type) {
> +       case BPF_PROG_TYPE_KPROBE:
> +       case BPF_PROG_TYPE_TRACEPOINT:
> +       case BPF_PROG_TYPE_PERF_EVENT:
> +       case BPF_PROG_TYPE_RAW_TRACEPOINT:
> +               return true;
> +       case BPF_PROG_TYPE_TRACING:
> +               if (env->prog->expected_attach_type != BPF_TRACE_ITER)
> +                       return true;
> +               fallthrough;
> +       default:
> +               return false;
> +       }
>
> I do agree that lsm programs will have issues if using private stack
> since preemptible is possible and we don't have recursion check for
> them (which is right in order to provide correct functionality).

static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
{
        switch (resolve_prog_type(prog)) {
        case BPF_PROG_TYPE_TRACING:
                return prog->expected_attach_type != BPF_TRACE_ITER;
        case BPF_PROG_TYPE_STRUCT_OPS:
        case BPF_PROG_TYPE_LSM:
                return false;
        default:
                return true;
        }
}

LSM prog is an example. The same issue is with struct_ops progs.
But struct_ops sched-ext progs is main motivation for adding
priv stack.

sched-ext will signal to bpf that it needs priv stack and
we would have to add "recursion no more than 1" check
and there is a chance (like above LSM prog demonstrates)
that struct_ops will be hitting this recursion check
and the prog will not be run.
The miss count will increment, of course, but the whole
priv stack feature for struct_ops becomes unreliable.
Hence the patches become questionable.
Why add a feature when the main user will struggle to use it.
Yonghong Song Oct. 22, 2024, 11:53 p.m. UTC | #13
On 10/22/24 3:59 PM, Alexei Starovoitov wrote:
> On Tue, Oct 22, 2024 at 3:41 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>> On 10/22/24 2:57 PM, Alexei Starovoitov wrote:
>>> On Tue, Oct 22, 2024 at 2:43 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>> To handle a subprog may be used in more than one
>>>> subtree (subprog 0 tree or async tree), I need to
>>>> add a 'visited' field to bpf_subprog_info.
>>>> I think this should work.
>>> This is getting quite complicated.
>>>
>>> But looks like we have even bigger problem:
>>>
>>> SEC("lsm/...")
>>> int BPF_PROG(...)
>>> {
>>>     volatile char buf[..];
>>>     buf[..] =
>>> }
>> If I understand correctly, lsm/... corresponds to BPF_PROG_TYPE_LSM prog type.
>> The current implementation only supports the following plus struct_ops programs.
>>
>> +       switch (env->prog->type) {
>> +       case BPF_PROG_TYPE_KPROBE:
>> +       case BPF_PROG_TYPE_TRACEPOINT:
>> +       case BPF_PROG_TYPE_PERF_EVENT:
>> +       case BPF_PROG_TYPE_RAW_TRACEPOINT:
>> +               return true;
>> +       case BPF_PROG_TYPE_TRACING:
>> +               if (env->prog->expected_attach_type != BPF_TRACE_ITER)
>> +                       return true;
>> +               fallthrough;
>> +       default:
>> +               return false;
>> +       }
>>
>> I do agree that lsm programs will have issues if using private stack
>> since preemptible is possible and we don't have recursion check for
>> them (which is right in order to provide correct functionality).
> static inline bool bpf_prog_check_recur(const struct bpf_prog *prog)
> {
>          switch (resolve_prog_type(prog)) {
>          case BPF_PROG_TYPE_TRACING:
>                  return prog->expected_attach_type != BPF_TRACE_ITER;
>          case BPF_PROG_TYPE_STRUCT_OPS:
>          case BPF_PROG_TYPE_LSM:
>                  return false;
>          default:
>                  return true;
>          }
> }
>
> LSM prog is an example. The same issue is with struct_ops progs.
> But struct_ops sched-ext progs is main motivation for adding
> priv stack.
>
> sched-ext will signal to bpf that it needs priv stack and
> we would have to add "recursion no more than 1" check
> and there is a chance (like above LSM prog demonstrates)
> that struct_ops will be hitting this recursion check
> and the prog will not be run.
> The miss count will increment, of course, but the whole
> priv stack feature for struct_ops becomes unreliable.
> Hence the patches become questionable.
> Why add a feature when the main user will struggle to use it.

Indeed, this is a known issue we kind of already aware of.
The recursion check (regardless it is one or four) may cause
prog no run if actual recursion level is beyond what recursion
check is doing.

I guess we indeed need to go back to drawing board again,
starting from struct_ops which is the main motivation of this
idea.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0c216e71cec7..6ad8ace7075a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1490,6 +1490,7 @@  struct bpf_prog_aux {
 	bool exception_cb;
 	bool exception_boundary;
 	bool is_extended; /* true if extended by freplace program */
+	bool priv_stack_eligible;
 	u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
 	struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
 	struct bpf_arena *arena;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 4513372c5bc8..bcfe868e3801 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -659,6 +659,8 @@  struct bpf_subprog_info {
 	 * are used for bpf_fastcall spills and fills.
 	 */
 	s16 fastcall_stack_off;
+	u16 subtree_stack_depth;
+	u16 subtree_top_idx;
 	bool has_tail_call: 1;
 	bool tail_call_reachable: 1;
 	bool has_ld_abs: 1;
@@ -668,6 +670,7 @@  struct bpf_subprog_info {
 	bool args_cached: 1;
 	/* true if bpf_fastcall stack region is used by functions that can't be inlined */
 	bool keep_fastcall_stack: 1;
+	bool priv_stack_eligible: 1;
 
 	u8 arg_cnt;
 	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7d7578a8eac1..3a21947f2fd4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1119,6 +1119,7 @@  bool bpf_jit_supports_exceptions(void);
 bool bpf_jit_supports_ptr_xchg(void);
 bool bpf_jit_supports_arena(void);
 bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
+bool bpf_jit_supports_private_stack(void);
 u64 bpf_arch_uaddress_limit(void);
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 233ea78f8f1b..14d9288441f2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3045,6 +3045,11 @@  bool __weak bpf_jit_supports_exceptions(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_private_stack(void)
+{
+	return false;
+}
+
 void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
 {
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f514247ba8ba..45bea4066272 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -194,6 +194,8 @@  struct bpf_verifier_stack_elem {
 
 #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
 
+#define BPF_PRIV_STACK_MIN_SUBTREE_SIZE	128
+
 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
 static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
 static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -5982,6 +5984,41 @@  static int check_ptr_alignment(struct bpf_verifier_env *env,
 					   strict);
 }
 
+static bool bpf_enable_private_stack(struct bpf_verifier_env *env)
+{
+	if (!bpf_jit_supports_private_stack())
+		return false;
+
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_KPROBE:
+	case BPF_PROG_TYPE_TRACEPOINT:
+	case BPF_PROG_TYPE_PERF_EVENT:
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+		return true;
+	case BPF_PROG_TYPE_TRACING:
+		if (env->prog->expected_attach_type != BPF_TRACE_ITER)
+			return true;
+		fallthrough;
+	default:
+		return false;
+	}
+}
+
+static bool is_priv_stack_supported(struct bpf_verifier_env *env)
+{
+	struct bpf_subprog_info *si = env->subprog_info;
+	bool has_tail_call = false;
+
+	for (int i = 0; i < env->subprog_cnt; i++) {
+		if (si[i].has_tail_call) {
+			has_tail_call = true;
+			break;
+		}
+	}
+
+	return !has_tail_call && bpf_enable_private_stack(env);
+}
+
 static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
 {
 	if (env->prog->jit_requested)
@@ -5999,16 +6036,21 @@  static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
  * Since recursion is prevented by check_cfg() this algorithm
  * only needs a local stack of MAX_CALL_FRAMES to remember callsites
  */
-static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
+static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
+					 bool check_priv_stack, bool priv_stack_supported)
 {
 	struct bpf_subprog_info *subprog = env->subprog_info;
 	struct bpf_insn *insn = env->prog->insnsi;
 	int depth = 0, frame = 0, i, subprog_end;
 	bool tail_call_reachable = false;
+	bool priv_stack_eligible = false;
 	int ret_insn[MAX_CALL_FRAMES];
 	int ret_prog[MAX_CALL_FRAMES];
-	int j;
+	int j, subprog_stack_depth;
+	int orig_idx = idx;
 
+	if (check_priv_stack)
+		subprog[idx].subtree_top_idx = idx;
 	i = subprog[idx].start;
 process_func:
 	/* protect against potential stack overflow that might happen when
@@ -6030,18 +6072,33 @@  static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
 	 * tailcall will unwind the current stack frame but it will not get rid
 	 * of caller's stack as shown on the example above.
 	 */
-	if (idx && subprog[idx].has_tail_call && depth >= 256) {
+	if (!check_priv_stack && idx && subprog[idx].has_tail_call && depth >= 256) {
 		verbose(env,
 			"tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n",
 			depth);
 		return -EACCES;
 	}
-	depth += round_up_stack_depth(env, subprog[idx].stack_depth);
-	if (depth > MAX_BPF_STACK) {
+	subprog_stack_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
+	depth += subprog_stack_depth;
+	if (!check_priv_stack && !priv_stack_supported && depth > MAX_BPF_STACK) {
 		verbose(env, "combined stack size of %d calls is %d. Too large\n",
 			frame + 1, depth);
 		return -EACCES;
 	}
+	if (check_priv_stack) {
+		if (subprog_stack_depth > MAX_BPF_STACK) {
+			verbose(env, "stack size of subprog %d is %d. Too large\n",
+				idx, subprog_stack_depth);
+			return -EACCES;
+		}
+
+		if (!priv_stack_eligible && depth >= BPF_PRIV_STACK_MIN_SUBTREE_SIZE) {
+			subprog[orig_idx].priv_stack_eligible = true;
+			env->prog->aux->priv_stack_eligible = priv_stack_eligible = true;
+		}
+		subprog[orig_idx].subtree_stack_depth =
+			max_t(u16, subprog[orig_idx].subtree_stack_depth, depth);
+	}
 continue_func:
 	subprog_end = subprog[idx + 1].start;
 	for (; i < subprog_end; i++) {
@@ -6078,6 +6135,12 @@  static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
 		next_insn = i + insn[i].imm + 1;
 		sidx = find_subprog(env, next_insn);
 		if (sidx < 0) {
+			/* It is possible that callback func has been removed as dead code after
+			 * instruction rewrites, e.g. bpf_loop with cnt 0.
+			 */
+			if (check_priv_stack)
+				continue;
+
 			WARN_ONCE(1, "verifier bug. No program starts at insn %d\n",
 				  next_insn);
 			return -EFAULT;
@@ -6097,8 +6160,10 @@  static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
 		}
 		i = next_insn;
 		idx = sidx;
+		if (check_priv_stack)
+			subprog[idx].subtree_top_idx = orig_idx;
 
-		if (subprog[idx].has_tail_call)
+		if (!check_priv_stack && subprog[idx].has_tail_call)
 			tail_call_reachable = true;
 
 		frame++;
@@ -6122,7 +6187,7 @@  static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
 			}
 			subprog[ret_prog[j]].tail_call_reachable = true;
 		}
-	if (subprog[0].tail_call_reachable)
+	if (!check_priv_stack && subprog[0].tail_call_reachable)
 		env->prog->aux->tail_call_reachable = true;
 
 	/* end of for() loop means the last insn of the 'subprog'
@@ -6137,14 +6202,18 @@  static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx)
 	goto continue_func;
 }
 
-static int check_max_stack_depth(struct bpf_verifier_env *env)
+static int check_max_stack_depth(struct bpf_verifier_env *env, bool check_priv_stack,
+				 bool priv_stack_supported)
 {
 	struct bpf_subprog_info *si = env->subprog_info;
+	bool check_subprog;
 	int ret;
 
 	for (int i = 0; i < env->subprog_cnt; i++) {
-		if (!i || si[i].is_async_cb) {
-			ret = check_max_stack_depth_subprog(env, i);
+		check_subprog = !i || (check_priv_stack ? si[i].is_cb : si[i].is_async_cb);
+		if (check_subprog) {
+			ret = check_max_stack_depth_subprog(env, i, check_priv_stack,
+							    priv_stack_supported);
 			if (ret < 0)
 				return ret;
 		}
@@ -22303,7 +22372,7 @@  int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	struct bpf_verifier_env *env;
 	int i, len, ret = -EINVAL, err;
 	u32 log_true_size;
-	bool is_priv;
+	bool is_priv, priv_stack_supported = false;
 
 	/* no program is valid */
 	if (ARRAY_SIZE(bpf_verifier_ops) == 0)
@@ -22430,8 +22499,10 @@  int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 	if (ret == 0)
 		ret = remove_fastcall_spills_fills(env);
 
-	if (ret == 0)
-		ret = check_max_stack_depth(env);
+	if (ret == 0) {
+		priv_stack_supported = is_priv_stack_supported(env);
+		ret = check_max_stack_depth(env, false, priv_stack_supported);
+	}
 
 	/* instruction rewrites happen after this point */
 	if (ret == 0)
@@ -22465,6 +22536,9 @@  int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
 								     : false;
 	}
 
+	if (ret == 0 && priv_stack_supported)
+		ret = check_max_stack_depth(env, true, true);
+
 	if (ret == 0)
 		ret = fixup_call_args(env);