diff mbox series

[bpf-next,v9,04/10] bpf: Check potential private stack recursion for progs with async callback

Message ID 20241104193515.3243315-1-yonghong.song@linux.dev (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf: Support private stack for bpf progs | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 5 this patch: 5
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 8 maintainers not CCed: kpsingh@kernel.org martin.lau@linux.dev eddyz87@gmail.com sdf@fomichev.me john.fastabend@gmail.com song@kernel.org jolsa@kernel.org haoluo@google.com
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 68 this patch: 68
netdev/checkpatch warning WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0

Commit Message

Yonghong Song Nov. 4, 2024, 7:35 p.m. UTC
In previous patch, tracing progs are enabled for private stack since
recursion checking ensures there exists no nested same bpf prog run on
the same cpu.

But it is still possible for nested bpf subprog run on the same cpu
if the same subprog is called in both main prog and async callback,
or in different async callbacks. For example,
  main_prog
   bpf_timer_set_callback(timer, timer_cb);
   call sub1
  sub1
   ...
  time_cb
   call sub1

In the above case, nested subprog run for sub1 is possible with one in
process context and the other in softirq context. If this is the case,
the verifier will disable private stack for this bpf prog.

Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
---
 include/linux/bpf_verifier.h |  2 ++
 kernel/bpf/verifier.c        | 42 +++++++++++++++++++++++++++++++-----
 2 files changed, 39 insertions(+), 5 deletions(-)

Comments

Alexei Starovoitov Nov. 5, 2024, 2:51 a.m. UTC | #1
On Mon, Nov 4, 2024 at 11:38 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> In previous patch, tracing progs are enabled for private stack since
> recursion checking ensures there exists no nested same bpf prog run on
> the same cpu.
>
> But it is still possible for nested bpf subprog run on the same cpu
> if the same subprog is called in both main prog and async callback,
> or in different async callbacks. For example,
>   main_prog
>    bpf_timer_set_callback(timer, timer_cb);
>    call sub1
>   sub1
>    ...
>   time_cb
>    call sub1
>
> In the above case, nested subprog run for sub1 is possible with one in
> process context and the other in softirq context. If this is the case,
> the verifier will disable private stack for this bpf prog.
>
> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> ---
>  include/linux/bpf_verifier.h |  2 ++
>  kernel/bpf/verifier.c        | 42 +++++++++++++++++++++++++++++++-----
>  2 files changed, 39 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> index 0622c11a7e19..e921589abc72 100644
> --- a/include/linux/bpf_verifier.h
> +++ b/include/linux/bpf_verifier.h
> @@ -669,6 +669,8 @@ struct bpf_subprog_info {
>         /* true if bpf_fastcall stack region is used by functions that can't be inlined */
>         bool keep_fastcall_stack: 1;
>         bool use_priv_stack: 1;
> +       bool visited_with_priv_stack_accum: 1;
> +       bool visited_with_priv_stack: 1;
>
>         u8 arg_cnt;
>         struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 406195c433ea..e01b3f0fd314 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -6118,8 +6118,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
>                                         idx, subprog_depth);
>                                 return -EACCES;
>                         }
> -                       if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
> +                       if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) {
>                                 subprog[idx].use_priv_stack = true;
> +                               subprog[idx].visited_with_priv_stack = true;
> +                       }
> +               } else {
> +                       subprog[idx].visited_with_priv_stack = true;

See suggestion for patch 3.
It's cleaner to rewrite with a single visited_with_priv_stack = true; statement.

>                 }
>         }
>  continue_func:
> @@ -6220,10 +6224,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
>  static int check_max_stack_depth(struct bpf_verifier_env *env)
>  {
>         struct bpf_subprog_info *si = env->subprog_info;
> +       enum priv_stack_mode orig_priv_stack_supported;
>         enum priv_stack_mode priv_stack_supported;
>         int ret, subtree_depth = 0, depth_frame;
>
>         priv_stack_supported = bpf_enable_priv_stack(env->prog);
> +       orig_priv_stack_supported = priv_stack_supported;
>
>         if (priv_stack_supported != NO_PRIV_STACK) {
>                 for (int i = 0; i < env->subprog_cnt; i++) {
> @@ -6240,13 +6246,39 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
>                                                             priv_stack_supported);
>                         if (ret < 0)
>                                 return ret;
> +
> +                       if (priv_stack_supported != NO_PRIV_STACK) {
> +                               for (int j = 0; j < env->subprog_cnt; j++) {
> +                                       if (si[j].visited_with_priv_stack_accum &&
> +                                           si[j].visited_with_priv_stack) {
> +                                               /* si[j] is visited by both main/async subprog
> +                                                * and another async subprog.
> +                                                */
> +                                               priv_stack_supported = NO_PRIV_STACK;
> +                                               break;
> +                                       }
> +                                       if (!si[j].visited_with_priv_stack_accum)
> +                                               si[j].visited_with_priv_stack_accum =
> +                                                       si[j].visited_with_priv_stack;
> +                               }
> +                       }
> +                       if (priv_stack_supported != NO_PRIV_STACK) {
> +                               for (int j = 0; j < env->subprog_cnt; j++)
> +                                       si[j].visited_with_priv_stack = false;
> +                       }

I cannot understand what this algorithm is doing.
What is the meaning of visited_with_priv_stack_accum ?

>                 }
>         }
>
> -       if (priv_stack_supported == NO_PRIV_STACK && subtree_depth > MAX_BPF_STACK) {
> -               verbose(env, "combined stack size of %d calls is %d. Too large\n",
> -                       depth_frame, subtree_depth);
> -               return -EACCES;
> +       if (priv_stack_supported == NO_PRIV_STACK) {
> +               if (subtree_depth > MAX_BPF_STACK) {
> +                       verbose(env, "combined stack size of %d calls is %d. Too large\n",
> +                               depth_frame, subtree_depth);
> +                       return -EACCES;
> +               }
> +               if (orig_priv_stack_supported == PRIV_STACK_ADAPTIVE) {
> +                       for (int i = 0; i < env->subprog_cnt; i++)
> +                               si[i].use_priv_stack = false;
> +               }

why? This patch suppose clear use_priv_stack from subprogs
that are dual called and only from those subprogs.
All other subprogs are fine.

But it seems the alog attempts to detect one such calling scenario
and disables priv_stack everywhere?
Yonghong Song Nov. 5, 2024, 3:37 a.m. UTC | #2
On 11/4/24 6:51 PM, Alexei Starovoitov wrote:
> On Mon, Nov 4, 2024 at 11:38 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>> In previous patch, tracing progs are enabled for private stack since
>> recursion checking ensures there exists no nested same bpf prog run on
>> the same cpu.
>>
>> But it is still possible for nested bpf subprog run on the same cpu
>> if the same subprog is called in both main prog and async callback,
>> or in different async callbacks. For example,
>>    main_prog
>>     bpf_timer_set_callback(timer, timer_cb);
>>     call sub1
>>    sub1
>>     ...
>>    time_cb
>>     call sub1
>>
>> In the above case, nested subprog run for sub1 is possible with one in
>> process context and the other in softirq context. If this is the case,
>> the verifier will disable private stack for this bpf prog.
>>
>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>> ---
>>   include/linux/bpf_verifier.h |  2 ++
>>   kernel/bpf/verifier.c        | 42 +++++++++++++++++++++++++++++++-----
>>   2 files changed, 39 insertions(+), 5 deletions(-)
>>
>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>> index 0622c11a7e19..e921589abc72 100644
>> --- a/include/linux/bpf_verifier.h
>> +++ b/include/linux/bpf_verifier.h
>> @@ -669,6 +669,8 @@ struct bpf_subprog_info {
>>          /* true if bpf_fastcall stack region is used by functions that can't be inlined */
>>          bool keep_fastcall_stack: 1;
>>          bool use_priv_stack: 1;
>> +       bool visited_with_priv_stack_accum: 1;
>> +       bool visited_with_priv_stack: 1;
>>
>>          u8 arg_cnt;
>>          struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 406195c433ea..e01b3f0fd314 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -6118,8 +6118,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
>>                                          idx, subprog_depth);
>>                                  return -EACCES;
>>                          }
>> -                       if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
>> +                       if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) {
>>                                  subprog[idx].use_priv_stack = true;
>> +                               subprog[idx].visited_with_priv_stack = true;
>> +                       }
>> +               } else {
>> +                       subprog[idx].visited_with_priv_stack = true;
> See suggestion for patch 3.
> It's cleaner to rewrite with a single visited_with_priv_stack = true; statement.

Ack.

>
>>                  }
>>          }
>>   continue_func:
>> @@ -6220,10 +6224,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
>>   static int check_max_stack_depth(struct bpf_verifier_env *env)
>>   {
>>          struct bpf_subprog_info *si = env->subprog_info;
>> +       enum priv_stack_mode orig_priv_stack_supported;
>>          enum priv_stack_mode priv_stack_supported;
>>          int ret, subtree_depth = 0, depth_frame;
>>
>>          priv_stack_supported = bpf_enable_priv_stack(env->prog);
>> +       orig_priv_stack_supported = priv_stack_supported;
>>
>>          if (priv_stack_supported != NO_PRIV_STACK) {
>>                  for (int i = 0; i < env->subprog_cnt; i++) {
>> @@ -6240,13 +6246,39 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
>>                                                              priv_stack_supported);
>>                          if (ret < 0)
>>                                  return ret;
>> +
>> +                       if (priv_stack_supported != NO_PRIV_STACK) {
>> +                               for (int j = 0; j < env->subprog_cnt; j++) {
>> +                                       if (si[j].visited_with_priv_stack_accum &&
>> +                                           si[j].visited_with_priv_stack) {
>> +                                               /* si[j] is visited by both main/async subprog
>> +                                                * and another async subprog.
>> +                                                */
>> +                                               priv_stack_supported = NO_PRIV_STACK;
>> +                                               break;
>> +                                       }
>> +                                       if (!si[j].visited_with_priv_stack_accum)
>> +                                               si[j].visited_with_priv_stack_accum =
>> +                                                       si[j].visited_with_priv_stack;
>> +                               }
>> +                       }
>> +                       if (priv_stack_supported != NO_PRIV_STACK) {
>> +                               for (int j = 0; j < env->subprog_cnt; j++)
>> +                                       si[j].visited_with_priv_stack = false;
>> +                       }
> I cannot understand what this algorithm is doing.
> What is the meaning of visited_with_priv_stack_accum ?

The following is an example to show how the algorithm works.
Let us say we have prog like
    main_prog0  si[0]
      sub1      si[1]
      sub2      si[2]
    async1      si[3]
      sub4      si[4]
      sub2      si[2]
    async2      si[5]
      sub4      si[4]
      sub5      si[6]
      

Total 9 subprograms.

after iteration 1 (main_prog0)
    visited_with_priv_stack_accum: si[i] = false for i = 0 ... 9
    visited_with_priv_stack: si[0] = si[1] = si[2] = true, others false

    for all i, visited_with_priv_stack_accum[i] and visited_with_priv_stack[i]
    is false, so main_prog0 can use priv stack.

    visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
    visited_with_priv_stack cleared with false.

after iteration 2 (async1)
    visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
    visited_with_priv_stack: si[2] = si[3] = si[4] = true, others false

    Here, si[2] appears in both visited_with_priv_stack_accum and
    visited_with_priv_stack, so async1 cannot have priv stack.

    In my algorithm, I flipped the whole thing to no_priv_stack, which is
    too conservative. We should just skip async1 and continues.

    Let us say, we say async1 not having priv stack while main_prog0 has.

    /* the same as end of iteration 1 */
    visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
    visited_with_priv_stack cleared with false.

after iteration 3 (async2)
    visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
    visited_with_priv_stack: si[4] = si[5] = si[6] = true;

    there are no conflict, so async2 can use private stack.


If we only have one bit in bpf_subprog_info, for a async tree,
if marking a subprog to be true and later we found there is a conflict in
async tree and we need make the whole async subprogs not eligible for priv stack,
then it will be hard to undo previous markings.

So visited_with_priv_stack_accum is to accumulate "true" results from
main_prog/async's.

Maybe we change two bit names to
   visited_with_priv_stack
   visited_with_priv_stack_tmp
?

>
>>                  }
>>          }
>>
>> -       if (priv_stack_supported == NO_PRIV_STACK && subtree_depth > MAX_BPF_STACK) {
>> -               verbose(env, "combined stack size of %d calls is %d. Too large\n",
>> -                       depth_frame, subtree_depth);
>> -               return -EACCES;
>> +       if (priv_stack_supported == NO_PRIV_STACK) {
>> +               if (subtree_depth > MAX_BPF_STACK) {
>> +                       verbose(env, "combined stack size of %d calls is %d. Too large\n",
>> +                               depth_frame, subtree_depth);
>> +                       return -EACCES;
>> +               }
>> +               if (orig_priv_stack_supported == PRIV_STACK_ADAPTIVE) {
>> +                       for (int i = 0; i < env->subprog_cnt; i++)
>> +                               si[i].use_priv_stack = false;
>> +               }
> why? This patch suppose clear use_priv_stack from subprogs
> that are dual called and only from those subprogs.
> All other subprogs are fine.
>
> But it seems the alog attempts to detect one such calling scenario
> and disables priv_stack everywhere?

Sorry about this. Will fix in the next revision.
Alexei Starovoitov Nov. 5, 2024, 8:26 p.m. UTC | #3
On Mon, Nov 4, 2024 at 7:37 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
> On 11/4/24 6:51 PM, Alexei Starovoitov wrote:
> > On Mon, Nov 4, 2024 at 11:38 AM Yonghong Song <yonghong.song@linux.dev> wrote:
> >> In previous patch, tracing progs are enabled for private stack since
> >> recursion checking ensures there exists no nested same bpf prog run on
> >> the same cpu.
> >>
> >> But it is still possible for nested bpf subprog run on the same cpu
> >> if the same subprog is called in both main prog and async callback,
> >> or in different async callbacks. For example,
> >>    main_prog
> >>     bpf_timer_set_callback(timer, timer_cb);
> >>     call sub1
> >>    sub1
> >>     ...
> >>    time_cb
> >>     call sub1
> >>
> >> In the above case, nested subprog run for sub1 is possible with one in
> >> process context and the other in softirq context. If this is the case,
> >> the verifier will disable private stack for this bpf prog.
> >>
> >> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
> >> ---
> >>   include/linux/bpf_verifier.h |  2 ++
> >>   kernel/bpf/verifier.c        | 42 +++++++++++++++++++++++++++++++-----
> >>   2 files changed, 39 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
> >> index 0622c11a7e19..e921589abc72 100644
> >> --- a/include/linux/bpf_verifier.h
> >> +++ b/include/linux/bpf_verifier.h
> >> @@ -669,6 +669,8 @@ struct bpf_subprog_info {
> >>          /* true if bpf_fastcall stack region is used by functions that can't be inlined */
> >>          bool keep_fastcall_stack: 1;
> >>          bool use_priv_stack: 1;
> >> +       bool visited_with_priv_stack_accum: 1;
> >> +       bool visited_with_priv_stack: 1;
> >>
> >>          u8 arg_cnt;
> >>          struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
> >> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> >> index 406195c433ea..e01b3f0fd314 100644
> >> --- a/kernel/bpf/verifier.c
> >> +++ b/kernel/bpf/verifier.c
> >> @@ -6118,8 +6118,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
> >>                                          idx, subprog_depth);
> >>                                  return -EACCES;
> >>                          }
> >> -                       if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
> >> +                       if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) {
> >>                                  subprog[idx].use_priv_stack = true;
> >> +                               subprog[idx].visited_with_priv_stack = true;
> >> +                       }
> >> +               } else {
> >> +                       subprog[idx].visited_with_priv_stack = true;
> > See suggestion for patch 3.
> > It's cleaner to rewrite with a single visited_with_priv_stack = true; statement.
>
> Ack.
>
> >
> >>                  }
> >>          }
> >>   continue_func:
> >> @@ -6220,10 +6224,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
> >>   static int check_max_stack_depth(struct bpf_verifier_env *env)
> >>   {
> >>          struct bpf_subprog_info *si = env->subprog_info;
> >> +       enum priv_stack_mode orig_priv_stack_supported;
> >>          enum priv_stack_mode priv_stack_supported;
> >>          int ret, subtree_depth = 0, depth_frame;
> >>
> >>          priv_stack_supported = bpf_enable_priv_stack(env->prog);
> >> +       orig_priv_stack_supported = priv_stack_supported;
> >>
> >>          if (priv_stack_supported != NO_PRIV_STACK) {
> >>                  for (int i = 0; i < env->subprog_cnt; i++) {
> >> @@ -6240,13 +6246,39 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
> >>                                                              priv_stack_supported);
> >>                          if (ret < 0)
> >>                                  return ret;
> >> +
> >> +                       if (priv_stack_supported != NO_PRIV_STACK) {
> >> +                               for (int j = 0; j < env->subprog_cnt; j++) {
> >> +                                       if (si[j].visited_with_priv_stack_accum &&
> >> +                                           si[j].visited_with_priv_stack) {
> >> +                                               /* si[j] is visited by both main/async subprog
> >> +                                                * and another async subprog.
> >> +                                                */
> >> +                                               priv_stack_supported = NO_PRIV_STACK;
> >> +                                               break;
> >> +                                       }
> >> +                                       if (!si[j].visited_with_priv_stack_accum)
> >> +                                               si[j].visited_with_priv_stack_accum =
> >> +                                                       si[j].visited_with_priv_stack;
> >> +                               }
> >> +                       }
> >> +                       if (priv_stack_supported != NO_PRIV_STACK) {
> >> +                               for (int j = 0; j < env->subprog_cnt; j++)
> >> +                                       si[j].visited_with_priv_stack = false;
> >> +                       }
> > I cannot understand what this algorithm is doing.
> > What is the meaning of visited_with_priv_stack_accum ?
>
> The following is an example to show how the algorithm works.
> Let us say we have prog like
>     main_prog0  si[0]
>       sub1      si[1]
>       sub2      si[2]
>     async1      si[3]
>       sub4      si[4]
>       sub2      si[2]
>     async2      si[5]
>       sub4      si[4]
>       sub5      si[6]
>
>
> Total 9 subprograms.
>
> after iteration 1 (main_prog0)
>     visited_with_priv_stack_accum: si[i] = false for i = 0 ... 9
>     visited_with_priv_stack: si[0] = si[1] = si[2] = true, others false
>
>     for all i, visited_with_priv_stack_accum[i] and visited_with_priv_stack[i]
>     is false, so main_prog0 can use priv stack.
>
>     visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
>     visited_with_priv_stack cleared with false.
>
> after iteration 2 (async1)
>     visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
>     visited_with_priv_stack: si[2] = si[3] = si[4] = true, others false
>
>     Here, si[2] appears in both visited_with_priv_stack_accum and
>     visited_with_priv_stack, so async1 cannot have priv stack.
>
>     In my algorithm, I flipped the whole thing to no_priv_stack, which is
>     too conservative. We should just skip async1 and continues.
>
>     Let us say, we say async1 not having priv stack while main_prog0 has.
>
>     /* the same as end of iteration 1 */
>     visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
>     visited_with_priv_stack cleared with false.
>
> after iteration 3 (async2)
>     visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
>     visited_with_priv_stack: si[4] = si[5] = si[6] = true;
>
>     there are no conflict, so async2 can use private stack.
>
>
> If we only have one bit in bpf_subprog_info, for a async tree,
> if marking a subprog to be true and later we found there is a conflict in
> async tree and we need make the whole async subprogs not eligible for priv stack,
> then it will be hard to undo previous markings.
>
> So visited_with_priv_stack_accum is to accumulate "true" results from
> main_prog/async's.

I see. I think it works, but feels complicated.
It feels it should be possible to do without extra flags. Like
check_max_stack_depth_subprog() will know whether it was called
to verify async_cb or not.
So it's just a matter of adding single 'if' to it:
if (subprog[idx].use_priv_stack && checking_async_cb)
   /* reset to false due to potential recursion */
   subprog[idx].use_priv_stack = false;

check_max_stack_depth() starts with i==0,
so reachable and eligible subprogs will be marked with use_priv_stack.
Then check_max_stack_depth_subprog() will be called again
to verify async. If it sees the mark it's a bad case.
what am I missing?
Yonghong Song Nov. 5, 2024, 9:26 p.m. UTC | #4
On 11/5/24 12:26 PM, Alexei Starovoitov wrote:
> On Mon, Nov 4, 2024 at 7:37 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>> On 11/4/24 6:51 PM, Alexei Starovoitov wrote:
>>> On Mon, Nov 4, 2024 at 11:38 AM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>> In previous patch, tracing progs are enabled for private stack since
>>>> recursion checking ensures there exists no nested same bpf prog run on
>>>> the same cpu.
>>>>
>>>> But it is still possible for nested bpf subprog run on the same cpu
>>>> if the same subprog is called in both main prog and async callback,
>>>> or in different async callbacks. For example,
>>>>     main_prog
>>>>      bpf_timer_set_callback(timer, timer_cb);
>>>>      call sub1
>>>>     sub1
>>>>      ...
>>>>     time_cb
>>>>      call sub1
>>>>
>>>> In the above case, nested subprog run for sub1 is possible with one in
>>>> process context and the other in softirq context. If this is the case,
>>>> the verifier will disable private stack for this bpf prog.
>>>>
>>>> Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
>>>> ---
>>>>    include/linux/bpf_verifier.h |  2 ++
>>>>    kernel/bpf/verifier.c        | 42 +++++++++++++++++++++++++++++++-----
>>>>    2 files changed, 39 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
>>>> index 0622c11a7e19..e921589abc72 100644
>>>> --- a/include/linux/bpf_verifier.h
>>>> +++ b/include/linux/bpf_verifier.h
>>>> @@ -669,6 +669,8 @@ struct bpf_subprog_info {
>>>>           /* true if bpf_fastcall stack region is used by functions that can't be inlined */
>>>>           bool keep_fastcall_stack: 1;
>>>>           bool use_priv_stack: 1;
>>>> +       bool visited_with_priv_stack_accum: 1;
>>>> +       bool visited_with_priv_stack: 1;
>>>>
>>>>           u8 arg_cnt;
>>>>           struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
>>>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>>>> index 406195c433ea..e01b3f0fd314 100644
>>>> --- a/kernel/bpf/verifier.c
>>>> +++ b/kernel/bpf/verifier.c
>>>> @@ -6118,8 +6118,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
>>>>                                           idx, subprog_depth);
>>>>                                   return -EACCES;
>>>>                           }
>>>> -                       if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
>>>> +                       if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) {
>>>>                                   subprog[idx].use_priv_stack = true;
>>>> +                               subprog[idx].visited_with_priv_stack = true;
>>>> +                       }
>>>> +               } else {
>>>> +                       subprog[idx].visited_with_priv_stack = true;
>>> See suggestion for patch 3.
>>> It's cleaner to rewrite with a single visited_with_priv_stack = true; statement.
>> Ack.
>>
>>>>                   }
>>>>           }
>>>>    continue_func:
>>>> @@ -6220,10 +6224,12 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
>>>>    static int check_max_stack_depth(struct bpf_verifier_env *env)
>>>>    {
>>>>           struct bpf_subprog_info *si = env->subprog_info;
>>>> +       enum priv_stack_mode orig_priv_stack_supported;
>>>>           enum priv_stack_mode priv_stack_supported;
>>>>           int ret, subtree_depth = 0, depth_frame;
>>>>
>>>>           priv_stack_supported = bpf_enable_priv_stack(env->prog);
>>>> +       orig_priv_stack_supported = priv_stack_supported;
>>>>
>>>>           if (priv_stack_supported != NO_PRIV_STACK) {
>>>>                   for (int i = 0; i < env->subprog_cnt; i++) {
>>>> @@ -6240,13 +6246,39 @@ static int check_max_stack_depth(struct bpf_verifier_env *env)
>>>>                                                               priv_stack_supported);
>>>>                           if (ret < 0)
>>>>                                   return ret;
>>>> +
>>>> +                       if (priv_stack_supported != NO_PRIV_STACK) {
>>>> +                               for (int j = 0; j < env->subprog_cnt; j++) {
>>>> +                                       if (si[j].visited_with_priv_stack_accum &&
>>>> +                                           si[j].visited_with_priv_stack) {
>>>> +                                               /* si[j] is visited by both main/async subprog
>>>> +                                                * and another async subprog.
>>>> +                                                */
>>>> +                                               priv_stack_supported = NO_PRIV_STACK;
>>>> +                                               break;
>>>> +                                       }
>>>> +                                       if (!si[j].visited_with_priv_stack_accum)
>>>> +                                               si[j].visited_with_priv_stack_accum =
>>>> +                                                       si[j].visited_with_priv_stack;
>>>> +                               }
>>>> +                       }
>>>> +                       if (priv_stack_supported != NO_PRIV_STACK) {
>>>> +                               for (int j = 0; j < env->subprog_cnt; j++)
>>>> +                                       si[j].visited_with_priv_stack = false;
>>>> +                       }
>>> I cannot understand what this algorithm is doing.
>>> What is the meaning of visited_with_priv_stack_accum ?
>> The following is an example to show how the algorithm works.
>> Let us say we have prog like
>>      main_prog0  si[0]
>>        sub1      si[1]
>>        sub2      si[2]
>>      async1      si[3]
>>        sub4      si[4]
>>        sub2      si[2]
>>      async2      si[5]
>>        sub4      si[4]
>>        sub5      si[6]
>>
>>
>> Total 9 subprograms.
>>
>> after iteration 1 (main_prog0)
>>      visited_with_priv_stack_accum: si[i] = false for i = 0 ... 9
>>      visited_with_priv_stack: si[0] = si[1] = si[2] = true, others false
>>
>>      for all i, visited_with_priv_stack_accum[i] and visited_with_priv_stack[i]
>>      is false, so main_prog0 can use priv stack.
>>
>>      visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
>>      visited_with_priv_stack cleared with false.
>>
>> after iteration 2 (async1)
>>      visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
>>      visited_with_priv_stack: si[2] = si[3] = si[4] = true, others false
>>
>>      Here, si[2] appears in both visited_with_priv_stack_accum and
>>      visited_with_priv_stack, so async1 cannot have priv stack.
>>
>>      In my algorithm, I flipped the whole thing to no_priv_stack, which is
>>      too conservative. We should just skip async1 and continues.
>>
>>      Let us say, we say async1 not having priv stack while main_prog0 has.
>>
>>      /* the same as end of iteration 1 */
>>      visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
>>      visited_with_priv_stack cleared with false.
>>
>> after iteration 3 (async2)
>>      visited_with_priv_stack_accum: si[0] = si[1] = si[2] = true; others false
>>      visited_with_priv_stack: si[4] = si[5] = si[6] = true;
>>
>>      there are no conflict, so async2 can use private stack.
>>
>>
>> If we only have one bit in bpf_subprog_info, for a async tree,
>> if marking a subprog to be true and later we found there is a conflict in
>> async tree and we need make the whole async subprogs not eligible for priv stack,
>> then it will be hard to undo previous markings.
>>
>> So visited_with_priv_stack_accum is to accumulate "true" results from
>> main_prog/async's.
> I see. I think it works, but feels complicated.
> It feels it should be possible to do without extra flags. Like
> check_max_stack_depth_subprog() will know whether it was called
> to verify async_cb or not.
> So it's just a matter of adding single 'if' to it:
> if (subprog[idx].use_priv_stack && checking_async_cb)
>     /* reset to false due to potential recursion */
>     subprog[idx].use_priv_stack = false;
>
> check_max_stack_depth() starts with i==0,
> so reachable and eligible subprogs will be marked with use_priv_stack.
> Then check_max_stack_depth_subprog() will be called again
> to verify async. If it sees the mark it's a bad case.
> what am I missing?

First I think we still want to mark some subprogs in async tree
to use private stack, right? If this is the case, then let us see
the following examle:

main_prog:
    sub1: use_priv_stack = true
    sub2" use_priv_stack = true

async: /* calling sub1 twice */
    sub1
      <=== we do
             if (subprog[idx].use_priv_stack && checking_async_cb)
                 subprog[idx].use_priv_stack = false;
    sub1
      <=== here we have subprog[idx].use_priv_stack = false;
           we could mark use_priv_stack = true again here
           since logic didn't keep track of sub1 has been
           visited before.

To solve the above issue, we need one visited bit in bpf_subprog_info.
After finishing async tree, if for any subprog,
   visited_bit && subprog[idx].use_priv_stack
is true, we can mark subprog[idx].use_priv_stack = false

So one visited bit is enough.

More complicated case is two asyncs. For example:

main_prog:
   sub1
   sub2

async1:
   sub3

async2:
   sub3

If async1/sub3 and async2/sub3 can be nested, then we will
need two visited bits as I have above.
If async1/sub3 and async2/sub3 cannot be nested, then one
visited bit should be enough, since we can traverse
async1/async2 with 'visited' marking and then compare against
main prog.

So the question would be:
   1. Is it possible that two async call backs may nest with
      each other? I actually do not know the answer.
   2. Do we want to allow subprogs in async tree to use private
      stacks?
Alexei Starovoitov Nov. 5, 2024, 9:52 p.m. UTC | #5
On Tue, Nov 5, 2024 at 1:26 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
> > I see. I think it works, but feels complicated.
> > It feels it should be possible to do without extra flags. Like
> > check_max_stack_depth_subprog() will know whether it was called
> > to verify async_cb or not.
> > So it's just a matter of adding single 'if' to it:
> > if (subprog[idx].use_priv_stack && checking_async_cb)
> >     /* reset to false due to potential recursion */
> >     subprog[idx].use_priv_stack = false;
> >
> > check_max_stack_depth() starts with i==0,
> > so reachable and eligible subprogs will be marked with use_priv_stack.
> > Then check_max_stack_depth_subprog() will be called again
> > to verify async. If it sees the mark it's a bad case.
> > what am I missing?
>
> First I think we still want to mark some subprogs in async tree
> to use private stack, right? If this is the case, then let us see
> the following examle:
>
> main_prog:
>     sub1: use_priv_stack = true
>     sub2" use_priv_stack = true
>
> async: /* calling sub1 twice */
>     sub1
>       <=== we do
>              if (subprog[idx].use_priv_stack && checking_async_cb)
>                  subprog[idx].use_priv_stack = false;
>     sub1
>       <=== here we have subprog[idx].use_priv_stack = false;
>            we could mark use_priv_stack = true again here
>            since logic didn't keep track of sub1 has been
>            visited before.

This case needs a sticky state to solve.
Instead of bool use_priv_stack it can be tri-state:
no_priv_stack
priv_stack_unknown <- start state
priv_stack_maybe

main_prog pass will set it to priv_stack_maybe
while async pass will clear it to no_priv_stack
and it cannot be bumped up.

> To solve the above issue, we need one visited bit in bpf_subprog_info.
> After finishing async tree, if for any subprog,
>    visited_bit && subprog[idx].use_priv_stack
> is true, we can mark subprog[idx].use_priv_stack = false
>
> So one visited bit is enough.
>
> More complicated case is two asyncs. For example:
>
> main_prog:
>    sub1
>    sub2
>
> async1:
>    sub3
>
> async2:
>    sub3
>
> If async1/sub3 and async2/sub3 can be nested, then we will
> need two visited bits as I have above.
> If async1/sub3 and async2/sub3 cannot be nested, then one
> visited bit should be enough, since we can traverse
> async1/async2 with 'visited' marking and then compare against
> main prog.
>
> So the question would be:
>    1. Is it possible that two async call backs may nest with
>       each other? I actually do not know the answer.

I think we have to assume that they can. Doing otherwise
would subject us to implementation details.
I think above tri-state approach works for two callbacks case too:
async1 will bump sub3 to priv_stack_maybe
while async2 will clear it to sticky no_priv_stack.

Ideally we reuse the same enum for this algorithm and for earlier
patches.

>    2. Do we want to allow subprogs in async tree to use private
>       stacks?

yes. when sched-ext requests priv stack it would want it everywhere.
I think the request_priv_stack should be treated as
PRIV_STACK_ADAPTIVE. Meaning that subprogs with stack_depth < 64
don't need to use it.
In other words struct_ops prog with request_priv_stack == true
tells the verifier: add run-time recursion check at main prog entry,
otherwise treat it like fentry and pick priv stack vs normal
as the best for performance.

Then for both fentry and struct_ops w/request_priv_stack
the async callbacks will be considered for priv stack too and
will be cleared to normals stack when potential recursion via async
is detected.
I don't think it's an error for either prog type.
Overall we shouldn't treat struct_ops too special.
fentry progs with large stack are automatically candidates for priv stack.
struct_ops w/request_priv_stack are in the same category.
Yonghong Song Nov. 6, 2024, 12:19 a.m. UTC | #6
On 11/5/24 1:52 PM, Alexei Starovoitov wrote:
> On Tue, Nov 5, 2024 at 1:26 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>> I see. I think it works, but feels complicated.
>>> It feels it should be possible to do without extra flags. Like
>>> check_max_stack_depth_subprog() will know whether it was called
>>> to verify async_cb or not.
>>> So it's just a matter of adding single 'if' to it:
>>> if (subprog[idx].use_priv_stack && checking_async_cb)
>>>      /* reset to false due to potential recursion */
>>>      subprog[idx].use_priv_stack = false;
>>>
>>> check_max_stack_depth() starts with i==0,
>>> so reachable and eligible subprogs will be marked with use_priv_stack.
>>> Then check_max_stack_depth_subprog() will be called again
>>> to verify async. If it sees the mark it's a bad case.
>>> what am I missing?
>> First I think we still want to mark some subprogs in async tree
>> to use private stack, right? If this is the case, then let us see
>> the following examle:
>>
>> main_prog:
>>      sub1: use_priv_stack = true
>>      sub2" use_priv_stack = true
>>
>> async: /* calling sub1 twice */
>>      sub1
>>        <=== we do
>>               if (subprog[idx].use_priv_stack && checking_async_cb)
>>                   subprog[idx].use_priv_stack = false;
>>      sub1
>>        <=== here we have subprog[idx].use_priv_stack = false;
>>             we could mark use_priv_stack = true again here
>>             since logic didn't keep track of sub1 has been
>>             visited before.
> This case needs a sticky state to solve.
> Instead of bool use_priv_stack it can be tri-state:
> no_priv_stack
> priv_stack_unknown <- start state
> priv_stack_maybe
>
> main_prog pass will set it to priv_stack_maybe
> while async pass will clear it to no_priv_stack
> and it cannot be bumped up.

The tri-state may not work. For example,

main_prog:
    call sub1
    call sub2
    call sub1
    call sub3

async:
    call sub4 ==> UNKNOWN -> MAYBE
    call sub3
    call sub4 ==> MAYBE -> NO_PRIV_STACK?

For sub4 in async which is called twice, for the second sub4 call,
it is not clear whether UNKNOWN->MAYBE transition happens in
main_prog or async. So based on transition prototol,
second sub4 call will transition to NO_PRIV_STACK which is not
what we want.

So I think we still need a separate bit in bpf_subprog_info to
accumulate information for main_prog tree or any async tree.

>
>> To solve the above issue, we need one visited bit in bpf_subprog_info.
>> After finishing async tree, if for any subprog,
>>     visited_bit && subprog[idx].use_priv_stack
>> is true, we can mark subprog[idx].use_priv_stack = false
>>
>> So one visited bit is enough.
>>
>> More complicated case is two asyncs. For example:
>>
>> main_prog:
>>     sub1
>>     sub2
>>
>> async1:
>>     sub3
>>
>> async2:
>>     sub3
>>
>> If async1/sub3 and async2/sub3 can be nested, then we will
>> need two visited bits as I have above.
>> If async1/sub3 and async2/sub3 cannot be nested, then one
>> visited bit should be enough, since we can traverse
>> async1/async2 with 'visited' marking and then compare against
>> main prog.
>>
>> So the question would be:
>>     1. Is it possible that two async call backs may nest with
>>        each other? I actually do not know the answer.
> I think we have to assume that they can. Doing otherwise
> would subject us to implementation details.
> I think above tri-state approach works for two callbacks case too:
> async1 will bump sub3 to priv_stack_maybe
> while async2 will clear it to sticky no_priv_stack.
>
> Ideally we reuse the same enum for this algorithm and for earlier
> patches.
>
>>     2. Do we want to allow subprogs in async tree to use private
>>        stacks?
> yes. when sched-ext requests priv stack it would want it everywhere.
> I think the request_priv_stack should be treated as
> PRIV_STACK_ADAPTIVE. Meaning that subprogs with stack_depth < 64
> don't need to use it.
> In other words struct_ops prog with request_priv_stack == true
> tells the verifier: add run-time recursion check at main prog entry,
> otherwise treat it like fentry and pick priv stack vs normal
> as the best for performance.
>
> Then for both fentry and struct_ops w/request_priv_stack
> the async callbacks will be considered for priv stack too and
> will be cleared to normals stack when potential recursion via async
> is detected.
> I don't think it's an error for either prog type.
> Overall we shouldn't treat struct_ops too special.
> fentry progs with large stack are automatically candidates for priv stack.
> struct_ops w/request_priv_stack are in the same category.

Okay, indeed we can treat struct_ops the same as other tracing programs.
They are adaptive too subject to various conditions where priv state may
not be used, e.g. small stack size, tailcall, and potentially nested subprogs.
Alexei Starovoitov Nov. 6, 2024, 1:07 a.m. UTC | #7
On Tue, Nov 5, 2024 at 4:19 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
>
>
> On 11/5/24 1:52 PM, Alexei Starovoitov wrote:
> > On Tue, Nov 5, 2024 at 1:26 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>> I see. I think it works, but feels complicated.
> >>> It feels it should be possible to do without extra flags. Like
> >>> check_max_stack_depth_subprog() will know whether it was called
> >>> to verify async_cb or not.
> >>> So it's just a matter of adding single 'if' to it:
> >>> if (subprog[idx].use_priv_stack && checking_async_cb)
> >>>      /* reset to false due to potential recursion */
> >>>      subprog[idx].use_priv_stack = false;
> >>>
> >>> check_max_stack_depth() starts with i==0,
> >>> so reachable and eligible subprogs will be marked with use_priv_stack.
> >>> Then check_max_stack_depth_subprog() will be called again
> >>> to verify async. If it sees the mark it's a bad case.
> >>> what am I missing?
> >> First I think we still want to mark some subprogs in async tree
> >> to use private stack, right? If this is the case, then let us see
> >> the following examle:
> >>
> >> main_prog:
> >>      sub1: use_priv_stack = true
> >>      sub2" use_priv_stack = true
> >>
> >> async: /* calling sub1 twice */
> >>      sub1
> >>        <=== we do
> >>               if (subprog[idx].use_priv_stack && checking_async_cb)
> >>                   subprog[idx].use_priv_stack = false;
> >>      sub1
> >>        <=== here we have subprog[idx].use_priv_stack = false;
> >>             we could mark use_priv_stack = true again here
> >>             since logic didn't keep track of sub1 has been
> >>             visited before.
> > This case needs a sticky state to solve.
> > Instead of bool use_priv_stack it can be tri-state:
> > no_priv_stack
> > priv_stack_unknown <- start state
> > priv_stack_maybe
> >
> > main_prog pass will set it to priv_stack_maybe
> > while async pass will clear it to no_priv_stack
> > and it cannot be bumped up.
>
> The tri-state may not work. For example,
>
> main_prog:
>     call sub1
>     call sub2
>     call sub1

sub1 cannot be called nested like this.
I think we discussed it already.

>     call sub3
>
> async:
>     call sub4 ==> UNKNOWN -> MAYBE
>     call sub3
>     call sub4 ==> MAYBE -> NO_PRIV_STACK?
>
> For sub4 in async which is called twice, for the second sub4 call,
> it is not clear whether UNKNOWN->MAYBE transition happens in
> main_prog or async. So based on transition prototol,
> second sub4 call will transition to NO_PRIV_STACK which is not
> what we want.

I see. Good point.

> So I think we still need a separate bit in bpf_subprog_info to
> accumulate information for main_prog tree or any async tree.

This is getting quite convoluted. To support priv stack
in multiple async cb-s we may need to remember async cb id or something.
I say let's force all subprogs in async cb to use normal stack.
Yonghong Song Nov. 6, 2024, 2:33 a.m. UTC | #8
On 11/5/24 5:07 PM, Alexei Starovoitov wrote:
> On Tue, Nov 5, 2024 at 4:19 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>>
>>
>> On 11/5/24 1:52 PM, Alexei Starovoitov wrote:
>>> On Tue, Nov 5, 2024 at 1:26 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>> I see. I think it works, but feels complicated.
>>>>> It feels it should be possible to do without extra flags. Like
>>>>> check_max_stack_depth_subprog() will know whether it was called
>>>>> to verify async_cb or not.
>>>>> So it's just a matter of adding single 'if' to it:
>>>>> if (subprog[idx].use_priv_stack && checking_async_cb)
>>>>>       /* reset to false due to potential recursion */
>>>>>       subprog[idx].use_priv_stack = false;
>>>>>
>>>>> check_max_stack_depth() starts with i==0,
>>>>> so reachable and eligible subprogs will be marked with use_priv_stack.
>>>>> Then check_max_stack_depth_subprog() will be called again
>>>>> to verify async. If it sees the mark it's a bad case.
>>>>> what am I missing?
>>>> First I think we still want to mark some subprogs in async tree
>>>> to use private stack, right? If this is the case, then let us see
>>>> the following examle:
>>>>
>>>> main_prog:
>>>>       sub1: use_priv_stack = true
>>>>       sub2" use_priv_stack = true
>>>>
>>>> async: /* calling sub1 twice */
>>>>       sub1
>>>>         <=== we do
>>>>                if (subprog[idx].use_priv_stack && checking_async_cb)
>>>>                    subprog[idx].use_priv_stack = false;
>>>>       sub1
>>>>         <=== here we have subprog[idx].use_priv_stack = false;
>>>>              we could mark use_priv_stack = true again here
>>>>              since logic didn't keep track of sub1 has been
>>>>              visited before.
>>> This case needs a sticky state to solve.
>>> Instead of bool use_priv_stack it can be tri-state:
>>> no_priv_stack
>>> priv_stack_unknown <- start state
>>> priv_stack_maybe
>>>
>>> main_prog pass will set it to priv_stack_maybe
>>> while async pass will clear it to no_priv_stack
>>> and it cannot be bumped up.
>> The tri-state may not work. For example,
>>
>> main_prog:
>>      call sub1
>>      call sub2
>>      call sub1
> sub1 cannot be called nested like this.
> I think we discussed it already.
>
>>      call sub3
>>
>> async:
>>      call sub4 ==> UNKNOWN -> MAYBE
>>      call sub3
>>      call sub4 ==> MAYBE -> NO_PRIV_STACK?
>>
>> For sub4 in async which is called twice, for the second sub4 call,
>> it is not clear whether UNKNOWN->MAYBE transition happens in
>> main_prog or async. So based on transition prototol,
>> second sub4 call will transition to NO_PRIV_STACK which is not
>> what we want.
> I see. Good point.
>
>> So I think we still need a separate bit in bpf_subprog_info to
>> accumulate information for main_prog tree or any async tree.
> This is getting quite convoluted. To support priv stack
> in multiple async cb-s we may need to remember async cb id or something.
> I say let's force all subprogs in async cb to use normal stack.

Okay. Let do this. We only have a few of helper/kfunc having async cb
    bpf_timer_set_callback
    bpf_wq_set_callback
    exception handling
Yonghong Song Nov. 6, 2024, 6:55 a.m. UTC | #9
On 11/5/24 5:07 PM, Alexei Starovoitov wrote:
> On Tue, Nov 5, 2024 at 4:19 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>>
>>
>> On 11/5/24 1:52 PM, Alexei Starovoitov wrote:
>>> On Tue, Nov 5, 2024 at 1:26 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>> I see. I think it works, but feels complicated.
>>>>> It feels it should be possible to do without extra flags. Like
>>>>> check_max_stack_depth_subprog() will know whether it was called
>>>>> to verify async_cb or not.
>>>>> So it's just a matter of adding single 'if' to it:
>>>>> if (subprog[idx].use_priv_stack && checking_async_cb)
>>>>>       /* reset to false due to potential recursion */
>>>>>       subprog[idx].use_priv_stack = false;
>>>>>
>>>>> check_max_stack_depth() starts with i==0,
>>>>> so reachable and eligible subprogs will be marked with use_priv_stack.
>>>>> Then check_max_stack_depth_subprog() will be called again
>>>>> to verify async. If it sees the mark it's a bad case.
>>>>> what am I missing?
>>>> First I think we still want to mark some subprogs in async tree
>>>> to use private stack, right? If this is the case, then let us see
>>>> the following examle:
>>>>
>>>> main_prog:
>>>>       sub1: use_priv_stack = true
>>>>       sub2" use_priv_stack = true
>>>>
>>>> async: /* calling sub1 twice */
>>>>       sub1
>>>>         <=== we do
>>>>                if (subprog[idx].use_priv_stack && checking_async_cb)
>>>>                    subprog[idx].use_priv_stack = false;
>>>>       sub1
>>>>         <=== here we have subprog[idx].use_priv_stack = false;
>>>>              we could mark use_priv_stack = true again here
>>>>              since logic didn't keep track of sub1 has been
>>>>              visited before.
>>> This case needs a sticky state to solve.
>>> Instead of bool use_priv_stack it can be tri-state:
>>> no_priv_stack
>>> priv_stack_unknown <- start state
>>> priv_stack_maybe
>>>
>>> main_prog pass will set it to priv_stack_maybe
>>> while async pass will clear it to no_priv_stack
>>> and it cannot be bumped up.
>> The tri-state may not work. For example,
>>
>> main_prog:
>>      call sub1
>>      call sub2
>>      call sub1
> sub1 cannot be called nested like this.
> I think we discussed it already.
>
>>      call sub3
>>
>> async:
>>      call sub4 ==> UNKNOWN -> MAYBE
>>      call sub3
>>      call sub4 ==> MAYBE -> NO_PRIV_STACK?
>>
>> For sub4 in async which is called twice, for the second sub4 call,
>> it is not clear whether UNKNOWN->MAYBE transition happens in
>> main_prog or async. So based on transition prototol,
>> second sub4 call will transition to NO_PRIV_STACK which is not
>> what we want.
> I see. Good point.
>
>> So I think we still need a separate bit in bpf_subprog_info to
>> accumulate information for main_prog tree or any async tree.
> This is getting quite convoluted. To support priv stack
> in multiple async cb-s we may need to remember async cb id or something.
> I say let's force all subprogs in async cb to use normal stack.

I did a quick prototype. Among others, tri-state (UNKNOWN, NO, ADAPTIVE) 
and reverse traversing subprogs like below diff --git 
a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 
cb82254484ff..1084432dbe83 100644 --- a/kernel/bpf/verifier.c +++ 
b/kernel/bpf/verifier.c @@ -6192,7 +6192,7 @@ static int 
check_max_stack_depth(struct bpf_verifier_env *env) struct 
bpf_subprog_info *si = env->subprog_info; int ret; - for (int i = 0; i < 
env->subprog_cnt; i++) { + for (int i = env->subprog_cnt - 1; i >= 0; 
i--) { if (i && !si[i].is_async_cb) continue; works correctly. 
Basically, all async_cb touched subprogs are marked as 'NO'. Finally for 
main prog tree, UNKNOWN subprog will convert to ADAPTIVE if >= stack 
size threshold. Stack size checking can also be done properly for 
async_cb tree and main prog tree.
Alexei Starovoitov Nov. 6, 2024, 3:26 p.m. UTC | #10
On Tue, Nov 5, 2024 at 10:55 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>
>
>
>
> On 11/5/24 5:07 PM, Alexei Starovoitov wrote:
> > On Tue, Nov 5, 2024 at 4:19 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>
> >>
> >>
> >> On 11/5/24 1:52 PM, Alexei Starovoitov wrote:
> >>> On Tue, Nov 5, 2024 at 1:26 PM Yonghong Song <yonghong.song@linux.dev> wrote:
> >>>>> I see. I think it works, but feels complicated.
> >>>>> It feels it should be possible to do without extra flags. Like
> >>>>> check_max_stack_depth_subprog() will know whether it was called
> >>>>> to verify async_cb or not.
> >>>>> So it's just a matter of adding single 'if' to it:
> >>>>> if (subprog[idx].use_priv_stack && checking_async_cb)
> >>>>>       /* reset to false due to potential recursion */
> >>>>>       subprog[idx].use_priv_stack = false;
> >>>>>
> >>>>> check_max_stack_depth() starts with i==0,
> >>>>> so reachable and eligible subprogs will be marked with use_priv_stack.
> >>>>> Then check_max_stack_depth_subprog() will be called again
> >>>>> to verify async. If it sees the mark it's a bad case.
> >>>>> what am I missing?
> >>>> First I think we still want to mark some subprogs in async tree
> >>>> to use private stack, right? If this is the case, then let us see
> >>>> the following examle:
> >>>>
> >>>> main_prog:
> >>>>       sub1: use_priv_stack = true
> >>>>       sub2" use_priv_stack = true
> >>>>
> >>>> async: /* calling sub1 twice */
> >>>>       sub1
> >>>>         <=== we do
> >>>>                if (subprog[idx].use_priv_stack && checking_async_cb)
> >>>>                    subprog[idx].use_priv_stack = false;
> >>>>       sub1
> >>>>         <=== here we have subprog[idx].use_priv_stack = false;
> >>>>              we could mark use_priv_stack = true again here
> >>>>              since logic didn't keep track of sub1 has been
> >>>>              visited before.
> >>> This case needs a sticky state to solve.
> >>> Instead of bool use_priv_stack it can be tri-state:
> >>> no_priv_stack
> >>> priv_stack_unknown <- start state
> >>> priv_stack_maybe
> >>>
> >>> main_prog pass will set it to priv_stack_maybe
> >>> while async pass will clear it to no_priv_stack
> >>> and it cannot be bumped up.
> >> The tri-state may not work. For example,
> >>
> >> main_prog:
> >>      call sub1
> >>      call sub2
> >>      call sub1
> > sub1 cannot be called nested like this.
> > I think we discussed it already.
> >
> >>      call sub3
> >>
> >> async:
> >>      call sub4 ==> UNKNOWN -> MAYBE
> >>      call sub3
> >>      call sub4 ==> MAYBE -> NO_PRIV_STACK?
> >>
> >> For sub4 in async which is called twice, for the second sub4 call,
> >> it is not clear whether UNKNOWN->MAYBE transition happens in
> >> main_prog or async. So based on transition prototol,
> >> second sub4 call will transition to NO_PRIV_STACK which is not
> >> what we want.
> > I see. Good point.
> >
> >> So I think we still need a separate bit in bpf_subprog_info to
> >> accumulate information for main_prog tree or any async tree.
> > This is getting quite convoluted. To support priv stack
> > in multiple async cb-s we may need to remember async cb id or something.
> > I say let's force all subprogs in async cb to use normal stack.
>
> I did a quick prototype. Among others, tri-state (UNKNOWN, NO, ADAPTIVE)
> and reverse traversing subprogs like below diff --git
> a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index
> cb82254484ff..1084432dbe83 100644 --- a/kernel/bpf/verifier.c +++
> b/kernel/bpf/verifier.c @@ -6192,7 +6192,7 @@ static int
> check_max_stack_depth(struct bpf_verifier_env *env) struct
> bpf_subprog_info *si = env->subprog_info; int ret; - for (int i = 0; i <
> env->subprog_cnt; i++) { + for (int i = env->subprog_cnt - 1; i >= 0;
> i--) { if (i && !si[i].is_async_cb) continue; works correctly.
> Basically, all async_cb touched subprogs are marked as 'NO'. Finally for
> main prog tree, UNKNOWN subprog will convert to ADAPTIVE if >= stack
> size threshold. Stack size checking can also be done properly for
> async_cb tree and main prog tree.

Your emailer still spits out garbage :(
but I think I got the idea. Will wait for respin.
Yonghong Song Nov. 6, 2024, 3:44 p.m. UTC | #11
On 11/6/24 7:26 AM, Alexei Starovoitov wrote:
> On Tue, Nov 5, 2024 at 10:55 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>
>>
>>
>> On 11/5/24 5:07 PM, Alexei Starovoitov wrote:
>>> On Tue, Nov 5, 2024 at 4:19 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>
>>>>
>>>> On 11/5/24 1:52 PM, Alexei Starovoitov wrote:
>>>>> On Tue, Nov 5, 2024 at 1:26 PM Yonghong Song <yonghong.song@linux.dev> wrote:
>>>>>>> I see. I think it works, but feels complicated.
>>>>>>> It feels it should be possible to do without extra flags. Like
>>>>>>> check_max_stack_depth_subprog() will know whether it was called
>>>>>>> to verify async_cb or not.
>>>>>>> So it's just a matter of adding single 'if' to it:
>>>>>>> if (subprog[idx].use_priv_stack && checking_async_cb)
>>>>>>>        /* reset to false due to potential recursion */
>>>>>>>        subprog[idx].use_priv_stack = false;
>>>>>>>
>>>>>>> check_max_stack_depth() starts with i==0,
>>>>>>> so reachable and eligible subprogs will be marked with use_priv_stack.
>>>>>>> Then check_max_stack_depth_subprog() will be called again
>>>>>>> to verify async. If it sees the mark it's a bad case.
>>>>>>> what am I missing?
>>>>>> First I think we still want to mark some subprogs in async tree
>>>>>> to use private stack, right? If this is the case, then let us see
>>>>>> the following examle:
>>>>>>
>>>>>> main_prog:
>>>>>>        sub1: use_priv_stack = true
>>>>>>        sub2" use_priv_stack = true
>>>>>>
>>>>>> async: /* calling sub1 twice */
>>>>>>        sub1
>>>>>>          <=== we do
>>>>>>                 if (subprog[idx].use_priv_stack && checking_async_cb)
>>>>>>                     subprog[idx].use_priv_stack = false;
>>>>>>        sub1
>>>>>>          <=== here we have subprog[idx].use_priv_stack = false;
>>>>>>               we could mark use_priv_stack = true again here
>>>>>>               since logic didn't keep track of sub1 has been
>>>>>>               visited before.
>>>>> This case needs a sticky state to solve.
>>>>> Instead of bool use_priv_stack it can be tri-state:
>>>>> no_priv_stack
>>>>> priv_stack_unknown <- start state
>>>>> priv_stack_maybe
>>>>>
>>>>> main_prog pass will set it to priv_stack_maybe
>>>>> while async pass will clear it to no_priv_stack
>>>>> and it cannot be bumped up.
>>>> The tri-state may not work. For example,
>>>>
>>>> main_prog:
>>>>       call sub1
>>>>       call sub2
>>>>       call sub1
>>> sub1 cannot be called nested like this.
>>> I think we discussed it already.
>>>
>>>>       call sub3
>>>>
>>>> async:
>>>>       call sub4 ==> UNKNOWN -> MAYBE
>>>>       call sub3
>>>>       call sub4 ==> MAYBE -> NO_PRIV_STACK?
>>>>
>>>> For sub4 in async which is called twice, for the second sub4 call,
>>>> it is not clear whether UNKNOWN->MAYBE transition happens in
>>>> main_prog or async. So based on transition prototol,
>>>> second sub4 call will transition to NO_PRIV_STACK which is not
>>>> what we want.
>>> I see. Good point.
>>>
>>>> So I think we still need a separate bit in bpf_subprog_info to
>>>> accumulate information for main_prog tree or any async tree.
>>> This is getting quite convoluted. To support priv stack
>>> in multiple async cb-s we may need to remember async cb id or something.
>>> I say let's force all subprogs in async cb to use normal stack.
>> I did a quick prototype. Among others, tri-state (UNKNOWN, NO, ADAPTIVE)
>> and reverse traversing subprogs like below diff --git
>> a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index
>> cb82254484ff..1084432dbe83 100644 --- a/kernel/bpf/verifier.c +++
>> b/kernel/bpf/verifier.c @@ -6192,7 +6192,7 @@ static int
>> check_max_stack_depth(struct bpf_verifier_env *env) struct
>> bpf_subprog_info *si = env->subprog_info; int ret; - for (int i = 0; i <
>> env->subprog_cnt; i++) { + for (int i = env->subprog_cnt - 1; i >= 0;
>> i--) { if (i && !si[i].is_async_cb) continue; works correctly.
>> Basically, all async_cb touched subprogs are marked as 'NO'. Finally for
>> main prog tree, UNKNOWN subprog will convert to ADAPTIVE if >= stack
>> size threshold. Stack size checking can also be done properly for
>> async_cb tree and main prog tree.
> Your emailer still spits out garbage :(

Somehow recently, my thunderbird mailer may change text/width setting
depending on the original format. Unless I explicitly set the format
again. Sometimes it will use a 'Variable width' which will glue everything in
one paragraph. I am trying to find a solution now to fix the mailer.

> but I think I got the idea. Will wait for respin.

Will do.
diff mbox series

Patch

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 0622c11a7e19..e921589abc72 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -669,6 +669,8 @@  struct bpf_subprog_info {
 	/* true if bpf_fastcall stack region is used by functions that can't be inlined */
 	bool keep_fastcall_stack: 1;
 	bool use_priv_stack: 1;
+	bool visited_with_priv_stack_accum: 1;
+	bool visited_with_priv_stack: 1;
 
 	u8 arg_cnt;
 	struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 406195c433ea..e01b3f0fd314 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6118,8 +6118,12 @@  static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
 					idx, subprog_depth);
 				return -EACCES;
 			}
-			if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
+			if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE) {
 				subprog[idx].use_priv_stack = true;
+				subprog[idx].visited_with_priv_stack = true;
+			}
+		} else {
+			subprog[idx].visited_with_priv_stack = true;
 		}
 	}
 continue_func:
@@ -6220,10 +6224,12 @@  static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
 static int check_max_stack_depth(struct bpf_verifier_env *env)
 {
 	struct bpf_subprog_info *si = env->subprog_info;
+	enum priv_stack_mode orig_priv_stack_supported;
 	enum priv_stack_mode priv_stack_supported;
 	int ret, subtree_depth = 0, depth_frame;
 
 	priv_stack_supported = bpf_enable_priv_stack(env->prog);
+	orig_priv_stack_supported = priv_stack_supported;
 
 	if (priv_stack_supported != NO_PRIV_STACK) {
 		for (int i = 0; i < env->subprog_cnt; i++) {
@@ -6240,13 +6246,39 @@  static int check_max_stack_depth(struct bpf_verifier_env *env)
 							    priv_stack_supported);
 			if (ret < 0)
 				return ret;
+
+			if (priv_stack_supported != NO_PRIV_STACK) {
+				for (int j = 0; j < env->subprog_cnt; j++) {
+					if (si[j].visited_with_priv_stack_accum &&
+					    si[j].visited_with_priv_stack) {
+						/* si[j] is visited by both main/async subprog
+						 * and another async subprog.
+						 */
+						priv_stack_supported = NO_PRIV_STACK;
+						break;
+					}
+					if (!si[j].visited_with_priv_stack_accum)
+						si[j].visited_with_priv_stack_accum =
+							si[j].visited_with_priv_stack;
+				}
+			}
+			if (priv_stack_supported != NO_PRIV_STACK) {
+				for (int j = 0; j < env->subprog_cnt; j++)
+					si[j].visited_with_priv_stack = false;
+			}
 		}
 	}
 
-	if (priv_stack_supported == NO_PRIV_STACK && subtree_depth > MAX_BPF_STACK) {
-		verbose(env, "combined stack size of %d calls is %d. Too large\n",
-			depth_frame, subtree_depth);
-		return -EACCES;
+	if (priv_stack_supported == NO_PRIV_STACK) {
+		if (subtree_depth > MAX_BPF_STACK) {
+			verbose(env, "combined stack size of %d calls is %d. Too large\n",
+				depth_frame, subtree_depth);
+			return -EACCES;
+		}
+		if (orig_priv_stack_supported == PRIV_STACK_ADAPTIVE) {
+			for (int i = 0; i < env->subprog_cnt; i++)
+				si[i].use_priv_stack = false;
+		}
 	}
 	if (si[0].use_priv_stack)
 		env->prog->aux->use_priv_stack = true;