@@ -1523,6 +1523,7 @@ struct bpf_prog_aux {
bool exception_cb;
bool exception_boundary;
bool is_extended; /* true if extended by freplace program */
+ bool use_priv_stack;
u64 prog_array_member_cnt; /* counts how many times as member of prog_array */
struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */
struct bpf_arena *arena;
@@ -668,6 +668,7 @@ struct bpf_subprog_info {
bool args_cached: 1;
/* true if bpf_fastcall stack region is used by functions that can't be inlined */
bool keep_fastcall_stack: 1;
+ bool use_priv_stack: 1;
u8 arg_cnt;
struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
@@ -1119,6 +1119,7 @@ bool bpf_jit_supports_exceptions(void);
bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
+bool bpf_jit_supports_private_stack(void);
u64 bpf_arch_uaddress_limit(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
bool bpf_helper_changes_pkt_data(void *func);
@@ -3045,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
return false;
}
+bool __weak bpf_jit_supports_private_stack(void)
+{
+ return false;
+}
+
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}
@@ -194,6 +194,8 @@ struct bpf_verifier_stack_elem {
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
+#define BPF_PRIV_STACK_MIN_SIZE 64
+
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
@@ -6015,6 +6017,38 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
strict);
}
+enum priv_stack_mode {
+ NO_PRIV_STACK,
+ PRIV_STACK_ADAPTIVE,
+ PRIV_STACK_ALWAYS,
+};
+
+static enum priv_stack_mode bpf_enable_priv_stack(struct bpf_prog *prog)
+{
+ if (!bpf_jit_supports_private_stack())
+ return NO_PRIV_STACK;
+
+ /* bpf_prog_check_recur() checks all prog types that use bpf trampoline
+ * while kprobe/tp/perf_event/raw_tp don't use trampoline hence checked
+ * explicitly.
+ */
+ switch (prog->type) {
+ case BPF_PROG_TYPE_KPROBE:
+ case BPF_PROG_TYPE_TRACEPOINT:
+ case BPF_PROG_TYPE_PERF_EVENT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ return PRIV_STACK_ADAPTIVE;
+ default:
+ break;
+ }
+
+ if (!bpf_prog_check_recur(prog))
+ return NO_PRIV_STACK;
+
+
+ return PRIV_STACK_ADAPTIVE;
+}
+
static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
{
if (env->prog->jit_requested)
@@ -6033,11 +6067,12 @@ static int round_up_stack_depth(struct bpf_verifier_env *env, int stack_depth)
* only needs a local stack of MAX_CALL_FRAMES to remember callsites
*/
static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
- int *subtree_depth, int *depth_frame)
+ int *subtree_depth, int *depth_frame,
+ int priv_stack_supported)
{
struct bpf_subprog_info *subprog = env->subprog_info;
struct bpf_insn *insn = env->prog->insnsi;
- int depth = 0, frame = 0, i, subprog_end;
+ int depth = 0, frame = 0, i, subprog_end, subprog_depth;
bool tail_call_reachable = false;
int ret_insn[MAX_CALL_FRAMES];
int ret_prog[MAX_CALL_FRAMES];
@@ -6070,11 +6105,23 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
depth);
return -EACCES;
}
- depth += round_up_stack_depth(env, subprog[idx].stack_depth);
+ subprog_depth = round_up_stack_depth(env, subprog[idx].stack_depth);
+ depth += subprog_depth;
if (depth > MAX_BPF_STACK && !*subtree_depth) {
*subtree_depth = depth;
*depth_frame = frame + 1;
}
+ if (priv_stack_supported != NO_PRIV_STACK) {
+ if (!subprog[idx].use_priv_stack) {
+ if (subprog_depth > MAX_BPF_STACK) {
+ verbose(env, "stack size of subprog %d is %d. Too large\n",
+ idx, subprog_depth);
+ return -EACCES;
+ }
+ if (subprog_depth >= BPF_PRIV_STACK_MIN_SIZE)
+ subprog[idx].use_priv_stack = true;
+ }
+ }
continue_func:
subprog_end = subprog[idx + 1].start;
for (; i < subprog_end; i++) {
@@ -6173,20 +6220,36 @@ static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx,
static int check_max_stack_depth(struct bpf_verifier_env *env)
{
struct bpf_subprog_info *si = env->subprog_info;
+ enum priv_stack_mode priv_stack_supported;
int ret, subtree_depth = 0, depth_frame;
+ priv_stack_supported = bpf_enable_priv_stack(env->prog);
+
+ if (priv_stack_supported != NO_PRIV_STACK) {
+ for (int i = 0; i < env->subprog_cnt; i++) {
+ if (!si[i].has_tail_call)
+ continue;
+ priv_stack_supported = NO_PRIV_STACK;
+ break;
+ }
+ }
+
for (int i = 0; i < env->subprog_cnt; i++) {
if (!i || si[i].is_async_cb) {
- ret = check_max_stack_depth_subprog(env, i, &subtree_depth, &depth_frame);
+ ret = check_max_stack_depth_subprog(env, i, &subtree_depth, &depth_frame,
+ priv_stack_supported);
if (ret < 0)
return ret;
}
}
- if (subtree_depth > MAX_BPF_STACK) {
+
+ if (priv_stack_supported == NO_PRIV_STACK && subtree_depth > MAX_BPF_STACK) {
verbose(env, "combined stack size of %d calls is %d. Too large\n",
depth_frame, subtree_depth);
return -EACCES;
}
+ if (si[0].use_priv_stack)
+ env->prog->aux->use_priv_stack = true;
return 0;
}
With private stack support, each subprog can have stack with up to 512 bytes. The limit of 512 bytes per subprog is kept to avoid increasing verifier complexity since greater than 512 bytes will cause big verifier change and increase memory consumption and verification time. If private stack is supported and certain stack size threshold is reached, that subprog will have its own private stack allocated. In this patch, some tracing programs are allowed to use private stack since tracing prog may be triggered in the middle of some other prog runs. The supported tracing programs already have recursion check such that if the same prog is running on the same cpu again, the nested prog run will be skipped. This ensures bpf prog private stack is not over-written. Note that if any tail_call is called in the prog (including all subprogs), then private stack is not used. Function bpf_enable_priv_stack() return values include NO_PRIV_STACK, PRIV_STACK_ADAPTIVE and PRIV_STACK_ALWAYS. The NO_PRIV_STACK represents priv stack not enabled, PRIV_STACK_ADAPTIVE for priv stack enabled with some conditions (e.g. stack size threshold), and PRIV_STACK_ALWAYS for priv stack always enabled. The PRIV_STACK_ALWAYS will be used by later struct_ops progs. Signed-off-by: Yonghong Song <yonghong.song@linux.dev> --- include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 1 + include/linux/filter.h | 1 + kernel/bpf/core.c | 5 +++ kernel/bpf/verifier.c | 73 +++++++++++++++++++++++++++++++++--- 5 files changed, 76 insertions(+), 5 deletions(-)