diff mbox series

[RFC,v9,01/11] bpf: Support getting referenced kptr from struct_ops argument

Message ID 20240714175130.4051012-2-amery.hung@bytedance.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series bpf qdisc | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-13 fail Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-15 success Logs for x86_64-gcc / test
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-llvm-17 / test
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-llvm-18 / test
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-llvm-18 / veristat
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1052 this patch: 1052
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 8 maintainers not CCed: kpsingh@kernel.org haoluo@google.com john.fastabend@gmail.com jolsa@kernel.org yonghong.song@linux.dev martin.lau@linux.dev song@kernel.org eddyz87@gmail.com
netdev/build_clang success Errors and warnings before: 1128 this patch: 1128
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 7780 this patch: 7780
netdev/checkpatch warning WARNING: From:/Signed-off-by: email address mismatch: 'From: Amery Hung <ameryhung@gmail.com>' != 'Signed-off-by: Amery Hung <amery.hung@bytedance.com>' WARNING: line length of 101 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0

Commit Message

Amery Hung July 14, 2024, 5:51 p.m. UTC
Allows struct_ops programs to acqurie referenced kptrs from arguments
by directly reading the argument.

The verifier will automatically acquire a reference for struct_ops
argument tagged with "__ref" in the stub function. The user will be
able to access the referenced kptr directly by reading the context
as long as it has not been released by the program.

This new mechanism to acquire referenced kptr (compared to the existing
"kfunc with KF_ACQUIRE") is introduced for ergonomic and semantic reasons.

In the first use case, Qdisc_ops, an skb is passed to .enqueue in the
first argument. The qdisc becomes the sole owner of the skb and must
enqueue or drop the skb. Representing skbs in bpf qdisc as referenced
kptrs makes sure 1) qdisc will always enqueue or drop the skb in .enqueue,
and 2) qdisc cannot make up invalid skb pointers in .dequeue. The new
mechanism provides a natural way for users to get a referenced kptr in
struct_ops programs.

More importantly, we would also like to make sure that there is only a
single reference to the same skb in qdisc. Since in the future,
skb->rbnode will be utilized to support adding skb to bpf list and rbtree,
allowing multiple references may lead to racy accesses to this field when
the user adds references of the skb to different bpf graphs. The new
mechanism provides a better way to enforce such unique ptr semantic than
forbidding users to call a KF_ACQUIRE kfunc multiple times.

Signed-off-by: Amery Hung <amery.hung@bytedance.com>
---
 include/linux/bpf.h         |  3 +++
 kernel/bpf/bpf_struct_ops.c | 26 ++++++++++++++++++++------
 kernel/bpf/btf.c            |  1 +
 kernel/bpf/verifier.c       | 34 +++++++++++++++++++++++++++++++---
 4 files changed, 55 insertions(+), 9 deletions(-)

Comments

Martin KaFai Lau July 24, 2024, 12:32 a.m. UTC | #1
On 7/14/24 10:51 AM, Amery Hung wrote:
> @@ -21004,6 +21025,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
>   		mark_reg_known_zero(env, regs, BPF_REG_1);
>   	}
>   
> +	if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
> +		ctx_arg_info = (struct bpf_ctx_arg_aux *)env->prog->aux->ctx_arg_info;
> +		for (i = 0; i < env->prog->aux->ctx_arg_info_size; i++)
> +			if (ctx_arg_info[i].refcounted)
> +				ctx_arg_info[i].ref_obj_id = acquire_reference_state(env, 0);
> +	}
> +

I think this will miss a case when passing the struct_ops prog ctx (i.e. "__u64 
*ctx") to a global subprog. Something like this:

__noinline int subprog_release(__u64 *ctx __arg_ctx)
{
	struct task_struct *task = (struct task_struct *)ctx[1];
	int dummy = (int)ctx[0];

	bpf_task_release(task);

	return dummy + 1;
}

SEC("struct_ops/subprog_ref")
__failure
int test_subprog_ref(__u64 *ctx)
{
	struct task_struct *task = (struct task_struct *)ctx[1];

	bpf_task_release(task);

	return subprog_release(ctx);;
}

SEC(".struct_ops.link")
struct bpf_testmod_ops subprog_ref = {
	.test_refcounted = (void *)test_subprog_ref,
};

A quick thought is, I think tracking the ctx's ref id in the env->cur_state may 
not be the correct place.

[ Just want to bring up what I have noticed so far. I will stop at here for 
today and will continue. ]
Amery Hung July 24, 2024, 5 p.m. UTC | #2
On Tue, Jul 23, 2024 at 5:32 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>
> On 7/14/24 10:51 AM, Amery Hung wrote:
> > @@ -21004,6 +21025,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
> >               mark_reg_known_zero(env, regs, BPF_REG_1);
> >       }
> >
> > +     if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
> > +             ctx_arg_info = (struct bpf_ctx_arg_aux *)env->prog->aux->ctx_arg_info;
> > +             for (i = 0; i < env->prog->aux->ctx_arg_info_size; i++)
> > +                     if (ctx_arg_info[i].refcounted)
> > +                             ctx_arg_info[i].ref_obj_id = acquire_reference_state(env, 0);
> > +     }
> > +
>
> I think this will miss a case when passing the struct_ops prog ctx (i.e. "__u64
> *ctx") to a global subprog. Something like this:
>
> __noinline int subprog_release(__u64 *ctx __arg_ctx)
> {
>         struct task_struct *task = (struct task_struct *)ctx[1];
>         int dummy = (int)ctx[0];
>
>         bpf_task_release(task);
>
>         return dummy + 1;
> }
>
> SEC("struct_ops/subprog_ref")
> __failure
> int test_subprog_ref(__u64 *ctx)
> {
>         struct task_struct *task = (struct task_struct *)ctx[1];
>
>         bpf_task_release(task);
>
>         return subprog_release(ctx);;
> }
>
> SEC(".struct_ops.link")
> struct bpf_testmod_ops subprog_ref = {
>         .test_refcounted = (void *)test_subprog_ref,
> };
>

Thanks for pointing this out. The test did failed.

> A quick thought is, I think tracking the ctx's ref id in the env->cur_state may
> not be the correct place.

I think it is a bit tricky because subprogs are checked independently
and their state is folded (i.e., there can be multiple edges from the
main program to a subprog).

Maybe the verifier can rewrite the program: set the refcounted ctx to
NULL when releasing reference. Then, in do_check_common(), if it is a
global subprog, we mark refcounted ctx as PTR_MAYBE_NULL to force a
runtime check. How does it sound?

>
> [ Just want to bring up what I have noticed so far. I will stop at here for
> today and will continue. ]
Martin KaFai Lau July 25, 2024, 1:28 a.m. UTC | #3
On 7/24/24 10:00 AM, Amery Hung wrote:
> On Tue, Jul 23, 2024 at 5:32 PM Martin KaFai Lau <martin.lau@linux.dev> wrote:
>>
>> On 7/14/24 10:51 AM, Amery Hung wrote:
>>> @@ -21004,6 +21025,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
>>>                mark_reg_known_zero(env, regs, BPF_REG_1);
>>>        }
>>>
>>> +     if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
>>> +             ctx_arg_info = (struct bpf_ctx_arg_aux *)env->prog->aux->ctx_arg_info;
>>> +             for (i = 0; i < env->prog->aux->ctx_arg_info_size; i++)
>>> +                     if (ctx_arg_info[i].refcounted)
>>> +                             ctx_arg_info[i].ref_obj_id = acquire_reference_state(env, 0);
>>> +     }
>>> +
>>
>> I think this will miss a case when passing the struct_ops prog ctx (i.e. "__u64
>> *ctx") to a global subprog. Something like this:
>>
>> __noinline int subprog_release(__u64 *ctx __arg_ctx)
>> {
>>          struct task_struct *task = (struct task_struct *)ctx[1];
>>          int dummy = (int)ctx[0];
>>
>>          bpf_task_release(task);
>>
>>          return dummy + 1;
>> }
>>
>> SEC("struct_ops/subprog_ref")
>> __failure
>> int test_subprog_ref(__u64 *ctx)
>> {
>>          struct task_struct *task = (struct task_struct *)ctx[1];
>>
>>          bpf_task_release(task);
>>
>>          return subprog_release(ctx);;
>> }
>>
>> SEC(".struct_ops.link")
>> struct bpf_testmod_ops subprog_ref = {
>>          .test_refcounted = (void *)test_subprog_ref,
>> };
>>
> 
> Thanks for pointing this out. The test did failed.
> 
>> A quick thought is, I think tracking the ctx's ref id in the env->cur_state may
>> not be the correct place.
> 
> I think it is a bit tricky because subprogs are checked independently
> and their state is folded (i.e., there can be multiple edges from the
> main program to a subprog).
> 
> Maybe the verifier can rewrite the program: set the refcounted ctx to
> NULL when releasing reference. Then, in do_check_common(), if it is a
> global subprog, we mark refcounted ctx as PTR_MAYBE_NULL to force a
> runtime check. How does it sound?

don't know how to get the ctx pointer to patch the code. It is not always in r1.

A case like this should still break even with the PTR_MAYBE_NULL marking in all 
main and subprog (I haven't tried this one myself):

SEC("struct_ops/subprog_ref")
int test_subprog_ref(__u64 *ctx)
{
	struct task_struct *task = (struct task_struct *)ctx[1];

	if (task) {
		subprog_release(ctx);
		bpf_task_release(task);
	}

	return;
}

afaik, the global subprog is checked independently from the main prog and it 
does not know the state of the main prog. Take a look at the subprog_is_global() 
case in the check_func_call().

How about only acquire_reference_state() for the main prog? Yes, the global 
subprog cannot do the bpf_kptr_xchg() and bpf_qdisc_skb_drop() but it can still 
read the skb. The non-global subprog (static) should work though (please test).

I don't have other better idea. May be Alexei can provide some guidance here?
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cc460786da9b..3891e45ded4e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -924,6 +924,7 @@  struct bpf_insn_access_aux {
 		struct {
 			struct btf *btf;
 			u32 btf_id;
+			u32 ref_obj_id;
 		};
 	};
 	struct bpf_verifier_log *log; /* for verbose logs */
@@ -1427,6 +1428,8 @@  struct bpf_ctx_arg_aux {
 	enum bpf_reg_type reg_type;
 	struct btf *btf;
 	u32 btf_id;
+	u32 ref_obj_id;
+	bool refcounted;
 };
 
 struct btf_mod_pair {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 0d515ec57aa5..05f16f21981e 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -145,6 +145,7 @@  void bpf_struct_ops_image_free(void *image)
 }
 
 #define MAYBE_NULL_SUFFIX "__nullable"
+#define REFCOUNTED_SUFFIX "__ref"
 #define MAX_STUB_NAME 128
 
 /* Return the type info of a stub function, if it exists.
@@ -206,9 +207,11 @@  static int prepare_arg_info(struct btf *btf,
 			    struct bpf_struct_ops_arg_info *arg_info)
 {
 	const struct btf_type *stub_func_proto, *pointed_type;
+	bool is_nullable = false, is_refcounted = false;
 	const struct btf_param *stub_args, *args;
 	struct bpf_ctx_arg_aux *info, *info_buf;
 	u32 nargs, arg_no, info_cnt = 0;
+	const char *suffix;
 	u32 arg_btf_id;
 	int offset;
 
@@ -240,12 +243,19 @@  static int prepare_arg_info(struct btf *btf,
 	info = info_buf;
 	for (arg_no = 0; arg_no < nargs; arg_no++) {
 		/* Skip arguments that is not suffixed with
-		 * "__nullable".
+		 * "__nullable or __ref".
 		 */
-		if (!btf_param_match_suffix(btf, &stub_args[arg_no],
-					    MAYBE_NULL_SUFFIX))
+		is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no],
+						     MAYBE_NULL_SUFFIX);
+		is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no],
+						       REFCOUNTED_SUFFIX);
+		if (!is_nullable && !is_refcounted)
 			continue;
 
+		if (is_nullable)
+			suffix = MAYBE_NULL_SUFFIX;
+		else if (is_refcounted)
+			suffix = REFCOUNTED_SUFFIX;
 		/* Should be a pointer to struct */
 		pointed_type = btf_type_resolve_ptr(btf,
 						    args[arg_no].type,
@@ -253,7 +263,7 @@  static int prepare_arg_info(struct btf *btf,
 		if (!pointed_type ||
 		    !btf_type_is_struct(pointed_type)) {
 			pr_warn("stub function %s__%s has %s tagging to an unsupported type\n",
-				st_ops_name, member_name, MAYBE_NULL_SUFFIX);
+				st_ops_name, member_name, suffix);
 			goto err_out;
 		}
 
@@ -271,11 +281,15 @@  static int prepare_arg_info(struct btf *btf,
 		}
 
 		/* Fill the information of the new argument */
-		info->reg_type =
-			PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
 		info->btf_id = arg_btf_id;
 		info->btf = btf;
 		info->offset = offset;
+		if (is_nullable) {
+			info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL;
+		} else if (is_refcounted) {
+			info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID;
+			info->refcounted = true;
+		}
 
 		info++;
 		info_cnt++;
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index de15e8b12fae..52be35b30308 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -6516,6 +6516,7 @@  bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			info->reg_type = ctx_arg_info->reg_type;
 			info->btf = ctx_arg_info->btf ? : btf_vmlinux;
 			info->btf_id = ctx_arg_info->btf_id;
+			info->ref_obj_id = ctx_arg_info->ref_obj_id;
 			return true;
 		}
 	}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 37053cc4defe..f614ab283c37 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1367,6 +1367,17 @@  static int release_reference_state(struct bpf_func_state *state, int ptr_id)
 	return -EINVAL;
 }
 
+static bool find_reference_state(struct bpf_func_state *state, int ptr_id)
+{
+	int i;
+
+	for (i = 0; i < state->acquired_refs; i++)
+		if (state->refs[i].id == ptr_id)
+			return true;
+
+	return false;
+}
+
 static void free_func_state(struct bpf_func_state *state)
 {
 	if (!state)
@@ -5587,7 +5598,7 @@  static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 /* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type,
-			    struct btf **btf, u32 *btf_id)
+			    struct btf **btf, u32 *btf_id, u32 *ref_obj_id)
 {
 	struct bpf_insn_access_aux info = {
 		.reg_type = *reg_type,
@@ -5606,8 +5617,16 @@  static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		*reg_type = info.reg_type;
 
 		if (base_type(*reg_type) == PTR_TO_BTF_ID) {
+			if (info.ref_obj_id &&
+			    !find_reference_state(cur_func(env), info.ref_obj_id)) {
+				verbose(env, "bpf_context off=%d ref_obj_id=%d is no longer valid\n",
+					off, info.ref_obj_id);
+				return -EACCES;
+			}
+
 			*btf = info.btf;
 			*btf_id = info.btf_id;
+			*ref_obj_id = info.ref_obj_id;
 		} else {
 			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		}
@@ -6878,7 +6897,7 @@  static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
 		struct btf *btf = NULL;
-		u32 btf_id = 0;
+		u32 btf_id = 0, ref_obj_id = 0;
 
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
@@ -6891,7 +6910,7 @@  static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			return err;
 
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
-				       &btf_id);
+				       &btf_id, &ref_obj_id);
 		if (err)
 			verbose_linfo(env, insn_idx, "; ");
 		if (!err && t == BPF_READ && value_regno >= 0) {
@@ -6915,6 +6934,7 @@  static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				if (base_type(reg_type) == PTR_TO_BTF_ID) {
 					regs[value_regno].btf = btf;
 					regs[value_regno].btf_id = btf_id;
+					regs[value_regno].ref_obj_id = ref_obj_id;
 				}
 			}
 			regs[value_regno].type = reg_type;
@@ -20897,6 +20917,7 @@  static int do_check_common(struct bpf_verifier_env *env, int subprog)
 {
 	bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
 	struct bpf_subprog_info *sub = subprog_info(env, subprog);
+	struct bpf_ctx_arg_aux *ctx_arg_info;
 	struct bpf_verifier_state *state;
 	struct bpf_reg_state *regs;
 	int ret, i;
@@ -21004,6 +21025,13 @@  static int do_check_common(struct bpf_verifier_env *env, int subprog)
 		mark_reg_known_zero(env, regs, BPF_REG_1);
 	}
 
+	if (env->prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+		ctx_arg_info = (struct bpf_ctx_arg_aux *)env->prog->aux->ctx_arg_info;
+		for (i = 0; i < env->prog->aux->ctx_arg_info_size; i++)
+			if (ctx_arg_info[i].refcounted)
+				ctx_arg_info[i].ref_obj_id = acquire_reference_state(env, 0);
+	}
+
 	ret = do_check(env);
 out:
 	/* check for NULL is necessary, since cur_state can be freed inside