@@ -871,6 +871,8 @@ struct sk_buff {
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list;
struct llist_node ll_node;
+ struct bpf_list_excl_node bpf_list;
+ struct bpf_rb_excl_node bpf_rbnode;
};
struct sock *sk;
@@ -5495,6 +5495,7 @@ static const char *alloc_obj_fields[] = {
/* kernel structures with special BTF fields*/
static const char *kstructs_with_special_btf[] = {
"unused",
+ "sk_buff",
};
static struct btf_struct_metas *
@@ -1988,6 +1988,9 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
bool tail, struct btf_record *rec, u64 off)
{
struct list_head *n = &node->list_head, *h = (void *)head;
+ bool exclusive;
+
+ exclusive = btf_record_has_field(rec, BPF_LIST_EXCL_NODE);
/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
* called on its fields, so init here
@@ -1998,14 +2001,15 @@ static int __bpf_list_add(struct bpf_list_node_kern *node,
/* node->owner != NULL implies !list_empty(n), no need to separately
* check the latter
*/
- if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
+ if (!exclusive && cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
__bpf_obj_drop_impl((void *)n - off, rec, false);
return -EINVAL;
}
tail ? list_add_tail(n, h) : list_add(n, h);
- WRITE_ONCE(node->owner, head);
+ if (!exclusive)
+ WRITE_ONCE(node->owner, head);
return 0;
}
@@ -2030,10 +2034,14 @@ __bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
return __bpf_list_add(n, head, true, meta ? meta->record : NULL, off);
}
-static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head,
+ struct btf_record *rec, bool tail)
{
struct list_head *n, *h = (void *)head;
struct bpf_list_node_kern *node;
+ bool exclusive;
+
+ exclusive = btf_record_has_field(rec, BPF_LIST_EXCL_NODE);
/* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
* called on its fields, so init here
@@ -2045,40 +2053,55 @@ static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tai
n = tail ? h->prev : h->next;
node = container_of(n, struct bpf_list_node_kern, list_head);
- if (WARN_ON_ONCE(READ_ONCE(node->owner) != head))
+ if (!exclusive && WARN_ON_ONCE(READ_ONCE(node->owner) != head))
return NULL;
list_del_init(n);
- WRITE_ONCE(node->owner, NULL);
+ if (!exclusive)
+ WRITE_ONCE(node->owner, NULL);
return (struct bpf_list_node *)n;
}
-__bpf_kfunc struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_front_impl(struct bpf_list_head *head,
+ void *meta__ign)
{
- return __bpf_list_del(head, false);
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_del(head, meta ? meta->record : NULL, false);
}
-__bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
+__bpf_kfunc struct bpf_list_node *bpf_list_pop_back_impl(struct bpf_list_head *head,
+ void *meta__ign)
{
- return __bpf_list_del(head, true);
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_del(head, meta ? meta->record : NULL, true);
}
-__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
- struct bpf_rb_node *node)
+__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove_impl(struct bpf_rb_root *root,
+ struct bpf_rb_node *node,
+ void *meta__ign)
{
struct bpf_rb_node_kern *node_internal = (struct bpf_rb_node_kern *)node;
struct rb_root_cached *r = (struct rb_root_cached *)root;
struct rb_node *n = &node_internal->rb_node;
+ struct btf_struct_meta *meta = meta__ign;
+ struct btf_record *rec;
+ bool exclusive;
+
+ rec = meta ? meta->record : NULL;
+ exclusive = btf_record_has_field(rec, BPF_RB_EXCL_NODE);
/* node_internal->owner != root implies either RB_EMPTY_NODE(n) or
* n is owned by some other tree. No need to check RB_EMPTY_NODE(n)
*/
- if (READ_ONCE(node_internal->owner) != root)
+ if (!exclusive && READ_ONCE(node_internal->owner) != root)
return NULL;
rb_erase_cached(n, r);
RB_CLEAR_NODE(n);
- WRITE_ONCE(node_internal->owner, NULL);
+ if (!exclusive)
+ WRITE_ONCE(node_internal->owner, NULL);
return (struct bpf_rb_node *)n;
}
@@ -2093,11 +2116,14 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
struct rb_node *parent = NULL, *n = &node->rb_node;
bpf_callback_t cb = (bpf_callback_t)less;
bool leftmost = true;
+ bool exclusive;
+
+ exclusive = btf_record_has_field(rec, BPF_RB_EXCL_NODE);
/* node->owner != NULL implies !RB_EMPTY_NODE(n), no need to separately
* check the latter
*/
- if (cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
+ if (!exclusive && cmpxchg(&node->owner, NULL, BPF_PTR_POISON)) {
/* Only called from BPF prog, no need to migrate_disable */
__bpf_obj_drop_impl((void *)n - off, rec, false);
return -EINVAL;
@@ -2115,7 +2141,8 @@ static int __bpf_rbtree_add(struct bpf_rb_root *root,
rb_link_node(n, parent, link);
rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
- WRITE_ONCE(node->owner, root);
+ if (!exclusive)
+ WRITE_ONCE(node->owner, root);
return 0;
}
@@ -2562,11 +2589,11 @@ BTF_ID_FLAGS(func, bpf_percpu_obj_drop_impl, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE | KF_RET_NULL | KF_RCU)
BTF_ID_FLAGS(func, bpf_list_push_front_impl)
BTF_ID_FLAGS(func, bpf_list_push_back_impl)
-BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_pop_front_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_pop_back_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_remove_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
@@ -11005,13 +11005,13 @@ enum special_kfunc_type {
KF_bpf_refcount_acquire_impl,
KF_bpf_list_push_front_impl,
KF_bpf_list_push_back_impl,
- KF_bpf_list_pop_front,
- KF_bpf_list_pop_back,
+ KF_bpf_list_pop_front_impl,
+ KF_bpf_list_pop_back_impl,
KF_bpf_cast_to_kern_ctx,
KF_bpf_rdonly_cast,
KF_bpf_rcu_read_lock,
KF_bpf_rcu_read_unlock,
- KF_bpf_rbtree_remove,
+ KF_bpf_rbtree_remove_impl,
KF_bpf_rbtree_add_impl,
KF_bpf_rbtree_first,
KF_bpf_dynptr_from_skb,
@@ -11031,11 +11031,11 @@ BTF_ID(func, bpf_obj_drop_impl)
BTF_ID(func, bpf_refcount_acquire_impl)
BTF_ID(func, bpf_list_push_front_impl)
BTF_ID(func, bpf_list_push_back_impl)
-BTF_ID(func, bpf_list_pop_front)
-BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_list_pop_front_impl)
+BTF_ID(func, bpf_list_pop_back_impl)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
-BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_remove_impl)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
BTF_ID(func, bpf_dynptr_from_skb)
@@ -11057,13 +11057,13 @@ BTF_ID(func, bpf_obj_drop_impl)
BTF_ID(func, bpf_refcount_acquire_impl)
BTF_ID(func, bpf_list_push_front_impl)
BTF_ID(func, bpf_list_push_back_impl)
-BTF_ID(func, bpf_list_pop_front)
-BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_list_pop_front_impl)
+BTF_ID(func, bpf_list_pop_back_impl)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rcu_read_lock)
BTF_ID(func, bpf_rcu_read_unlock)
-BTF_ID(func, bpf_rbtree_remove)
+BTF_ID(func, bpf_rbtree_remove_impl)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
BTF_ID(func, bpf_dynptr_from_skb)
@@ -11382,14 +11382,14 @@ static bool is_bpf_list_api_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
- btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
- btf_id == special_kfunc_list[KF_bpf_list_pop_back];
+ btf_id == special_kfunc_list[KF_bpf_list_pop_front_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back_impl];
}
static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
- btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ btf_id == special_kfunc_list[KF_bpf_rbtree_remove_impl] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_first];
}
@@ -11448,11 +11448,13 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
switch (node_field_type) {
case BPF_LIST_NODE:
+ case BPF_LIST_EXCL_NODE:
ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
break;
case BPF_RB_NODE:
- ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ case BPF_RB_EXCL_NODE:
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove_impl] ||
kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
break;
default:
@@ -11515,6 +11517,9 @@ __process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env,
return -EFAULT;
}
*head_field = field;
+ meta->arg_btf = field->graph_root.btf;
+ meta->arg_btf_id = field->graph_root.value_btf_id;
+
return 0;
}
@@ -11603,18 +11608,30 @@ static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno,
struct bpf_kfunc_call_arg_meta *meta)
{
- return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
- BPF_LIST_HEAD, BPF_LIST_NODE,
- &meta->arg_list_head.field);
+ int err;
+
+ err = __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_LIST_HEAD, BPF_LIST_NODE,
+ &meta->arg_list_head.field);
+
+ return err ? __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_LIST_HEAD, BPF_LIST_EXCL_NODE,
+ &meta->arg_list_head.field) : 0;
}
static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno,
struct bpf_kfunc_call_arg_meta *meta)
{
- return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
- BPF_RB_ROOT, BPF_RB_NODE,
- &meta->arg_rbtree_root.field);
+ int err;
+
+ err = __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_RB_ROOT, BPF_RB_NODE,
+ &meta->arg_rbtree_root.field);
+
+ return err ? __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta,
+ BPF_RB_ROOT, BPF_RB_EXCL_NODE,
+ &meta->arg_rbtree_root.field) : 0;
}
/*
@@ -11948,7 +11965,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return ret;
break;
case KF_ARG_PTR_TO_RB_NODE:
- if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) {
+ if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove_impl]) {
if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) {
verbose(env, "rbtree_remove node input must be non-owning ref\n");
return -EINVAL;
@@ -12255,6 +12272,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
+ if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_pop_back_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove_impl])
+ insn_aux->kptr_struct_meta = btf_find_struct_meta(meta.arg_btf, meta.arg_btf_id);
+
if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
if (!bpf_jit_supports_exceptions()) {
verbose(env, "JIT does not support calling kfunc %s#%d\n",
@@ -12386,12 +12408,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn_aux->kptr_struct_meta =
btf_find_struct_meta(meta.arg_btf,
meta.arg_btf_id);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
- meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_pop_back_impl]) {
struct btf_field *field = meta.arg_list_head.field;
mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove_impl] ||
meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) {
struct btf_field *field = meta.arg_rbtree_root.field;
@@ -19526,6 +19548,21 @@ static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
*cnt = 4;
}
+static void __fixup_collection_remove_kfunc(struct bpf_insn_aux_data *insn_aux,
+ u16 struct_meta_reg,
+ struct bpf_insn *insn,
+ struct bpf_insn *insn_buf,
+ int *cnt)
+{
+ struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
+}
+
static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
@@ -19614,6 +19651,24 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
__fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
node_offset_reg, insn, insn_buf, cnt);
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_list_pop_back_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_list_pop_front_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rbtree_remove_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ int struct_meta_reg = BPF_REG_2;
+
+ /* rbtree_remove has extra 'node' arg, so args-to-fixup are in diff regs */
+ if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_remove_impl])
+ struct_meta_reg = BPF_REG_3;
+
+ if (!kptr_struct_meta) {
+ verbose(env, "verifier internal error: kptr_struct_meta expected at insn_idx %d\n",
+ insn_idx);
+ return -EFAULT;
+ }
+
+ __fixup_collection_remove_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+ insn, insn_buf, cnt);
} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
@@ -91,22 +91,34 @@ extern int bpf_list_push_back_impl(struct bpf_list_head *head,
* Returns
* Pointer to bpf_list_node of deleted entry, or NULL if list is empty.
*/
-extern struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
+extern struct bpf_list_node *bpf_list_pop_front_impl(struct bpf_list_head *head,
+ void *meta) __ksym;
+
+/* Convenience macro to wrap over bpf_list_pop_front_impl */
+#define bpf_list_pop_front(head) bpf_list_pop_front_impl(head, NULL)
/* Description
* Remove the entry at the end of the BPF linked list.
* Returns
* Pointer to bpf_list_node of deleted entry, or NULL if list is empty.
*/
-extern struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
+extern struct bpf_list_node *bpf_list_pop_back_impl(struct bpf_list_head *head,
+ void *meta) __ksym;
+
+/* Convenience macro to wrap over bpf_list_pop_back_impl */
+#define bpf_list_pop_back(head) bpf_list_pop_back_impl(head, NULL)
/* Description
* Remove 'node' from rbtree with root 'root'
* Returns
* Pointer to the removed node, or NULL if 'root' didn't contain 'node'
*/
-extern struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
- struct bpf_rb_node *node) __ksym;
+extern struct bpf_rb_node *bpf_rbtree_remove_impl(struct bpf_rb_root *root,
+ struct bpf_rb_node *node,
+ void *meta) __ksym;
+
+/* Convenience macro to wrap over bpf_rbtree_remove_impl */
+#define bpf_rbtree_remove(head, node) bpf_rbtree_remove_impl(head, node, NULL)
/* Description
* Add 'node' to rbtree with root 'root' using comparator 'less'
@@ -132,6 +144,44 @@ extern int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *nod
*/
extern struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
+/* Convenience single-ownership graph functions */
+int bpf_list_excl_push_front(struct bpf_list_head *head, struct bpf_list_excl_node *node)
+{
+ return bpf_list_push_front(head, (struct bpf_list_node *)node);
+}
+
+int bpf_list_excl_push_back(struct bpf_list_head *head, struct bpf_list_excl_node *node)
+{
+ return bpf_list_push_back(head, (struct bpf_list_node *)node);
+}
+
+struct bpf_list_excl_node *bpf_list_excl_pop_front(struct bpf_list_head *head)
+{
+ return (struct bpf_list_excl_node *)bpf_list_pop_front(head);
+}
+
+struct bpf_list_excl_node *bpf_list_excl_pop_back(struct bpf_list_head *head)
+{
+ return (struct bpf_list_excl_node *)bpf_list_pop_back(head);
+}
+
+struct bpf_rb_excl_node *bpf_rbtree_excl_remove(struct bpf_rb_root *root,
+ struct bpf_rb_excl_node *node)
+{
+ return (struct bpf_rb_excl_node *)bpf_rbtree_remove(root, (struct bpf_rb_node *)node);
+}
+
+int bpf_rbtree_excl_add(struct bpf_rb_root *root, struct bpf_rb_excl_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b))
+{
+ return bpf_rbtree_add(root, (struct bpf_rb_node *)node, less);
+}
+
+struct bpf_rb_excl_node *bpf_rbtree_excl_first(struct bpf_rb_root *root)
+{
+ return (struct bpf_rb_excl_node *)bpf_rbtree_first(root);
+}
+
/* Description
* Allocates a percpu object of the type represented by 'local_type_id' in
* program BTF. User may use the bpf_core_type_id_local macro to pass the
This patch first teaches verifier to accept exclusive nodes (bpf_list_excl_node and bpf_rb_excl_node) as valid graph nodes. Graph kfuncs can now skip ownership tracking and checks for graphs containing exclusive nodes since we already make sure that a exclusive node cannot be owned by more than one collection at the same time. Graph kfuncs will use struct_meta to tell whether a node is exclusive or not. Therefore we pass struct_meta as an additional argument to graph remove kfuncs and let verifier fixup the instruction. The first user of exclusive-ownership nodes is sk_buff. In bpf qdisc, an sk_buff will be able to be enqueued into either a bpf_list or a bpf_rbtree. This significantly simplify how users write the code and improve qdisc performance as we no longer need to allocate local objects to store skb kptrs. Signed-off-by: Amery Hung <amery.hung@bytedance.com> --- include/linux/skbuff.h | 2 + kernel/bpf/btf.c | 1 + kernel/bpf/helpers.c | 63 +++++++---- kernel/bpf/verifier.c | 101 ++++++++++++++---- .../testing/selftests/bpf/bpf_experimental.h | 58 +++++++++- 5 files changed, 180 insertions(+), 45 deletions(-)