diff mbox series

[bpf-next,v3,02/16] bpf: Parse bpf_dynptr in map key

Message ID 20250327083455.848708-3-houtao@huaweicloud.com (mailing list archive)
State New
Delegated to: BPF
Headers show
Series Support dynptr key for hash map | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next-0
bpf/vmtest-bpf-net-PR fail merge-conflict
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18

Commit Message

Hou Tao March 27, 2025, 8:34 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

To support variable-length key or strings in map key, use bpf_dynptr to
represent these variable-length objects and save these bpf_dynptr
fields in the map key. As shown in the examples below, a map key with an
integer and a string is defined:

	struct pid_name {
		int pid;
		struct bpf_dynptr name;
	};

The bpf_dynptr in the map key could also be contained indirectly in a
struct as shown below:

	struct pid_name_time {
		struct pid_name process;
		unsigned long long time;
	};

It is also fine to have multiple bpf_dynptrs in the map key as shown
below. The maximum number of bpf_dynptr in a map key is limited to 2 and
the limitation can be lifted if necessary.

	struct pid_name_tag {
		struct pid_name process;
		struct bpf_dynptr tag;
	};

If the whole map key is a bpf_dynptr, the map could be defined as a
struct or directly using bpf_dynptr as the map key:

	struct map_key {
		struct bpf_dynptr name;
	};

The bpf program could use bpf_dynptr_init() to initialize the dynptr
part in the map key, and the userspace application will use
bpf_dynptr_user_init() or similar API to initialize the dynptr. Just
like kptrs in map value, the bpf_dynptr field in the map key could also
be defined in a nested struct which is contained in the map key struct.

The patch updates map_create() accordingly to parse these bpf_dynptr
fields in map key, just like it does for other special fields in map
value. These special fields are saved in the newly-added key_record
field of bpf_map. Considering both key_record and key_size are used
during the lookup procedure, place key_record in the same cacheline as
key_size and move the cold map_extra to the next cacheline. At present,
only BPF_MAP_TYPE_HASH map supports bpf_dynptr and the support will be
enabled later when its implementation is ready.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 include/linux/bpf.h     | 12 +++++++++++-
 kernel/bpf/btf.c        |  4 ++++
 kernel/bpf/map_in_map.c | 21 +++++++++++++++++----
 kernel/bpf/syscall.c    | 40 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 72 insertions(+), 5 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0b65c98d8b7d5..e25ff78f1fabf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -271,10 +271,13 @@  struct bpf_map {
 	u32 key_size;
 	u32 value_size;
 	u32 max_entries;
-	u64 map_extra; /* any per-map-type extra fields */
 	u32 map_flags;
 	u32 id;
+	/* BTF record for special fields in map Key. Only allow bpf_dynptr */
+	struct btf_record *key_record;
+	/* BTF record for special fields in map Value. Disallow bpf_dynptr. */
 	struct btf_record *record;
+	u64 map_extra; /* any per-map-type extra fields */
 	int numa_node;
 	u32 btf_key_type_id;
 	u32 btf_value_type_id;
@@ -341,6 +344,8 @@  static inline const char *btf_field_type_name(enum btf_field_type type)
 		return "bpf_rb_node";
 	case BPF_REFCOUNT:
 		return "bpf_refcount";
+	case BPF_DYNPTR:
+		return "bpf_dynptr";
 	default:
 		WARN_ON_ONCE(1);
 		return "unknown";
@@ -373,6 +378,8 @@  static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_rb_node);
 	case BPF_REFCOUNT:
 		return sizeof(struct bpf_refcount);
+	case BPF_DYNPTR:
+		return sizeof(struct bpf_dynptr);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -405,6 +412,8 @@  static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_rb_node);
 	case BPF_REFCOUNT:
 		return __alignof__(struct bpf_refcount);
+	case BPF_DYNPTR:
+		return __alignof__(struct bpf_dynptr);
 	default:
 		WARN_ON_ONCE(1);
 		return 0;
@@ -436,6 +445,7 @@  static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
 	case BPF_KPTR_REF:
 	case BPF_KPTR_PERCPU:
 	case BPF_UPTR:
+	case BPF_DYNPTR:
 		break;
 	default:
 		WARN_ON_ONCE(1);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 1054a1e27e9d3..c3c28ecf6bf09 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3513,6 +3513,7 @@  static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_
 	field_mask_test_name(BPF_RB_ROOT,   "bpf_rb_root");
 	field_mask_test_name(BPF_RB_NODE,   "bpf_rb_node");
 	field_mask_test_name(BPF_REFCOUNT,  "bpf_refcount");
+	field_mask_test_name(BPF_DYNPTR,    "bpf_dynptr");
 
 	/* Only return BPF_KPTR when all other types with matchable names fail */
 	if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
@@ -3551,6 +3552,7 @@  static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
 		case BPF_UPTR:
 		case BPF_LIST_HEAD:
 		case BPF_RB_ROOT:
+		case BPF_DYNPTR:
 			break;
 		default:
 			return -EINVAL;
@@ -3674,6 +3676,7 @@  static int btf_find_field_one(const struct btf *btf,
 	case BPF_LIST_NODE:
 	case BPF_RB_NODE:
 	case BPF_REFCOUNT:
+	case BPF_DYNPTR:
 		ret = btf_find_struct(btf, var_type, off, sz, field_type,
 				      info_cnt ? &info[0] : &tmp);
 		if (ret < 0)
@@ -4037,6 +4040,7 @@  struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			break;
 		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
+		case BPF_DYNPTR:
 			break;
 		default:
 			ret = -EFAULT;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 645bd30bc9a9d..564ebcc857564 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -12,6 +12,7 @@  struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 	struct bpf_map *inner_map, *inner_map_meta;
 	u32 inner_map_meta_size;
 	CLASS(fd, f)(inner_map_ufd);
+	int ret;
 
 	inner_map = __bpf_map_get(f);
 	if (IS_ERR(inner_map))
@@ -45,10 +46,15 @@  struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 		 * invalid/empty/valid, but ERR_PTR in case of errors. During
 		 * equality NULL or IS_ERR is equivalent.
 		 */
-		struct bpf_map *ret = ERR_CAST(inner_map_meta->record);
-		kfree(inner_map_meta);
-		return ret;
+		ret = PTR_ERR(inner_map_meta->record);
+		goto free_meta;
 	}
+	inner_map_meta->key_record = btf_record_dup(inner_map->key_record);
+	if (IS_ERR(inner_map_meta->key_record)) {
+		ret = PTR_ERR(inner_map_meta->key_record);
+		goto free_record;
+	}
+
 	/* Note: We must use the same BTF, as we also used btf_record_dup above
 	 * which relies on BTF being same for both maps, as some members like
 	 * record->fields.list_head have pointers like value_rec pointing into
@@ -71,6 +77,12 @@  struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
 		inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1;
 	}
 	return inner_map_meta;
+
+free_record:
+	btf_record_free(inner_map_meta->record);
+free_meta:
+	kfree(inner_map_meta);
+	return ERR_PTR(ret);
 }
 
 void bpf_map_meta_free(struct bpf_map *map_meta)
@@ -88,7 +100,8 @@  bool bpf_map_meta_equal(const struct bpf_map *meta0,
 		meta0->key_size == meta1->key_size &&
 		meta0->value_size == meta1->value_size &&
 		meta0->map_flags == meta1->map_flags &&
-		btf_record_equal(meta0->record, meta1->record);
+		btf_record_equal(meta0->record, meta1->record) &&
+		btf_record_equal(meta0->key_record, meta1->key_record);
 }
 
 void *bpf_map_fd_get_ptr(struct bpf_map *map,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 9794446bc8c6c..9ded3ba82d356 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -669,6 +669,7 @@  void btf_record_free(struct btf_record *rec)
 		case BPF_TIMER:
 		case BPF_REFCOUNT:
 		case BPF_WORKQUEUE:
+		case BPF_DYNPTR:
 			/* Nothing to release */
 			break;
 		default:
@@ -682,7 +683,9 @@  void btf_record_free(struct btf_record *rec)
 void bpf_map_free_record(struct bpf_map *map)
 {
 	btf_record_free(map->record);
+	btf_record_free(map->key_record);
 	map->record = NULL;
+	map->key_record = NULL;
 }
 
 struct btf_record *btf_record_dup(const struct btf_record *rec)
@@ -722,6 +725,7 @@  struct btf_record *btf_record_dup(const struct btf_record *rec)
 		case BPF_TIMER:
 		case BPF_REFCOUNT:
 		case BPF_WORKQUEUE:
+		case BPF_DYNPTR:
 			/* Nothing to acquire */
 			break;
 		default:
@@ -841,6 +845,8 @@  void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 		case BPF_RB_NODE:
 		case BPF_REFCOUNT:
 			break;
+		case BPF_DYNPTR:
+			break;
 		default:
 			WARN_ON_ONCE(1);
 			continue;
@@ -850,6 +856,7 @@  void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 
 static void bpf_map_free(struct bpf_map *map)
 {
+	struct btf_record *key_rec = map->key_record;
 	struct btf_record *rec = map->record;
 	struct btf *btf = map->btf;
 
@@ -870,6 +877,7 @@  static void bpf_map_free(struct bpf_map *map)
 	 * eventually calls bpf_map_free_meta, since inner_map_meta is only a
 	 * template bpf_map struct used during verification.
 	 */
+	btf_record_free(key_rec);
 	btf_record_free(rec);
 	/* Delay freeing of btf for maps, as map_free callback may need
 	 * struct_meta info which will be freed with btf_put().
@@ -1209,6 +1217,8 @@  int map_check_no_btf(const struct bpf_map *map,
 	return -ENOTSUPP;
 }
 
+#define MAX_DYNPTR_CNT_IN_MAP_KEY 2
+
 static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 			 const struct btf *btf, u32 btf_key_id, u32 btf_value_id)
 {
@@ -1231,6 +1241,36 @@  static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 	if (!value_type || value_size != map->value_size)
 		return -EINVAL;
 
+	/* Key BTF type can't be data section */
+	if (btf_type_is_dynptr(btf, key_type))
+		map->key_record = btf_new_bpf_dynptr_record();
+	else if (__btf_type_is_struct(key_type))
+		map->key_record = btf_parse_fields(btf, key_type, BPF_DYNPTR, map->key_size);
+	else
+		map->key_record = NULL;
+	if (!IS_ERR_OR_NULL(map->key_record)) {
+		if (map->key_record->cnt > MAX_DYNPTR_CNT_IN_MAP_KEY) {
+			ret = -E2BIG;
+			goto free_map_tab;
+		}
+		if (!bpf_token_capable(token, CAP_BPF)) {
+			ret = -EPERM;
+			goto free_map_tab;
+		}
+		/* Disallow key with dynptr for special map */
+		if (map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) {
+			ret = -EACCES;
+			goto free_map_tab;
+		}
+		/* Enable for BPF_MAP_TYPE_HASH later */
+		ret = -EOPNOTSUPP;
+		goto free_map_tab;
+	} else if (IS_ERR(map->key_record)) {
+		/* Return an error early even the bpf program doesn't use it */
+		ret = PTR_ERR(map->key_record);
+		goto free_map_tab;
+	}
+
 	map->record = btf_parse_fields(btf, value_type,
 				       BPF_SPIN_LOCK | BPF_RES_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
 				       BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,