@@ -663,12 +663,15 @@ enum bpf_type_flag {
/* DYNPTR points to xdp_buff */
DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS),
+ /* DYNPTR points to optval buffer of bpf_sockopt */
+ DYNPTR_TYPE_CGROUP_SOCKOPT = BIT(17 + BPF_BASE_TYPE_BITS),
+
__BPF_TYPE_FLAG_MAX,
__BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1,
};
#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
- | DYNPTR_TYPE_XDP)
+ | DYNPTR_TYPE_XDP | DYNPTR_TYPE_CGROUP_SOCKOPT)
/* Max number of base types. */
#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
@@ -1206,6 +1209,8 @@ enum bpf_dynptr_type {
BPF_DYNPTR_TYPE_SKB,
/* Underlying data is a xdp_buff */
BPF_DYNPTR_TYPE_XDP,
+ /* Underlying data is for the optval of a cgroup sock */
+ BPF_DYNPTR_TYPE_CGROUP_SOCKOPT,
};
int bpf_dynptr_check_size(u32 size);
@@ -7145,6 +7145,10 @@ struct bpf_sockopt {
enum bpf_sockopt_flags {
/* optval is a pointer to user space memory */
BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
+ /* able to install new optval */
+ BPF_SOCKOPT_FLAG_OPTVAL_REPLACE = (1U << 1),
+ /* optval is referenced by a dynptr */
+ BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR = (1U << 2),
};
struct bpf_pidns_info {
@@ -217,6 +217,7 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_SOCKET_FILTER,
BTF_KFUNC_HOOK_LWT,
BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_CGROUP_SOCKOPT,
BTF_KFUNC_HOOK_MAX,
};
@@ -7846,6 +7847,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_LWT;
case BPF_PROG_TYPE_NETFILTER:
return BTF_KFUNC_HOOK_NETFILTER;
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ return BTF_KFUNC_HOOK_CGROUP_SOCKOPT;
default:
return BTF_KFUNC_HOOK_MAX;
}
@@ -1877,6 +1877,8 @@ static int filter_setsockopt_progs_cb(void *arg,
if (max_optlen < 0)
return max_optlen;
+ ctx->flags = BPF_SOCKOPT_FLAG_OPTVAL_REPLACE;
+
if (copy_from_user(ctx->optval, optval,
min(ctx->optlen, max_optlen)) != 0)
return -EFAULT;
@@ -1905,7 +1907,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
ctx.optlen = *optlen;
ctx.optval = optval;
ctx.optval_end = optval + *optlen;
- ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
+ ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER |
+ BPF_SOCKOPT_FLAG_OPTVAL_REPLACE;
lock_sock(sk);
ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_SETSOCKOPT,
@@ -1557,6 +1557,7 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
switch (type) {
case BPF_DYNPTR_TYPE_LOCAL:
case BPF_DYNPTR_TYPE_RINGBUF:
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
/* Source and destination may possibly overlap, hence use memmove to
* copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
* pointing to overlapping PTR_TO_MAP_VALUE regions.
@@ -1602,6 +1603,7 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
switch (type) {
case BPF_DYNPTR_TYPE_LOCAL:
case BPF_DYNPTR_TYPE_RINGBUF:
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
if (flags)
return -EINVAL;
/* Source and destination may possibly overlap, hence use memmove to
@@ -1654,6 +1656,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
switch (type) {
case BPF_DYNPTR_TYPE_LOCAL:
case BPF_DYNPTR_TYPE_RINGBUF:
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
return (unsigned long)(ptr->data + ptr->offset + offset);
case BPF_DYNPTR_TYPE_SKB:
case BPF_DYNPTR_TYPE_XDP:
@@ -2281,6 +2284,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
switch (type) {
case BPF_DYNPTR_TYPE_LOCAL:
case BPF_DYNPTR_TYPE_RINGBUF:
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
return ptr->data + ptr->offset + offset;
case BPF_DYNPTR_TYPE_SKB:
if (buffer__opt)
@@ -2449,6 +2453,198 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
rcu_read_unlock();
}
+/* Create a buffer of the given size for a {set,get}sockopt BPF filter.
+ *
+ * This kfunc is only avaliabe for sleeplabe contexts. The dynptr should be
+ * released by bpf_so_optval_install() or bpf_sockopt_release().
+ */
+__bpf_kfunc int bpf_so_optval_alloc(struct bpf_sockopt *sopt, int size,
+ struct bpf_dynptr_kern *ptr__uninit)
+{
+ void *optval;
+ int err;
+
+ bpf_dynptr_set_null(ptr__uninit);
+
+ err = bpf_dynptr_check_size(size);
+ if (err)
+ return err;
+
+ optval = kzalloc(size, GFP_KERNEL);
+ if (!optval)
+ return -ENOMEM;
+
+ bpf_dynptr_init(ptr__uninit, optval,
+ BPF_DYNPTR_TYPE_CGROUP_SOCKOPT, 0, size);
+
+ return size;
+}
+
+/* Install the buffer of the dynptr into the sockopt context.
+ *
+ * This kfunc is only avaliabe for sleeplabe contexts. The dynptr should be
+ * allocated by bpf_so_optval_alloc(). The dynptr is invalid after
+ * returning from this function successfully.
+ */
+__bpf_kfunc int bpf_so_optval_install(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+
+ if (!(sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_REPLACE) ||
+ bpf_dynptr_get_type(ptr) != BPF_DYNPTR_TYPE_CGROUP_SOCKOPT ||
+ !ptr->data)
+ return -EINVAL;
+
+ if (sopt_kern->optval == ptr->data &&
+ !(sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)) {
+ /* This dynptr is initialized by bpf_so_optval_from() and
+ * the optval is not overwritten by bpf_so_optval_install()
+ * yet.
+ */
+ bpf_dynptr_set_null(ptr);
+ sopt_kern->flags &= ~BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR;
+ return 0;
+ }
+
+ if (sopt_kern->optval &&
+ !(sopt_kern->flags & (BPF_SOCKOPT_FLAG_OPTVAL_USER |
+ BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR)))
+ kfree(sopt_kern->optval);
+
+ sopt_kern->optval = ptr->data;
+ sopt_kern->optval_end = ptr->data + __bpf_dynptr_size(ptr);
+ sopt_kern->optlen = __bpf_dynptr_size(ptr);
+ sopt_kern->flags &= ~(BPF_SOCKOPT_FLAG_OPTVAL_USER |
+ BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR);
+
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_so_optval_release(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+
+ if (bpf_dynptr_get_type(ptr) != BPF_DYNPTR_TYPE_CGROUP_SOCKOPT ||
+ !ptr->data)
+ return -EINVAL;
+
+ if (sopt_kern->optval == ptr->data &&
+ !(sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER))
+ /* This dynptr is initialized by bpf_so_optval_from() and
+ * the optval is not overwritten by bpf_so_optval_install()
+ * yet.
+ */
+ sopt_kern->flags &= ~BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR;
+ else
+ kfree(ptr->data);
+ bpf_dynptr_set_null(ptr);
+
+ return 0;
+}
+
+/* Initialize a sockopt dynptr from a user or installed optval pointer.
+ *
+ * sopt->optval can be a user pointer or a kernel pointer. A kernel pointer
+ * can be a buffer allocated by the caller of the BPF program or a buffer
+ * installed by other BPF programs through bpf_so_optval_install().
+ *
+ * Atmost one dynptr shall be created by this function at any moment, or
+ * it will return -EINVAL. You can create another dypptr by this function
+ * after release the previous one by bpf_so_optval_release().
+ *
+ * A dynptr that is initialized when optval is a user pointer is an
+ * exception. In this case, the dynptr will point to a kernel buffer with
+ * the same content as the user buffer. To simplify the code, users should
+ * always make sure having only one dynptr initialized by this function at
+ * any moment.
+ */
+__bpf_kfunc int bpf_so_optval_from(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr__uninit,
+ unsigned int size)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+ int err;
+
+ bpf_dynptr_set_null(ptr__uninit);
+
+ if (size > (sopt_kern->optval_end - sopt_kern->optval))
+ return -EINVAL;
+
+ if (size == 0)
+ size = min(sopt_kern->optlen,
+ (int)(sopt_kern->optval_end - sopt_kern->optval));
+
+ if (sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR)
+ return -EINVAL;
+
+ if (sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+ err = bpf_so_optval_alloc(sopt, sopt_kern->optlen, ptr__uninit);
+ if (err >= 0)
+ err = copy_from_user(ptr__uninit->data,
+ sopt_kern->optval,
+ size);
+ return err;
+ }
+
+ bpf_dynptr_init(ptr__uninit, sopt_kern->optval,
+ BPF_DYNPTR_TYPE_CGROUP_SOCKOPT, 0,
+ size);
+ sopt_kern->flags |= BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR;
+
+ return size;
+}
+
+/**
+ * int bpf_so_optval_copy_to_r(struct bpf_sockopt *sopt,
+ * void *ptr, u32 ptr__sz)
+ * Description
+ * Copy data from *ptr* to *sopt->optval*.
+ * Return
+ * >= 0 on success, or a negative error in case of failure.
+ */
+__bpf_kfunc int bpf_so_optval_copy_to_r(struct bpf_sockopt *sopt,
+ void *ptr, u32 ptr__sz)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+ int ret;
+
+ if (ptr__sz > (sopt_kern->optval_end - sopt_kern->optval))
+ return -EINVAL;
+
+ if (sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+ ret = copy_to_user(sopt_kern->optval, ptr,
+ ptr__sz);
+ if (unlikely(ret))
+ return -EFAULT;
+ } else {
+ /* Use memmove() in case of optval & ptr overlap. */
+ memmove(sopt_kern->optval, ptr, ptr__sz);
+ ret = ptr__sz;
+ }
+
+ return ret;
+}
+
+/**
+ * int bpf_so_optval_copy_to(struct bpf_sockopt *sopt,
+ * struct bpf_dynptr_kern *ptr)
+ * Description
+ * Copy data from *ptr* to *sopt->optval*.
+ * Return
+ * >= 0 on success, or a negative error in case of failure.
+ */
+__bpf_kfunc int bpf_so_optval_copy_to(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr)
+{
+ __u32 size = bpf_dynptr_size(ptr);
+
+ return bpf_so_optval_copy_to_r(sopt, ptr->data, size);
+}
+
__diag_pop();
BTF_SET8_START(generic_btf_ids)
@@ -2517,6 +2713,12 @@ static const struct btf_kfunc_id_set common_kfunc_set = {
BTF_SET8_START(cgroup_common_btf_ids)
BTF_ID_FLAGS(func, bpf_copy_to_user, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_copy_to_r, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_copy_to, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_alloc, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_install, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_release, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_so_optval_from, KF_SLEEPABLE)
BTF_SET8_END(cgroup_common_btf_ids)
static const struct btf_kfunc_id_set cgroup_kfunc_set = {
@@ -745,6 +745,8 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
return "skb";
case BPF_DYNPTR_TYPE_XDP:
return "xdp";
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return "cgroup_sockopt";
case BPF_DYNPTR_TYPE_INVALID:
return "<invalid>";
default:
@@ -826,6 +828,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_SKB;
case DYNPTR_TYPE_XDP:
return BPF_DYNPTR_TYPE_XDP;
+ case DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return BPF_DYNPTR_TYPE_CGROUP_SOCKOPT;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
@@ -842,6 +846,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
return DYNPTR_TYPE_SKB;
case BPF_DYNPTR_TYPE_XDP:
return DYNPTR_TYPE_XDP;
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return DYNPTR_TYPE_CGROUP_SOCKOPT;
default:
return 0;
}
@@ -849,7 +855,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
- return type == BPF_DYNPTR_TYPE_RINGBUF;
+ return type == BPF_DYNPTR_TYPE_RINGBUF ||
+ type == BPF_DYNPTR_TYPE_CGROUP_SOCKOPT;
}
static void __mark_dynptr_reg(struct bpf_reg_state *reg,
@@ -10271,6 +10278,10 @@ enum special_kfunc_type {
KF_bpf_dynptr_slice,
KF_bpf_dynptr_slice_rdwr,
KF_bpf_dynptr_clone,
+ KF_bpf_sockopt_alloc_optval,
+ KF_bpf_so_optval_install,
+ KF_bpf_so_optval_release,
+ KF_bpf_so_optval_from,
};
BTF_SET_START(special_kfunc_set)
@@ -10291,6 +10302,10 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_so_optval_alloc)
+BTF_ID(func, bpf_so_optval_install)
+BTF_ID(func, bpf_so_optval_release)
+BTF_ID(func, bpf_so_optval_from)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -10313,6 +10328,10 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_so_optval_alloc)
+BTF_ID(func, bpf_so_optval_install)
+BTF_ID(func, bpf_so_optval_release)
+BTF_ID(func, bpf_so_optval_from)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -10966,6 +10985,20 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
arg_type |= OBJ_RELEASE;
break;
case KF_ARG_PTR_TO_DYNPTR:
+ if (meta->func_id == special_kfunc_list[KF_bpf_so_optval_install] ||
+ meta->func_id == special_kfunc_list[KF_bpf_so_optval_release]) {
+ int ref_obj_id = dynptr_ref_obj_id(env, reg);
+
+ if (ref_obj_id < 0) {
+ verbose(env, "R%d is not a valid dynptr\n", regno);
+ return -EINVAL;
+ }
+
+ /* Required by check_func_arg_reg_off() */
+ arg_type |= ARG_PTR_TO_DYNPTR | OBJ_RELEASE;
+ meta->release_regno = regno;
+ }
+ break;
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
case KF_ARG_PTR_TO_LIST_NODE:
@@ -11053,6 +11086,10 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
return -EFAULT;
}
+ } else if ((meta->func_id == special_kfunc_list[KF_bpf_sockopt_alloc_optval] ||
+ meta->func_id == special_kfunc_list[KF_bpf_so_optval_from]) &&
+ (dynptr_arg_type & MEM_UNINIT)) {
+ dynptr_arg_type |= DYNPTR_TYPE_CGROUP_SOCKOPT;
}
ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
@@ -11361,7 +11398,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
if (meta.release_regno) {
- err = release_reference(env, regs[meta.release_regno].ref_obj_id);
+ verbose(env, "release refcounted PTR_TO_BTF_ID %s\n",
+ meta.func_name);
+ if (meta.func_id == special_kfunc_list[KF_bpf_so_optval_install] ||
+ meta.func_id == special_kfunc_list[KF_bpf_so_optval_release])
+ err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]);
+ else
+ err = release_reference(env, regs[meta.release_regno].ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
func_name, meta.func_id);
@@ -7145,6 +7145,10 @@ struct bpf_sockopt {
enum bpf_sockopt_flags {
/* optval is a pointer to user space memory */
BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
+ /* able to install new optval */
+ BPF_SOCKOPT_FLAG_OPTVAL_REPLACE = (1U << 1),
+ /* optval is referenced by a dynptr */
+ BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR = (1U << 2),
};
struct bpf_pidns_info {