@@ -663,12 +663,15 @@ enum bpf_type_flag {
/* DYNPTR points to xdp_buff */
DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS),
+ /* DYNPTR points to optval buffer of bpf_sockopt */
+ DYNPTR_TYPE_CGROUP_SOCKOPT = BIT(17 + BPF_BASE_TYPE_BITS),
+
__BPF_TYPE_FLAG_MAX,
__BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1,
};
#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \
- | DYNPTR_TYPE_XDP)
+ | DYNPTR_TYPE_XDP | DYNPTR_TYPE_CGROUP_SOCKOPT)
/* Max number of base types. */
#define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS)
@@ -1208,6 +1211,8 @@ enum bpf_dynptr_type {
BPF_DYNPTR_TYPE_SKB,
/* Underlying data is a xdp_buff */
BPF_DYNPTR_TYPE_XDP,
+ /* Underlying data is for the optval of a cgroup sock */
+ BPF_DYNPTR_TYPE_CGROUP_SOCKOPT,
};
int bpf_dynptr_check_size(u32 size);
@@ -1347,6 +1347,10 @@ struct bpf_sockopt_kern {
enum bpf_sockopt_kern_flags {
/* optval is a pointer to user space memory */
BPF_SOCKOPT_FLAG_OPTVAL_USER = (1U << 0),
+ /* able to install new optval */
+ BPF_SOCKOPT_FLAG_OPTVAL_REPLACE = (1U << 1),
+ /* optval is referenced by a dynptr */
+ BPF_SOCKOPT_FLAG_OPTVAL_DYNPTR = (1U << 2),
};
int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);
@@ -217,6 +217,7 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_SOCKET_FILTER,
BTF_KFUNC_HOOK_LWT,
BTF_KFUNC_HOOK_NETFILTER,
+ BTF_KFUNC_HOOK_CGROUP_SOCKOPT,
BTF_KFUNC_HOOK_MAX,
};
@@ -7846,6 +7847,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_LWT;
case BPF_PROG_TYPE_NETFILTER:
return BTF_KFUNC_HOOK_NETFILTER;
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ return BTF_KFUNC_HOOK_CGROUP_SOCKOPT;
default:
return BTF_KFUNC_HOOK_MAX;
}
@@ -1866,6 +1866,8 @@ static int filter_setsockopt_progs_cb(void *arg,
if (max_optlen < 0)
return max_optlen;
+ ctx->flags = BPF_SOCKOPT_FLAG_OPTVAL_REPLACE;
+
if (copy_from_user(ctx->optval, optval,
min(ctx->optlen, max_optlen)) != 0)
return -EFAULT;
@@ -1894,7 +1896,8 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
ctx.optlen = *optlen;
ctx.optval = optval;
ctx.optval_end = optval + *optlen;
- ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER;
+ ctx.flags = BPF_SOCKOPT_FLAG_OPTVAL_USER |
+ BPF_SOCKOPT_FLAG_OPTVAL_REPLACE;
lock_sock(sk);
ret = bpf_prog_run_array_cg_cb(&cgrp->bpf, CGROUP_SETSOCKOPT,
@@ -1519,6 +1519,51 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
};
+static int __bpf_sockopt_store_bytes(struct bpf_sockopt_kern *sopt, u32 offset,
+ void *src, u32 len)
+{
+ int buf_len, err;
+ void *buf;
+
+ if (!src)
+ return 0;
+
+ if (sopt->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+ if (!(sopt->flags & BPF_SOCKOPT_FLAG_OPTVAL_REPLACE))
+ return copy_to_user(sopt->optval + offset, src, len) ?
+ -EFAULT : 0;
+ buf_len = sopt->optval_end - sopt->optval;
+ buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ err = copy_from_user(buf, sopt->optval, buf_len) ? -EFAULT : 0;
+ if (err < 0) {
+ kfree(buf);
+ return err;
+ }
+ sopt->optval = buf;
+ sopt->optval_end = buf + len;
+ sopt->flags &= ~BPF_SOCKOPT_FLAG_OPTVAL_USER;
+ memcpy(buf + offset, src, len);
+ }
+
+ memcpy(sopt->optval + offset, src, len);
+
+ return 0;
+}
+
+static int __bpf_sockopt_load_bytes(struct bpf_sockopt_kern *sopt, u32 offset,
+ void *dst, u32 len)
+{
+ if (sopt->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER)
+ return copy_from_user(dst, sopt->optval + offset, len) ?
+ -EFAULT : 0;
+
+ memcpy(dst, sopt->optval + offset, len);
+
+ return 0;
+}
+
BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
u32, offset, u64, flags)
{
@@ -1547,6 +1592,8 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
case BPF_DYNPTR_TYPE_XDP:
return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return __bpf_sockopt_load_bytes(src->data, src->offset + offset, dst, len);
default:
WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1597,6 +1644,10 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v
if (flags)
return -EINVAL;
return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return __bpf_sockopt_store_bytes(dst->data,
+ dst->offset + offset,
+ src, len);
default:
WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
return -EFAULT;
@@ -1634,6 +1685,7 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
switch (type) {
case BPF_DYNPTR_TYPE_LOCAL:
case BPF_DYNPTR_TYPE_RINGBUF:
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
return (unsigned long)(ptr->data + ptr->offset + offset);
case BPF_DYNPTR_TYPE_SKB:
case BPF_DYNPTR_TYPE_XDP:
@@ -2278,6 +2330,8 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer__opt, len, false);
return buffer__opt;
}
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return NULL;
default:
WARN_ONCE(true, "unknown dynptr type %d\n", type);
return NULL;
@@ -2429,6 +2483,80 @@ __bpf_kfunc void bpf_rcu_read_unlock(void)
rcu_read_unlock();
}
+__bpf_kfunc int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr)
+{
+ bpf_dynptr_set_null(ptr);
+ return 0;
+}
+
+/* Initialize a sockopt dynptr from a user or installed optval pointer.
+ *
+ * sopt->optval can be a user pointer or a kernel pointer. A kernel pointer
+ * can be a buffer allocated by the caller of the BPF program or a buffer
+ * installed by other BPF programs through bpf_sockopt_dynptr_install().
+ *
+ * Atmost one dynptr shall be created by this function at any moment, or
+ * it will return -EINVAL. You can create another dypptr by this function
+ * after release the previous one by bpf_sockopt_dynptr_release().
+ *
+ * A dynptr that is initialized when optval is a user pointer is an
+ * exception. In this case, the dynptr will point to a kernel buffer with
+ * the same content as the user buffer. To simplify the code, users should
+ * always make sure having only one dynptr initialized by this function at
+ * any moment.
+ */
+__bpf_kfunc int bpf_dynptr_from_sockopt(struct bpf_sockopt *sopt,
+ struct bpf_dynptr_kern *ptr__uninit)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+ unsigned int size;
+
+ size = sopt_kern->optval_end - sopt_kern->optval;
+
+ bpf_dynptr_init(ptr__uninit, sopt,
+ BPF_DYNPTR_TYPE_CGROUP_SOCKOPT, 0,
+ size);
+
+ return size;
+}
+
+__bpf_kfunc int bpf_sockopt_grow_to(struct bpf_sockopt *sopt,
+ u32 newsize)
+{
+ struct bpf_sockopt_kern *sopt_kern = (struct bpf_sockopt_kern *)sopt;
+ void *newoptval;
+ int err;
+
+ if (newsize > DYNPTR_MAX_SIZE)
+ return -EINVAL;
+
+ if (newsize <= sopt_kern->optlen)
+ return 0;
+
+ if (sopt_kern->flags & BPF_SOCKOPT_FLAG_OPTVAL_USER) {
+ newoptval = kmalloc(newsize, GFP_KERNEL);
+ if (!newoptval)
+ return -ENOMEM;
+ err = copy_from_user(newoptval, sopt_kern->optval,
+ sopt_kern->optval_end - sopt_kern->optval);
+ if (err < 0) {
+ kfree(newoptval);
+ return err;
+ }
+ sopt_kern->flags &= ~BPF_SOCKOPT_FLAG_OPTVAL_USER;
+ } else {
+ newoptval = krealloc(sopt_kern->optval, newsize, GFP_KERNEL);
+ if (!newoptval)
+ return -ENOMEM;
+ }
+
+ sopt_kern->optval = newoptval;
+ sopt_kern->optval_end = newoptval + newsize;
+
+ return 0;
+}
+
__diag_pop();
BTF_SET8_START(generic_btf_ids)
@@ -2494,6 +2622,17 @@ static const struct btf_kfunc_id_set common_kfunc_set = {
.set = &common_btf_ids,
};
+BTF_SET8_START(cgroup_common_btf_ids)
+BTF_ID_FLAGS(func, bpf_sockopt_dynptr_release, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_dynptr_from_sockopt, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_sockopt_grow_to, KF_SLEEPABLE)
+BTF_SET8_END(cgroup_common_btf_ids)
+
+static const struct btf_kfunc_id_set cgroup_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &cgroup_common_btf_ids,
+};
+
static int __init kfunc_init(void)
{
int ret;
@@ -2513,6 +2652,7 @@ static int __init kfunc_init(void)
ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCKOPT, &cgroup_kfunc_set);
ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
ARRAY_SIZE(generic_dtors),
THIS_MODULE);
@@ -755,6 +755,8 @@ static const char *dynptr_type_str(enum bpf_dynptr_type type)
return "skb";
case BPF_DYNPTR_TYPE_XDP:
return "xdp";
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return "cgroup_sockopt";
case BPF_DYNPTR_TYPE_INVALID:
return "<invalid>";
default:
@@ -836,6 +838,8 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_SKB;
case DYNPTR_TYPE_XDP:
return BPF_DYNPTR_TYPE_XDP;
+ case DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return BPF_DYNPTR_TYPE_CGROUP_SOCKOPT;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
@@ -852,6 +856,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
return DYNPTR_TYPE_SKB;
case BPF_DYNPTR_TYPE_XDP:
return DYNPTR_TYPE_XDP;
+ case BPF_DYNPTR_TYPE_CGROUP_SOCKOPT:
+ return DYNPTR_TYPE_CGROUP_SOCKOPT;
default:
return 0;
}
@@ -859,7 +865,8 @@ static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
- return type == BPF_DYNPTR_TYPE_RINGBUF;
+ return type == BPF_DYNPTR_TYPE_RINGBUF ||
+ type == BPF_DYNPTR_TYPE_CGROUP_SOCKOPT;
}
static void __mark_dynptr_reg(struct bpf_reg_state *reg,
@@ -10300,6 +10307,8 @@ enum special_kfunc_type {
KF_bpf_dynptr_slice,
KF_bpf_dynptr_slice_rdwr,
KF_bpf_dynptr_clone,
+ KF_bpf_sockopt_dynptr_release,
+ KF_bpf_dynptr_from_sockopt,
};
BTF_SET_START(special_kfunc_set)
@@ -10320,6 +10329,8 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_sockopt_dynptr_release)
+BTF_ID(func, bpf_dynptr_from_sockopt)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
@@ -10342,6 +10353,8 @@ BTF_ID(func, bpf_dynptr_from_xdp)
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
+BTF_ID(func, bpf_sockopt_dynptr_release)
+BTF_ID(func, bpf_dynptr_from_sockopt)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -10995,6 +11008,19 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
arg_type |= OBJ_RELEASE;
break;
case KF_ARG_PTR_TO_DYNPTR:
+ if (meta->func_id == special_kfunc_list[KF_bpf_sockopt_dynptr_release]) {
+ int ref_obj_id = dynptr_ref_obj_id(env, reg);
+
+ if (ref_obj_id < 0) {
+ verbose(env, "R%d is not a valid dynptr\n", regno);
+ return -EINVAL;
+ }
+
+ /* Required by check_func_arg_reg_off() */
+ arg_type |= ARG_PTR_TO_DYNPTR | OBJ_RELEASE;
+ meta->release_regno = regno;
+ }
+ break;
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
case KF_ARG_PTR_TO_LIST_NODE:
@@ -11082,6 +11108,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
verbose(env, "verifier internal error: missing ref obj id for parent of clone\n");
return -EFAULT;
}
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_sockopt] &&
+ (dynptr_arg_type & MEM_UNINIT)) {
+ dynptr_arg_type |= DYNPTR_TYPE_CGROUP_SOCKOPT;
}
ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type, clone_ref_obj_id);
@@ -11390,7 +11419,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
if (meta.release_regno) {
- err = release_reference(env, regs[meta.release_regno].ref_obj_id);
+ verbose(env, "release refcounted PTR_TO_BTF_ID %s\n",
+ meta.func_name);
+ if (meta.func_id == special_kfunc_list[KF_bpf_sockopt_dynptr_release])
+ err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]);
+ else
+ err = release_reference(env, regs[meta.release_regno].ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
func_name, meta.func_id);