Message ID | 20230815174712.660956-6-thinker.li@gmail.com (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | Sleepable BPF programs on cgroup {get,set}sockopt | expand |
On 08/15, thinker.li@gmail.com wrote: > From: Kui-Feng Lee <thinker.li@gmail.com> > > Do the same test as non-sleepable ones. > > Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com> > --- > .../testing/selftests/bpf/bpf_experimental.h | 36 +++ > tools/testing/selftests/bpf/bpf_kfuncs.h | 41 +++ > .../selftests/bpf/prog_tests/sockopt_sk.c | 112 +++++++- > .../testing/selftests/bpf/progs/sockopt_sk.c | 257 ++++++++++++++++++ > .../selftests/bpf/verifier/sleepable.c | 2 +- > 5 files changed, 445 insertions(+), 3 deletions(-) > > diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h > index 209811b1993a..9b5dfefe65dc 100644 > --- a/tools/testing/selftests/bpf/bpf_experimental.h > +++ b/tools/testing/selftests/bpf/bpf_experimental.h > @@ -131,4 +131,40 @@ extern int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *nod > */ > extern struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; > > +/* > + * Description > + * Copy data from *ptr* to *sopt->optval*. > + * Return > + * >= 0 on success, or a negative error in case of failure. > + */ > +extern int bpf_sockopt_dynptr_copy_to(struct bpf_sockopt *sopt, > + struct bpf_dynptr *ptr) __ksym; > + > +/* Description > + * Allocate a buffer of 'size' bytes for being installed as optval. > + * Returns > + * > 0 on success, the size of the allocated buffer > + * -ENOMEM or -EINVAL on failure > + */ > +extern int bpf_sockopt_dynptr_alloc(struct bpf_sockopt *sopt, int size, > + struct bpf_dynptr *ptr__uninit) __ksym; > + > +/* Description > + * Install the buffer pointed to by 'ptr' as optval. > + * Returns > + * 0 on success > + * -EINVAL if the buffer is too small > + */ > +extern int bpf_sockopt_dynptr_install(struct bpf_sockopt *sopt, > + struct bpf_dynptr *ptr) __ksym; > + > +/* Description > + * Release the buffer allocated by bpf_sockopt_dynptr_alloc. > + * Returns > + * 0 on success > + * -EINVAL if the buffer was not allocated by bpf_sockopt_dynptr_alloc > + */ > +extern int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt, > + struct bpf_dynptr *ptr) __ksym; > + > #endif > diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h > index 642dda0e758a..772040225257 100644 > --- a/tools/testing/selftests/bpf/bpf_kfuncs.h > +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h > @@ -41,4 +41,45 @@ extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym; > extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym; > extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym; > > +extern int bpf_sockopt_dynptr_copy_to(struct bpf_sockopt *sopt, > + struct bpf_dynptr *ptr) __ksym; > + > +/* Description > + * Allocate a buffer of 'size' bytes for being installed as optval. > + * Returns > + * > 0 on success, the size of the allocated buffer > + * -ENOMEM or -EINVAL on failure > + */ > +extern int bpf_sockopt_dynptr_alloc(struct bpf_sockopt *sopt, int size, > + struct bpf_dynptr *ptr__uninit) __ksym; > + > +/* Description > + * Install the buffer pointed to by 'ptr' as optval. > + * Returns > + * 0 on success > + * -EINVAL if the buffer is too small > + */ > +extern int bpf_sockopt_dynptr_install(struct bpf_sockopt *sopt, > + struct bpf_dynptr *ptr) __ksym; > + > +/* Description > + * Release the buffer allocated by bpf_sockopt_dynptr_alloc. > + * Returns > + * 0 on success > + * -EINVAL if the buffer was not allocated by bpf_sockopt_dynptr_alloc > + */ > +extern int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt, > + struct bpf_dynptr *ptr) __ksym; > + > +/* Description > + * Initialize a dynptr to access the content of optval passing > + * to {get,set}sockopt()s. > + * Returns > + * > 0 on success, the size of the allocated buffer > + * -ENOMEM or -EINVAL on failure > + */ > +extern int bpf_sockopt_dynptr_from(struct bpf_sockopt *sopt, > + struct bpf_dynptr *ptr__uninit, > + unsigned int size) __ksym; > + > #endif > diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c > index 05d0e07da394..85255648747f 100644 > --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c > +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c > @@ -92,6 +92,7 @@ static int getsetsockopt(void) > } > if (buf.u8[0] != 0x01) { > log_err("Unexpected buf[0] 0x%02x != 0x01", buf.u8[0]); > + log_err("optlen %d", optlen); > goto err; > } > > @@ -220,7 +221,7 @@ static int getsetsockopt(void) > return -1; > } > > -static void run_test(int cgroup_fd) > +static void run_test_nonsleepable(int cgroup_fd) > { > struct sockopt_sk *skel; > > @@ -246,6 +247,106 @@ static void run_test(int cgroup_fd) > sockopt_sk__destroy(skel); > } > > +static void run_test_nonsleepable_mixed(int cgroup_fd) > +{ > + struct sockopt_sk *skel; > + > + skel = sockopt_sk__open_and_load(); > + if (!ASSERT_OK_PTR(skel, "skel_load")) > + goto cleanup; > + > + skel->bss->page_size = getpagesize(); > + skel->bss->skip_sleepable = 1; > + > + skel->links._setsockopt_s = > + bpf_program__attach_cgroup(skel->progs._setsockopt_s, cgroup_fd); > + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link (sleepable)")) > + goto cleanup; > + > + skel->links._getsockopt_s = > + bpf_program__attach_cgroup(skel->progs._getsockopt_s, cgroup_fd); > + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link (sleepable)")) > + goto cleanup; > + > + skel->links._setsockopt = > + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); > + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link")) > + goto cleanup; > + > + skel->links._getsockopt = > + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); > + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link")) > + goto cleanup; > + > + ASSERT_OK(getsetsockopt(), "getsetsockopt"); > + > +cleanup: > + sockopt_sk__destroy(skel); > +} > + > +static void run_test_sleepable(int cgroup_fd) > +{ > + struct sockopt_sk *skel; > + > + skel = sockopt_sk__open_and_load(); > + if (!ASSERT_OK_PTR(skel, "skel_load")) > + goto cleanup; > + > + skel->bss->page_size = getpagesize(); > + > + skel->links._setsockopt_s = > + bpf_program__attach_cgroup(skel->progs._setsockopt_s, cgroup_fd); > + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link")) > + goto cleanup; > + > + skel->links._getsockopt_s = > + bpf_program__attach_cgroup(skel->progs._getsockopt_s, cgroup_fd); > + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link")) > + goto cleanup; > + > + ASSERT_OK(getsetsockopt(), "getsetsockopt"); > + > +cleanup: > + sockopt_sk__destroy(skel); > +} > + > +static void run_test_sleepable_mixed(int cgroup_fd) > +{ > + struct sockopt_sk *skel; > + > + skel = sockopt_sk__open_and_load(); > + if (!ASSERT_OK_PTR(skel, "skel_load")) > + goto cleanup; > + > + skel->bss->page_size = getpagesize(); > + skel->bss->skip_nonsleepable = 1; > + > + skel->links._setsockopt = > + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); > + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link (nonsleepable)")) > + goto cleanup; > + > + skel->links._getsockopt = > + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); > + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link (nonsleepable)")) > + goto cleanup; > + > + skel->links._setsockopt_s = > + bpf_program__attach_cgroup(skel->progs._setsockopt_s, cgroup_fd); > + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link")) > + goto cleanup; > + > + skel->links._getsockopt_s = > + bpf_program__attach_cgroup(skel->progs._getsockopt_s, cgroup_fd); > + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link")) > + goto cleanup; > + > + ASSERT_OK(getsetsockopt(), "getsetsockopt"); > + > +cleanup: > + sockopt_sk__destroy(skel); > +} > + > void test_sockopt_sk(void) > { > int cgroup_fd; > @@ -254,6 +355,13 @@ void test_sockopt_sk(void) > if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /sockopt_sk")) > return; > > - run_test(cgroup_fd); > + if (test__start_subtest("nonsleepable")) > + run_test_nonsleepable(cgroup_fd); > + if (test__start_subtest("sleepable")) > + run_test_sleepable(cgroup_fd); > + if (test__start_subtest("nonsleepable_mixed")) > + run_test_nonsleepable_mixed(cgroup_fd); > + if (test__start_subtest("sleepable_mixed")) > + run_test_sleepable_mixed(cgroup_fd); > close(cgroup_fd); > } > diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c > index cb990a7d3d45..efacd3b88c40 100644 > --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c > +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c > @@ -5,10 +5,16 @@ > #include <netinet/in.h> > #include <bpf/bpf_helpers.h> > > +typedef int bool; > +#include "bpf_kfuncs.h" > + > char _license[] SEC("license") = "GPL"; > > int page_size = 0; /* userspace should set it */ > > +int skip_sleepable = 0; > +int skip_nonsleepable = 0; > + > #ifndef SOL_TCP > #define SOL_TCP IPPROTO_TCP > #endif > @@ -34,6 +40,9 @@ int _getsockopt(struct bpf_sockopt *ctx) > struct sockopt_sk *storage; > struct bpf_sock *sk; > > + if (skip_nonsleepable) > + return 1; > + > /* Bypass AF_NETLINK. */ > sk = ctx->sk; > if (sk && sk->family == AF_NETLINK) > @@ -136,6 +145,134 @@ int _getsockopt(struct bpf_sockopt *ctx) > return 1; > } > > +SEC("cgroup/getsockopt.s") > +int _getsockopt_s(struct bpf_sockopt *ctx) > +{ > + struct tcp_zerocopy_receive *zcvr; > + struct bpf_dynptr optval_dynptr; > + struct sockopt_sk *storage; > + __u8 *optval, *optval_end; > + struct bpf_sock *sk; > + char buf[1]; > + __u64 addr; > + int ret; > + > + if (skip_sleepable) > + return 1; > + > + /* Bypass AF_NETLINK. */ > + sk = ctx->sk; > + if (sk && sk->family == AF_NETLINK) > + return 1; > + > + optval = ctx->optval; > + optval_end = ctx->optval_end; > + > + /* Make sure bpf_get_netns_cookie is callable. > + */ > + if (bpf_get_netns_cookie(NULL) == 0) > + return 0; > + > + if (bpf_get_netns_cookie(ctx) == 0) > + return 0; > + > + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { > + /* Not interested in SOL_IP:IP_TOS; > + * let next BPF program in the cgroup chain or kernel > + * handle it. > + */ > + return 1; > + } > + > + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { > + /* Not interested in SOL_SOCKET:SO_SNDBUF; > + * let next BPF program in the cgroup chain or kernel > + * handle it. > + */ > + return 1; > + } > + > + if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) { > + /* Not interested in SOL_TCP:TCP_CONGESTION; > + * let next BPF program in the cgroup chain or kernel > + * handle it. > + */ > + return 1; > + } > + > + if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { > + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. > + * It has a custom implementation for performance > + * reasons. > + */ > + > + bpf_sockopt_dynptr_from(ctx, &optval_dynptr, sizeof(*zcvr)); > + zcvr = bpf_dynptr_data(&optval_dynptr, 0, sizeof(*zcvr)); > + addr = zcvr ? zcvr->address : 0; > + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); This starts to look more usable, thank you for the changes! Let me poke the api a bit more, I'm not super familiar with the dynptrs. here: bpf_sockopt_dynptr_from should probably be called bpf_dynptr_from_sockopt to match bpf_dynptr_from_mem? > + > + return addr != 0 ? 0 : 1; > + } > + > + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { > + if (optval + 1 > optval_end) > + return 0; /* bounds check */ > + > + ctx->retval = 0; /* Reset system call return value to zero */ > + > + /* Always export 0x55 */ > + buf[0] = 0x55; > + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); > + if (ret >= 0) { > + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); > + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); > + } > + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); Does bpf_sockopt_dynptr_alloc and bpf_sockopt_dynptr_release need to be sockopt specific? Seems like we should provide, instead, some generic bpf_dynptr_alloc/bpf_dynptr_release and make bpf_sockopt_dynptr_copy_to/install work with them? WDYT? > + if (ret < 0) > + return 0; > + ctx->optlen = 1; > + > + /* Userspace buffer is PAGE_SIZE * 2, but BPF > + * program can only see the first PAGE_SIZE > + * bytes of data. > + */ > + if (optval_end - optval != page_size && 0) > + return 0; /* unexpected data size */ > + > + return 1; > + } > + > + if (ctx->level != SOL_CUSTOM) > + return 0; /* deny everything except custom level */ > + > + if (optval + 1 > optval_end) > + return 0; /* bounds check */ > + > + storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0, > + BPF_SK_STORAGE_GET_F_CREATE); > + if (!storage) > + return 0; /* couldn't get sk storage */ > + > + if (!ctx->retval) > + return 0; /* kernel should not have handled > + * SOL_CUSTOM, something is wrong! > + */ > + ctx->retval = 0; /* Reset system call return value to zero */ > + > + buf[0] = storage->val; > + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); > + if (ret >= 0) { > + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); > + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); > + } > + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); > + if (ret < 0) > + return 0; > + ctx->optlen = 1; > + > + return 1; > +} > + > SEC("cgroup/setsockopt") > int _setsockopt(struct bpf_sockopt *ctx) > { > @@ -144,6 +281,9 @@ int _setsockopt(struct bpf_sockopt *ctx) > struct sockopt_sk *storage; > struct bpf_sock *sk; > > + if (skip_nonsleepable) > + return 1; > + > /* Bypass AF_NETLINK. */ > sk = ctx->sk; > if (sk && sk->family == AF_NETLINK) > @@ -236,3 +376,120 @@ int _setsockopt(struct bpf_sockopt *ctx) > ctx->optlen = 0; > return 1; > } > + > +SEC("cgroup/setsockopt.s") > +int _setsockopt_s(struct bpf_sockopt *ctx) > +{ > + struct bpf_dynptr optval_buf; > + struct sockopt_sk *storage; > + __u8 *optval, *optval_end; > + struct bpf_sock *sk; > + __u8 tmp_u8; > + __u32 tmp; > + int ret; > + > + if (skip_sleepable) > + return 1; > + > + optval = ctx->optval; > + optval_end = ctx->optval_end; > + > + /* Bypass AF_NETLINK. */ > + sk = ctx->sk; > + if (sk && sk->family == AF_NETLINK) > + return -1; > + > + /* Make sure bpf_get_netns_cookie is callable. > + */ > + if (bpf_get_netns_cookie(NULL) == 0) > + return 0; > + > + if (bpf_get_netns_cookie(ctx) == 0) > + return 0; > + > + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { > + /* Not interested in SOL_IP:IP_TOS; > + * let next BPF program in the cgroup chain or kernel > + * handle it. > + */ > + ctx->optlen = 0; /* bypass optval>PAGE_SIZE */ > + return 1; > + } > + > + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { > + /* Overwrite SO_SNDBUF value */ > + > + ret = bpf_sockopt_dynptr_alloc(ctx, sizeof(__u32), > + &optval_buf); > + if (ret < 0) > + bpf_sockopt_dynptr_release(ctx, &optval_buf); > + else { > + tmp = 0x55AA; > + bpf_dynptr_write(&optval_buf, 0, &tmp, sizeof(tmp), 0); > + ret = bpf_sockopt_dynptr_install(ctx, &optval_buf); One thing I'm still slightly confused about is bpf_sockopt_dynptr_install vs bpf_sockopt_dynptr_copy_to. I do understand that it comes from getsockopt vs setsockopt (and the ability, in setsockopt, to allocate larger buffers), but I wonder whether we can hide everything under single bpf_sockopt_dynptr_copy_to call? For getsockopt, it stays as is. For setsockopt, it would work like _install currently does. Would that work? From user perspective, if we provide a simple call that does the right thing, seems a bit more usable? The only problem is probably the fact that _install explicitly moves the ownership, but I don't see why copy_to can't have the same "consume" semantics?
On 8/15/23 13:57, Stanislav Fomichev wrote: > On 08/15, thinker.li@gmail.com wrote: >> From: Kui-Feng Lee <thinker.li@gmail.com> >> >> Do the same test as non-sleepable ones. >> >> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com> >> --- >> .../testing/selftests/bpf/bpf_experimental.h | 36 +++ >> tools/testing/selftests/bpf/bpf_kfuncs.h | 41 +++ >> .../selftests/bpf/prog_tests/sockopt_sk.c | 112 +++++++- >> .../testing/selftests/bpf/progs/sockopt_sk.c | 257 ++++++++++++++++++ >> .../selftests/bpf/verifier/sleepable.c | 2 +- >> 5 files changed, 445 insertions(+), 3 deletions(-) >> >> diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h >> index 209811b1993a..9b5dfefe65dc 100644 >> --- a/tools/testing/selftests/bpf/bpf_experimental.h >> +++ b/tools/testing/selftests/bpf/bpf_experimental.h >> @@ -131,4 +131,40 @@ extern int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *nod >> */ >> extern struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; >> >> +/* >> + * Description >> + * Copy data from *ptr* to *sopt->optval*. >> + * Return >> + * >= 0 on success, or a negative error in case of failure. >> + */ >> +extern int bpf_sockopt_dynptr_copy_to(struct bpf_sockopt *sopt, >> + struct bpf_dynptr *ptr) __ksym; >> + >> +/* Description >> + * Allocate a buffer of 'size' bytes for being installed as optval. >> + * Returns >> + * > 0 on success, the size of the allocated buffer >> + * -ENOMEM or -EINVAL on failure >> + */ >> +extern int bpf_sockopt_dynptr_alloc(struct bpf_sockopt *sopt, int size, >> + struct bpf_dynptr *ptr__uninit) __ksym; >> + >> +/* Description >> + * Install the buffer pointed to by 'ptr' as optval. >> + * Returns >> + * 0 on success >> + * -EINVAL if the buffer is too small >> + */ >> +extern int bpf_sockopt_dynptr_install(struct bpf_sockopt *sopt, >> + struct bpf_dynptr *ptr) __ksym; >> + >> +/* Description >> + * Release the buffer allocated by bpf_sockopt_dynptr_alloc. >> + * Returns >> + * 0 on success >> + * -EINVAL if the buffer was not allocated by bpf_sockopt_dynptr_alloc >> + */ >> +extern int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt, >> + struct bpf_dynptr *ptr) __ksym; >> + >> #endif >> diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h >> index 642dda0e758a..772040225257 100644 >> --- a/tools/testing/selftests/bpf/bpf_kfuncs.h >> +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h >> @@ -41,4 +41,45 @@ extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym; >> extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym; >> extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym; >> >> +extern int bpf_sockopt_dynptr_copy_to(struct bpf_sockopt *sopt, >> + struct bpf_dynptr *ptr) __ksym; >> + >> +/* Description >> + * Allocate a buffer of 'size' bytes for being installed as optval. >> + * Returns >> + * > 0 on success, the size of the allocated buffer >> + * -ENOMEM or -EINVAL on failure >> + */ >> +extern int bpf_sockopt_dynptr_alloc(struct bpf_sockopt *sopt, int size, >> + struct bpf_dynptr *ptr__uninit) __ksym; >> + >> +/* Description >> + * Install the buffer pointed to by 'ptr' as optval. >> + * Returns >> + * 0 on success >> + * -EINVAL if the buffer is too small >> + */ >> +extern int bpf_sockopt_dynptr_install(struct bpf_sockopt *sopt, >> + struct bpf_dynptr *ptr) __ksym; >> + >> +/* Description >> + * Release the buffer allocated by bpf_sockopt_dynptr_alloc. >> + * Returns >> + * 0 on success >> + * -EINVAL if the buffer was not allocated by bpf_sockopt_dynptr_alloc >> + */ >> +extern int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt, >> + struct bpf_dynptr *ptr) __ksym; >> + >> +/* Description >> + * Initialize a dynptr to access the content of optval passing >> + * to {get,set}sockopt()s. >> + * Returns >> + * > 0 on success, the size of the allocated buffer >> + * -ENOMEM or -EINVAL on failure >> + */ >> +extern int bpf_sockopt_dynptr_from(struct bpf_sockopt *sopt, >> + struct bpf_dynptr *ptr__uninit, >> + unsigned int size) __ksym; >> + >> #endif >> diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c >> index 05d0e07da394..85255648747f 100644 >> --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c >> +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c >> @@ -92,6 +92,7 @@ static int getsetsockopt(void) >> } >> if (buf.u8[0] != 0x01) { >> log_err("Unexpected buf[0] 0x%02x != 0x01", buf.u8[0]); >> + log_err("optlen %d", optlen); >> goto err; >> } >> >> @@ -220,7 +221,7 @@ static int getsetsockopt(void) >> return -1; >> } >> >> -static void run_test(int cgroup_fd) >> +static void run_test_nonsleepable(int cgroup_fd) >> { >> struct sockopt_sk *skel; >> >> @@ -246,6 +247,106 @@ static void run_test(int cgroup_fd) >> sockopt_sk__destroy(skel); >> } >> >> +static void run_test_nonsleepable_mixed(int cgroup_fd) >> +{ >> + struct sockopt_sk *skel; >> + >> + skel = sockopt_sk__open_and_load(); >> + if (!ASSERT_OK_PTR(skel, "skel_load")) >> + goto cleanup; >> + >> + skel->bss->page_size = getpagesize(); >> + skel->bss->skip_sleepable = 1; >> + >> + skel->links._setsockopt_s = >> + bpf_program__attach_cgroup(skel->progs._setsockopt_s, cgroup_fd); >> + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link (sleepable)")) >> + goto cleanup; >> + >> + skel->links._getsockopt_s = >> + bpf_program__attach_cgroup(skel->progs._getsockopt_s, cgroup_fd); >> + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link (sleepable)")) >> + goto cleanup; >> + >> + skel->links._setsockopt = >> + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); >> + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link")) >> + goto cleanup; >> + >> + skel->links._getsockopt = >> + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); >> + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link")) >> + goto cleanup; >> + >> + ASSERT_OK(getsetsockopt(), "getsetsockopt"); >> + >> +cleanup: >> + sockopt_sk__destroy(skel); >> +} >> + >> +static void run_test_sleepable(int cgroup_fd) >> +{ >> + struct sockopt_sk *skel; >> + >> + skel = sockopt_sk__open_and_load(); >> + if (!ASSERT_OK_PTR(skel, "skel_load")) >> + goto cleanup; >> + >> + skel->bss->page_size = getpagesize(); >> + >> + skel->links._setsockopt_s = >> + bpf_program__attach_cgroup(skel->progs._setsockopt_s, cgroup_fd); >> + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link")) >> + goto cleanup; >> + >> + skel->links._getsockopt_s = >> + bpf_program__attach_cgroup(skel->progs._getsockopt_s, cgroup_fd); >> + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link")) >> + goto cleanup; >> + >> + ASSERT_OK(getsetsockopt(), "getsetsockopt"); >> + >> +cleanup: >> + sockopt_sk__destroy(skel); >> +} >> + >> +static void run_test_sleepable_mixed(int cgroup_fd) >> +{ >> + struct sockopt_sk *skel; >> + >> + skel = sockopt_sk__open_and_load(); >> + if (!ASSERT_OK_PTR(skel, "skel_load")) >> + goto cleanup; >> + >> + skel->bss->page_size = getpagesize(); >> + skel->bss->skip_nonsleepable = 1; >> + >> + skel->links._setsockopt = >> + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); >> + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link (nonsleepable)")) >> + goto cleanup; >> + >> + skel->links._getsockopt = >> + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); >> + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link (nonsleepable)")) >> + goto cleanup; >> + >> + skel->links._setsockopt_s = >> + bpf_program__attach_cgroup(skel->progs._setsockopt_s, cgroup_fd); >> + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link")) >> + goto cleanup; >> + >> + skel->links._getsockopt_s = >> + bpf_program__attach_cgroup(skel->progs._getsockopt_s, cgroup_fd); >> + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link")) >> + goto cleanup; >> + >> + ASSERT_OK(getsetsockopt(), "getsetsockopt"); >> + >> +cleanup: >> + sockopt_sk__destroy(skel); >> +} >> + >> void test_sockopt_sk(void) >> { >> int cgroup_fd; >> @@ -254,6 +355,13 @@ void test_sockopt_sk(void) >> if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /sockopt_sk")) >> return; >> >> - run_test(cgroup_fd); >> + if (test__start_subtest("nonsleepable")) >> + run_test_nonsleepable(cgroup_fd); >> + if (test__start_subtest("sleepable")) >> + run_test_sleepable(cgroup_fd); >> + if (test__start_subtest("nonsleepable_mixed")) >> + run_test_nonsleepable_mixed(cgroup_fd); >> + if (test__start_subtest("sleepable_mixed")) >> + run_test_sleepable_mixed(cgroup_fd); >> close(cgroup_fd); >> } >> diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c >> index cb990a7d3d45..efacd3b88c40 100644 >> --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c >> +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c >> @@ -5,10 +5,16 @@ >> #include <netinet/in.h> >> #include <bpf/bpf_helpers.h> >> >> +typedef int bool; >> +#include "bpf_kfuncs.h" >> + >> char _license[] SEC("license") = "GPL"; >> >> int page_size = 0; /* userspace should set it */ >> >> +int skip_sleepable = 0; >> +int skip_nonsleepable = 0; >> + >> #ifndef SOL_TCP >> #define SOL_TCP IPPROTO_TCP >> #endif >> @@ -34,6 +40,9 @@ int _getsockopt(struct bpf_sockopt *ctx) >> struct sockopt_sk *storage; >> struct bpf_sock *sk; >> >> + if (skip_nonsleepable) >> + return 1; >> + >> /* Bypass AF_NETLINK. */ >> sk = ctx->sk; >> if (sk && sk->family == AF_NETLINK) >> @@ -136,6 +145,134 @@ int _getsockopt(struct bpf_sockopt *ctx) >> return 1; >> } >> >> +SEC("cgroup/getsockopt.s") >> +int _getsockopt_s(struct bpf_sockopt *ctx) >> +{ >> + struct tcp_zerocopy_receive *zcvr; >> + struct bpf_dynptr optval_dynptr; >> + struct sockopt_sk *storage; >> + __u8 *optval, *optval_end; >> + struct bpf_sock *sk; >> + char buf[1]; >> + __u64 addr; >> + int ret; >> + >> + if (skip_sleepable) >> + return 1; >> + >> + /* Bypass AF_NETLINK. */ >> + sk = ctx->sk; >> + if (sk && sk->family == AF_NETLINK) >> + return 1; >> + >> + optval = ctx->optval; >> + optval_end = ctx->optval_end; >> + >> + /* Make sure bpf_get_netns_cookie is callable. >> + */ >> + if (bpf_get_netns_cookie(NULL) == 0) >> + return 0; >> + >> + if (bpf_get_netns_cookie(ctx) == 0) >> + return 0; >> + >> + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { >> + /* Not interested in SOL_IP:IP_TOS; >> + * let next BPF program in the cgroup chain or kernel >> + * handle it. >> + */ >> + return 1; >> + } >> + >> + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { >> + /* Not interested in SOL_SOCKET:SO_SNDBUF; >> + * let next BPF program in the cgroup chain or kernel >> + * handle it. >> + */ >> + return 1; >> + } >> + >> + if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) { >> + /* Not interested in SOL_TCP:TCP_CONGESTION; >> + * let next BPF program in the cgroup chain or kernel >> + * handle it. >> + */ >> + return 1; >> + } >> + >> + if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { >> + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. >> + * It has a custom implementation for performance >> + * reasons. >> + */ >> + >> + bpf_sockopt_dynptr_from(ctx, &optval_dynptr, sizeof(*zcvr)); >> + zcvr = bpf_dynptr_data(&optval_dynptr, 0, sizeof(*zcvr)); >> + addr = zcvr ? zcvr->address : 0; >> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); > > This starts to look more usable, thank you for the changes! > Let me poke the api a bit more, I'm not super familiar with the dynptrs. > > here: bpf_sockopt_dynptr_from should probably be called > bpf_dynptr_from_sockopt to match bpf_dynptr_from_mem? agree! > >> + >> + return addr != 0 ? 0 : 1; >> + } >> + >> + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { >> + if (optval + 1 > optval_end) >> + return 0; /* bounds check */ >> + >> + ctx->retval = 0; /* Reset system call return value to zero */ >> + >> + /* Always export 0x55 */ >> + buf[0] = 0x55; >> + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); >> + if (ret >= 0) { >> + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); >> + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); >> + } >> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); > > Does bpf_sockopt_dynptr_alloc and bpf_sockopt_dynptr_release need to be > sockopt specific? Seems like we should provide, instead, some generic > bpf_dynptr_alloc/bpf_dynptr_release and make > bpf_sockopt_dynptr_copy_to/install work with them? WDYT? I found that kmalloc can not be called in the context of nmi, slab or page alloc path. It is why we don't have functions like bpf_dynptr_alloc/bpf_dynptr_release yet. That means we need someone to implement an allocator for BPF programs. And, then, we can have bpf_dynptr_alloc unpon it. (There is an implementation of bpf_dynptr_alloc() in the early versions of the patchset of dynptr. But, be removed before landing.) > >> + if (ret < 0) >> + return 0; >> + ctx->optlen = 1; >> + >> + /* Userspace buffer is PAGE_SIZE * 2, but BPF >> + * program can only see the first PAGE_SIZE >> + * bytes of data. >> + */ >> + if (optval_end - optval != page_size && 0) >> + return 0; /* unexpected data size */ >> + >> + return 1; >> + } >> + >> + if (ctx->level != SOL_CUSTOM) >> + return 0; /* deny everything except custom level */ >> + >> + if (optval + 1 > optval_end) >> + return 0; /* bounds check */ >> + >> + storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0, >> + BPF_SK_STORAGE_GET_F_CREATE); >> + if (!storage) >> + return 0; /* couldn't get sk storage */ >> + >> + if (!ctx->retval) >> + return 0; /* kernel should not have handled >> + * SOL_CUSTOM, something is wrong! >> + */ >> + ctx->retval = 0; /* Reset system call return value to zero */ >> + >> + buf[0] = storage->val; >> + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); >> + if (ret >= 0) { >> + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); >> + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); >> + } >> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); >> + if (ret < 0) >> + return 0; >> + ctx->optlen = 1; >> + >> + return 1; >> +} >> + >> SEC("cgroup/setsockopt") >> int _setsockopt(struct bpf_sockopt *ctx) >> { >> @@ -144,6 +281,9 @@ int _setsockopt(struct bpf_sockopt *ctx) >> struct sockopt_sk *storage; >> struct bpf_sock *sk; >> >> + if (skip_nonsleepable) >> + return 1; >> + >> /* Bypass AF_NETLINK. */ >> sk = ctx->sk; >> if (sk && sk->family == AF_NETLINK) >> @@ -236,3 +376,120 @@ int _setsockopt(struct bpf_sockopt *ctx) >> ctx->optlen = 0; >> return 1; >> } >> + >> +SEC("cgroup/setsockopt.s") >> +int _setsockopt_s(struct bpf_sockopt *ctx) >> +{ >> + struct bpf_dynptr optval_buf; >> + struct sockopt_sk *storage; >> + __u8 *optval, *optval_end; >> + struct bpf_sock *sk; >> + __u8 tmp_u8; >> + __u32 tmp; >> + int ret; >> + >> + if (skip_sleepable) >> + return 1; >> + >> + optval = ctx->optval; >> + optval_end = ctx->optval_end; >> + >> + /* Bypass AF_NETLINK. */ >> + sk = ctx->sk; >> + if (sk && sk->family == AF_NETLINK) >> + return -1; >> + >> + /* Make sure bpf_get_netns_cookie is callable. >> + */ >> + if (bpf_get_netns_cookie(NULL) == 0) >> + return 0; >> + >> + if (bpf_get_netns_cookie(ctx) == 0) >> + return 0; >> + >> + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { >> + /* Not interested in SOL_IP:IP_TOS; >> + * let next BPF program in the cgroup chain or kernel >> + * handle it. >> + */ >> + ctx->optlen = 0; /* bypass optval>PAGE_SIZE */ >> + return 1; >> + } >> + >> + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { >> + /* Overwrite SO_SNDBUF value */ >> + >> + ret = bpf_sockopt_dynptr_alloc(ctx, sizeof(__u32), >> + &optval_buf); >> + if (ret < 0) >> + bpf_sockopt_dynptr_release(ctx, &optval_buf); >> + else { >> + tmp = 0x55AA; >> + bpf_dynptr_write(&optval_buf, 0, &tmp, sizeof(tmp), 0); >> + ret = bpf_sockopt_dynptr_install(ctx, &optval_buf); > > One thing I'm still slightly confused about is > bpf_sockopt_dynptr_install vs bpf_sockopt_dynptr_copy_to. I do > understand that it comes from getsockopt vs setsockopt (and the ability, > in setsockopt, to allocate larger buffers), but I wonder whether > we can hide everything under single bpf_sockopt_dynptr_copy_to call? > > For getsockopt, it stays as is. For setsockopt, it would work like > _install currently does. Would that work? From user perspective, > if we provide a simple call that does the right thing, seems a bit > more usable? The only problem is probably the fact that _install > explicitly moves the ownership, but I don't see why copy_to can't > have the same "consume" semantics?
On 8/15/23 16:37, Kui-Feng Lee wrote: > > > On 8/15/23 13:57, Stanislav Fomichev wrote: >> On 08/15, thinker.li@gmail.com wrote: >>> From: Kui-Feng Lee <thinker.li@gmail.com> >>> >>> Do the same test as non-sleepable ones. >>> >>> Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com> >>> --- >>> .../testing/selftests/bpf/bpf_experimental.h | 36 +++ >>> tools/testing/selftests/bpf/bpf_kfuncs.h | 41 +++ >>> .../selftests/bpf/prog_tests/sockopt_sk.c | 112 +++++++- >>> .../testing/selftests/bpf/progs/sockopt_sk.c | 257 ++++++++++++++++++ >>> .../selftests/bpf/verifier/sleepable.c | 2 +- >>> 5 files changed, 445 insertions(+), 3 deletions(-) >>> >>> diff --git a/tools/testing/selftests/bpf/bpf_experimental.h >>> b/tools/testing/selftests/bpf/bpf_experimental.h >>> index 209811b1993a..9b5dfefe65dc 100644 >>> --- a/tools/testing/selftests/bpf/bpf_experimental.h >>> +++ b/tools/testing/selftests/bpf/bpf_experimental.h >>> @@ -131,4 +131,40 @@ extern int bpf_rbtree_add_impl(struct >>> bpf_rb_root *root, struct bpf_rb_node *nod >>> */ >>> extern struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root >>> *root) __ksym; >>> +/* >>> + * Description >>> + * Copy data from *ptr* to *sopt->optval*. >>> + * Return >>> + * >= 0 on success, or a negative error in case of failure. >>> + */ >>> +extern int bpf_sockopt_dynptr_copy_to(struct bpf_sockopt *sopt, >>> + struct bpf_dynptr *ptr) __ksym; >>> + >>> +/* Description >>> + * Allocate a buffer of 'size' bytes for being installed as optval. >>> + * Returns >>> + * > 0 on success, the size of the allocated buffer >>> + * -ENOMEM or -EINVAL on failure >>> + */ >>> +extern int bpf_sockopt_dynptr_alloc(struct bpf_sockopt *sopt, int size, >>> + struct bpf_dynptr *ptr__uninit) __ksym; >>> + >>> +/* Description >>> + * Install the buffer pointed to by 'ptr' as optval. >>> + * Returns >>> + * 0 on success >>> + * -EINVAL if the buffer is too small >>> + */ >>> +extern int bpf_sockopt_dynptr_install(struct bpf_sockopt *sopt, >>> + struct bpf_dynptr *ptr) __ksym; >>> + >>> +/* Description >>> + * Release the buffer allocated by bpf_sockopt_dynptr_alloc. >>> + * Returns >>> + * 0 on success >>> + * -EINVAL if the buffer was not allocated by >>> bpf_sockopt_dynptr_alloc >>> + */ >>> +extern int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt, >>> + struct bpf_dynptr *ptr) __ksym; >>> + >>> #endif >>> diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h >>> b/tools/testing/selftests/bpf/bpf_kfuncs.h >>> index 642dda0e758a..772040225257 100644 >>> --- a/tools/testing/selftests/bpf/bpf_kfuncs.h >>> +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h >>> @@ -41,4 +41,45 @@ extern bool bpf_dynptr_is_rdonly(const struct >>> bpf_dynptr *ptr) __ksym; >>> extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym; >>> extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct >>> bpf_dynptr *clone__init) __ksym; >>> +extern int bpf_sockopt_dynptr_copy_to(struct bpf_sockopt *sopt, >>> + struct bpf_dynptr *ptr) __ksym; >>> + >>> +/* Description >>> + * Allocate a buffer of 'size' bytes for being installed as optval. >>> + * Returns >>> + * > 0 on success, the size of the allocated buffer >>> + * -ENOMEM or -EINVAL on failure >>> + */ >>> +extern int bpf_sockopt_dynptr_alloc(struct bpf_sockopt *sopt, int size, >>> + struct bpf_dynptr *ptr__uninit) __ksym; >>> + >>> +/* Description >>> + * Install the buffer pointed to by 'ptr' as optval. >>> + * Returns >>> + * 0 on success >>> + * -EINVAL if the buffer is too small >>> + */ >>> +extern int bpf_sockopt_dynptr_install(struct bpf_sockopt *sopt, >>> + struct bpf_dynptr *ptr) __ksym; >>> + >>> +/* Description >>> + * Release the buffer allocated by bpf_sockopt_dynptr_alloc. >>> + * Returns >>> + * 0 on success >>> + * -EINVAL if the buffer was not allocated by >>> bpf_sockopt_dynptr_alloc >>> + */ >>> +extern int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt, >>> + struct bpf_dynptr *ptr) __ksym; >>> + >>> +/* Description >>> + * Initialize a dynptr to access the content of optval passing >>> + * to {get,set}sockopt()s. >>> + * Returns >>> + * > 0 on success, the size of the allocated buffer >>> + * -ENOMEM or -EINVAL on failure >>> + */ >>> +extern int bpf_sockopt_dynptr_from(struct bpf_sockopt *sopt, >>> + struct bpf_dynptr *ptr__uninit, >>> + unsigned int size) __ksym; >>> + >>> #endif >>> diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c >>> b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c >>> index 05d0e07da394..85255648747f 100644 >>> --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c >>> +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c >>> @@ -92,6 +92,7 @@ static int getsetsockopt(void) >>> } >>> if (buf.u8[0] != 0x01) { >>> log_err("Unexpected buf[0] 0x%02x != 0x01", buf.u8[0]); >>> + log_err("optlen %d", optlen); >>> goto err; >>> } >>> @@ -220,7 +221,7 @@ static int getsetsockopt(void) >>> return -1; >>> } >>> -static void run_test(int cgroup_fd) >>> +static void run_test_nonsleepable(int cgroup_fd) >>> { >>> struct sockopt_sk *skel; >>> @@ -246,6 +247,106 @@ static void run_test(int cgroup_fd) >>> sockopt_sk__destroy(skel); >>> } >>> +static void run_test_nonsleepable_mixed(int cgroup_fd) >>> +{ >>> + struct sockopt_sk *skel; >>> + >>> + skel = sockopt_sk__open_and_load(); >>> + if (!ASSERT_OK_PTR(skel, "skel_load")) >>> + goto cleanup; >>> + >>> + skel->bss->page_size = getpagesize(); >>> + skel->bss->skip_sleepable = 1; >>> + >>> + skel->links._setsockopt_s = >>> + bpf_program__attach_cgroup(skel->progs._setsockopt_s, >>> cgroup_fd); >>> + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link >>> (sleepable)")) >>> + goto cleanup; >>> + >>> + skel->links._getsockopt_s = >>> + bpf_program__attach_cgroup(skel->progs._getsockopt_s, >>> cgroup_fd); >>> + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link >>> (sleepable)")) >>> + goto cleanup; >>> + >>> + skel->links._setsockopt = >>> + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); >>> + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link")) >>> + goto cleanup; >>> + >>> + skel->links._getsockopt = >>> + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); >>> + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link")) >>> + goto cleanup; >>> + >>> + ASSERT_OK(getsetsockopt(), "getsetsockopt"); >>> + >>> +cleanup: >>> + sockopt_sk__destroy(skel); >>> +} >>> + >>> +static void run_test_sleepable(int cgroup_fd) >>> +{ >>> + struct sockopt_sk *skel; >>> + >>> + skel = sockopt_sk__open_and_load(); >>> + if (!ASSERT_OK_PTR(skel, "skel_load")) >>> + goto cleanup; >>> + >>> + skel->bss->page_size = getpagesize(); >>> + >>> + skel->links._setsockopt_s = >>> + bpf_program__attach_cgroup(skel->progs._setsockopt_s, >>> cgroup_fd); >>> + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link")) >>> + goto cleanup; >>> + >>> + skel->links._getsockopt_s = >>> + bpf_program__attach_cgroup(skel->progs._getsockopt_s, >>> cgroup_fd); >>> + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link")) >>> + goto cleanup; >>> + >>> + ASSERT_OK(getsetsockopt(), "getsetsockopt"); >>> + >>> +cleanup: >>> + sockopt_sk__destroy(skel); >>> +} >>> + >>> +static void run_test_sleepable_mixed(int cgroup_fd) >>> +{ >>> + struct sockopt_sk *skel; >>> + >>> + skel = sockopt_sk__open_and_load(); >>> + if (!ASSERT_OK_PTR(skel, "skel_load")) >>> + goto cleanup; >>> + >>> + skel->bss->page_size = getpagesize(); >>> + skel->bss->skip_nonsleepable = 1; >>> + >>> + skel->links._setsockopt = >>> + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); >>> + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link >>> (nonsleepable)")) >>> + goto cleanup; >>> + >>> + skel->links._getsockopt = >>> + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); >>> + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link >>> (nonsleepable)")) >>> + goto cleanup; >>> + >>> + skel->links._setsockopt_s = >>> + bpf_program__attach_cgroup(skel->progs._setsockopt_s, >>> cgroup_fd); >>> + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link")) >>> + goto cleanup; >>> + >>> + skel->links._getsockopt_s = >>> + bpf_program__attach_cgroup(skel->progs._getsockopt_s, >>> cgroup_fd); >>> + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link")) >>> + goto cleanup; >>> + >>> + ASSERT_OK(getsetsockopt(), "getsetsockopt"); >>> + >>> +cleanup: >>> + sockopt_sk__destroy(skel); >>> +} >>> + >>> void test_sockopt_sk(void) >>> { >>> int cgroup_fd; >>> @@ -254,6 +355,13 @@ void test_sockopt_sk(void) >>> if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /sockopt_sk")) >>> return; >>> - run_test(cgroup_fd); >>> + if (test__start_subtest("nonsleepable")) >>> + run_test_nonsleepable(cgroup_fd); >>> + if (test__start_subtest("sleepable")) >>> + run_test_sleepable(cgroup_fd); >>> + if (test__start_subtest("nonsleepable_mixed")) >>> + run_test_nonsleepable_mixed(cgroup_fd); >>> + if (test__start_subtest("sleepable_mixed")) >>> + run_test_sleepable_mixed(cgroup_fd); >>> close(cgroup_fd); >>> } >>> diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c >>> b/tools/testing/selftests/bpf/progs/sockopt_sk.c >>> index cb990a7d3d45..efacd3b88c40 100644 >>> --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c >>> +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c >>> @@ -5,10 +5,16 @@ >>> #include <netinet/in.h> >>> #include <bpf/bpf_helpers.h> >>> +typedef int bool; >>> +#include "bpf_kfuncs.h" >>> + >>> char _license[] SEC("license") = "GPL"; >>> int page_size = 0; /* userspace should set it */ >>> +int skip_sleepable = 0; >>> +int skip_nonsleepable = 0; >>> + >>> #ifndef SOL_TCP >>> #define SOL_TCP IPPROTO_TCP >>> #endif >>> @@ -34,6 +40,9 @@ int _getsockopt(struct bpf_sockopt *ctx) >>> struct sockopt_sk *storage; >>> struct bpf_sock *sk; >>> + if (skip_nonsleepable) >>> + return 1; >>> + >>> /* Bypass AF_NETLINK. */ >>> sk = ctx->sk; >>> if (sk && sk->family == AF_NETLINK) >>> @@ -136,6 +145,134 @@ int _getsockopt(struct bpf_sockopt *ctx) >>> return 1; >>> } >>> +SEC("cgroup/getsockopt.s") >>> +int _getsockopt_s(struct bpf_sockopt *ctx) >>> +{ >>> + struct tcp_zerocopy_receive *zcvr; >>> + struct bpf_dynptr optval_dynptr; >>> + struct sockopt_sk *storage; >>> + __u8 *optval, *optval_end; >>> + struct bpf_sock *sk; >>> + char buf[1]; >>> + __u64 addr; >>> + int ret; >>> + >>> + if (skip_sleepable) >>> + return 1; >>> + >>> + /* Bypass AF_NETLINK. */ >>> + sk = ctx->sk; >>> + if (sk && sk->family == AF_NETLINK) >>> + return 1; >>> + >>> + optval = ctx->optval; >>> + optval_end = ctx->optval_end; >>> + >>> + /* Make sure bpf_get_netns_cookie is callable. >>> + */ >>> + if (bpf_get_netns_cookie(NULL) == 0) >>> + return 0; >>> + >>> + if (bpf_get_netns_cookie(ctx) == 0) >>> + return 0; >>> + >>> + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { >>> + /* Not interested in SOL_IP:IP_TOS; >>> + * let next BPF program in the cgroup chain or kernel >>> + * handle it. >>> + */ >>> + return 1; >>> + } >>> + >>> + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { >>> + /* Not interested in SOL_SOCKET:SO_SNDBUF; >>> + * let next BPF program in the cgroup chain or kernel >>> + * handle it. >>> + */ >>> + return 1; >>> + } >>> + >>> + if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) { >>> + /* Not interested in SOL_TCP:TCP_CONGESTION; >>> + * let next BPF program in the cgroup chain or kernel >>> + * handle it. >>> + */ >>> + return 1; >>> + } >>> + >>> + if (ctx->level == SOL_TCP && ctx->optname == >>> TCP_ZEROCOPY_RECEIVE) { >>> + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. >>> + * It has a custom implementation for performance >>> + * reasons. >>> + */ >>> + >>> + bpf_sockopt_dynptr_from(ctx, &optval_dynptr, sizeof(*zcvr)); >>> + zcvr = bpf_dynptr_data(&optval_dynptr, 0, sizeof(*zcvr)); >>> + addr = zcvr ? zcvr->address : 0; >>> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); >> >> This starts to look more usable, thank you for the changes! >> Let me poke the api a bit more, I'm not super familiar with the dynptrs. >> >> here: bpf_sockopt_dynptr_from should probably be called >> bpf_dynptr_from_sockopt to match bpf_dynptr_from_mem? > > agree! > >> >>> + >>> + return addr != 0 ? 0 : 1; >>> + } >>> + >>> + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { >>> + if (optval + 1 > optval_end) >>> + return 0; /* bounds check */ >>> + >>> + ctx->retval = 0; /* Reset system call return value to zero */ >>> + >>> + /* Always export 0x55 */ >>> + buf[0] = 0x55; >>> + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); >>> + if (ret >= 0) { >>> + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); >>> + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); >>> + } >>> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); >> >> Does bpf_sockopt_dynptr_alloc and bpf_sockopt_dynptr_release need to be >> sockopt specific? Seems like we should provide, instead, some generic >> bpf_dynptr_alloc/bpf_dynptr_release and make >> bpf_sockopt_dynptr_copy_to/install work with them? WDYT? > > I found that kmalloc can not be called in the context of nmi, slab or > page alloc path. It is why we don't have functions like > bpf_dynptr_alloc/bpf_dynptr_release yet. That means we need someone > to implement an allocator for BPF programs. And, then, we can have > bpf_dynptr_alloc unpon it. (There is an implementation of > bpf_dynptr_alloc() in the early versions of the patchset of dynptr. > But, be removed before landing.) > > >> >>> + if (ret < 0) >>> + return 0; >>> + ctx->optlen = 1; >>> + >>> + /* Userspace buffer is PAGE_SIZE * 2, but BPF >>> + * program can only see the first PAGE_SIZE >>> + * bytes of data. >>> + */ >>> + if (optval_end - optval != page_size && 0) >>> + return 0; /* unexpected data size */ >>> + >>> + return 1; >>> + } >>> + >>> + if (ctx->level != SOL_CUSTOM) >>> + return 0; /* deny everything except custom level */ >>> + >>> + if (optval + 1 > optval_end) >>> + return 0; /* bounds check */ >>> + >>> + storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0, >>> + BPF_SK_STORAGE_GET_F_CREATE); >>> + if (!storage) >>> + return 0; /* couldn't get sk storage */ >>> + >>> + if (!ctx->retval) >>> + return 0; /* kernel should not have handled >>> + * SOL_CUSTOM, something is wrong! >>> + */ >>> + ctx->retval = 0; /* Reset system call return value to zero */ >>> + >>> + buf[0] = storage->val; >>> + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); >>> + if (ret >= 0) { >>> + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); >>> + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); >>> + } >>> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); >>> + if (ret < 0) >>> + return 0; >>> + ctx->optlen = 1; >>> + >>> + return 1; >>> +} >>> + >>> SEC("cgroup/setsockopt") >>> int _setsockopt(struct bpf_sockopt *ctx) >>> { >>> @@ -144,6 +281,9 @@ int _setsockopt(struct bpf_sockopt *ctx) >>> struct sockopt_sk *storage; >>> struct bpf_sock *sk; >>> + if (skip_nonsleepable) >>> + return 1; >>> + >>> /* Bypass AF_NETLINK. */ >>> sk = ctx->sk; >>> if (sk && sk->family == AF_NETLINK) >>> @@ -236,3 +376,120 @@ int _setsockopt(struct bpf_sockopt *ctx) >>> ctx->optlen = 0; >>> return 1; >>> } >>> + >>> +SEC("cgroup/setsockopt.s") >>> +int _setsockopt_s(struct bpf_sockopt *ctx) >>> +{ >>> + struct bpf_dynptr optval_buf; >>> + struct sockopt_sk *storage; >>> + __u8 *optval, *optval_end; >>> + struct bpf_sock *sk; >>> + __u8 tmp_u8; >>> + __u32 tmp; >>> + int ret; >>> + >>> + if (skip_sleepable) >>> + return 1; >>> + >>> + optval = ctx->optval; >>> + optval_end = ctx->optval_end; >>> + >>> + /* Bypass AF_NETLINK. */ >>> + sk = ctx->sk; >>> + if (sk && sk->family == AF_NETLINK) >>> + return -1; >>> + >>> + /* Make sure bpf_get_netns_cookie is callable. >>> + */ >>> + if (bpf_get_netns_cookie(NULL) == 0) >>> + return 0; >>> + >>> + if (bpf_get_netns_cookie(ctx) == 0) >>> + return 0; >>> + >>> + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { >>> + /* Not interested in SOL_IP:IP_TOS; >>> + * let next BPF program in the cgroup chain or kernel >>> + * handle it. >>> + */ >>> + ctx->optlen = 0; /* bypass optval>PAGE_SIZE */ >>> + return 1; >>> + } >>> + >>> + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { >>> + /* Overwrite SO_SNDBUF value */ >>> + >>> + ret = bpf_sockopt_dynptr_alloc(ctx, sizeof(__u32), >>> + &optval_buf); >>> + if (ret < 0) >>> + bpf_sockopt_dynptr_release(ctx, &optval_buf); >>> + else { >>> + tmp = 0x55AA; >>> + bpf_dynptr_write(&optval_buf, 0, &tmp, sizeof(tmp), 0); >>> + ret = bpf_sockopt_dynptr_install(ctx, &optval_buf); >> >> One thing I'm still slightly confused about is >> bpf_sockopt_dynptr_install vs bpf_sockopt_dynptr_copy_to. I do >> understand that it comes from getsockopt vs setsockopt (and the ability, >> in setsockopt, to allocate larger buffers), but I wonder whether >> we can hide everything under single bpf_sockopt_dynptr_copy_to call? >> >> For getsockopt, it stays as is. For setsockopt, it would work like >> _install currently does. Would that work? From user perspective, >> if we provide a simple call that does the right thing, seems a bit >> more usable? The only problem is probably the fact that _install >> explicitly moves the ownership, but I don't see why copy_to can't >> have the same "consume" semantics? Sorry for missing this part! This overloading is counterintuitive for me. *_copy_to() will not copy/overwrite the buffer, but replace the buffer instead. And, it will change its side-effects according to its context. I would prefer a different name instead of reusing *_copy_to(). We probably need a better name, instead of copy_to, in order to merge these two functions if we want to.
On 8/15/23 5:03 PM, Kui-Feng Lee wrote: >>>> +SEC("cgroup/getsockopt.s") >>>> +int _getsockopt_s(struct bpf_sockopt *ctx) >>>> +{ >>>> + struct tcp_zerocopy_receive *zcvr; >>>> + struct bpf_dynptr optval_dynptr; >>>> + struct sockopt_sk *storage; >>>> + __u8 *optval, *optval_end; >>>> + struct bpf_sock *sk; >>>> + char buf[1]; >>>> + __u64 addr; >>>> + int ret; >>>> + >>>> + if (skip_sleepable) >>>> + return 1; >>>> + >>>> + /* Bypass AF_NETLINK. */ >>>> + sk = ctx->sk; >>>> + if (sk && sk->family == AF_NETLINK) >>>> + return 1; >>>> + >>>> + optval = ctx->optval; >>>> + optval_end = ctx->optval_end; >>>> + >>>> + /* Make sure bpf_get_netns_cookie is callable. >>>> + */ >>>> + if (bpf_get_netns_cookie(NULL) == 0) >>>> + return 0; >>>> + >>>> + if (bpf_get_netns_cookie(ctx) == 0) >>>> + return 0; >>>> + >>>> + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { >>>> + /* Not interested in SOL_IP:IP_TOS; >>>> + * let next BPF program in the cgroup chain or kernel >>>> + * handle it. >>>> + */ >>>> + return 1; >>>> + } >>>> + >>>> + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { >>>> + /* Not interested in SOL_SOCKET:SO_SNDBUF; >>>> + * let next BPF program in the cgroup chain or kernel >>>> + * handle it. >>>> + */ >>>> + return 1; >>>> + } >>>> + >>>> + if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) { >>>> + /* Not interested in SOL_TCP:TCP_CONGESTION; >>>> + * let next BPF program in the cgroup chain or kernel >>>> + * handle it. >>>> + */ >>>> + return 1; >>>> + } >>>> + >>>> + if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { >>>> + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. >>>> + * It has a custom implementation for performance >>>> + * reasons. >>>> + */ >>>> + >>>> + bpf_sockopt_dynptr_from(ctx, &optval_dynptr, sizeof(*zcvr)); >>>> + zcvr = bpf_dynptr_data(&optval_dynptr, 0, sizeof(*zcvr)); >>>> + addr = zcvr ? zcvr->address : 0; >>>> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); >>> >>> This starts to look more usable, thank you for the changes! >>> Let me poke the api a bit more, I'm not super familiar with the dynptrs. >>> >>> here: bpf_sockopt_dynptr_from should probably be called >>> bpf_dynptr_from_sockopt to match bpf_dynptr_from_mem? >> >> agree! >> >>> >>>> + >>>> + return addr != 0 ? 0 : 1; >>>> + } >>>> + >>>> + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { >>>> + if (optval + 1 > optval_end) >>>> + return 0; /* bounds check */ >>>> + >>>> + ctx->retval = 0; /* Reset system call return value to zero */ >>>> + >>>> + /* Always export 0x55 */ >>>> + buf[0] = 0x55; >>>> + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); >>>> + if (ret >= 0) { >>>> + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); >>>> + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); >>>> + } >>>> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); >>> >>> Does bpf_sockopt_dynptr_alloc and bpf_sockopt_dynptr_release need to be >>> sockopt specific? Seems like we should provide, instead, some generic >>> bpf_dynptr_alloc/bpf_dynptr_release and make >>> bpf_sockopt_dynptr_copy_to/install work with them? WDYT? >> >> I found that kmalloc can not be called in the context of nmi, slab or >> page alloc path. It is why we don't have functions like >> bpf_dynptr_alloc/bpf_dynptr_release yet. That means we need someone >> to implement an allocator for BPF programs. And, then, we can have >> bpf_dynptr_alloc unpon it. (There is an implementation of >> bpf_dynptr_alloc() in the early versions of the patchset of dynptr. >> But, be removed before landing.) >> >> >>> >>>> + if (ret < 0) >>>> + return 0; >>>> + ctx->optlen = 1; >>>> + >>>> + /* Userspace buffer is PAGE_SIZE * 2, but BPF >>>> + * program can only see the first PAGE_SIZE >>>> + * bytes of data. >>>> + */ >>>> + if (optval_end - optval != page_size && 0) >>>> + return 0; /* unexpected data size */ >>>> + >>>> + return 1; >>>> + } >>>> + >>>> + if (ctx->level != SOL_CUSTOM) >>>> + return 0; /* deny everything except custom level */ >>>> + >>>> + if (optval + 1 > optval_end) >>>> + return 0; /* bounds check */ >>>> + >>>> + storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0, >>>> + BPF_SK_STORAGE_GET_F_CREATE); >>>> + if (!storage) >>>> + return 0; /* couldn't get sk storage */ >>>> + >>>> + if (!ctx->retval) >>>> + return 0; /* kernel should not have handled >>>> + * SOL_CUSTOM, something is wrong! >>>> + */ >>>> + ctx->retval = 0; /* Reset system call return value to zero */ >>>> + >>>> + buf[0] = storage->val; >>>> + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); >>>> + if (ret >= 0) { >>>> + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); >>>> + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); >>>> + } >>>> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); >>>> + if (ret < 0) >>>> + return 0; >>>> + ctx->optlen = 1; >>>> + >>>> + return 1; >>>> +} >>>> + >>>> SEC("cgroup/setsockopt") >>>> int _setsockopt(struct bpf_sockopt *ctx) >>>> { >>>> @@ -144,6 +281,9 @@ int _setsockopt(struct bpf_sockopt *ctx) >>>> struct sockopt_sk *storage; >>>> struct bpf_sock *sk; >>>> + if (skip_nonsleepable) >>>> + return 1; >>>> + >>>> /* Bypass AF_NETLINK. */ >>>> sk = ctx->sk; >>>> if (sk && sk->family == AF_NETLINK) >>>> @@ -236,3 +376,120 @@ int _setsockopt(struct bpf_sockopt *ctx) >>>> ctx->optlen = 0; >>>> return 1; >>>> } >>>> + >>>> +SEC("cgroup/setsockopt.s") >>>> +int _setsockopt_s(struct bpf_sockopt *ctx) >>>> +{ >>>> + struct bpf_dynptr optval_buf; >>>> + struct sockopt_sk *storage; >>>> + __u8 *optval, *optval_end; >>>> + struct bpf_sock *sk; >>>> + __u8 tmp_u8; >>>> + __u32 tmp; >>>> + int ret; >>>> + >>>> + if (skip_sleepable) >>>> + return 1; >>>> + >>>> + optval = ctx->optval; >>>> + optval_end = ctx->optval_end; >>>> + >>>> + /* Bypass AF_NETLINK. */ >>>> + sk = ctx->sk; >>>> + if (sk && sk->family == AF_NETLINK) >>>> + return -1; >>>> + >>>> + /* Make sure bpf_get_netns_cookie is callable. >>>> + */ >>>> + if (bpf_get_netns_cookie(NULL) == 0) >>>> + return 0; >>>> + >>>> + if (bpf_get_netns_cookie(ctx) == 0) >>>> + return 0; >>>> + >>>> + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { >>>> + /* Not interested in SOL_IP:IP_TOS; >>>> + * let next BPF program in the cgroup chain or kernel >>>> + * handle it. >>>> + */ >>>> + ctx->optlen = 0; /* bypass optval>PAGE_SIZE */ >>>> + return 1; >>>> + } >>>> + >>>> + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { >>>> + /* Overwrite SO_SNDBUF value */ >>>> + >>>> + ret = bpf_sockopt_dynptr_alloc(ctx, sizeof(__u32), >>>> + &optval_buf); >>>> + if (ret < 0) >>>> + bpf_sockopt_dynptr_release(ctx, &optval_buf); >>>> + else { >>>> + tmp = 0x55AA; >>>> + bpf_dynptr_write(&optval_buf, 0, &tmp, sizeof(tmp), 0); >>>> + ret = bpf_sockopt_dynptr_install(ctx, &optval_buf); >>> >>> One thing I'm still slightly confused about is >>> bpf_sockopt_dynptr_install vs bpf_sockopt_dynptr_copy_to. I do >>> understand that it comes from getsockopt vs setsockopt (and the ability, >>> in setsockopt, to allocate larger buffers), but I wonder whether >>> we can hide everything under single bpf_sockopt_dynptr_copy_to call? >>> >>> For getsockopt, it stays as is. For setsockopt, it would work like >>> _install currently does. Would that work? From user perspective, >>> if we provide a simple call that does the right thing, seems a bit >>> more usable? The only problem is probably the fact that _install >>> explicitly moves the ownership, but I don't see why copy_to can't >>> have the same "consume" semantics? > > Sorry for missing this part! > This overloading is counterintuitive for me. > *_copy_to() will not copy/overwrite the buffer, but replace the buffer > instead. And, it will change its side-effects according to its context. > I would prefer a different name instead of reusing *_copy_to(). > > We probably need a better name, instead of copy_to, in order to merge > these two functions if we want to. It also took me some time to realize the alloc/install/copy_to/release usages. iiuc, to change optval in getsockopt, it is alloc=>write=>copy_to=>release. to change optval in setsockopt, it is alloc=>write=>install. Can this alloc and user-or-kernel memory details be done in the bpf_dynptr_write() instead such that both BPF_CGROUP_GETSOCKOPT and BPF_CGROUP_SETSOCKOPT program can just call one bpf_dynptr_write() to update the optval? I meant bpf_dynptr_write() should have the needed context to decide if it can directly write to the __user ptr or it needs to write to a kernel memory and if kmalloc is needed/allowed. Same for reading. bpf_dynptr_read() should know it should read from user or kernel memory. bpf_dynptr_data() may not work well if the underlying sockopt memory is still backed by user memory, so probably don't support bpf_dynptr_data() for sockopt. Support the bpf_dynptr_slice() and bpf_dynptr_slice_rdwr() instead. Take a look at how the "buffer__opt" arg is used by the xdp dynptr. If the underlying sockopt is still backed by user memory, the bpf_dynptr_slice() can do a copy_from_user to the "buffer__opt". bpf_dynptr_from_cgrp_sockopt() should be the only function to initalize a dynptr from the 'struct bpf_sockopt *ctx'. Probably don't need to allocate any memory at the init time. Is there something specific to cgrp sockopt that is difficult and any downside of the above? WDYT?
On 8/16/23 18:13, Martin KaFai Lau wrote: > On 8/15/23 5:03 PM, Kui-Feng Lee wrote: >>>>> +SEC("cgroup/getsockopt.s") >>>>> +int _getsockopt_s(struct bpf_sockopt *ctx) >>>>> +{ >>>>> + struct tcp_zerocopy_receive *zcvr; >>>>> + struct bpf_dynptr optval_dynptr; >>>>> + struct sockopt_sk *storage; >>>>> + __u8 *optval, *optval_end; >>>>> + struct bpf_sock *sk; >>>>> + char buf[1]; >>>>> + __u64 addr; >>>>> + int ret; >>>>> + >>>>> + if (skip_sleepable) >>>>> + return 1; >>>>> + >>>>> + /* Bypass AF_NETLINK. */ >>>>> + sk = ctx->sk; >>>>> + if (sk && sk->family == AF_NETLINK) >>>>> + return 1; >>>>> + >>>>> + optval = ctx->optval; >>>>> + optval_end = ctx->optval_end; >>>>> + >>>>> + /* Make sure bpf_get_netns_cookie is callable. >>>>> + */ >>>>> + if (bpf_get_netns_cookie(NULL) == 0) >>>>> + return 0; >>>>> + >>>>> + if (bpf_get_netns_cookie(ctx) == 0) >>>>> + return 0; >>>>> + >>>>> + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { >>>>> + /* Not interested in SOL_IP:IP_TOS; >>>>> + * let next BPF program in the cgroup chain or kernel >>>>> + * handle it. >>>>> + */ >>>>> + return 1; >>>>> + } >>>>> + >>>>> + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { >>>>> + /* Not interested in SOL_SOCKET:SO_SNDBUF; >>>>> + * let next BPF program in the cgroup chain or kernel >>>>> + * handle it. >>>>> + */ >>>>> + return 1; >>>>> + } >>>>> + >>>>> + if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) { >>>>> + /* Not interested in SOL_TCP:TCP_CONGESTION; >>>>> + * let next BPF program in the cgroup chain or kernel >>>>> + * handle it. >>>>> + */ >>>>> + return 1; >>>>> + } >>>>> + >>>>> + if (ctx->level == SOL_TCP && ctx->optname == >>>>> TCP_ZEROCOPY_RECEIVE) { >>>>> + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. >>>>> + * It has a custom implementation for performance >>>>> + * reasons. >>>>> + */ >>>>> + >>>>> + bpf_sockopt_dynptr_from(ctx, &optval_dynptr, sizeof(*zcvr)); >>>>> + zcvr = bpf_dynptr_data(&optval_dynptr, 0, sizeof(*zcvr)); >>>>> + addr = zcvr ? zcvr->address : 0; >>>>> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); >>>> >>>> This starts to look more usable, thank you for the changes! >>>> Let me poke the api a bit more, I'm not super familiar with the >>>> dynptrs. >>>> >>>> here: bpf_sockopt_dynptr_from should probably be called >>>> bpf_dynptr_from_sockopt to match bpf_dynptr_from_mem? >>> >>> agree! >>> >>>> >>>>> + >>>>> + return addr != 0 ? 0 : 1; >>>>> + } >>>>> + >>>>> + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { >>>>> + if (optval + 1 > optval_end) >>>>> + return 0; /* bounds check */ >>>>> + >>>>> + ctx->retval = 0; /* Reset system call return value to zero */ >>>>> + >>>>> + /* Always export 0x55 */ >>>>> + buf[0] = 0x55; >>>>> + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); >>>>> + if (ret >= 0) { >>>>> + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); >>>>> + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); >>>>> + } >>>>> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); >>>> >>>> Does bpf_sockopt_dynptr_alloc and bpf_sockopt_dynptr_release need to be >>>> sockopt specific? Seems like we should provide, instead, some generic >>>> bpf_dynptr_alloc/bpf_dynptr_release and make >>>> bpf_sockopt_dynptr_copy_to/install work with them? WDYT? >>> >>> I found that kmalloc can not be called in the context of nmi, slab or >>> page alloc path. It is why we don't have functions like >>> bpf_dynptr_alloc/bpf_dynptr_release yet. That means we need someone >>> to implement an allocator for BPF programs. And, then, we can have >>> bpf_dynptr_alloc unpon it. (There is an implementation of >>> bpf_dynptr_alloc() in the early versions of the patchset of dynptr. >>> But, be removed before landing.) >>> >>> >>>> >>>>> + if (ret < 0) >>>>> + return 0; >>>>> + ctx->optlen = 1; >>>>> + >>>>> + /* Userspace buffer is PAGE_SIZE * 2, but BPF >>>>> + * program can only see the first PAGE_SIZE >>>>> + * bytes of data. >>>>> + */ >>>>> + if (optval_end - optval != page_size && 0) >>>>> + return 0; /* unexpected data size */ >>>>> + >>>>> + return 1; >>>>> + } >>>>> + >>>>> + if (ctx->level != SOL_CUSTOM) >>>>> + return 0; /* deny everything except custom level */ >>>>> + >>>>> + if (optval + 1 > optval_end) >>>>> + return 0; /* bounds check */ >>>>> + >>>>> + storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0, >>>>> + BPF_SK_STORAGE_GET_F_CREATE); >>>>> + if (!storage) >>>>> + return 0; /* couldn't get sk storage */ >>>>> + >>>>> + if (!ctx->retval) >>>>> + return 0; /* kernel should not have handled >>>>> + * SOL_CUSTOM, something is wrong! >>>>> + */ >>>>> + ctx->retval = 0; /* Reset system call return value to zero */ >>>>> + >>>>> + buf[0] = storage->val; >>>>> + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); >>>>> + if (ret >= 0) { >>>>> + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); >>>>> + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); >>>>> + } >>>>> + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); >>>>> + if (ret < 0) >>>>> + return 0; >>>>> + ctx->optlen = 1; >>>>> + >>>>> + return 1; >>>>> +} >>>>> + >>>>> SEC("cgroup/setsockopt") >>>>> int _setsockopt(struct bpf_sockopt *ctx) >>>>> { >>>>> @@ -144,6 +281,9 @@ int _setsockopt(struct bpf_sockopt *ctx) >>>>> struct sockopt_sk *storage; >>>>> struct bpf_sock *sk; >>>>> + if (skip_nonsleepable) >>>>> + return 1; >>>>> + >>>>> /* Bypass AF_NETLINK. */ >>>>> sk = ctx->sk; >>>>> if (sk && sk->family == AF_NETLINK) >>>>> @@ -236,3 +376,120 @@ int _setsockopt(struct bpf_sockopt *ctx) >>>>> ctx->optlen = 0; >>>>> return 1; >>>>> } >>>>> + >>>>> +SEC("cgroup/setsockopt.s") >>>>> +int _setsockopt_s(struct bpf_sockopt *ctx) >>>>> +{ >>>>> + struct bpf_dynptr optval_buf; >>>>> + struct sockopt_sk *storage; >>>>> + __u8 *optval, *optval_end; >>>>> + struct bpf_sock *sk; >>>>> + __u8 tmp_u8; >>>>> + __u32 tmp; >>>>> + int ret; >>>>> + >>>>> + if (skip_sleepable) >>>>> + return 1; >>>>> + >>>>> + optval = ctx->optval; >>>>> + optval_end = ctx->optval_end; >>>>> + >>>>> + /* Bypass AF_NETLINK. */ >>>>> + sk = ctx->sk; >>>>> + if (sk && sk->family == AF_NETLINK) >>>>> + return -1; >>>>> + >>>>> + /* Make sure bpf_get_netns_cookie is callable. >>>>> + */ >>>>> + if (bpf_get_netns_cookie(NULL) == 0) >>>>> + return 0; >>>>> + >>>>> + if (bpf_get_netns_cookie(ctx) == 0) >>>>> + return 0; >>>>> + >>>>> + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { >>>>> + /* Not interested in SOL_IP:IP_TOS; >>>>> + * let next BPF program in the cgroup chain or kernel >>>>> + * handle it. >>>>> + */ >>>>> + ctx->optlen = 0; /* bypass optval>PAGE_SIZE */ >>>>> + return 1; >>>>> + } >>>>> + >>>>> + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { >>>>> + /* Overwrite SO_SNDBUF value */ >>>>> + >>>>> + ret = bpf_sockopt_dynptr_alloc(ctx, sizeof(__u32), >>>>> + &optval_buf); >>>>> + if (ret < 0) >>>>> + bpf_sockopt_dynptr_release(ctx, &optval_buf); >>>>> + else { >>>>> + tmp = 0x55AA; >>>>> + bpf_dynptr_write(&optval_buf, 0, &tmp, sizeof(tmp), 0); >>>>> + ret = bpf_sockopt_dynptr_install(ctx, &optval_buf); >>>> >>>> One thing I'm still slightly confused about is >>>> bpf_sockopt_dynptr_install vs bpf_sockopt_dynptr_copy_to. I do >>>> understand that it comes from getsockopt vs setsockopt (and the >>>> ability, >>>> in setsockopt, to allocate larger buffers), but I wonder whether >>>> we can hide everything under single bpf_sockopt_dynptr_copy_to call? >>>> >>>> For getsockopt, it stays as is. For setsockopt, it would work like >>>> _install currently does. Would that work? From user perspective, >>>> if we provide a simple call that does the right thing, seems a bit >>>> more usable? The only problem is probably the fact that _install >>>> explicitly moves the ownership, but I don't see why copy_to can't >>>> have the same "consume" semantics? >> >> Sorry for missing this part! >> This overloading is counterintuitive for me. >> *_copy_to() will not copy/overwrite the buffer, but replace the buffer >> instead. And, it will change its side-effects according to its context. >> I would prefer a different name instead of reusing *_copy_to(). >> >> We probably need a better name, instead of copy_to, in order to merge >> these two functions if we want to. > > It also took me some time to realize the alloc/install/copy_to/release > usages. iiuc, to change optval in getsockopt, it is > alloc=>write=>copy_to=>release. > to change optval in setsockopt, it is alloc=>write=>install. > > Can this alloc and user-or-kernel memory details be done in the > bpf_dynptr_write() instead such that both BPF_CGROUP_GETSOCKOPT and > BPF_CGROUP_SETSOCKOPT program can just call one bpf_dynptr_write() to > update the optval? I meant bpf_dynptr_write() should have the needed > context to decide if it can directly write to the __user ptr or it needs > to write to a kernel memory and if kmalloc is needed/allowed. For v3, you can just call bpf_dynptr_write() to update the buffer without installing it if optval is already a in kernel buffer. But, you can still call *_install() even if it is unnecessary. The code in the test case try to make it less complicated. So, it always call *_install() without any check. > > Same for reading. bpf_dynptr_read() should know it should read from user > or kernel memory. bpf_dynptr_data() may not work well if the underlying > sockopt memory is still backed by user memory, so probably don't support > bpf_dynptr_data() for sockopt. Support the bpf_dynptr_slice() and > bpf_dynptr_slice_rdwr() instead. Take a look at how the "buffer__opt" > arg is used by the xdp dynptr. If the underlying sockopt is still backed > by user memory, the bpf_dynptr_slice() can do a copy_from_user to the > "buffer__opt". > > bpf_dynptr_from_cgrp_sockopt() should be the only function to initalize > a dynptr from the 'struct bpf_sockopt *ctx'. Probably don't need to > allocate any memory at the init time. > > Is there something specific to cgrp sockopt that is difficult and any > downside of the above? > > WDYT? > This is doable.
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index 209811b1993a..9b5dfefe65dc 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -131,4 +131,40 @@ extern int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *nod */ extern struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; +/* + * Description + * Copy data from *ptr* to *sopt->optval*. + * Return + * >= 0 on success, or a negative error in case of failure. + */ +extern int bpf_sockopt_dynptr_copy_to(struct bpf_sockopt *sopt, + struct bpf_dynptr *ptr) __ksym; + +/* Description + * Allocate a buffer of 'size' bytes for being installed as optval. + * Returns + * > 0 on success, the size of the allocated buffer + * -ENOMEM or -EINVAL on failure + */ +extern int bpf_sockopt_dynptr_alloc(struct bpf_sockopt *sopt, int size, + struct bpf_dynptr *ptr__uninit) __ksym; + +/* Description + * Install the buffer pointed to by 'ptr' as optval. + * Returns + * 0 on success + * -EINVAL if the buffer is too small + */ +extern int bpf_sockopt_dynptr_install(struct bpf_sockopt *sopt, + struct bpf_dynptr *ptr) __ksym; + +/* Description + * Release the buffer allocated by bpf_sockopt_dynptr_alloc. + * Returns + * 0 on success + * -EINVAL if the buffer was not allocated by bpf_sockopt_dynptr_alloc + */ +extern int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt, + struct bpf_dynptr *ptr) __ksym; + #endif diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h index 642dda0e758a..772040225257 100644 --- a/tools/testing/selftests/bpf/bpf_kfuncs.h +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h @@ -41,4 +41,45 @@ extern bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *ptr) __ksym; extern __u32 bpf_dynptr_size(const struct bpf_dynptr *ptr) __ksym; extern int bpf_dynptr_clone(const struct bpf_dynptr *ptr, struct bpf_dynptr *clone__init) __ksym; +extern int bpf_sockopt_dynptr_copy_to(struct bpf_sockopt *sopt, + struct bpf_dynptr *ptr) __ksym; + +/* Description + * Allocate a buffer of 'size' bytes for being installed as optval. + * Returns + * > 0 on success, the size of the allocated buffer + * -ENOMEM or -EINVAL on failure + */ +extern int bpf_sockopt_dynptr_alloc(struct bpf_sockopt *sopt, int size, + struct bpf_dynptr *ptr__uninit) __ksym; + +/* Description + * Install the buffer pointed to by 'ptr' as optval. + * Returns + * 0 on success + * -EINVAL if the buffer is too small + */ +extern int bpf_sockopt_dynptr_install(struct bpf_sockopt *sopt, + struct bpf_dynptr *ptr) __ksym; + +/* Description + * Release the buffer allocated by bpf_sockopt_dynptr_alloc. + * Returns + * 0 on success + * -EINVAL if the buffer was not allocated by bpf_sockopt_dynptr_alloc + */ +extern int bpf_sockopt_dynptr_release(struct bpf_sockopt *sopt, + struct bpf_dynptr *ptr) __ksym; + +/* Description + * Initialize a dynptr to access the content of optval passing + * to {get,set}sockopt()s. + * Returns + * > 0 on success, the size of the allocated buffer + * -ENOMEM or -EINVAL on failure + */ +extern int bpf_sockopt_dynptr_from(struct bpf_sockopt *sopt, + struct bpf_dynptr *ptr__uninit, + unsigned int size) __ksym; + #endif diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index 05d0e07da394..85255648747f 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -92,6 +92,7 @@ static int getsetsockopt(void) } if (buf.u8[0] != 0x01) { log_err("Unexpected buf[0] 0x%02x != 0x01", buf.u8[0]); + log_err("optlen %d", optlen); goto err; } @@ -220,7 +221,7 @@ static int getsetsockopt(void) return -1; } -static void run_test(int cgroup_fd) +static void run_test_nonsleepable(int cgroup_fd) { struct sockopt_sk *skel; @@ -246,6 +247,106 @@ static void run_test(int cgroup_fd) sockopt_sk__destroy(skel); } +static void run_test_nonsleepable_mixed(int cgroup_fd) +{ + struct sockopt_sk *skel; + + skel = sockopt_sk__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + skel->bss->page_size = getpagesize(); + skel->bss->skip_sleepable = 1; + + skel->links._setsockopt_s = + bpf_program__attach_cgroup(skel->progs._setsockopt_s, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link (sleepable)")) + goto cleanup; + + skel->links._getsockopt_s = + bpf_program__attach_cgroup(skel->progs._getsockopt_s, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link (sleepable)")) + goto cleanup; + + skel->links._setsockopt = + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link")) + goto cleanup; + + skel->links._getsockopt = + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link")) + goto cleanup; + + ASSERT_OK(getsetsockopt(), "getsetsockopt"); + +cleanup: + sockopt_sk__destroy(skel); +} + +static void run_test_sleepable(int cgroup_fd) +{ + struct sockopt_sk *skel; + + skel = sockopt_sk__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + skel->bss->page_size = getpagesize(); + + skel->links._setsockopt_s = + bpf_program__attach_cgroup(skel->progs._setsockopt_s, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link")) + goto cleanup; + + skel->links._getsockopt_s = + bpf_program__attach_cgroup(skel->progs._getsockopt_s, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link")) + goto cleanup; + + ASSERT_OK(getsetsockopt(), "getsetsockopt"); + +cleanup: + sockopt_sk__destroy(skel); +} + +static void run_test_sleepable_mixed(int cgroup_fd) +{ + struct sockopt_sk *skel; + + skel = sockopt_sk__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + skel->bss->page_size = getpagesize(); + skel->bss->skip_nonsleepable = 1; + + skel->links._setsockopt = + bpf_program__attach_cgroup(skel->progs._setsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._setsockopt, "setsockopt_link (nonsleepable)")) + goto cleanup; + + skel->links._getsockopt = + bpf_program__attach_cgroup(skel->progs._getsockopt, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._getsockopt, "getsockopt_link (nonsleepable)")) + goto cleanup; + + skel->links._setsockopt_s = + bpf_program__attach_cgroup(skel->progs._setsockopt_s, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._setsockopt_s, "setsockopt_link")) + goto cleanup; + + skel->links._getsockopt_s = + bpf_program__attach_cgroup(skel->progs._getsockopt_s, cgroup_fd); + if (!ASSERT_OK_PTR(skel->links._getsockopt_s, "getsockopt_link")) + goto cleanup; + + ASSERT_OK(getsetsockopt(), "getsetsockopt"); + +cleanup: + sockopt_sk__destroy(skel); +} + void test_sockopt_sk(void) { int cgroup_fd; @@ -254,6 +355,13 @@ void test_sockopt_sk(void) if (!ASSERT_GE(cgroup_fd, 0, "join_cgroup /sockopt_sk")) return; - run_test(cgroup_fd); + if (test__start_subtest("nonsleepable")) + run_test_nonsleepable(cgroup_fd); + if (test__start_subtest("sleepable")) + run_test_sleepable(cgroup_fd); + if (test__start_subtest("nonsleepable_mixed")) + run_test_nonsleepable_mixed(cgroup_fd); + if (test__start_subtest("sleepable_mixed")) + run_test_sleepable_mixed(cgroup_fd); close(cgroup_fd); } diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index cb990a7d3d45..efacd3b88c40 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -5,10 +5,16 @@ #include <netinet/in.h> #include <bpf/bpf_helpers.h> +typedef int bool; +#include "bpf_kfuncs.h" + char _license[] SEC("license") = "GPL"; int page_size = 0; /* userspace should set it */ +int skip_sleepable = 0; +int skip_nonsleepable = 0; + #ifndef SOL_TCP #define SOL_TCP IPPROTO_TCP #endif @@ -34,6 +40,9 @@ int _getsockopt(struct bpf_sockopt *ctx) struct sockopt_sk *storage; struct bpf_sock *sk; + if (skip_nonsleepable) + return 1; + /* Bypass AF_NETLINK. */ sk = ctx->sk; if (sk && sk->family == AF_NETLINK) @@ -136,6 +145,134 @@ int _getsockopt(struct bpf_sockopt *ctx) return 1; } +SEC("cgroup/getsockopt.s") +int _getsockopt_s(struct bpf_sockopt *ctx) +{ + struct tcp_zerocopy_receive *zcvr; + struct bpf_dynptr optval_dynptr; + struct sockopt_sk *storage; + __u8 *optval, *optval_end; + struct bpf_sock *sk; + char buf[1]; + __u64 addr; + int ret; + + if (skip_sleepable) + return 1; + + /* Bypass AF_NETLINK. */ + sk = ctx->sk; + if (sk && sk->family == AF_NETLINK) + return 1; + + optval = ctx->optval; + optval_end = ctx->optval_end; + + /* Make sure bpf_get_netns_cookie is callable. + */ + if (bpf_get_netns_cookie(NULL) == 0) + return 0; + + if (bpf_get_netns_cookie(ctx) == 0) + return 0; + + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { + /* Not interested in SOL_IP:IP_TOS; + * let next BPF program in the cgroup chain or kernel + * handle it. + */ + return 1; + } + + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { + /* Not interested in SOL_SOCKET:SO_SNDBUF; + * let next BPF program in the cgroup chain or kernel + * handle it. + */ + return 1; + } + + if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) { + /* Not interested in SOL_TCP:TCP_CONGESTION; + * let next BPF program in the cgroup chain or kernel + * handle it. + */ + return 1; + } + + if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. + * It has a custom implementation for performance + * reasons. + */ + + bpf_sockopt_dynptr_from(ctx, &optval_dynptr, sizeof(*zcvr)); + zcvr = bpf_dynptr_data(&optval_dynptr, 0, sizeof(*zcvr)); + addr = zcvr ? zcvr->address : 0; + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); + + return addr != 0 ? 0 : 1; + } + + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { + if (optval + 1 > optval_end) + return 0; /* bounds check */ + + ctx->retval = 0; /* Reset system call return value to zero */ + + /* Always export 0x55 */ + buf[0] = 0x55; + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); + if (ret >= 0) { + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); + } + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); + if (ret < 0) + return 0; + ctx->optlen = 1; + + /* Userspace buffer is PAGE_SIZE * 2, but BPF + * program can only see the first PAGE_SIZE + * bytes of data. + */ + if (optval_end - optval != page_size && 0) + return 0; /* unexpected data size */ + + return 1; + } + + if (ctx->level != SOL_CUSTOM) + return 0; /* deny everything except custom level */ + + if (optval + 1 > optval_end) + return 0; /* bounds check */ + + storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!storage) + return 0; /* couldn't get sk storage */ + + if (!ctx->retval) + return 0; /* kernel should not have handled + * SOL_CUSTOM, something is wrong! + */ + ctx->retval = 0; /* Reset system call return value to zero */ + + buf[0] = storage->val; + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_dynptr); + if (ret >= 0) { + bpf_dynptr_write(&optval_dynptr, 0, buf, 1, 0); + ret = bpf_sockopt_dynptr_copy_to(ctx, &optval_dynptr); + } + bpf_sockopt_dynptr_release(ctx, &optval_dynptr); + if (ret < 0) + return 0; + ctx->optlen = 1; + + return 1; +} + SEC("cgroup/setsockopt") int _setsockopt(struct bpf_sockopt *ctx) { @@ -144,6 +281,9 @@ int _setsockopt(struct bpf_sockopt *ctx) struct sockopt_sk *storage; struct bpf_sock *sk; + if (skip_nonsleepable) + return 1; + /* Bypass AF_NETLINK. */ sk = ctx->sk; if (sk && sk->family == AF_NETLINK) @@ -236,3 +376,120 @@ int _setsockopt(struct bpf_sockopt *ctx) ctx->optlen = 0; return 1; } + +SEC("cgroup/setsockopt.s") +int _setsockopt_s(struct bpf_sockopt *ctx) +{ + struct bpf_dynptr optval_buf; + struct sockopt_sk *storage; + __u8 *optval, *optval_end; + struct bpf_sock *sk; + __u8 tmp_u8; + __u32 tmp; + int ret; + + if (skip_sleepable) + return 1; + + optval = ctx->optval; + optval_end = ctx->optval_end; + + /* Bypass AF_NETLINK. */ + sk = ctx->sk; + if (sk && sk->family == AF_NETLINK) + return -1; + + /* Make sure bpf_get_netns_cookie is callable. + */ + if (bpf_get_netns_cookie(NULL) == 0) + return 0; + + if (bpf_get_netns_cookie(ctx) == 0) + return 0; + + if (ctx->level == SOL_IP && ctx->optname == IP_TOS) { + /* Not interested in SOL_IP:IP_TOS; + * let next BPF program in the cgroup chain or kernel + * handle it. + */ + ctx->optlen = 0; /* bypass optval>PAGE_SIZE */ + return 1; + } + + if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) { + /* Overwrite SO_SNDBUF value */ + + ret = bpf_sockopt_dynptr_alloc(ctx, sizeof(__u32), + &optval_buf); + if (ret < 0) + bpf_sockopt_dynptr_release(ctx, &optval_buf); + else { + tmp = 0x55AA; + bpf_dynptr_write(&optval_buf, 0, &tmp, sizeof(tmp), 0); + ret = bpf_sockopt_dynptr_install(ctx, &optval_buf); + } + + return ret >= 0 ? 1 : 0; + } + + if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) { + /* Always use cubic */ + + ret = bpf_sockopt_dynptr_alloc(ctx, 5, &optval_buf); + if (ret < 0) { + bpf_sockopt_dynptr_release(ctx, &optval_buf); + return 0; + } + bpf_dynptr_write(&optval_buf, 0, "cubic", 5, 0); + ret = bpf_sockopt_dynptr_install(ctx, &optval_buf); + if (ret < 0) + return 0; + ctx->optlen = 5; + + return 1; + } + + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { + /* Original optlen is larger than PAGE_SIZE. */ + if (ctx->optlen != page_size * 2) + return 0; /* unexpected data size */ + + ret = bpf_sockopt_dynptr_alloc(ctx, 1, &optval_buf); + if (ret < 0) { + bpf_sockopt_dynptr_release(ctx, &optval_buf); + return 0; + } + tmp_u8 = 0; + bpf_dynptr_write(&optval_buf, 0, &tmp_u8, 1, 0); + ret = bpf_sockopt_dynptr_install(ctx, &optval_buf); + if (ret < 0) + return 0; + ctx->optlen = 1; + + return 1; + } + + if (ctx->level != SOL_CUSTOM) + return 0; /* deny everything except custom level */ + + if (optval + 1 > optval_end) + return 0; /* bounds check */ + + storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0, + BPF_SK_STORAGE_GET_F_CREATE); + if (!storage) + return 0; /* couldn't get sk storage */ + + bpf_sockopt_dynptr_from(ctx, &optval_buf, sizeof(__u8)); + optval = bpf_dynptr_data(&optval_buf, 0, sizeof(__u8)); + if (optval) { + storage->val = *optval; + ctx->optlen = -1; /* BPF has consumed this option, don't call + * kernel setsockopt handler. + */ + } + bpf_sockopt_dynptr_release(ctx, &optval_buf); + + return optval ? 1 : 0; +} + diff --git a/tools/testing/selftests/bpf/verifier/sleepable.c b/tools/testing/selftests/bpf/verifier/sleepable.c index 1f0d2bdc673f..4b6c1117ec9f 100644 --- a/tools/testing/selftests/bpf/verifier/sleepable.c +++ b/tools/testing/selftests/bpf/verifier/sleepable.c @@ -85,7 +85,7 @@ .expected_attach_type = BPF_TRACE_RAW_TP, .kfunc = "sched_switch", .result = REJECT, - .errstr = "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, and struct_ops programs can be sleepable", + .errstr = "Only fentry/fexit/fmod_ret, lsm, iter, uprobe, cgroup, and struct_ops programs can be sleepable", .flags = BPF_F_SLEEPABLE, .runs = -1, },