Message ID | 20240510192412.3297104-18-amery.hung@bytedance.com (mailing list archive) |
---|---|
State | RFC |
Delegated to: | BPF |
Headers | show |
Series | bpf qdisc | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Guessing tree name failed - patch did not apply, async |
On 05/10, Amery Hung wrote: > This selftest shows a bare minimum fifo qdisc, which simply enqueues skbs > into the back of a bpf list and dequeues from the front of the list. > > Signed-off-by: Amery Hung <amery.hung@bytedance.com> > --- > .../selftests/bpf/prog_tests/bpf_qdisc.c | 161 ++++++++++++++++++ > .../selftests/bpf/progs/bpf_qdisc_common.h | 23 +++ > .../selftests/bpf/progs/bpf_qdisc_fifo.c | 83 +++++++++ > 3 files changed, 267 insertions(+) > create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c > create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_common.h > create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c > > diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c > new file mode 100644 > index 000000000000..295d0216e70f > --- /dev/null > +++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c > @@ -0,0 +1,161 @@ > +#include <linux/pkt_sched.h> > +#include <linux/rtnetlink.h> > +#include <test_progs.h> > + > +#include "network_helpers.h" > +#include "bpf_qdisc_fifo.skel.h" > + > +#ifndef ENOTSUPP > +#define ENOTSUPP 524 > +#endif > + > +#define LO_IFINDEX 1 > + > +static const unsigned int total_bytes = 10 * 1024 * 1024; > +static int stop; > + > +static void *server(void *arg) > +{ > + int lfd = (int)(long)arg, err = 0, fd; > + ssize_t nr_sent = 0, bytes = 0; > + char batch[1500]; > + > + fd = accept(lfd, NULL, NULL); > + while (fd == -1) { > + if (errno == EINTR) > + continue; > + err = -errno; > + goto done; > + } > + > + if (settimeo(fd, 0)) { > + err = -errno; > + goto done; > + } > + > + while (bytes < total_bytes && !READ_ONCE(stop)) { > + nr_sent = send(fd, &batch, > + MIN(total_bytes - bytes, sizeof(batch)), 0); > + if (nr_sent == -1 && errno == EINTR) > + continue; > + if (nr_sent == -1) { > + err = -errno; > + break; > + } > + bytes += nr_sent; > + } > + > + ASSERT_EQ(bytes, total_bytes, "send"); > + > +done: > + if (fd >= 0) > + close(fd); > + if (err) { > + WRITE_ONCE(stop, 1); > + return ERR_PTR(err); > + } > + return NULL; > +} > + > +static void do_test(char *qdisc) > +{ > + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = LO_IFINDEX, > + .attach_point = BPF_TC_QDISC, > + .parent = TC_H_ROOT, > + .handle = 0x8000000, > + .qdisc = qdisc); > + struct sockaddr_in6 sa6 = {}; > + ssize_t nr_recv = 0, bytes = 0; > + int lfd = -1, fd = -1; > + pthread_t srv_thread; > + socklen_t addrlen = sizeof(sa6); > + void *thread_ret; > + char batch[1500]; > + int err; > + > + WRITE_ONCE(stop, 0); > + > + err = bpf_tc_hook_create(&hook); > + if (!ASSERT_OK(err, "attach qdisc")) > + return; > + > + lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); > + if (!ASSERT_NEQ(lfd, -1, "socket")) { > + bpf_tc_hook_destroy(&hook); > + return; > + } > + > + fd = socket(AF_INET6, SOCK_STREAM, 0); > + if (!ASSERT_NEQ(fd, -1, "socket")) { > + bpf_tc_hook_destroy(&hook); > + close(lfd); > + return; > + } > + > + if (settimeo(lfd, 0) || settimeo(fd, 0)) > + goto done; > + > + err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); > + if (!ASSERT_NEQ(err, -1, "getsockname")) > + goto done; > + > + /* connect to server */ > + err = connect(fd, (struct sockaddr *)&sa6, addrlen); > + if (!ASSERT_NEQ(err, -1, "connect")) > + goto done; > + > + err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); > + if (!ASSERT_OK(err, "pthread_create")) > + goto done; > + > + /* recv total_bytes */ > + while (bytes < total_bytes && !READ_ONCE(stop)) { > + nr_recv = recv(fd, &batch, > + MIN(total_bytes - bytes, sizeof(batch)), 0); > + if (nr_recv == -1 && errno == EINTR) > + continue; > + if (nr_recv == -1) > + break; > + bytes += nr_recv; > + } > + > + ASSERT_EQ(bytes, total_bytes, "recv"); > + > + WRITE_ONCE(stop, 1); > + pthread_join(srv_thread, &thread_ret); > + ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); > + > +done: > + close(lfd); > + close(fd); > + > + bpf_tc_hook_destroy(&hook); > + return; > +} > + > +static void test_fifo(void) > +{ > + struct bpf_qdisc_fifo *fifo_skel; > + struct bpf_link *link; > + > + fifo_skel = bpf_qdisc_fifo__open_and_load(); > + if (!ASSERT_OK_PTR(fifo_skel, "bpf_qdisc_fifo__open_and_load")) > + return; > + > + link = bpf_map__attach_struct_ops(fifo_skel->maps.fifo); > + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { > + bpf_qdisc_fifo__destroy(fifo_skel); > + return; > + } > + > + do_test("bpf_fifo"); > + > + bpf_link__destroy(link); > + bpf_qdisc_fifo__destroy(fifo_skel); > +} > + > +void test_bpf_qdisc(void) > +{ > + if (test__start_subtest("fifo")) > + test_fifo(); > +} > diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h > new file mode 100644 > index 000000000000..96ab357de28e > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h > @@ -0,0 +1,23 @@ > +#ifndef _BPF_QDISC_COMMON_H > +#define _BPF_QDISC_COMMON_H > + > +#define NET_XMIT_SUCCESS 0x00 > +#define NET_XMIT_DROP 0x01 /* skb dropped */ > +#define NET_XMIT_CN 0x02 /* congestion notification */ > + > +#define TC_PRIO_CONTROL 7 > +#define TC_PRIO_MAX 15 > + > +void bpf_skb_set_dev(struct sk_buff *skb, struct Qdisc *sch) __ksym; > +u32 bpf_skb_get_hash(struct sk_buff *p) __ksym; > +void bpf_skb_release(struct sk_buff *p) __ksym; > +void bpf_qdisc_skb_drop(struct sk_buff *p, struct bpf_sk_buff_ptr *to_free) __ksym; > +void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) __ksym; > +bool bpf_qdisc_find_class(struct Qdisc *sch, u32 classid) __ksym; > +int bpf_qdisc_create_child(struct Qdisc *sch, u32 min, > + struct netlink_ext_ack *extack) __ksym; > +int bpf_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, u32 classid, > + struct bpf_sk_buff_ptr *to_free_list) __ksym; > +struct sk_buff *bpf_qdisc_dequeue(struct Qdisc *sch, u32 classid) __ksym; > + > +#endif > diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c > new file mode 100644 > index 000000000000..433fd9c3639c > --- /dev/null > +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c > @@ -0,0 +1,83 @@ > +#include <vmlinux.h> > +#include "bpf_experimental.h" > +#include "bpf_qdisc_common.h" > + > +char _license[] SEC("license") = "GPL"; > + > +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) > + > +private(B) struct bpf_spin_lock q_fifo_lock; > +private(B) struct bpf_list_head q_fifo __contains_kptr(sk_buff, bpf_list); > + > +unsigned int q_limit = 1000; > +unsigned int q_qlen = 0; > + > +SEC("struct_ops/bpf_fifo_enqueue") > +int BPF_PROG(bpf_fifo_enqueue, struct sk_buff *skb, struct Qdisc *sch, > + struct bpf_sk_buff_ptr *to_free) > +{ > + q_qlen++; > + if (q_qlen > q_limit) { > + bpf_qdisc_skb_drop(skb, to_free); > + return NET_XMIT_DROP; > + } [..] > + bpf_spin_lock(&q_fifo_lock); > + bpf_list_excl_push_back(&q_fifo, &skb->bpf_list); > + bpf_spin_unlock(&q_fifo_lock); Can you also expand a bit on the locking here and elsewhere? And how it interplays with TCQ_F_NOLOCK? As I mentioned at lsfmmbpf, I don't think there is a lot of similar locking in the existing C implementations? So why do we need it here?
On Mon, May 20, 2024 at 8:15 PM Stanislav Fomichev <sdf@google.com> wrote: > > On 05/10, Amery Hung wrote: > > This selftest shows a bare minimum fifo qdisc, which simply enqueues skbs > > into the back of a bpf list and dequeues from the front of the list. > > > > Signed-off-by: Amery Hung <amery.hung@bytedance.com> > > --- > > .../selftests/bpf/prog_tests/bpf_qdisc.c | 161 ++++++++++++++++++ > > .../selftests/bpf/progs/bpf_qdisc_common.h | 23 +++ > > .../selftests/bpf/progs/bpf_qdisc_fifo.c | 83 +++++++++ > > 3 files changed, 267 insertions(+) > > create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c > > create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_common.h > > create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c > > > > diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c > > new file mode 100644 > > index 000000000000..295d0216e70f > > --- /dev/null > > +++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c > > @@ -0,0 +1,161 @@ > > +#include <linux/pkt_sched.h> > > +#include <linux/rtnetlink.h> > > +#include <test_progs.h> > > + > > +#include "network_helpers.h" > > +#include "bpf_qdisc_fifo.skel.h" > > + > > +#ifndef ENOTSUPP > > +#define ENOTSUPP 524 > > +#endif > > + > > +#define LO_IFINDEX 1 > > + > > +static const unsigned int total_bytes = 10 * 1024 * 1024; > > +static int stop; > > + > > +static void *server(void *arg) > > +{ > > + int lfd = (int)(long)arg, err = 0, fd; > > + ssize_t nr_sent = 0, bytes = 0; > > + char batch[1500]; > > + > > + fd = accept(lfd, NULL, NULL); > > + while (fd == -1) { > > + if (errno == EINTR) > > + continue; > > + err = -errno; > > + goto done; > > + } > > + > > + if (settimeo(fd, 0)) { > > + err = -errno; > > + goto done; > > + } > > + > > + while (bytes < total_bytes && !READ_ONCE(stop)) { > > + nr_sent = send(fd, &batch, > > + MIN(total_bytes - bytes, sizeof(batch)), 0); > > + if (nr_sent == -1 && errno == EINTR) > > + continue; > > + if (nr_sent == -1) { > > + err = -errno; > > + break; > > + } > > + bytes += nr_sent; > > + } > > + > > + ASSERT_EQ(bytes, total_bytes, "send"); > > + > > +done: > > + if (fd >= 0) > > + close(fd); > > + if (err) { > > + WRITE_ONCE(stop, 1); > > + return ERR_PTR(err); > > + } > > + return NULL; > > +} > > + > > +static void do_test(char *qdisc) > > +{ > > + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = LO_IFINDEX, > > + .attach_point = BPF_TC_QDISC, > > + .parent = TC_H_ROOT, > > + .handle = 0x8000000, > > + .qdisc = qdisc); > > + struct sockaddr_in6 sa6 = {}; > > + ssize_t nr_recv = 0, bytes = 0; > > + int lfd = -1, fd = -1; > > + pthread_t srv_thread; > > + socklen_t addrlen = sizeof(sa6); > > + void *thread_ret; > > + char batch[1500]; > > + int err; > > + > > + WRITE_ONCE(stop, 0); > > + > > + err = bpf_tc_hook_create(&hook); > > + if (!ASSERT_OK(err, "attach qdisc")) > > + return; > > + > > + lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); > > + if (!ASSERT_NEQ(lfd, -1, "socket")) { > > + bpf_tc_hook_destroy(&hook); > > + return; > > + } > > + > > + fd = socket(AF_INET6, SOCK_STREAM, 0); > > + if (!ASSERT_NEQ(fd, -1, "socket")) { > > + bpf_tc_hook_destroy(&hook); > > + close(lfd); > > + return; > > + } > > + > > + if (settimeo(lfd, 0) || settimeo(fd, 0)) > > + goto done; > > + > > + err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); > > + if (!ASSERT_NEQ(err, -1, "getsockname")) > > + goto done; > > + > > + /* connect to server */ > > + err = connect(fd, (struct sockaddr *)&sa6, addrlen); > > + if (!ASSERT_NEQ(err, -1, "connect")) > > + goto done; > > + > > + err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); > > + if (!ASSERT_OK(err, "pthread_create")) > > + goto done; > > + > > + /* recv total_bytes */ > > + while (bytes < total_bytes && !READ_ONCE(stop)) { > > + nr_recv = recv(fd, &batch, > > + MIN(total_bytes - bytes, sizeof(batch)), 0); > > + if (nr_recv == -1 && errno == EINTR) > > + continue; > > + if (nr_recv == -1) > > + break; > > + bytes += nr_recv; > > + } > > + > > + ASSERT_EQ(bytes, total_bytes, "recv"); > > + > > + WRITE_ONCE(stop, 1); > > + pthread_join(srv_thread, &thread_ret); > > + ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); > > + > > +done: > > + close(lfd); > > + close(fd); > > + > > + bpf_tc_hook_destroy(&hook); > > + return; > > +} > > + > > +static void test_fifo(void) > > +{ > > + struct bpf_qdisc_fifo *fifo_skel; > > + struct bpf_link *link; > > + > > + fifo_skel = bpf_qdisc_fifo__open_and_load(); > > + if (!ASSERT_OK_PTR(fifo_skel, "bpf_qdisc_fifo__open_and_load")) > > + return; > > + > > + link = bpf_map__attach_struct_ops(fifo_skel->maps.fifo); > > + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { > > + bpf_qdisc_fifo__destroy(fifo_skel); > > + return; > > + } > > + > > + do_test("bpf_fifo"); > > + > > + bpf_link__destroy(link); > > + bpf_qdisc_fifo__destroy(fifo_skel); > > +} > > + > > +void test_bpf_qdisc(void) > > +{ > > + if (test__start_subtest("fifo")) > > + test_fifo(); > > +} > > diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h > > new file mode 100644 > > index 000000000000..96ab357de28e > > --- /dev/null > > +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h > > @@ -0,0 +1,23 @@ > > +#ifndef _BPF_QDISC_COMMON_H > > +#define _BPF_QDISC_COMMON_H > > + > > +#define NET_XMIT_SUCCESS 0x00 > > +#define NET_XMIT_DROP 0x01 /* skb dropped */ > > +#define NET_XMIT_CN 0x02 /* congestion notification */ > > + > > +#define TC_PRIO_CONTROL 7 > > +#define TC_PRIO_MAX 15 > > + > > +void bpf_skb_set_dev(struct sk_buff *skb, struct Qdisc *sch) __ksym; > > +u32 bpf_skb_get_hash(struct sk_buff *p) __ksym; > > +void bpf_skb_release(struct sk_buff *p) __ksym; > > +void bpf_qdisc_skb_drop(struct sk_buff *p, struct bpf_sk_buff_ptr *to_free) __ksym; > > +void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) __ksym; > > +bool bpf_qdisc_find_class(struct Qdisc *sch, u32 classid) __ksym; > > +int bpf_qdisc_create_child(struct Qdisc *sch, u32 min, > > + struct netlink_ext_ack *extack) __ksym; > > +int bpf_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, u32 classid, > > + struct bpf_sk_buff_ptr *to_free_list) __ksym; > > +struct sk_buff *bpf_qdisc_dequeue(struct Qdisc *sch, u32 classid) __ksym; > > + > > +#endif > > diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c > > new file mode 100644 > > index 000000000000..433fd9c3639c > > --- /dev/null > > +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c > > @@ -0,0 +1,83 @@ > > +#include <vmlinux.h> > > +#include "bpf_experimental.h" > > +#include "bpf_qdisc_common.h" > > + > > +char _license[] SEC("license") = "GPL"; > > + > > +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) > > + > > +private(B) struct bpf_spin_lock q_fifo_lock; > > +private(B) struct bpf_list_head q_fifo __contains_kptr(sk_buff, bpf_list); > > + > > +unsigned int q_limit = 1000; > > +unsigned int q_qlen = 0; > > + > > +SEC("struct_ops/bpf_fifo_enqueue") > > +int BPF_PROG(bpf_fifo_enqueue, struct sk_buff *skb, struct Qdisc *sch, > > + struct bpf_sk_buff_ptr *to_free) > > +{ > > + q_qlen++; > > + if (q_qlen > q_limit) { > > + bpf_qdisc_skb_drop(skb, to_free); > > + return NET_XMIT_DROP; > > + } > > [..] > > > + bpf_spin_lock(&q_fifo_lock); > > + bpf_list_excl_push_back(&q_fifo, &skb->bpf_list); > > + bpf_spin_unlock(&q_fifo_lock); > > Can you also expand a bit on the locking here and elsewhere? And how it > interplays with TCQ_F_NOLOCK? > > As I mentioned at lsfmmbpf, I don't think there is a lot of similar > locking in the existing C implementations? So why do we need it here? The locks are required to prevent catastrophic concurrent accesses to bpf graphs. The verifier will check 1) if there is a spin_lock in the same struct with a list head or rbtree root, and 2) the lock is held when accessing the list or rbtree. Since we have the safety guarantee provided by the verifier, I think there is an opportunity to allow qdisc users to set TCQ_F_NOLOCK. I will check if qdisc kfuncs are TCQ_F_NOLOCK safe though. Let me know if I missed anything. Thanks, Amery
On Tue, May 21, 2024 at 8:03 AM Amery Hung <ameryhung@gmail.com> wrote: > > On Mon, May 20, 2024 at 8:15 PM Stanislav Fomichev <sdf@google.com> wrote: > > > > On 05/10, Amery Hung wrote: > > > This selftest shows a bare minimum fifo qdisc, which simply enqueues skbs > > > into the back of a bpf list and dequeues from the front of the list. > > > > > > Signed-off-by: Amery Hung <amery.hung@bytedance.com> > > > --- > > > .../selftests/bpf/prog_tests/bpf_qdisc.c | 161 ++++++++++++++++++ > > > .../selftests/bpf/progs/bpf_qdisc_common.h | 23 +++ > > > .../selftests/bpf/progs/bpf_qdisc_fifo.c | 83 +++++++++ > > > 3 files changed, 267 insertions(+) > > > create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c > > > create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_common.h > > > create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c > > > > > > diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c > > > new file mode 100644 > > > index 000000000000..295d0216e70f > > > --- /dev/null > > > +++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c > > > @@ -0,0 +1,161 @@ > > > +#include <linux/pkt_sched.h> > > > +#include <linux/rtnetlink.h> > > > +#include <test_progs.h> > > > + > > > +#include "network_helpers.h" > > > +#include "bpf_qdisc_fifo.skel.h" > > > + > > > +#ifndef ENOTSUPP > > > +#define ENOTSUPP 524 > > > +#endif > > > + > > > +#define LO_IFINDEX 1 > > > + > > > +static const unsigned int total_bytes = 10 * 1024 * 1024; > > > +static int stop; > > > + > > > +static void *server(void *arg) > > > +{ > > > + int lfd = (int)(long)arg, err = 0, fd; > > > + ssize_t nr_sent = 0, bytes = 0; > > > + char batch[1500]; > > > + > > > + fd = accept(lfd, NULL, NULL); > > > + while (fd == -1) { > > > + if (errno == EINTR) > > > + continue; > > > + err = -errno; > > > + goto done; > > > + } > > > + > > > + if (settimeo(fd, 0)) { > > > + err = -errno; > > > + goto done; > > > + } > > > + > > > + while (bytes < total_bytes && !READ_ONCE(stop)) { > > > + nr_sent = send(fd, &batch, > > > + MIN(total_bytes - bytes, sizeof(batch)), 0); > > > + if (nr_sent == -1 && errno == EINTR) > > > + continue; > > > + if (nr_sent == -1) { > > > + err = -errno; > > > + break; > > > + } > > > + bytes += nr_sent; > > > + } > > > + > > > + ASSERT_EQ(bytes, total_bytes, "send"); > > > + > > > +done: > > > + if (fd >= 0) > > > + close(fd); > > > + if (err) { > > > + WRITE_ONCE(stop, 1); > > > + return ERR_PTR(err); > > > + } > > > + return NULL; > > > +} > > > + > > > +static void do_test(char *qdisc) > > > +{ > > > + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = LO_IFINDEX, > > > + .attach_point = BPF_TC_QDISC, > > > + .parent = TC_H_ROOT, > > > + .handle = 0x8000000, > > > + .qdisc = qdisc); > > > + struct sockaddr_in6 sa6 = {}; > > > + ssize_t nr_recv = 0, bytes = 0; > > > + int lfd = -1, fd = -1; > > > + pthread_t srv_thread; > > > + socklen_t addrlen = sizeof(sa6); > > > + void *thread_ret; > > > + char batch[1500]; > > > + int err; > > > + > > > + WRITE_ONCE(stop, 0); > > > + > > > + err = bpf_tc_hook_create(&hook); > > > + if (!ASSERT_OK(err, "attach qdisc")) > > > + return; > > > + > > > + lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); > > > + if (!ASSERT_NEQ(lfd, -1, "socket")) { > > > + bpf_tc_hook_destroy(&hook); > > > + return; > > > + } > > > + > > > + fd = socket(AF_INET6, SOCK_STREAM, 0); > > > + if (!ASSERT_NEQ(fd, -1, "socket")) { > > > + bpf_tc_hook_destroy(&hook); > > > + close(lfd); > > > + return; > > > + } > > > + > > > + if (settimeo(lfd, 0) || settimeo(fd, 0)) > > > + goto done; > > > + > > > + err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); > > > + if (!ASSERT_NEQ(err, -1, "getsockname")) > > > + goto done; > > > + > > > + /* connect to server */ > > > + err = connect(fd, (struct sockaddr *)&sa6, addrlen); > > > + if (!ASSERT_NEQ(err, -1, "connect")) > > > + goto done; > > > + > > > + err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); > > > + if (!ASSERT_OK(err, "pthread_create")) > > > + goto done; > > > + > > > + /* recv total_bytes */ > > > + while (bytes < total_bytes && !READ_ONCE(stop)) { > > > + nr_recv = recv(fd, &batch, > > > + MIN(total_bytes - bytes, sizeof(batch)), 0); > > > + if (nr_recv == -1 && errno == EINTR) > > > + continue; > > > + if (nr_recv == -1) > > > + break; > > > + bytes += nr_recv; > > > + } > > > + > > > + ASSERT_EQ(bytes, total_bytes, "recv"); > > > + > > > + WRITE_ONCE(stop, 1); > > > + pthread_join(srv_thread, &thread_ret); > > > + ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); > > > + > > > +done: > > > + close(lfd); > > > + close(fd); > > > + > > > + bpf_tc_hook_destroy(&hook); > > > + return; > > > +} > > > + > > > +static void test_fifo(void) > > > +{ > > > + struct bpf_qdisc_fifo *fifo_skel; > > > + struct bpf_link *link; > > > + > > > + fifo_skel = bpf_qdisc_fifo__open_and_load(); > > > + if (!ASSERT_OK_PTR(fifo_skel, "bpf_qdisc_fifo__open_and_load")) > > > + return; > > > + > > > + link = bpf_map__attach_struct_ops(fifo_skel->maps.fifo); > > > + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { > > > + bpf_qdisc_fifo__destroy(fifo_skel); > > > + return; > > > + } > > > + > > > + do_test("bpf_fifo"); > > > + > > > + bpf_link__destroy(link); > > > + bpf_qdisc_fifo__destroy(fifo_skel); > > > +} > > > + > > > +void test_bpf_qdisc(void) > > > +{ > > > + if (test__start_subtest("fifo")) > > > + test_fifo(); > > > +} > > > diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h > > > new file mode 100644 > > > index 000000000000..96ab357de28e > > > --- /dev/null > > > +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h > > > @@ -0,0 +1,23 @@ > > > +#ifndef _BPF_QDISC_COMMON_H > > > +#define _BPF_QDISC_COMMON_H > > > + > > > +#define NET_XMIT_SUCCESS 0x00 > > > +#define NET_XMIT_DROP 0x01 /* skb dropped */ > > > +#define NET_XMIT_CN 0x02 /* congestion notification */ > > > + > > > +#define TC_PRIO_CONTROL 7 > > > +#define TC_PRIO_MAX 15 > > > + > > > +void bpf_skb_set_dev(struct sk_buff *skb, struct Qdisc *sch) __ksym; > > > +u32 bpf_skb_get_hash(struct sk_buff *p) __ksym; > > > +void bpf_skb_release(struct sk_buff *p) __ksym; > > > +void bpf_qdisc_skb_drop(struct sk_buff *p, struct bpf_sk_buff_ptr *to_free) __ksym; > > > +void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) __ksym; > > > +bool bpf_qdisc_find_class(struct Qdisc *sch, u32 classid) __ksym; > > > +int bpf_qdisc_create_child(struct Qdisc *sch, u32 min, > > > + struct netlink_ext_ack *extack) __ksym; > > > +int bpf_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, u32 classid, > > > + struct bpf_sk_buff_ptr *to_free_list) __ksym; > > > +struct sk_buff *bpf_qdisc_dequeue(struct Qdisc *sch, u32 classid) __ksym; > > > + > > > +#endif > > > diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c > > > new file mode 100644 > > > index 000000000000..433fd9c3639c > > > --- /dev/null > > > +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c > > > @@ -0,0 +1,83 @@ > > > +#include <vmlinux.h> > > > +#include "bpf_experimental.h" > > > +#include "bpf_qdisc_common.h" > > > + > > > +char _license[] SEC("license") = "GPL"; > > > + > > > +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) > > > + > > > +private(B) struct bpf_spin_lock q_fifo_lock; > > > +private(B) struct bpf_list_head q_fifo __contains_kptr(sk_buff, bpf_list); > > > + > > > +unsigned int q_limit = 1000; > > > +unsigned int q_qlen = 0; > > > + > > > +SEC("struct_ops/bpf_fifo_enqueue") > > > +int BPF_PROG(bpf_fifo_enqueue, struct sk_buff *skb, struct Qdisc *sch, > > > + struct bpf_sk_buff_ptr *to_free) > > > +{ > > > + q_qlen++; > > > + if (q_qlen > q_limit) { > > > + bpf_qdisc_skb_drop(skb, to_free); > > > + return NET_XMIT_DROP; > > > + } > > > > [..] > > > > > + bpf_spin_lock(&q_fifo_lock); > > > + bpf_list_excl_push_back(&q_fifo, &skb->bpf_list); > > > + bpf_spin_unlock(&q_fifo_lock); > > > > Can you also expand a bit on the locking here and elsewhere? And how it > > interplays with TCQ_F_NOLOCK? > > > > As I mentioned at lsfmmbpf, I don't think there is a lot of similar > > locking in the existing C implementations? So why do we need it here? > > The locks are required to prevent catastrophic concurrent accesses to > bpf graphs. The verifier will check 1) if there is a spin_lock in the > same struct with a list head or rbtree root, and 2) the lock is held > when accessing the list or rbtree. > > Since we have the safety guarantee provided by the verifier, I think > there is an opportunity to allow qdisc users to set TCQ_F_NOLOCK. I will > check if qdisc kfuncs are TCQ_F_NOLOCK safe though. Let me know if I > missed anything. Ah, so these locking constraints come from the verifier.... In this case, yes, it would be nice to have special treatment for bpf qdisc (or maybe allow passing TCQ_F_NOLOCK somehow). If the verifier enforces the locking for the underlying data structures, we should try to remove the ones from the generic qdisc layer.
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c new file mode 100644 index 000000000000..295d0216e70f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c @@ -0,0 +1,161 @@ +#include <linux/pkt_sched.h> +#include <linux/rtnetlink.h> +#include <test_progs.h> + +#include "network_helpers.h" +#include "bpf_qdisc_fifo.skel.h" + +#ifndef ENOTSUPP +#define ENOTSUPP 524 +#endif + +#define LO_IFINDEX 1 + +static const unsigned int total_bytes = 10 * 1024 * 1024; +static int stop; + +static void *server(void *arg) +{ + int lfd = (int)(long)arg, err = 0, fd; + ssize_t nr_sent = 0, bytes = 0; + char batch[1500]; + + fd = accept(lfd, NULL, NULL); + while (fd == -1) { + if (errno == EINTR) + continue; + err = -errno; + goto done; + } + + if (settimeo(fd, 0)) { + err = -errno; + goto done; + } + + while (bytes < total_bytes && !READ_ONCE(stop)) { + nr_sent = send(fd, &batch, + MIN(total_bytes - bytes, sizeof(batch)), 0); + if (nr_sent == -1 && errno == EINTR) + continue; + if (nr_sent == -1) { + err = -errno; + break; + } + bytes += nr_sent; + } + + ASSERT_EQ(bytes, total_bytes, "send"); + +done: + if (fd >= 0) + close(fd); + if (err) { + WRITE_ONCE(stop, 1); + return ERR_PTR(err); + } + return NULL; +} + +static void do_test(char *qdisc) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = LO_IFINDEX, + .attach_point = BPF_TC_QDISC, + .parent = TC_H_ROOT, + .handle = 0x8000000, + .qdisc = qdisc); + struct sockaddr_in6 sa6 = {}; + ssize_t nr_recv = 0, bytes = 0; + int lfd = -1, fd = -1; + pthread_t srv_thread; + socklen_t addrlen = sizeof(sa6); + void *thread_ret; + char batch[1500]; + int err; + + WRITE_ONCE(stop, 0); + + err = bpf_tc_hook_create(&hook); + if (!ASSERT_OK(err, "attach qdisc")) + return; + + lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); + if (!ASSERT_NEQ(lfd, -1, "socket")) { + bpf_tc_hook_destroy(&hook); + return; + } + + fd = socket(AF_INET6, SOCK_STREAM, 0); + if (!ASSERT_NEQ(fd, -1, "socket")) { + bpf_tc_hook_destroy(&hook); + close(lfd); + return; + } + + if (settimeo(lfd, 0) || settimeo(fd, 0)) + goto done; + + err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen); + if (!ASSERT_NEQ(err, -1, "getsockname")) + goto done; + + /* connect to server */ + err = connect(fd, (struct sockaddr *)&sa6, addrlen); + if (!ASSERT_NEQ(err, -1, "connect")) + goto done; + + err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd); + if (!ASSERT_OK(err, "pthread_create")) + goto done; + + /* recv total_bytes */ + while (bytes < total_bytes && !READ_ONCE(stop)) { + nr_recv = recv(fd, &batch, + MIN(total_bytes - bytes, sizeof(batch)), 0); + if (nr_recv == -1 && errno == EINTR) + continue; + if (nr_recv == -1) + break; + bytes += nr_recv; + } + + ASSERT_EQ(bytes, total_bytes, "recv"); + + WRITE_ONCE(stop, 1); + pthread_join(srv_thread, &thread_ret); + ASSERT_OK(IS_ERR(thread_ret), "thread_ret"); + +done: + close(lfd); + close(fd); + + bpf_tc_hook_destroy(&hook); + return; +} + +static void test_fifo(void) +{ + struct bpf_qdisc_fifo *fifo_skel; + struct bpf_link *link; + + fifo_skel = bpf_qdisc_fifo__open_and_load(); + if (!ASSERT_OK_PTR(fifo_skel, "bpf_qdisc_fifo__open_and_load")) + return; + + link = bpf_map__attach_struct_ops(fifo_skel->maps.fifo); + if (!ASSERT_OK_PTR(link, "bpf_map__attach_struct_ops")) { + bpf_qdisc_fifo__destroy(fifo_skel); + return; + } + + do_test("bpf_fifo"); + + bpf_link__destroy(link); + bpf_qdisc_fifo__destroy(fifo_skel); +} + +void test_bpf_qdisc(void) +{ + if (test__start_subtest("fifo")) + test_fifo(); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h new file mode 100644 index 000000000000..96ab357de28e --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_common.h @@ -0,0 +1,23 @@ +#ifndef _BPF_QDISC_COMMON_H +#define _BPF_QDISC_COMMON_H + +#define NET_XMIT_SUCCESS 0x00 +#define NET_XMIT_DROP 0x01 /* skb dropped */ +#define NET_XMIT_CN 0x02 /* congestion notification */ + +#define TC_PRIO_CONTROL 7 +#define TC_PRIO_MAX 15 + +void bpf_skb_set_dev(struct sk_buff *skb, struct Qdisc *sch) __ksym; +u32 bpf_skb_get_hash(struct sk_buff *p) __ksym; +void bpf_skb_release(struct sk_buff *p) __ksym; +void bpf_qdisc_skb_drop(struct sk_buff *p, struct bpf_sk_buff_ptr *to_free) __ksym; +void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) __ksym; +bool bpf_qdisc_find_class(struct Qdisc *sch, u32 classid) __ksym; +int bpf_qdisc_create_child(struct Qdisc *sch, u32 min, + struct netlink_ext_ack *extack) __ksym; +int bpf_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, u32 classid, + struct bpf_sk_buff_ptr *to_free_list) __ksym; +struct sk_buff *bpf_qdisc_dequeue(struct Qdisc *sch, u32 classid) __ksym; + +#endif diff --git a/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c new file mode 100644 index 000000000000..433fd9c3639c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c @@ -0,0 +1,83 @@ +#include <vmlinux.h> +#include "bpf_experimental.h" +#include "bpf_qdisc_common.h" + +char _license[] SEC("license") = "GPL"; + +#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) + +private(B) struct bpf_spin_lock q_fifo_lock; +private(B) struct bpf_list_head q_fifo __contains_kptr(sk_buff, bpf_list); + +unsigned int q_limit = 1000; +unsigned int q_qlen = 0; + +SEC("struct_ops/bpf_fifo_enqueue") +int BPF_PROG(bpf_fifo_enqueue, struct sk_buff *skb, struct Qdisc *sch, + struct bpf_sk_buff_ptr *to_free) +{ + q_qlen++; + if (q_qlen > q_limit) { + bpf_qdisc_skb_drop(skb, to_free); + return NET_XMIT_DROP; + } + + bpf_spin_lock(&q_fifo_lock); + bpf_list_excl_push_back(&q_fifo, &skb->bpf_list); + bpf_spin_unlock(&q_fifo_lock); + + return NET_XMIT_SUCCESS; +} + +SEC("struct_ops/bpf_fifo_dequeue") +struct sk_buff *BPF_PROG(bpf_fifo_dequeue, struct Qdisc *sch) +{ + struct sk_buff *skb; + struct bpf_list_excl_node *node; + + bpf_spin_lock(&q_fifo_lock); + node = bpf_list_excl_pop_front(&q_fifo); + bpf_spin_unlock(&q_fifo_lock); + if (!node) + return NULL; + + skb = container_of(node, struct sk_buff, bpf_list); + bpf_skb_set_dev(skb, sch); + q_qlen--; + + return skb; +} + +static int reset_fifo(u32 index, void *ctx) +{ + struct bpf_list_excl_node *node; + struct sk_buff *skb; + + bpf_spin_lock(&q_fifo_lock); + node = bpf_list_excl_pop_front(&q_fifo); + bpf_spin_unlock(&q_fifo_lock); + + if (!node) { + return 1; + } + + skb = container_of(node, struct sk_buff, bpf_list); + bpf_skb_release(skb); + return 0; +} + +SEC("struct_ops/bpf_fifo_reset") +void BPF_PROG(bpf_fifo_reset, struct Qdisc *sch) +{ + bpf_loop(q_qlen, reset_fifo, NULL, 0); + q_qlen = 0; +} + +SEC(".struct_ops") +struct Qdisc_ops fifo = { + .enqueue = (void *)bpf_fifo_enqueue, + .dequeue = (void *)bpf_fifo_dequeue, + .reset = (void *)bpf_fifo_reset, + .id = "bpf_fifo", +}; +
This selftest shows a bare minimum fifo qdisc, which simply enqueues skbs into the back of a bpf list and dequeues from the front of the list. Signed-off-by: Amery Hung <amery.hung@bytedance.com> --- .../selftests/bpf/prog_tests/bpf_qdisc.c | 161 ++++++++++++++++++ .../selftests/bpf/progs/bpf_qdisc_common.h | 23 +++ .../selftests/bpf/progs/bpf_qdisc_fifo.c | 83 +++++++++ 3 files changed, 267 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_qdisc.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_common.h create mode 100644 tools/testing/selftests/bpf/progs/bpf_qdisc_fifo.c