diff mbox series

[bpf-next,v1,3/3] selftests/bpf: tests for using dynptrs to parse skb and xdp buffers

Message ID 20220726184706.954822-4-joannelkoong@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Add skb + xdp dynptrs | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail merge-conflict
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/apply fail Patch does not apply to bpf-next

Commit Message

Joanne Koong July 26, 2022, 6:47 p.m. UTC
Test skb and xdp dynptr functionality in the following ways:

1) progs/test_xdp.c
   * Change existing test to use dynptrs to parse xdp data

     There were no noticeable diferences in user + system time between
     the original version vs. using dynptrs. Averaging the time for 10
     runs (run using "time ./test_progs -t xdp_bpf2bpf"):
         original version: 0.0449 sec
         with dynptrs: 0.0429 sec

2) progs/test_l4lb_noinline.c
   * Change existing test to use dynptrs to parse skb data

     There were no noticeable diferences in user + system time between
     the original version vs. using dynptrs. Averaging the time for 10
     runs (run using "time ./test_progs -t l4lb_all/l4lb_noinline"):
         original version: 0.0502 sec
         with dynptrs: 0.055 sec

     For number of processed verifier instructions:
         original version: 6284 insns
         with dynptrs: 2538 insns

3) progs/test_dynptr_xdp.c
   * Add sample code for parsing tcp hdr opt lookup using dynptrs.
     This logic is lifted from a real-world use case of packet parsing in
     katran [0], a layer 4 load balancer

4) progs/dynptr_success.c
   * Add test case "test_skb_readonly" for testing attempts at writes /
     data slices on a prog type with read-only skb ctx.

5) progs/dynptr_fail.c
   * Add test cases "skb_invalid_data_slice" and
     "xdp_invalid_data_slice" for testing that helpers that modify the
     underlying packet buffer automatically invalidate the associated
     data slice.
   * Add test cases "skb_invalid_ctx" and "xdp_invalid_ctx" for testing
     that prog types that do not support bpf_dynptr_from_skb/xdp don't
     have access to the API.

[0] https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h

Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
 .../testing/selftests/bpf/prog_tests/dynptr.c |  85 ++++++++++---
 .../selftests/bpf/prog_tests/dynptr_xdp.c     |  49 ++++++++
 .../testing/selftests/bpf/progs/dynptr_fail.c |  76 ++++++++++++
 .../selftests/bpf/progs/dynptr_success.c      |  32 +++++
 .../selftests/bpf/progs/test_dynptr_xdp.c     | 115 ++++++++++++++++++
 .../selftests/bpf/progs/test_l4lb_noinline.c  |  71 +++++------
 tools/testing/selftests/bpf/progs/test_xdp.c  |  95 +++++++--------
 .../selftests/bpf/test_tcp_hdr_options.h      |   1 +
 8 files changed, 416 insertions(+), 108 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_dynptr_xdp.c

Comments

Zvi Effron July 26, 2022, 7:44 p.m. UTC | #1
On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> Test skb and xdp dynptr functionality in the following ways:
>
> 1) progs/test_xdp.c
> * Change existing test to use dynptrs to parse xdp data
>
> There were no noticeable diferences in user + system time between
> the original version vs. using dynptrs. Averaging the time for 10
> runs (run using "time ./test_progs -t xdp_bpf2bpf"):
> original version: 0.0449 sec
> with dynptrs: 0.0429 sec
>
> 2) progs/test_l4lb_noinline.c
> * Change existing test to use dynptrs to parse skb data
>
> There were no noticeable diferences in user + system time between
> the original version vs. using dynptrs. Averaging the time for 10
> runs (run using "time ./test_progs -t l4lb_all/l4lb_noinline"):
> original version: 0.0502 sec
> with dynptrs: 0.055 sec
>
> For number of processed verifier instructions:
> original version: 6284 insns
> with dynptrs: 2538 insns
>
> 3) progs/test_dynptr_xdp.c
> * Add sample code for parsing tcp hdr opt lookup using dynptrs.
> This logic is lifted from a real-world use case of packet parsing in
> katran [0], a layer 4 load balancer
>
> 4) progs/dynptr_success.c
> * Add test case "test_skb_readonly" for testing attempts at writes /
> data slices on a prog type with read-only skb ctx.
>
> 5) progs/dynptr_fail.c
> * Add test cases "skb_invalid_data_slice" and
> "xdp_invalid_data_slice" for testing that helpers that modify the
> underlying packet buffer automatically invalidate the associated
> data slice.
> * Add test cases "skb_invalid_ctx" and "xdp_invalid_ctx" for testing
> that prog types that do not support bpf_dynptr_from_skb/xdp don't
> have access to the API.
>
> [0] https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
> .../testing/selftests/bpf/prog_tests/dynptr.c | 85 ++++++++++---
> .../selftests/bpf/prog_tests/dynptr_xdp.c | 49 ++++++++
> .../testing/selftests/bpf/progs/dynptr_fail.c | 76 ++++++++++++
> .../selftests/bpf/progs/dynptr_success.c | 32 +++++
> .../selftests/bpf/progs/test_dynptr_xdp.c | 115 ++++++++++++++++++
> .../selftests/bpf/progs/test_l4lb_noinline.c | 71 +++++------
> tools/testing/selftests/bpf/progs/test_xdp.c | 95 +++++++--------
> .../selftests/bpf/test_tcp_hdr_options.h | 1 +
> 8 files changed, 416 insertions(+), 108 deletions(-)
> create mode 100644 tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> create mode 100644 tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> index bcf80b9f7c27..c40631f33c7b 100644
> --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
> +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> @@ -2,6 +2,7 @@
> /* Copyright (c) 2022 Facebook */
>
> #include <test_progs.h>
> +#include <network_helpers.h>
> #include "dynptr_fail.skel.h"
> #include "dynptr_success.skel.h"
>
> @@ -11,8 +12,8 @@ static char obj_log_buf[1048576];
> static struct {
> const char *prog_name;
> const char *expected_err_msg;
> -} dynptr_tests[] = {
> - /* failure cases */
> +} verifier_error_tests[] = {
> + /* these cases should trigger a verifier error */
> {"ringbuf_missing_release1", "Unreleased reference id=1"},
> {"ringbuf_missing_release2", "Unreleased reference id=2"},
> {"ringbuf_missing_release_callback", "Unreleased reference id"},
> @@ -42,11 +43,25 @@ static struct {
> {"release_twice_callback", "arg 1 is an unacquired reference"},
> {"dynptr_from_mem_invalid_api",
> "Unsupported reg type fp for bpf_dynptr_from_mem data"},
> + {"skb_invalid_data_slice", "invalid mem access 'scalar'"},
> + {"xdp_invalid_data_slice", "invalid mem access 'scalar'"},
> + {"skb_invalid_ctx", "unknown func bpf_dynptr_from_skb"},
> + {"xdp_invalid_ctx", "unknown func bpf_dynptr_from_xdp"},
> +};
> +
> +enum test_setup_type {
> + SETUP_SYSCALL_SLEEP,
> + SETUP_SKB_PROG,
> +};
>
> - /* success cases */
> - {"test_read_write", NULL},
> - {"test_data_slice", NULL},
> - {"test_ringbuf", NULL},
> +static struct {
> + const char *prog_name;
> + enum test_setup_type type;
> +} runtime_tests[] = {
> + {"test_read_write", SETUP_SYSCALL_SLEEP},
> + {"test_data_slice", SETUP_SYSCALL_SLEEP},
> + {"test_ringbuf", SETUP_SYSCALL_SLEEP},
> + {"test_skb_readonly", SETUP_SKB_PROG},
> };
>
> static void verify_fail(const char *prog_name, const char *expected_err_msg)
> @@ -85,7 +100,7 @@ static void verify_fail(const char *prog_name, const char *expected_err_msg)
> dynptr_fail__destroy(skel);
> }
>
> -static void verify_success(const char *prog_name)
> +static void run_tests(const char *prog_name, enum test_setup_type setup_type)
> {
> struct dynptr_success *skel;
> struct bpf_program *prog;
> @@ -107,15 +122,42 @@ static void verify_success(const char *prog_name)
> if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
> goto cleanup;
>
> - link = bpf_program__attach(prog);
> - if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
> - goto cleanup;
> + switch (setup_type) {
> + case SETUP_SYSCALL_SLEEP:
> + link = bpf_program__attach(prog);
> + if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
> + goto cleanup;
>
> - usleep(1);
> + usleep(1);
>
> - ASSERT_EQ(skel->bss->err, 0, "err");
> + bpf_link__destroy(link);
> + break;
> + case SETUP_SKB_PROG:
> + {
> + int prog_fd, err;
> + char buf[64];
> +
> + prog_fd = bpf_program__fd(prog);
> + if (CHECK_FAIL(prog_fd < 0))
> + goto cleanup;
> +
> + LIBBPF_OPTS(bpf_test_run_opts, topts,
> + .data_in = &pkt_v4,
> + .data_size_in = sizeof(pkt_v4),
> + .data_out = buf,
> + .data_size_out = sizeof(buf),
> + .repeat = 1,
> + );
>
> - bpf_link__destroy(link);
> + err = bpf_prog_test_run_opts(prog_fd, &topts);
> +
> + if (!ASSERT_OK(err, "test_run"))
> + goto cleanup;
> +
> + break;
> + }
> + }
> + ASSERT_EQ(skel->bss->err, 0, "err");
>
> cleanup:
> dynptr_success__destroy(skel);
> @@ -125,14 +167,17 @@ void test_dynptr(void)
> {
> int i;
>
> - for (i = 0; i < ARRAY_SIZE(dynptr_tests); i++) {
> - if (!test__start_subtest(dynptr_tests[i].prog_name))
> + for (i = 0; i < ARRAY_SIZE(verifier_error_tests); i++) {
> + if (!test__start_subtest(verifier_error_tests[i].prog_name))
> + continue;
> +
> + verify_fail(verifier_error_tests[i].prog_name,
> + verifier_error_tests[i].expected_err_msg);
> + }
> + for (i = 0; i < ARRAY_SIZE(runtime_tests); i++) {
> + if (!test__start_subtest(runtime_tests[i].prog_name))
> continue;
>
> - if (dynptr_tests[i].expected_err_msg)
> - verify_fail(dynptr_tests[i].prog_name,
> - dynptr_tests[i].expected_err_msg);
> - else
> - verify_success(dynptr_tests[i].prog_name);
> + run_tests(runtime_tests[i].prog_name, runtime_tests[i].type);
> }
> }
> diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c b/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> new file mode 100644
> index 000000000000..ca775d126b60
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> @@ -0,0 +1,49 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <test_progs.h>
> +#include <network_helpers.h>
> +#include "test_dynptr_xdp.skel.h"
> +#include "test_tcp_hdr_options.h"
> +
> +struct test_pkt {
> + struct ipv6_packet pk6_v6;
> + u8 options[16];
> +} __packed;
> +
> +void test_dynptr_xdp(void)
> +{
> + struct test_dynptr_xdp *skel;
> + char buf[128];
> + int err;
> +
> + skel = test_dynptr_xdp__open_and_load();
> + if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
> + return;
> +
> + struct test_pkt pkt = {
> + .pk6_v6.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
> + .pk6_v6.iph.nexthdr = IPPROTO_TCP,
> + .pk6_v6.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
> + .pk6_v6.tcp.urg_ptr = 123,
> + .pk6_v6.tcp.doff = 9, /* 16 bytes of options */
> +
> + .options = {
> + TCPOPT_MSS, 4, 0x05, 0xB4, TCPOPT_NOP, TCPOPT_NOP,
> + skel->rodata->tcp_hdr_opt_kind_tpr, 6, 0, 0, 0, 9, TCPOPT_EOL
> + },
> + };
> +
> + LIBBPF_OPTS(bpf_test_run_opts, topts,
> + .data_in = &pkt,
> + .data_size_in = sizeof(pkt),
> + .data_out = buf,
> + .data_size_out = sizeof(buf),
> + .repeat = 3,
> + );
> +
> + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.xdp_ingress_v6), &topts);
> + ASSERT_OK(err, "ipv6 test_run");
> + ASSERT_EQ(skel->bss->server_id, 0x9000000, "server id");
> + ASSERT_EQ(topts.retval, XDP_PASS, "ipv6 test_run retval");
> +
> + test_dynptr_xdp__destroy(skel);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
> index c1814938a5fd..4e3f853b2d02 100644
> --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
> +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
> @@ -5,6 +5,7 @@
> #include <string.h>
> #include <linux/bpf.h>
> #include <bpf/bpf_helpers.h>
> +#include <linux/if_ether.h>
> #include "bpf_misc.h"
>
> char _license[] SEC("license") = "GPL";
> @@ -622,3 +623,78 @@ int dynptr_from_mem_invalid_api(void *ctx)
>
> return 0;
> }
> +
> +/* The data slice is invalidated whenever a helper changes packet data */
> +SEC("?tc")
> +int skb_invalid_data_slice(struct __sk_buff *skb)
> +{
> + struct bpf_dynptr ptr;
> + struct ethhdr *hdr;
> +
> + bpf_dynptr_from_skb(skb, 0, &ptr);
> + hdr = bpf_dynptr_data(&ptr, 0, sizeof(*hdr));
> + if (!hdr)
> + return SK_DROP;
> +
> + hdr->h_proto = 12;
> +
> + if (bpf_skb_pull_data(skb, skb->len))
> + return SK_DROP;
> +
> + /* this should fail */
> + hdr->h_proto = 1;
> +
> + return SK_PASS;
> +}
> +
> +/* The data slice is invalidated whenever a helper changes packet data */
> +SEC("?xdp")
> +int xdp_invalid_data_slice(struct xdp_md *xdp)
> +{
> + struct bpf_dynptr ptr;
> + struct ethhdr *hdr1, *hdr2;
> +
> + bpf_dynptr_from_xdp(xdp, 0, &ptr);
> + hdr1 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr1));
> + if (!hdr1)
> + return XDP_DROP;
> +
> + hdr2 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr2));
> + if (!hdr2)
> + return XDP_DROP;
> +
> + hdr1->h_proto = 12;
> + hdr2->h_proto = 12;
> +
> + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr1)))
> + return XDP_DROP;
> +
> + /* this should fail */
> + hdr2->h_proto = 1;
> +
> + return XDP_PASS;
> +}
> +
> +/* Only supported prog type can create skb-type dynptrs */
> +SEC("?raw_tp/sys_nanosleep")
> +int skb_invalid_ctx(void *ctx)
> +{
> + struct bpf_dynptr ptr;
> +
> + /* this should fail */
> + bpf_dynptr_from_skb(ctx, 0, &ptr);
> +
> + return 0;
> +}
> +
> +/* Only supported prog type can create xdp-type dynptrs */
> +SEC("?raw_tp/sys_nanosleep")
> +int xdp_invalid_ctx(void *ctx)
> +{
> + struct bpf_dynptr ptr;
> +
> + /* this should fail */
> + bpf_dynptr_from_xdp(ctx, 0, &ptr);
> +
> + return 0;
> +}
> diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
> index a3a6103c8569..ffddd6ddc7fb 100644
> --- a/tools/testing/selftests/bpf/progs/dynptr_success.c
> +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
> @@ -162,3 +162,35 @@ int test_ringbuf(void *ctx)
> bpf_ringbuf_discard_dynptr(&ptr, 0);
> return 0;
> }
> +
> +SEC("cgroup_skb/egress")
> +int test_skb_readonly(void *ctx)
> +{
> + __u8 write_data[2] = {1, 2};
> + struct bpf_dynptr ptr;
> + void *data;
> + int ret;
> +
> + err = 1;
> +
> + if (bpf_dynptr_from_skb(ctx, 0, &ptr))
> + return 0;
> + err++;
> +
> + data = bpf_dynptr_data(&ptr, 0, 1);
> + if (data)
> + /* it's an error if data is not NULL since cgroup skbs
> + * are read only
> + */
> + return 0;
> + err++;
> +
> + ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
> + /* since cgroup skbs are read only, writes should fail */
> + if (ret != -EINVAL)
> + return 0;
> +
> + err = 0;
> +
> + return 0;
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> new file mode 100644
> index 000000000000..c879dfb6370a
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> @@ -0,0 +1,115 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/* This logic is lifted from a real-world use case of packet parsing, used in
> + * the open source library katran, a layer 4 load balancer.
> + *
> + * This test demonstrates how to parse packet contents using dynptrs.
> + *
> + * https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
> + */
> +
> +#include <string.h>
> +#include <linux/bpf.h>
> +#include <bpf/bpf_helpers.h>
> +#include <linux/tcp.h>
> +#include <stdbool.h>
> +#include <linux/ipv6.h>
> +#include <linux/if_ether.h>
> +#include "test_tcp_hdr_options.h"
> +
> +char _license[] SEC("license") = "GPL";
> +
> +/* Arbitrarily picked unused value from IANA TCP Option Kind Numbers */
> +const __u32 tcp_hdr_opt_kind_tpr = 0xB7;

Should this instead be either 0xFD or 0xFE, as those are the two Kind numbers
allocated for experiments? Using a reserved value seems suboptimal, and
potentially risks updating one of the entries in [0] to have a double asterisk.

[0]: https://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-parameters-1

> +/* Length of the tcp header option */
> +const __u32 tcp_hdr_opt_len_tpr = 6;
> +/* maximum number of header options to check to lookup server_id */
> +const __u32 tcp_hdr_opt_max_opt_checks = 15;
> +
> +__u32 server_id;
> +
> +static int parse_hdr_opt(struct bpf_dynptr *ptr, __u32 *off, __u8 *hdr_bytes_remaining,
> + __u32 *server_id)
> +{
> + __u8 *tcp_opt, kind, hdr_len;
> + __u8 *data;
> +
> + data = bpf_dynptr_data(ptr, *off, sizeof(kind) + sizeof(hdr_len) +
> + sizeof(*server_id));
> + if (!data)
> + return -1;
> +
> + kind = data[0];
> +
> + if (kind == TCPOPT_EOL)
> + return -1;
> +
> + if (kind == TCPOPT_NOP) {
> + *off += 1;
> + /* continue to the next option */
> + *hdr_bytes_remaining -= 1;
> +
> + return 0;
> + }
> +
> + if (*hdr_bytes_remaining < 2)
> + return -1;
> +
> + hdr_len = data[1];
> + if (hdr_len > *hdr_bytes_remaining)
> + return -1;
> +
> + if (kind == tcp_hdr_opt_kind_tpr) {
> + if (hdr_len != tcp_hdr_opt_len_tpr)
> + return -1;
> +
> + memcpy(server_id, (__u32 *)(data + 2), sizeof(*server_id));
> + return 1;
> + }
> +
> + *off += hdr_len;
> + *hdr_bytes_remaining -= hdr_len;
> +
> + return 0;
> +}
> +
> +SEC("xdp")
> +int xdp_ingress_v6(struct xdp_md *xdp)
> +{
> + __u8 hdr_bytes_remaining;
> + struct tcphdr *tcp_hdr;
> + __u8 tcp_hdr_opt_len;
> + int err = 0;
> + __u32 off;
> +
> + struct bpf_dynptr ptr;
> +
> + bpf_dynptr_from_xdp(xdp, 0, &ptr);
> +
> + off = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
> +
> + tcp_hdr = bpf_dynptr_data(&ptr, off, sizeof(*tcp_hdr));
> + if (!tcp_hdr)
> + return XDP_DROP;
> +
> + tcp_hdr_opt_len = (tcp_hdr->doff * 4) - sizeof(struct tcphdr);
> + if (tcp_hdr_opt_len < tcp_hdr_opt_len_tpr)
> + return XDP_DROP;
> +
> + hdr_bytes_remaining = tcp_hdr_opt_len;
> +
> + off += sizeof(struct tcphdr);
> +
> + /* max number of bytes of options in tcp header is 40 bytes */
> + for (int i = 0; i < tcp_hdr_opt_max_opt_checks; i++) {
> + err = parse_hdr_opt(&ptr, &off, &hdr_bytes_remaining, &server_id);
> +
> + if (err || !hdr_bytes_remaining)
> + break;
> + }
> +
> + if (!server_id)
> + return XDP_DROP;
> +
> + return XDP_PASS;
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> index c8bc0c6947aa..1fef7868ea8b 100644
> --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> @@ -230,21 +230,18 @@ static __noinline bool get_packet_dst(struct real_definition **real,
> return true;
> }
>
> -static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
> +static __noinline int parse_icmpv6(struct bpf_dynptr *skb_ptr, __u64 off,
> struct packet_description *pckt)
> {
> struct icmp6hdr *icmp_hdr;
> struct ipv6hdr *ip6h;
>
> - icmp_hdr = data + off;
> - if (icmp_hdr + 1 > data_end)
> + icmp_hdr = bpf_dynptr_data(skb_ptr, off, sizeof(*icmp_hdr) + sizeof(*ip6h));
> + if (!icmp_hdr)
> return TC_ACT_SHOT;
> if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
> return TC_ACT_OK;
> - off += sizeof(struct icmp6hdr);
> - ip6h = data + off;
> - if (ip6h + 1 > data_end)
> - return TC_ACT_SHOT;
> + ip6h = (struct ipv6hdr *)(icmp_hdr + 1);
> pckt->proto = ip6h->nexthdr;
> pckt->flags |= F_ICMP;
> memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
> @@ -252,22 +249,19 @@ static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
> return TC_ACT_UNSPEC;
> }
>
> -static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
> +static __noinline int parse_icmp(struct bpf_dynptr *skb_ptr, __u64 off,
> struct packet_description *pckt)
> {
> struct icmphdr *icmp_hdr;
> struct iphdr *iph;
>
> - icmp_hdr = data + off;
> - if (icmp_hdr + 1 > data_end)
> + icmp_hdr = bpf_dynptr_data(skb_ptr, off, sizeof(*icmp_hdr) + sizeof(*iph));
> + if (!icmp_hdr)
> return TC_ACT_SHOT;
> if (icmp_hdr->type != ICMP_DEST_UNREACH ||
> icmp_hdr->code != ICMP_FRAG_NEEDED)
> return TC_ACT_OK;
> - off += sizeof(struct icmphdr);
> - iph = data + off;
> - if (iph + 1 > data_end)
> - return TC_ACT_SHOT;
> + iph = (struct iphdr *)(icmp_hdr + 1);
> if (iph->ihl != 5)
> return TC_ACT_SHOT;
> pckt->proto = iph->protocol;
> @@ -277,13 +271,13 @@ static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
> return TC_ACT_UNSPEC;
> }
>
> -static __noinline bool parse_udp(void *data, __u64 off, void *data_end,
> +static __noinline bool parse_udp(struct bpf_dynptr *skb_ptr, __u64 off,
> struct packet_description *pckt)
> {
> struct udphdr *udp;
> - udp = data + off;
>
> - if (udp + 1 > data_end)
> + udp = bpf_dynptr_data(skb_ptr, off, sizeof(*udp));
> + if (!udp)
> return false;
>
> if (!(pckt->flags & F_ICMP)) {
> @@ -296,13 +290,13 @@ static __noinline bool parse_udp(void *data, __u64 off, void *data_end,
> return true;
> }
>
> -static __noinline bool parse_tcp(void *data, __u64 off, void *data_end,
> +static __noinline bool parse_tcp(struct bpf_dynptr *skb_ptr, __u64 off,
> struct packet_description *pckt)
> {
> struct tcphdr *tcp;
>
> - tcp = data + off;
> - if (tcp + 1 > data_end)
> + tcp = bpf_dynptr_data(skb_ptr, off, sizeof(*tcp));
> + if (!tcp)
> return false;
>
> if (tcp->syn)
> @@ -318,12 +312,11 @@ static __noinline bool parse_tcp(void *data, __u64 off, void *data_end,
> return true;
> }
>
> -static __noinline int process_packet(void *data, __u64 off, void *data_end,
> +static __noinline int process_packet(struct bpf_dynptr *skb_ptr,
> + struct eth_hdr *eth, __u64 off,
> bool is_ipv6, struct __sk_buff *skb)
> {
> - void *pkt_start = (void *)(long)skb->data;
> struct packet_description pckt = {};
> - struct eth_hdr *eth = pkt_start;
> struct bpf_tunnel_key tkey = {};
> struct vip_stats *data_stats;
> struct real_definition *dst;
> @@ -344,8 +337,8 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
>
> tkey.tunnel_ttl = 64;
> if (is_ipv6) {
> - ip6h = data + off;
> - if (ip6h + 1 > data_end)
> + ip6h = bpf_dynptr_data(skb_ptr, off, sizeof(*ip6h));
> + if (!ip6h)
> return TC_ACT_SHOT;
>
> iph_len = sizeof(struct ipv6hdr);
> @@ -356,7 +349,7 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> if (protocol == IPPROTO_FRAGMENT) {
> return TC_ACT_SHOT;
> } else if (protocol == IPPROTO_ICMPV6) {
> - action = parse_icmpv6(data, data_end, off, &pckt);
> + action = parse_icmpv6(skb_ptr, off, &pckt);
> if (action >= 0)
> return action;
> off += IPV6_PLUS_ICMP_HDR;
> @@ -365,10 +358,8 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16);
> }
> } else {
> - iph = data + off;
> - if (iph + 1 > data_end)
> - return TC_ACT_SHOT;
> - if (iph->ihl != 5)
> + iph = bpf_dynptr_data(skb_ptr, off, sizeof(*iph));
> + if (!iph || iph->ihl != 5)
> return TC_ACT_SHOT;
>
> protocol = iph->protocol;
> @@ -379,7 +370,7 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> if (iph->frag_off & PCKT_FRAGMENTED)
> return TC_ACT_SHOT;
> if (protocol == IPPROTO_ICMP) {
> - action = parse_icmp(data, data_end, off, &pckt);
> + action = parse_icmp(skb_ptr, off, &pckt);
> if (action >= 0)
> return action;
> off += IPV4_PLUS_ICMP_HDR;
> @@ -391,10 +382,10 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> protocol = pckt.proto;
>
> if (protocol == IPPROTO_TCP) {
> - if (!parse_tcp(data, off, data_end, &pckt))
> + if (!parse_tcp(skb_ptr, off, &pckt))
> return TC_ACT_SHOT;
> } else if (protocol == IPPROTO_UDP) {
> - if (!parse_udp(data, off, data_end, &pckt))
> + if (!parse_udp(skb_ptr, off, &pckt))
> return TC_ACT_SHOT;
> } else {
> return TC_ACT_SHOT;
> @@ -450,20 +441,22 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> SEC("tc")
> int balancer_ingress(struct __sk_buff *ctx)
> {
> - void *data_end = (void *)(long)ctx->data_end;
> - void *data = (void *)(long)ctx->data;
> - struct eth_hdr *eth = data;
> + struct bpf_dynptr ptr;
> + struct eth_hdr *eth;
> __u32 eth_proto;
> __u32 nh_off;
>
> nh_off = sizeof(struct eth_hdr);
> - if (data + nh_off > data_end)
> +
> + bpf_dynptr_from_skb(ctx, 0, &ptr);
> + eth = bpf_dynptr_data(&ptr, 0, sizeof(*eth));
> + if (!eth)
> return TC_ACT_SHOT;
> eth_proto = eth->eth_proto;
> if (eth_proto == bpf_htons(ETH_P_IP))
> - return process_packet(data, nh_off, data_end, false, ctx);
> + return process_packet(&ptr, eth, nh_off, false, ctx);
> else if (eth_proto == bpf_htons(ETH_P_IPV6))
> - return process_packet(data, nh_off, data_end, true, ctx);
> + return process_packet(&ptr, eth, nh_off, true, ctx);
> else
> return TC_ACT_SHOT;
> }
> diff --git a/tools/testing/selftests/bpf/progs/test_xdp.c b/tools/testing/selftests/bpf/progs/test_xdp.c
> index d7a9a74b7245..2272b56a8777 100644
> --- a/tools/testing/selftests/bpf/progs/test_xdp.c
> +++ b/tools/testing/selftests/bpf/progs/test_xdp.c
> @@ -20,6 +20,12 @@
> #include <bpf/bpf_endian.h>
> #include "test_iptunnel_common.h"
>
> +const size_t tcphdr_sz = sizeof(struct tcphdr);
> +const size_t udphdr_sz = sizeof(struct udphdr);
> +const size_t ethhdr_sz = sizeof(struct ethhdr);
> +const size_t iphdr_sz = sizeof(struct iphdr);
> +const size_t ipv6hdr_sz = sizeof(struct ipv6hdr);
> +
> struct {
> __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> __uint(max_entries, 256);
> @@ -43,8 +49,7 @@ static __always_inline void count_tx(__u32 protocol)
> *rxcnt_count += 1;
> }
>
> -static __always_inline int get_dport(void *trans_data, void *data_end,
> - __u8 protocol)
> +static __always_inline int get_dport(void *trans_data, __u8 protocol)
> {
> struct tcphdr *th;
> struct udphdr *uh;
> @@ -52,13 +57,9 @@ static __always_inline int get_dport(void *trans_data, void *data_end,
> switch (protocol) {
> case IPPROTO_TCP:
> th = (struct tcphdr *)trans_data;
> - if (th + 1 > data_end)
> - return -1;
> return th->dest;
> case IPPROTO_UDP:
> uh = (struct udphdr *)trans_data;
> - if (uh + 1 > data_end)
> - return -1;
> return uh->dest;
> default:
> return 0;
> @@ -75,14 +76,13 @@ static __always_inline void set_ethhdr(struct ethhdr *new_eth,
> new_eth->h_proto = h_proto;
> }
>
> -static __always_inline int handle_ipv4(struct xdp_md *xdp)
> +static __always_inline int handle_ipv4(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
> {
> - void *data_end = (void *)(long)xdp->data_end;
> - void *data = (void *)(long)xdp->data;
> + struct bpf_dynptr new_xdp_ptr;
> struct iptnl_info *tnl;
> struct ethhdr *new_eth;
> struct ethhdr *old_eth;
> - struct iphdr *iph = data + sizeof(struct ethhdr);
> + struct iphdr *iph;
> __u16 *next_iph;
> __u16 payload_len;
> struct vip vip = {};
> @@ -90,10 +90,12 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
> __u32 csum = 0;
> int i;
>
> - if (iph + 1 > data_end)
> + iph = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> + iphdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
> + if (!iph)
> return XDP_DROP;
>
> - dport = get_dport(iph + 1, data_end, iph->protocol);
> + dport = get_dport(iph + 1, iph->protocol);
> if (dport == -1)
> return XDP_DROP;
>
> @@ -108,37 +110,33 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
> if (!tnl || tnl->family != AF_INET)
> return XDP_PASS;
>
> - if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
> + if (bpf_xdp_adjust_head(xdp, 0 - (int)iphdr_sz))
> return XDP_DROP;
>
> - data = (void *)(long)xdp->data;
> - data_end = (void *)(long)xdp->data_end;
> -
> - new_eth = data;
> - iph = data + sizeof(*new_eth);
> - old_eth = data + sizeof(*iph);
> -
> - if (new_eth + 1 > data_end ||
> - old_eth + 1 > data_end ||
> - iph + 1 > data_end)
> + bpf_dynptr_from_xdp(xdp, 0, &new_xdp_ptr);
> + new_eth = bpf_dynptr_data(&new_xdp_ptr, 0, ethhdr_sz + iphdr_sz + ethhdr_sz);
> + if (!new_eth)
> return XDP_DROP;
>
> + iph = (struct iphdr *)(new_eth + 1);
> + old_eth = (struct ethhdr *)(iph + 1);
> +
> set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IP));
>
> iph->version = 4;
> - iph->ihl = sizeof(*iph) >> 2;
> + iph->ihl = iphdr_sz >> 2;
> iph->frag_off = 0;
> iph->protocol = IPPROTO_IPIP;
> iph->check = 0;
> iph->tos = 0;
> - iph->tot_len = bpf_htons(payload_len + sizeof(*iph));
> + iph->tot_len = bpf_htons(payload_len + iphdr_sz);
> iph->daddr = tnl->daddr.v4;
> iph->saddr = tnl->saddr.v4;
> iph->ttl = 8;
>
> next_iph = (__u16 *)iph;
> #pragma clang loop unroll(full)
> - for (i = 0; i < sizeof(*iph) >> 1; i++)
> + for (i = 0; i < iphdr_sz >> 1; i++)
> csum += *next_iph++;
>
> iph->check = ~((csum & 0xffff) + (csum >> 16));
> @@ -148,22 +146,23 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
> return XDP_TX;
> }
>
> -static __always_inline int handle_ipv6(struct xdp_md *xdp)
> +static __always_inline int handle_ipv6(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
> {
> - void *data_end = (void *)(long)xdp->data_end;
> - void *data = (void *)(long)xdp->data;
> + struct bpf_dynptr new_xdp_ptr;
> struct iptnl_info *tnl;
> struct ethhdr *new_eth;
> struct ethhdr *old_eth;
> - struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
> + struct ipv6hdr *ip6h;
> __u16 payload_len;
> struct vip vip = {};
> int dport;
>
> - if (ip6h + 1 > data_end)
> + ip6h = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> + ipv6hdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
> + if (!ip6h)
> return XDP_DROP;
>
> - dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
> + dport = get_dport(ip6h + 1, ip6h->nexthdr);
> if (dport == -1)
> return XDP_DROP;
>
> @@ -178,26 +177,23 @@ static __always_inline int handle_ipv6(struct xdp_md *xdp)
> if (!tnl || tnl->family != AF_INET6)
> return XDP_PASS;
>
> - if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
> + if (bpf_xdp_adjust_head(xdp, 0 - (int)ipv6hdr_sz))
> return XDP_DROP;
>
> - data = (void *)(long)xdp->data;
> - data_end = (void *)(long)xdp->data_end;
> -
> - new_eth = data;
> - ip6h = data + sizeof(*new_eth);
> - old_eth = data + sizeof(*ip6h);
> -
> - if (new_eth + 1 > data_end || old_eth + 1 > data_end ||
> - ip6h + 1 > data_end)
> + bpf_dynptr_from_xdp(xdp, 0, &new_xdp_ptr);
> + new_eth = bpf_dynptr_data(&new_xdp_ptr, 0, ethhdr_sz + ipv6hdr_sz + ethhdr_sz);
> + if (!new_eth)
> return XDP_DROP;
>
> + ip6h = (struct ipv6hdr *)(new_eth + 1);
> + old_eth = (struct ethhdr *)(ip6h + 1);
> +
> set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IPV6));
>
> ip6h->version = 6;
> ip6h->priority = 0;
> memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
> - ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + sizeof(*ip6h));
> + ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + ipv6hdr_sz);
> ip6h->nexthdr = IPPROTO_IPV6;
> ip6h->hop_limit = 8;
> memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
> @@ -211,21 +207,22 @@ static __always_inline int handle_ipv6(struct xdp_md *xdp)
> SEC("xdp")
> int _xdp_tx_iptunnel(struct xdp_md *xdp)
> {
> - void *data_end = (void *)(long)xdp->data_end;
> - void *data = (void *)(long)xdp->data;
> - struct ethhdr *eth = data;
> + struct bpf_dynptr ptr;
> + struct ethhdr *eth;
> __u16 h_proto;
>
> - if (eth + 1 > data_end)
> + bpf_dynptr_from_xdp(xdp, 0, &ptr);
> + eth = bpf_dynptr_data(&ptr, 0, ethhdr_sz);
> + if (!eth)
> return XDP_DROP;
>
> h_proto = eth->h_proto;
>
> if (h_proto == bpf_htons(ETH_P_IP))
> - return handle_ipv4(xdp);
> + return handle_ipv4(xdp, &ptr);
> else if (h_proto == bpf_htons(ETH_P_IPV6))
>
> - return handle_ipv6(xdp);
> + return handle_ipv6(xdp, &ptr);
> else
> return XDP_DROP;
> }
> diff --git a/tools/testing/selftests/bpf/test_tcp_hdr_options.h b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
> index 6118e3ab61fc..56c9f8a3ad3d 100644
> --- a/tools/testing/selftests/bpf/test_tcp_hdr_options.h
> +++ b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
> @@ -50,6 +50,7 @@ struct linum_err {
>
> #define TCPOPT_EOL 0
> #define TCPOPT_NOP 1
> +#define TCPOPT_MSS 2
> #define TCPOPT_WINDOW 3
> #define TCPOPT_EXP 254
>
> --
> 2.30.2
>
Joanne Koong July 26, 2022, 8:06 p.m. UTC | #2
On Tue, Jul 26, 2022 at 12:44 PM Zvi Effron <zeffron@riotgames.com> wrote:
>
> On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > Test skb and xdp dynptr functionality in the following ways:
> >
> > 1) progs/test_xdp.c
> > * Change existing test to use dynptrs to parse xdp data
> >
> > There were no noticeable diferences in user + system time between
> > the original version vs. using dynptrs. Averaging the time for 10
> > runs (run using "time ./test_progs -t xdp_bpf2bpf"):
> > original version: 0.0449 sec
> > with dynptrs: 0.0429 sec
> >
> > 2) progs/test_l4lb_noinline.c
> > * Change existing test to use dynptrs to parse skb data
> >
> > There were no noticeable diferences in user + system time between
> > the original version vs. using dynptrs. Averaging the time for 10
> > runs (run using "time ./test_progs -t l4lb_all/l4lb_noinline"):
> > original version: 0.0502 sec
> > with dynptrs: 0.055 sec
> >
> > For number of processed verifier instructions:
> > original version: 6284 insns
> > with dynptrs: 2538 insns
> >
> > 3) progs/test_dynptr_xdp.c
> > * Add sample code for parsing tcp hdr opt lookup using dynptrs.
> > This logic is lifted from a real-world use case of packet parsing in
> > katran [0], a layer 4 load balancer
> >
> > 4) progs/dynptr_success.c
> > * Add test case "test_skb_readonly" for testing attempts at writes /
> > data slices on a prog type with read-only skb ctx.
> >
> > 5) progs/dynptr_fail.c
> > * Add test cases "skb_invalid_data_slice" and
> > "xdp_invalid_data_slice" for testing that helpers that modify the
> > underlying packet buffer automatically invalidate the associated
> > data slice.
> > * Add test cases "skb_invalid_ctx" and "xdp_invalid_ctx" for testing
> > that prog types that do not support bpf_dynptr_from_skb/xdp don't
> > have access to the API.
> >
> > [0] https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
> > .../testing/selftests/bpf/prog_tests/dynptr.c | 85 ++++++++++---
> > .../selftests/bpf/prog_tests/dynptr_xdp.c | 49 ++++++++
> > .../testing/selftests/bpf/progs/dynptr_fail.c | 76 ++++++++++++
> > .../selftests/bpf/progs/dynptr_success.c | 32 +++++
> > .../selftests/bpf/progs/test_dynptr_xdp.c | 115 ++++++++++++++++++
> > .../selftests/bpf/progs/test_l4lb_noinline.c | 71 +++++------
> > tools/testing/selftests/bpf/progs/test_xdp.c | 95 +++++++--------
> > .../selftests/bpf/test_tcp_hdr_options.h | 1 +
> > 8 files changed, 416 insertions(+), 108 deletions(-)
> > create mode 100644 tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> > create mode 100644 tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> >
> > diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > index bcf80b9f7c27..c40631f33c7b 100644
> > --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > @@ -2,6 +2,7 @@
> > /* Copyright (c) 2022 Facebook */
> >
> > #include <test_progs.h>
> > +#include <network_helpers.h>
> > #include "dynptr_fail.skel.h"
> > #include "dynptr_success.skel.h"
> >
> > @@ -11,8 +12,8 @@ static char obj_log_buf[1048576];
> > static struct {
> > const char *prog_name;
> > const char *expected_err_msg;
> > -} dynptr_tests[] = {
> > - /* failure cases */
> > +} verifier_error_tests[] = {
> > + /* these cases should trigger a verifier error */
> > {"ringbuf_missing_release1", "Unreleased reference id=1"},
> > {"ringbuf_missing_release2", "Unreleased reference id=2"},
> > {"ringbuf_missing_release_callback", "Unreleased reference id"},
> > @@ -42,11 +43,25 @@ static struct {
> > {"release_twice_callback", "arg 1 is an unacquired reference"},
> > {"dynptr_from_mem_invalid_api",
> > "Unsupported reg type fp for bpf_dynptr_from_mem data"},
> > + {"skb_invalid_data_slice", "invalid mem access 'scalar'"},
> > + {"xdp_invalid_data_slice", "invalid mem access 'scalar'"},
> > + {"skb_invalid_ctx", "unknown func bpf_dynptr_from_skb"},
> > + {"xdp_invalid_ctx", "unknown func bpf_dynptr_from_xdp"},
> > +};
> > +
> > +enum test_setup_type {
> > + SETUP_SYSCALL_SLEEP,
> > + SETUP_SKB_PROG,
> > +};
> >
> > - /* success cases */
> > - {"test_read_write", NULL},
> > - {"test_data_slice", NULL},
> > - {"test_ringbuf", NULL},
> > +static struct {
> > + const char *prog_name;
> > + enum test_setup_type type;
> > +} runtime_tests[] = {
> > + {"test_read_write", SETUP_SYSCALL_SLEEP},
> > + {"test_data_slice", SETUP_SYSCALL_SLEEP},
> > + {"test_ringbuf", SETUP_SYSCALL_SLEEP},
> > + {"test_skb_readonly", SETUP_SKB_PROG},
> > };
> >
> > static void verify_fail(const char *prog_name, const char *expected_err_msg)
> > @@ -85,7 +100,7 @@ static void verify_fail(const char *prog_name, const char *expected_err_msg)
> > dynptr_fail__destroy(skel);
> > }
> >
> > -static void verify_success(const char *prog_name)
> > +static void run_tests(const char *prog_name, enum test_setup_type setup_type)
> > {
> > struct dynptr_success *skel;
> > struct bpf_program *prog;
> > @@ -107,15 +122,42 @@ static void verify_success(const char *prog_name)
> > if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
> > goto cleanup;
> >
> > - link = bpf_program__attach(prog);
> > - if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
> > - goto cleanup;
> > + switch (setup_type) {
> > + case SETUP_SYSCALL_SLEEP:
> > + link = bpf_program__attach(prog);
> > + if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
> > + goto cleanup;
> >
> > - usleep(1);
> > + usleep(1);
> >
> > - ASSERT_EQ(skel->bss->err, 0, "err");
> > + bpf_link__destroy(link);
> > + break;
> > + case SETUP_SKB_PROG:
> > + {
> > + int prog_fd, err;
> > + char buf[64];
> > +
> > + prog_fd = bpf_program__fd(prog);
> > + if (CHECK_FAIL(prog_fd < 0))
> > + goto cleanup;
> > +
> > + LIBBPF_OPTS(bpf_test_run_opts, topts,
> > + .data_in = &pkt_v4,
> > + .data_size_in = sizeof(pkt_v4),
> > + .data_out = buf,
> > + .data_size_out = sizeof(buf),
> > + .repeat = 1,
> > + );
> >
> > - bpf_link__destroy(link);
> > + err = bpf_prog_test_run_opts(prog_fd, &topts);
> > +
> > + if (!ASSERT_OK(err, "test_run"))
> > + goto cleanup;
> > +
> > + break;
> > + }
> > + }
> > + ASSERT_EQ(skel->bss->err, 0, "err");
> >
> > cleanup:
> > dynptr_success__destroy(skel);
> > @@ -125,14 +167,17 @@ void test_dynptr(void)
> > {
> > int i;
> >
> > - for (i = 0; i < ARRAY_SIZE(dynptr_tests); i++) {
> > - if (!test__start_subtest(dynptr_tests[i].prog_name))
> > + for (i = 0; i < ARRAY_SIZE(verifier_error_tests); i++) {
> > + if (!test__start_subtest(verifier_error_tests[i].prog_name))
> > + continue;
> > +
> > + verify_fail(verifier_error_tests[i].prog_name,
> > + verifier_error_tests[i].expected_err_msg);
> > + }
> > + for (i = 0; i < ARRAY_SIZE(runtime_tests); i++) {
> > + if (!test__start_subtest(runtime_tests[i].prog_name))
> > continue;
> >
> > - if (dynptr_tests[i].expected_err_msg)
> > - verify_fail(dynptr_tests[i].prog_name,
> > - dynptr_tests[i].expected_err_msg);
> > - else
> > - verify_success(dynptr_tests[i].prog_name);
> > + run_tests(runtime_tests[i].prog_name, runtime_tests[i].type);
> > }
> > }
> > diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c b/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> > new file mode 100644
> > index 000000000000..ca775d126b60
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> > @@ -0,0 +1,49 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <test_progs.h>
> > +#include <network_helpers.h>
> > +#include "test_dynptr_xdp.skel.h"
> > +#include "test_tcp_hdr_options.h"
> > +
> > +struct test_pkt {
> > + struct ipv6_packet pk6_v6;
> > + u8 options[16];
> > +} __packed;
> > +
> > +void test_dynptr_xdp(void)
> > +{
> > + struct test_dynptr_xdp *skel;
> > + char buf[128];
> > + int err;
> > +
> > + skel = test_dynptr_xdp__open_and_load();
> > + if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
> > + return;
> > +
> > + struct test_pkt pkt = {
> > + .pk6_v6.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
> > + .pk6_v6.iph.nexthdr = IPPROTO_TCP,
> > + .pk6_v6.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
> > + .pk6_v6.tcp.urg_ptr = 123,
> > + .pk6_v6.tcp.doff = 9, /* 16 bytes of options */
> > +
> > + .options = {
> > + TCPOPT_MSS, 4, 0x05, 0xB4, TCPOPT_NOP, TCPOPT_NOP,
> > + skel->rodata->tcp_hdr_opt_kind_tpr, 6, 0, 0, 0, 9, TCPOPT_EOL
> > + },
> > + };
> > +
> > + LIBBPF_OPTS(bpf_test_run_opts, topts,
> > + .data_in = &pkt,
> > + .data_size_in = sizeof(pkt),
> > + .data_out = buf,
> > + .data_size_out = sizeof(buf),
> > + .repeat = 3,
> > + );
> > +
> > + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.xdp_ingress_v6), &topts);
> > + ASSERT_OK(err, "ipv6 test_run");
> > + ASSERT_EQ(skel->bss->server_id, 0x9000000, "server id");
> > + ASSERT_EQ(topts.retval, XDP_PASS, "ipv6 test_run retval");
> > +
> > + test_dynptr_xdp__destroy(skel);
> > +}
> > diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
> > index c1814938a5fd..4e3f853b2d02 100644
> > --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
> > +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
> > @@ -5,6 +5,7 @@
> > #include <string.h>
> > #include <linux/bpf.h>
> > #include <bpf/bpf_helpers.h>
> > +#include <linux/if_ether.h>
> > #include "bpf_misc.h"
> >
> > char _license[] SEC("license") = "GPL";
> > @@ -622,3 +623,78 @@ int dynptr_from_mem_invalid_api(void *ctx)
> >
> > return 0;
> > }
> > +
> > +/* The data slice is invalidated whenever a helper changes packet data */
> > +SEC("?tc")
> > +int skb_invalid_data_slice(struct __sk_buff *skb)
> > +{
> > + struct bpf_dynptr ptr;
> > + struct ethhdr *hdr;
> > +
> > + bpf_dynptr_from_skb(skb, 0, &ptr);
> > + hdr = bpf_dynptr_data(&ptr, 0, sizeof(*hdr));
> > + if (!hdr)
> > + return SK_DROP;
> > +
> > + hdr->h_proto = 12;
> > +
> > + if (bpf_skb_pull_data(skb, skb->len))
> > + return SK_DROP;
> > +
> > + /* this should fail */
> > + hdr->h_proto = 1;
> > +
> > + return SK_PASS;
> > +}
> > +
> > +/* The data slice is invalidated whenever a helper changes packet data */
> > +SEC("?xdp")
> > +int xdp_invalid_data_slice(struct xdp_md *xdp)
> > +{
> > + struct bpf_dynptr ptr;
> > + struct ethhdr *hdr1, *hdr2;
> > +
> > + bpf_dynptr_from_xdp(xdp, 0, &ptr);
> > + hdr1 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr1));
> > + if (!hdr1)
> > + return XDP_DROP;
> > +
> > + hdr2 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr2));
> > + if (!hdr2)
> > + return XDP_DROP;
> > +
> > + hdr1->h_proto = 12;
> > + hdr2->h_proto = 12;
> > +
> > + if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr1)))
> > + return XDP_DROP;
> > +
> > + /* this should fail */
> > + hdr2->h_proto = 1;
> > +
> > + return XDP_PASS;
> > +}
> > +
> > +/* Only supported prog type can create skb-type dynptrs */
> > +SEC("?raw_tp/sys_nanosleep")
> > +int skb_invalid_ctx(void *ctx)
> > +{
> > + struct bpf_dynptr ptr;
> > +
> > + /* this should fail */
> > + bpf_dynptr_from_skb(ctx, 0, &ptr);
> > +
> > + return 0;
> > +}
> > +
> > +/* Only supported prog type can create xdp-type dynptrs */
> > +SEC("?raw_tp/sys_nanosleep")
> > +int xdp_invalid_ctx(void *ctx)
> > +{
> > + struct bpf_dynptr ptr;
> > +
> > + /* this should fail */
> > + bpf_dynptr_from_xdp(ctx, 0, &ptr);
> > +
> > + return 0;
> > +}
> > diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
> > index a3a6103c8569..ffddd6ddc7fb 100644
> > --- a/tools/testing/selftests/bpf/progs/dynptr_success.c
> > +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
> > @@ -162,3 +162,35 @@ int test_ringbuf(void *ctx)
> > bpf_ringbuf_discard_dynptr(&ptr, 0);
> > return 0;
> > }
> > +
> > +SEC("cgroup_skb/egress")
> > +int test_skb_readonly(void *ctx)
> > +{
> > + __u8 write_data[2] = {1, 2};
> > + struct bpf_dynptr ptr;
> > + void *data;
> > + int ret;
> > +
> > + err = 1;
> > +
> > + if (bpf_dynptr_from_skb(ctx, 0, &ptr))
> > + return 0;
> > + err++;
> > +
> > + data = bpf_dynptr_data(&ptr, 0, 1);
> > + if (data)
> > + /* it's an error if data is not NULL since cgroup skbs
> > + * are read only
> > + */
> > + return 0;
> > + err++;
> > +
> > + ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
> > + /* since cgroup skbs are read only, writes should fail */
> > + if (ret != -EINVAL)
> > + return 0;
> > +
> > + err = 0;
> > +
> > + return 0;
> > +}
> > diff --git a/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> > new file mode 100644
> > index 000000000000..c879dfb6370a
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> > @@ -0,0 +1,115 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +/* This logic is lifted from a real-world use case of packet parsing, used in
> > + * the open source library katran, a layer 4 load balancer.
> > + *
> > + * This test demonstrates how to parse packet contents using dynptrs.
> > + *
> > + * https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
> > + */
> > +
> > +#include <string.h>
> > +#include <linux/bpf.h>
> > +#include <bpf/bpf_helpers.h>
> > +#include <linux/tcp.h>
> > +#include <stdbool.h>
> > +#include <linux/ipv6.h>
> > +#include <linux/if_ether.h>
> > +#include "test_tcp_hdr_options.h"
> > +
> > +char _license[] SEC("license") = "GPL";
> > +
> > +/* Arbitrarily picked unused value from IANA TCP Option Kind Numbers */
> > +const __u32 tcp_hdr_opt_kind_tpr = 0xB7;
>
> Should this instead be either 0xFD or 0xFE, as those are the two Kind numbers
> allocated for experiments? Using a reserved value seems suboptimal, and
> potentially risks updating one of the entries in [0] to have a double asterisk.

I used 0xB7 because that's what the katran library [1] uses, but after
reading through that iana link, using 0xFD or 0xFE as the experimental
value makes sense to me. I'll change this for v2.

[1] https://github.com/facebookincubator/katran/blob/44ac98876280b8a76e6c90bf857b7b0afe1870f1/katran/lib/bpf/balancer_consts.h#L176-L177

>
> [0]: https://www.iana.org/assignments/tcp-parameters/tcp-parameters.xhtml#tcp-parameters-1
>
> > +/* Length of the tcp header option */
> > +const __u32 tcp_hdr_opt_len_tpr = 6;
> > +/* maximum number of header options to check to lookup server_id */
> > +const __u32 tcp_hdr_opt_max_opt_checks = 15;
> > +
> > +__u32 server_id;
> > +
> > +static int parse_hdr_opt(struct bpf_dynptr *ptr, __u32 *off, __u8 *hdr_bytes_remaining,
> > + __u32 *server_id)
> > +{
> > + __u8 *tcp_opt, kind, hdr_len;
> > + __u8 *data;
> > +
> > + data = bpf_dynptr_data(ptr, *off, sizeof(kind) + sizeof(hdr_len) +
> > + sizeof(*server_id));
> > + if (!data)
> > + return -1;
> > +
> > + kind = data[0];
> > +
> > + if (kind == TCPOPT_EOL)
> > + return -1;
> > +
> > + if (kind == TCPOPT_NOP) {
> > + *off += 1;
> > + /* continue to the next option */
> > + *hdr_bytes_remaining -= 1;
> > +
> > + return 0;
> > + }
> > +
> > + if (*hdr_bytes_remaining < 2)
> > + return -1;
> > +
> > + hdr_len = data[1];
> > + if (hdr_len > *hdr_bytes_remaining)
> > + return -1;
> > +
> > + if (kind == tcp_hdr_opt_kind_tpr) {
> > + if (hdr_len != tcp_hdr_opt_len_tpr)
> > + return -1;
> > +
> > + memcpy(server_id, (__u32 *)(data + 2), sizeof(*server_id));
> > + return 1;
> > + }
> > +
> > + *off += hdr_len;
> > + *hdr_bytes_remaining -= hdr_len;
> > +
> > + return 0;
> > +}
> > +
> > +SEC("xdp")
> > +int xdp_ingress_v6(struct xdp_md *xdp)
> > +{
> > + __u8 hdr_bytes_remaining;
> > + struct tcphdr *tcp_hdr;
> > + __u8 tcp_hdr_opt_len;
> > + int err = 0;
> > + __u32 off;
> > +
> > + struct bpf_dynptr ptr;
> > +
> > + bpf_dynptr_from_xdp(xdp, 0, &ptr);
> > +
> > + off = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
> > +
> > + tcp_hdr = bpf_dynptr_data(&ptr, off, sizeof(*tcp_hdr));
> > + if (!tcp_hdr)
> > + return XDP_DROP;
> > +
> > + tcp_hdr_opt_len = (tcp_hdr->doff * 4) - sizeof(struct tcphdr);
> > + if (tcp_hdr_opt_len < tcp_hdr_opt_len_tpr)
> > + return XDP_DROP;
> > +
> > + hdr_bytes_remaining = tcp_hdr_opt_len;
> > +
> > + off += sizeof(struct tcphdr);
> > +
> > + /* max number of bytes of options in tcp header is 40 bytes */
> > + for (int i = 0; i < tcp_hdr_opt_max_opt_checks; i++) {
> > + err = parse_hdr_opt(&ptr, &off, &hdr_bytes_remaining, &server_id);
> > +
> > + if (err || !hdr_bytes_remaining)
> > + break;
> > + }
> > +
> > + if (!server_id)
> > + return XDP_DROP;
> > +
> > + return XDP_PASS;
> > +}
> > diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> > index c8bc0c6947aa..1fef7868ea8b 100644
> > --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> > +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> > @@ -230,21 +230,18 @@ static __noinline bool get_packet_dst(struct real_definition **real,
> > return true;
> > }
> >
> > -static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
> > +static __noinline int parse_icmpv6(struct bpf_dynptr *skb_ptr, __u64 off,
> > struct packet_description *pckt)
> > {
> > struct icmp6hdr *icmp_hdr;
> > struct ipv6hdr *ip6h;
> >
> > - icmp_hdr = data + off;
> > - if (icmp_hdr + 1 > data_end)
> > + icmp_hdr = bpf_dynptr_data(skb_ptr, off, sizeof(*icmp_hdr) + sizeof(*ip6h));
> > + if (!icmp_hdr)
> > return TC_ACT_SHOT;
> > if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
> > return TC_ACT_OK;
> > - off += sizeof(struct icmp6hdr);
> > - ip6h = data + off;
> > - if (ip6h + 1 > data_end)
> > - return TC_ACT_SHOT;
> > + ip6h = (struct ipv6hdr *)(icmp_hdr + 1);
> > pckt->proto = ip6h->nexthdr;
> > pckt->flags |= F_ICMP;
> > memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
> > @@ -252,22 +249,19 @@ static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
> > return TC_ACT_UNSPEC;
> > }
> >
> > -static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
> > +static __noinline int parse_icmp(struct bpf_dynptr *skb_ptr, __u64 off,
> > struct packet_description *pckt)
> > {
> > struct icmphdr *icmp_hdr;
> > struct iphdr *iph;
> >
> > - icmp_hdr = data + off;
> > - if (icmp_hdr + 1 > data_end)
> > + icmp_hdr = bpf_dynptr_data(skb_ptr, off, sizeof(*icmp_hdr) + sizeof(*iph));
> > + if (!icmp_hdr)
> > return TC_ACT_SHOT;
> > if (icmp_hdr->type != ICMP_DEST_UNREACH ||
> > icmp_hdr->code != ICMP_FRAG_NEEDED)
> > return TC_ACT_OK;
> > - off += sizeof(struct icmphdr);
> > - iph = data + off;
> > - if (iph + 1 > data_end)
> > - return TC_ACT_SHOT;
> > + iph = (struct iphdr *)(icmp_hdr + 1);
> > if (iph->ihl != 5)
> > return TC_ACT_SHOT;
> > pckt->proto = iph->protocol;
> > @@ -277,13 +271,13 @@ static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
> > return TC_ACT_UNSPEC;
> > }
> >
> > -static __noinline bool parse_udp(void *data, __u64 off, void *data_end,
> > +static __noinline bool parse_udp(struct bpf_dynptr *skb_ptr, __u64 off,
> > struct packet_description *pckt)
> > {
> > struct udphdr *udp;
> > - udp = data + off;
> >
> > - if (udp + 1 > data_end)
> > + udp = bpf_dynptr_data(skb_ptr, off, sizeof(*udp));
> > + if (!udp)
> > return false;
> >
> > if (!(pckt->flags & F_ICMP)) {
> > @@ -296,13 +290,13 @@ static __noinline bool parse_udp(void *data, __u64 off, void *data_end,
> > return true;
> > }
> >
> > -static __noinline bool parse_tcp(void *data, __u64 off, void *data_end,
> > +static __noinline bool parse_tcp(struct bpf_dynptr *skb_ptr, __u64 off,
> > struct packet_description *pckt)
> > {
> > struct tcphdr *tcp;
> >
> > - tcp = data + off;
> > - if (tcp + 1 > data_end)
> > + tcp = bpf_dynptr_data(skb_ptr, off, sizeof(*tcp));
> > + if (!tcp)
> > return false;
> >
> > if (tcp->syn)
> > @@ -318,12 +312,11 @@ static __noinline bool parse_tcp(void *data, __u64 off, void *data_end,
> > return true;
> > }
> >
> > -static __noinline int process_packet(void *data, __u64 off, void *data_end,
> > +static __noinline int process_packet(struct bpf_dynptr *skb_ptr,
> > + struct eth_hdr *eth, __u64 off,
> > bool is_ipv6, struct __sk_buff *skb)
> > {
> > - void *pkt_start = (void *)(long)skb->data;
> > struct packet_description pckt = {};
> > - struct eth_hdr *eth = pkt_start;
> > struct bpf_tunnel_key tkey = {};
> > struct vip_stats *data_stats;
> > struct real_definition *dst;
> > @@ -344,8 +337,8 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> >
> > tkey.tunnel_ttl = 64;
> > if (is_ipv6) {
> > - ip6h = data + off;
> > - if (ip6h + 1 > data_end)
> > + ip6h = bpf_dynptr_data(skb_ptr, off, sizeof(*ip6h));
> > + if (!ip6h)
> > return TC_ACT_SHOT;
> >
> > iph_len = sizeof(struct ipv6hdr);
> > @@ -356,7 +349,7 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> > if (protocol == IPPROTO_FRAGMENT) {
> > return TC_ACT_SHOT;
> > } else if (protocol == IPPROTO_ICMPV6) {
> > - action = parse_icmpv6(data, data_end, off, &pckt);
> > + action = parse_icmpv6(skb_ptr, off, &pckt);
> > if (action >= 0)
> > return action;
> > off += IPV6_PLUS_ICMP_HDR;
> > @@ -365,10 +358,8 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> > memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16);
> > }
> > } else {
> > - iph = data + off;
> > - if (iph + 1 > data_end)
> > - return TC_ACT_SHOT;
> > - if (iph->ihl != 5)
> > + iph = bpf_dynptr_data(skb_ptr, off, sizeof(*iph));
> > + if (!iph || iph->ihl != 5)
> > return TC_ACT_SHOT;
> >
> > protocol = iph->protocol;
> > @@ -379,7 +370,7 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> > if (iph->frag_off & PCKT_FRAGMENTED)
> > return TC_ACT_SHOT;
> > if (protocol == IPPROTO_ICMP) {
> > - action = parse_icmp(data, data_end, off, &pckt);
> > + action = parse_icmp(skb_ptr, off, &pckt);
> > if (action >= 0)
> > return action;
> > off += IPV4_PLUS_ICMP_HDR;
> > @@ -391,10 +382,10 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> > protocol = pckt.proto;
> >
> > if (protocol == IPPROTO_TCP) {
> > - if (!parse_tcp(data, off, data_end, &pckt))
> > + if (!parse_tcp(skb_ptr, off, &pckt))
> > return TC_ACT_SHOT;
> > } else if (protocol == IPPROTO_UDP) {
> > - if (!parse_udp(data, off, data_end, &pckt))
> > + if (!parse_udp(skb_ptr, off, &pckt))
> > return TC_ACT_SHOT;
> > } else {
> > return TC_ACT_SHOT;
> > @@ -450,20 +441,22 @@ static __noinline int process_packet(void *data, __u64 off, void *data_end,
> > SEC("tc")
> > int balancer_ingress(struct __sk_buff *ctx)
> > {
> > - void *data_end = (void *)(long)ctx->data_end;
> > - void *data = (void *)(long)ctx->data;
> > - struct eth_hdr *eth = data;
> > + struct bpf_dynptr ptr;
> > + struct eth_hdr *eth;
> > __u32 eth_proto;
> > __u32 nh_off;
> >
> > nh_off = sizeof(struct eth_hdr);
> > - if (data + nh_off > data_end)
> > +
> > + bpf_dynptr_from_skb(ctx, 0, &ptr);
> > + eth = bpf_dynptr_data(&ptr, 0, sizeof(*eth));
> > + if (!eth)
> > return TC_ACT_SHOT;
> > eth_proto = eth->eth_proto;
> > if (eth_proto == bpf_htons(ETH_P_IP))
> > - return process_packet(data, nh_off, data_end, false, ctx);
> > + return process_packet(&ptr, eth, nh_off, false, ctx);
> > else if (eth_proto == bpf_htons(ETH_P_IPV6))
> > - return process_packet(data, nh_off, data_end, true, ctx);
> > + return process_packet(&ptr, eth, nh_off, true, ctx);
> > else
> > return TC_ACT_SHOT;
> > }
> > diff --git a/tools/testing/selftests/bpf/progs/test_xdp.c b/tools/testing/selftests/bpf/progs/test_xdp.c
> > index d7a9a74b7245..2272b56a8777 100644
> > --- a/tools/testing/selftests/bpf/progs/test_xdp.c
> > +++ b/tools/testing/selftests/bpf/progs/test_xdp.c
> > @@ -20,6 +20,12 @@
> > #include <bpf/bpf_endian.h>
> > #include "test_iptunnel_common.h"
> >
> > +const size_t tcphdr_sz = sizeof(struct tcphdr);
> > +const size_t udphdr_sz = sizeof(struct udphdr);
> > +const size_t ethhdr_sz = sizeof(struct ethhdr);
> > +const size_t iphdr_sz = sizeof(struct iphdr);
> > +const size_t ipv6hdr_sz = sizeof(struct ipv6hdr);
> > +
> > struct {
> > __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
> > __uint(max_entries, 256);
> > @@ -43,8 +49,7 @@ static __always_inline void count_tx(__u32 protocol)
> > *rxcnt_count += 1;
> > }
> >
> > -static __always_inline int get_dport(void *trans_data, void *data_end,
> > - __u8 protocol)
> > +static __always_inline int get_dport(void *trans_data, __u8 protocol)
> > {
> > struct tcphdr *th;
> > struct udphdr *uh;
> > @@ -52,13 +57,9 @@ static __always_inline int get_dport(void *trans_data, void *data_end,
> > switch (protocol) {
> > case IPPROTO_TCP:
> > th = (struct tcphdr *)trans_data;
> > - if (th + 1 > data_end)
> > - return -1;
> > return th->dest;
> > case IPPROTO_UDP:
> > uh = (struct udphdr *)trans_data;
> > - if (uh + 1 > data_end)
> > - return -1;
> > return uh->dest;
> > default:
> > return 0;
> > @@ -75,14 +76,13 @@ static __always_inline void set_ethhdr(struct ethhdr *new_eth,
> > new_eth->h_proto = h_proto;
> > }
> >
> > -static __always_inline int handle_ipv4(struct xdp_md *xdp)
> > +static __always_inline int handle_ipv4(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
> > {
> > - void *data_end = (void *)(long)xdp->data_end;
> > - void *data = (void *)(long)xdp->data;
> > + struct bpf_dynptr new_xdp_ptr;
> > struct iptnl_info *tnl;
> > struct ethhdr *new_eth;
> > struct ethhdr *old_eth;
> > - struct iphdr *iph = data + sizeof(struct ethhdr);
> > + struct iphdr *iph;
> > __u16 *next_iph;
> > __u16 payload_len;
> > struct vip vip = {};
> > @@ -90,10 +90,12 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
> > __u32 csum = 0;
> > int i;
> >
> > - if (iph + 1 > data_end)
> > + iph = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> > + iphdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
> > + if (!iph)
> > return XDP_DROP;
> >
> > - dport = get_dport(iph + 1, data_end, iph->protocol);
> > + dport = get_dport(iph + 1, iph->protocol);
> > if (dport == -1)
> > return XDP_DROP;
> >
> > @@ -108,37 +110,33 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
> > if (!tnl || tnl->family != AF_INET)
> > return XDP_PASS;
> >
> > - if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
> > + if (bpf_xdp_adjust_head(xdp, 0 - (int)iphdr_sz))
> > return XDP_DROP;
> >
> > - data = (void *)(long)xdp->data;
> > - data_end = (void *)(long)xdp->data_end;
> > -
> > - new_eth = data;
> > - iph = data + sizeof(*new_eth);
> > - old_eth = data + sizeof(*iph);
> > -
> > - if (new_eth + 1 > data_end ||
> > - old_eth + 1 > data_end ||
> > - iph + 1 > data_end)
> > + bpf_dynptr_from_xdp(xdp, 0, &new_xdp_ptr);
> > + new_eth = bpf_dynptr_data(&new_xdp_ptr, 0, ethhdr_sz + iphdr_sz + ethhdr_sz);
> > + if (!new_eth)
> > return XDP_DROP;
> >
> > + iph = (struct iphdr *)(new_eth + 1);
> > + old_eth = (struct ethhdr *)(iph + 1);
> > +
> > set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IP));
> >
> > iph->version = 4;
> > - iph->ihl = sizeof(*iph) >> 2;
> > + iph->ihl = iphdr_sz >> 2;
> > iph->frag_off = 0;
> > iph->protocol = IPPROTO_IPIP;
> > iph->check = 0;
> > iph->tos = 0;
> > - iph->tot_len = bpf_htons(payload_len + sizeof(*iph));
> > + iph->tot_len = bpf_htons(payload_len + iphdr_sz);
> > iph->daddr = tnl->daddr.v4;
> > iph->saddr = tnl->saddr.v4;
> > iph->ttl = 8;
> >
> > next_iph = (__u16 *)iph;
> > #pragma clang loop unroll(full)
> > - for (i = 0; i < sizeof(*iph) >> 1; i++)
> > + for (i = 0; i < iphdr_sz >> 1; i++)
> > csum += *next_iph++;
> >
> > iph->check = ~((csum & 0xffff) + (csum >> 16));
> > @@ -148,22 +146,23 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
> > return XDP_TX;
> > }
> >
> > -static __always_inline int handle_ipv6(struct xdp_md *xdp)
> > +static __always_inline int handle_ipv6(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
> > {
> > - void *data_end = (void *)(long)xdp->data_end;
> > - void *data = (void *)(long)xdp->data;
> > + struct bpf_dynptr new_xdp_ptr;
> > struct iptnl_info *tnl;
> > struct ethhdr *new_eth;
> > struct ethhdr *old_eth;
> > - struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
> > + struct ipv6hdr *ip6h;
> > __u16 payload_len;
> > struct vip vip = {};
> > int dport;
> >
> > - if (ip6h + 1 > data_end)
> > + ip6h = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> > + ipv6hdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
> > + if (!ip6h)
> > return XDP_DROP;
> >
> > - dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
> > + dport = get_dport(ip6h + 1, ip6h->nexthdr);
> > if (dport == -1)
> > return XDP_DROP;
> >
> > @@ -178,26 +177,23 @@ static __always_inline int handle_ipv6(struct xdp_md *xdp)
> > if (!tnl || tnl->family != AF_INET6)
> > return XDP_PASS;
> >
> > - if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
> > + if (bpf_xdp_adjust_head(xdp, 0 - (int)ipv6hdr_sz))
> > return XDP_DROP;
> >
> > - data = (void *)(long)xdp->data;
> > - data_end = (void *)(long)xdp->data_end;
> > -
> > - new_eth = data;
> > - ip6h = data + sizeof(*new_eth);
> > - old_eth = data + sizeof(*ip6h);
> > -
> > - if (new_eth + 1 > data_end || old_eth + 1 > data_end ||
> > - ip6h + 1 > data_end)
> > + bpf_dynptr_from_xdp(xdp, 0, &new_xdp_ptr);
> > + new_eth = bpf_dynptr_data(&new_xdp_ptr, 0, ethhdr_sz + ipv6hdr_sz + ethhdr_sz);
> > + if (!new_eth)
> > return XDP_DROP;
> >
> > + ip6h = (struct ipv6hdr *)(new_eth + 1);
> > + old_eth = (struct ethhdr *)(ip6h + 1);
> > +
> > set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IPV6));
> >
> > ip6h->version = 6;
> > ip6h->priority = 0;
> > memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
> > - ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + sizeof(*ip6h));
> > + ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + ipv6hdr_sz);
> > ip6h->nexthdr = IPPROTO_IPV6;
> > ip6h->hop_limit = 8;
> > memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
> > @@ -211,21 +207,22 @@ static __always_inline int handle_ipv6(struct xdp_md *xdp)
> > SEC("xdp")
> > int _xdp_tx_iptunnel(struct xdp_md *xdp)
> > {
> > - void *data_end = (void *)(long)xdp->data_end;
> > - void *data = (void *)(long)xdp->data;
> > - struct ethhdr *eth = data;
> > + struct bpf_dynptr ptr;
> > + struct ethhdr *eth;
> > __u16 h_proto;
> >
> > - if (eth + 1 > data_end)
> > + bpf_dynptr_from_xdp(xdp, 0, &ptr);
> > + eth = bpf_dynptr_data(&ptr, 0, ethhdr_sz);
> > + if (!eth)
> > return XDP_DROP;
> >
> > h_proto = eth->h_proto;
> >
> > if (h_proto == bpf_htons(ETH_P_IP))
> > - return handle_ipv4(xdp);
> > + return handle_ipv4(xdp, &ptr);
> > else if (h_proto == bpf_htons(ETH_P_IPV6))
> >
> > - return handle_ipv6(xdp);
> > + return handle_ipv6(xdp, &ptr);
> > else
> > return XDP_DROP;
> > }
> > diff --git a/tools/testing/selftests/bpf/test_tcp_hdr_options.h b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
> > index 6118e3ab61fc..56c9f8a3ad3d 100644
> > --- a/tools/testing/selftests/bpf/test_tcp_hdr_options.h
> > +++ b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
> > @@ -50,6 +50,7 @@ struct linum_err {
> >
> > #define TCPOPT_EOL 0
> > #define TCPOPT_NOP 1
> > +#define TCPOPT_MSS 2
> > #define TCPOPT_WINDOW 3
> > #define TCPOPT_EXP 254
> >
> > --
> > 2.30.2
> >
Andrii Nakryiko Aug. 1, 2022, 5:58 p.m. UTC | #3
On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> Test skb and xdp dynptr functionality in the following ways:
>
> 1) progs/test_xdp.c
>    * Change existing test to use dynptrs to parse xdp data
>
>      There were no noticeable diferences in user + system time between
>      the original version vs. using dynptrs. Averaging the time for 10
>      runs (run using "time ./test_progs -t xdp_bpf2bpf"):
>          original version: 0.0449 sec
>          with dynptrs: 0.0429 sec
>
> 2) progs/test_l4lb_noinline.c
>    * Change existing test to use dynptrs to parse skb data
>
>      There were no noticeable diferences in user + system time between
>      the original version vs. using dynptrs. Averaging the time for 10
>      runs (run using "time ./test_progs -t l4lb_all/l4lb_noinline"):
>          original version: 0.0502 sec
>          with dynptrs: 0.055 sec
>
>      For number of processed verifier instructions:
>          original version: 6284 insns
>          with dynptrs: 2538 insns
>
> 3) progs/test_dynptr_xdp.c
>    * Add sample code for parsing tcp hdr opt lookup using dynptrs.
>      This logic is lifted from a real-world use case of packet parsing in
>      katran [0], a layer 4 load balancer
>
> 4) progs/dynptr_success.c
>    * Add test case "test_skb_readonly" for testing attempts at writes /
>      data slices on a prog type with read-only skb ctx.
>
> 5) progs/dynptr_fail.c
>    * Add test cases "skb_invalid_data_slice" and
>      "xdp_invalid_data_slice" for testing that helpers that modify the
>      underlying packet buffer automatically invalidate the associated
>      data slice.
>    * Add test cases "skb_invalid_ctx" and "xdp_invalid_ctx" for testing
>      that prog types that do not support bpf_dynptr_from_skb/xdp don't
>      have access to the API.
>
> [0] https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
>  .../testing/selftests/bpf/prog_tests/dynptr.c |  85 ++++++++++---
>  .../selftests/bpf/prog_tests/dynptr_xdp.c     |  49 ++++++++
>  .../testing/selftests/bpf/progs/dynptr_fail.c |  76 ++++++++++++
>  .../selftests/bpf/progs/dynptr_success.c      |  32 +++++
>  .../selftests/bpf/progs/test_dynptr_xdp.c     | 115 ++++++++++++++++++
>  .../selftests/bpf/progs/test_l4lb_noinline.c  |  71 +++++------
>  tools/testing/selftests/bpf/progs/test_xdp.c  |  95 +++++++--------
>  .../selftests/bpf/test_tcp_hdr_options.h      |   1 +
>  8 files changed, 416 insertions(+), 108 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
>  create mode 100644 tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> index bcf80b9f7c27..c40631f33c7b 100644
> --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
> +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> @@ -2,6 +2,7 @@
>  /* Copyright (c) 2022 Facebook */
>
>  #include <test_progs.h>
> +#include <network_helpers.h>
>  #include "dynptr_fail.skel.h"
>  #include "dynptr_success.skel.h"
>
> @@ -11,8 +12,8 @@ static char obj_log_buf[1048576];
>  static struct {
>         const char *prog_name;
>         const char *expected_err_msg;
> -} dynptr_tests[] = {
> -       /* failure cases */
> +} verifier_error_tests[] = {
> +       /* these cases should trigger a verifier error */
>         {"ringbuf_missing_release1", "Unreleased reference id=1"},
>         {"ringbuf_missing_release2", "Unreleased reference id=2"},
>         {"ringbuf_missing_release_callback", "Unreleased reference id"},
> @@ -42,11 +43,25 @@ static struct {
>         {"release_twice_callback", "arg 1 is an unacquired reference"},
>         {"dynptr_from_mem_invalid_api",
>                 "Unsupported reg type fp for bpf_dynptr_from_mem data"},
> +       {"skb_invalid_data_slice", "invalid mem access 'scalar'"},
> +       {"xdp_invalid_data_slice", "invalid mem access 'scalar'"},
> +       {"skb_invalid_ctx", "unknown func bpf_dynptr_from_skb"},
> +       {"xdp_invalid_ctx", "unknown func bpf_dynptr_from_xdp"},
> +};
> +
> +enum test_setup_type {
> +       SETUP_SYSCALL_SLEEP,
> +       SETUP_SKB_PROG,
> +};
>
> -       /* success cases */
> -       {"test_read_write", NULL},
> -       {"test_data_slice", NULL},
> -       {"test_ringbuf", NULL},
> +static struct {
> +       const char *prog_name;
> +       enum test_setup_type type;
> +} runtime_tests[] = {
> +       {"test_read_write", SETUP_SYSCALL_SLEEP},
> +       {"test_data_slice", SETUP_SYSCALL_SLEEP},
> +       {"test_ringbuf", SETUP_SYSCALL_SLEEP},
> +       {"test_skb_readonly", SETUP_SKB_PROG},

nit: wouldn't it be better to add test_setup_type to dynptr_tests (and
keep fail and success cases together)? It's conceivable that you might
want different setups to test different error conditions, right?

>  };
>
>  static void verify_fail(const char *prog_name, const char *expected_err_msg)
> @@ -85,7 +100,7 @@ static void verify_fail(const char *prog_name, const char *expected_err_msg)
>         dynptr_fail__destroy(skel);
>  }
>
> -static void verify_success(const char *prog_name)
> +static void run_tests(const char *prog_name, enum test_setup_type setup_type)
>  {
>         struct dynptr_success *skel;
>         struct bpf_program *prog;
> @@ -107,15 +122,42 @@ static void verify_success(const char *prog_name)
>         if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
>                 goto cleanup;
>
> -       link = bpf_program__attach(prog);
> -       if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
> -               goto cleanup;
> +       switch (setup_type) {
> +       case SETUP_SYSCALL_SLEEP:
> +               link = bpf_program__attach(prog);
> +               if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
> +                       goto cleanup;
>
> -       usleep(1);
> +               usleep(1);
>
> -       ASSERT_EQ(skel->bss->err, 0, "err");
> +               bpf_link__destroy(link);
> +               break;
> +       case SETUP_SKB_PROG:
> +       {
> +               int prog_fd, err;
> +               char buf[64];
> +
> +               prog_fd = bpf_program__fd(prog);
> +               if (CHECK_FAIL(prog_fd < 0))

please don't use CHECK and especially CHECK_FAIL

> +                       goto cleanup;
> +
> +               LIBBPF_OPTS(bpf_test_run_opts, topts,
> +                           .data_in = &pkt_v4,
> +                           .data_size_in = sizeof(pkt_v4),
> +                           .data_out = buf,
> +                           .data_size_out = sizeof(buf),
> +                           .repeat = 1,
> +               );

nit: LIBBPF_OPTS declares variable, so should be part of variable
declaration block

>
> -       bpf_link__destroy(link);
> +               err = bpf_prog_test_run_opts(prog_fd, &topts);
> +
> +               if (!ASSERT_OK(err, "test_run"))
> +                       goto cleanup;
> +
> +               break;
> +       }
> +       }
> +       ASSERT_EQ(skel->bss->err, 0, "err");
>
>  cleanup:
>         dynptr_success__destroy(skel);
> @@ -125,14 +167,17 @@ void test_dynptr(void)
>  {
>         int i;
>
> -       for (i = 0; i < ARRAY_SIZE(dynptr_tests); i++) {
> -               if (!test__start_subtest(dynptr_tests[i].prog_name))
> +       for (i = 0; i < ARRAY_SIZE(verifier_error_tests); i++) {
> +               if (!test__start_subtest(verifier_error_tests[i].prog_name))
> +                       continue;
> +
> +               verify_fail(verifier_error_tests[i].prog_name,
> +                           verifier_error_tests[i].expected_err_msg);
> +       }
> +       for (i = 0; i < ARRAY_SIZE(runtime_tests); i++) {
> +               if (!test__start_subtest(runtime_tests[i].prog_name))
>                         continue;
>
> -               if (dynptr_tests[i].expected_err_msg)
> -                       verify_fail(dynptr_tests[i].prog_name,
> -                                   dynptr_tests[i].expected_err_msg);
> -               else
> -                       verify_success(dynptr_tests[i].prog_name);
> +               run_tests(runtime_tests[i].prog_name, runtime_tests[i].type);
>         }
>  }
> diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c b/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> new file mode 100644
> index 000000000000..ca775d126b60
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> @@ -0,0 +1,49 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <test_progs.h>
> +#include <network_helpers.h>
> +#include "test_dynptr_xdp.skel.h"
> +#include "test_tcp_hdr_options.h"
> +
> +struct test_pkt {
> +       struct ipv6_packet pk6_v6;
> +       u8 options[16];
> +} __packed;
> +
> +void test_dynptr_xdp(void)
> +{
> +       struct test_dynptr_xdp *skel;
> +       char buf[128];
> +       int err;
> +
> +       skel = test_dynptr_xdp__open_and_load();
> +       if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
> +               return;
> +
> +       struct test_pkt pkt = {
> +               .pk6_v6.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
> +               .pk6_v6.iph.nexthdr = IPPROTO_TCP,
> +               .pk6_v6.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
> +               .pk6_v6.tcp.urg_ptr = 123,
> +               .pk6_v6.tcp.doff = 9, /* 16 bytes of options */
> +
> +               .options = {
> +                       TCPOPT_MSS, 4, 0x05, 0xB4, TCPOPT_NOP, TCPOPT_NOP,
> +                       skel->rodata->tcp_hdr_opt_kind_tpr, 6, 0, 0, 0, 9, TCPOPT_EOL
> +               },
> +       };
> +
> +       LIBBPF_OPTS(bpf_test_run_opts, topts,
> +                   .data_in = &pkt,
> +                   .data_size_in = sizeof(pkt),
> +                   .data_out = buf,
> +                   .data_size_out = sizeof(buf),
> +                   .repeat = 3,
> +       );
> +

for topts and pkt, they should be up above with other variables
(unless we want to break off from kernel code style, which I didn't
think we want)

> +       err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.xdp_ingress_v6), &topts);
> +       ASSERT_OK(err, "ipv6 test_run");
> +       ASSERT_EQ(skel->bss->server_id, 0x9000000, "server id");
> +       ASSERT_EQ(topts.retval, XDP_PASS, "ipv6 test_run retval");
> +
> +       test_dynptr_xdp__destroy(skel);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
> index c1814938a5fd..4e3f853b2d02 100644
> --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
> +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
> @@ -5,6 +5,7 @@
>  #include <string.h>
>  #include <linux/bpf.h>
>  #include <bpf/bpf_helpers.h>
> +#include <linux/if_ether.h>
>  #include "bpf_misc.h"
>
>  char _license[] SEC("license") = "GPL";
> @@ -622,3 +623,78 @@ int dynptr_from_mem_invalid_api(void *ctx)
>
>         return 0;
>  }
> +
> +/* The data slice is invalidated whenever a helper changes packet data */
> +SEC("?tc")
> +int skb_invalid_data_slice(struct __sk_buff *skb)
> +{
> +       struct bpf_dynptr ptr;
> +       struct ethhdr *hdr;
> +
> +       bpf_dynptr_from_skb(skb, 0, &ptr);
> +       hdr = bpf_dynptr_data(&ptr, 0, sizeof(*hdr));
> +       if (!hdr)
> +               return SK_DROP;
> +
> +       hdr->h_proto = 12;
> +
> +       if (bpf_skb_pull_data(skb, skb->len))
> +               return SK_DROP;
> +
> +       /* this should fail */
> +       hdr->h_proto = 1;
> +
> +       return SK_PASS;
> +}
> +
> +/* The data slice is invalidated whenever a helper changes packet data */
> +SEC("?xdp")
> +int xdp_invalid_data_slice(struct xdp_md *xdp)
> +{
> +       struct bpf_dynptr ptr;
> +       struct ethhdr *hdr1, *hdr2;
> +
> +       bpf_dynptr_from_xdp(xdp, 0, &ptr);
> +       hdr1 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr1));
> +       if (!hdr1)
> +               return XDP_DROP;
> +
> +       hdr2 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr2));
> +       if (!hdr2)
> +               return XDP_DROP;
> +
> +       hdr1->h_proto = 12;
> +       hdr2->h_proto = 12;
> +
> +       if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr1)))
> +               return XDP_DROP;
> +
> +       /* this should fail */
> +       hdr2->h_proto = 1;

is there something special about having both hdr1 and hdr2? Wouldn't
this test work with just single hdr pointer?

> +
> +       return XDP_PASS;
> +}
> +
> +/* Only supported prog type can create skb-type dynptrs */

[...]

> +       err = 1;
> +
> +       if (bpf_dynptr_from_skb(ctx, 0, &ptr))
> +               return 0;
> +       err++;
> +
> +       data = bpf_dynptr_data(&ptr, 0, 1);
> +       if (data)
> +               /* it's an error if data is not NULL since cgroup skbs
> +                * are read only
> +                */
> +               return 0;
> +       err++;
> +
> +       ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
> +       /* since cgroup skbs are read only, writes should fail */
> +       if (ret != -EINVAL)
> +               return 0;
> +
> +       err = 0;

hm, if data is NULL you'll still report success if bpf_dynptr_write
returns 0 or any other error but -EINVAL... The logic is a bit unclear
here...

> +
> +       return 0;
> +}
> diff --git a/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> new file mode 100644
> index 000000000000..c879dfb6370a
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> @@ -0,0 +1,115 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +/* This logic is lifted from a real-world use case of packet parsing, used in
> + * the open source library katran, a layer 4 load balancer.
> + *
> + * This test demonstrates how to parse packet contents using dynptrs.
> + *
> + * https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
> + */
> +
> +#include <string.h>
> +#include <linux/bpf.h>
> +#include <bpf/bpf_helpers.h>
> +#include <linux/tcp.h>
> +#include <stdbool.h>
> +#include <linux/ipv6.h>
> +#include <linux/if_ether.h>
> +#include "test_tcp_hdr_options.h"
> +
> +char _license[] SEC("license") = "GPL";
> +
> +/* Arbitrarily picked unused value from IANA TCP Option Kind Numbers */
> +const __u32 tcp_hdr_opt_kind_tpr = 0xB7;
> +/* Length of the tcp header option */
> +const __u32 tcp_hdr_opt_len_tpr = 6;
> +/* maximum number of header options to check to lookup server_id */
> +const __u32 tcp_hdr_opt_max_opt_checks = 15;
> +
> +__u32 server_id;
> +
> +static int parse_hdr_opt(struct bpf_dynptr *ptr, __u32 *off, __u8 *hdr_bytes_remaining,
> +                        __u32 *server_id)
> +{
> +       __u8 *tcp_opt, kind, hdr_len;
> +       __u8 *data;
> +
> +       data = bpf_dynptr_data(ptr, *off, sizeof(kind) + sizeof(hdr_len) +
> +                              sizeof(*server_id));
> +       if (!data)
> +               return -1;
> +
> +       kind = data[0];
> +
> +       if (kind == TCPOPT_EOL)
> +               return -1;
> +
> +       if (kind == TCPOPT_NOP) {
> +               *off += 1;
> +               /* continue to the next option */
> +               *hdr_bytes_remaining -= 1;
> +
> +               return 0;
> +       }
> +
> +       if (*hdr_bytes_remaining < 2)
> +               return -1;
> +
> +       hdr_len = data[1];
> +       if (hdr_len > *hdr_bytes_remaining)
> +               return -1;
> +
> +       if (kind == tcp_hdr_opt_kind_tpr) {
> +               if (hdr_len != tcp_hdr_opt_len_tpr)
> +                       return -1;
> +
> +               memcpy(server_id, (__u32 *)(data + 2), sizeof(*server_id));

this implicitly relies on compiler inlining memcpy, let's use
__builtint_memcpy() here instead to set a good example?

> +               return 1;
> +       }
> +
> +       *off += hdr_len;
> +       *hdr_bytes_remaining -= hdr_len;
> +
> +       return 0;
> +}
> +
> +SEC("xdp")
> +int xdp_ingress_v6(struct xdp_md *xdp)
> +{
> +       __u8 hdr_bytes_remaining;
> +       struct tcphdr *tcp_hdr;
> +       __u8 tcp_hdr_opt_len;
> +       int err = 0;
> +       __u32 off;
> +
> +       struct bpf_dynptr ptr;
> +
> +       bpf_dynptr_from_xdp(xdp, 0, &ptr);
> +
> +       off = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
> +
> +       tcp_hdr = bpf_dynptr_data(&ptr, off, sizeof(*tcp_hdr));
> +       if (!tcp_hdr)
> +               return XDP_DROP;
> +
> +       tcp_hdr_opt_len = (tcp_hdr->doff * 4) - sizeof(struct tcphdr);
> +       if (tcp_hdr_opt_len < tcp_hdr_opt_len_tpr)
> +               return XDP_DROP;
> +
> +       hdr_bytes_remaining = tcp_hdr_opt_len;
> +
> +       off += sizeof(struct tcphdr);
> +
> +       /* max number of bytes of options in tcp header is 40 bytes */
> +       for (int i = 0; i < tcp_hdr_opt_max_opt_checks; i++) {
> +               err = parse_hdr_opt(&ptr, &off, &hdr_bytes_remaining, &server_id);
> +
> +               if (err || !hdr_bytes_remaining)
> +                       break;
> +       }
> +
> +       if (!server_id)
> +               return XDP_DROP;
> +
> +       return XDP_PASS;
> +}

I'm not a networking BPF expert, but the logic of packet parsing here
looks pretty clean! Would it be possible to also backport original
code with data and data_end, both for testing but also to be able to
compare and contrast dynptr vs data/data_end approaches?


> diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> index c8bc0c6947aa..1fef7868ea8b 100644
> --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> @@ -230,21 +230,18 @@ static __noinline bool get_packet_dst(struct real_definition **real,
>         return true;
>  }
>
> -static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
> +static __noinline int parse_icmpv6(struct bpf_dynptr *skb_ptr, __u64 off,
>                                    struct packet_description *pckt)
>  {
>         struct icmp6hdr *icmp_hdr;
>         struct ipv6hdr *ip6h;
>
> -       icmp_hdr = data + off;
> -       if (icmp_hdr + 1 > data_end)
> +       icmp_hdr = bpf_dynptr_data(skb_ptr, off, sizeof(*icmp_hdr) + sizeof(*ip6h));
> +       if (!icmp_hdr)
>                 return TC_ACT_SHOT;
>         if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
>                 return TC_ACT_OK;

previously you can still TC_ACT_OK if it's ICMPV6_PKT_TOOBIG even if
packet size is < sizeof(*icmp_hdr) + sizeof(*ip6h), which might have
been a bug, but current logic will enforce that packet is at least
sizeof(*icmp_hdr) + sizeof(*ip6h). Is that a problem?

> -       off += sizeof(struct icmp6hdr);
> -       ip6h = data + off;
> -       if (ip6h + 1 > data_end)
> -               return TC_ACT_SHOT;
> +       ip6h = (struct ipv6hdr *)(icmp_hdr + 1);
>         pckt->proto = ip6h->nexthdr;
>         pckt->flags |= F_ICMP;
>         memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
> @@ -252,22 +249,19 @@ static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
>         return TC_ACT_UNSPEC;
>  }
>
> -static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
> +static __noinline int parse_icmp(struct bpf_dynptr *skb_ptr, __u64 off,
>                                  struct packet_description *pckt)
>  {
>         struct icmphdr *icmp_hdr;
>         struct iphdr *iph;
>
> -       icmp_hdr = data + off;
> -       if (icmp_hdr + 1 > data_end)
> +       icmp_hdr = bpf_dynptr_data(skb_ptr, off, sizeof(*icmp_hdr) + sizeof(*iph));
> +       if (!icmp_hdr)
>                 return TC_ACT_SHOT;
>         if (icmp_hdr->type != ICMP_DEST_UNREACH ||
>             icmp_hdr->code != ICMP_FRAG_NEEDED)
>                 return TC_ACT_OK;

similarly here, short packets can still be TC_ACT_OK in some
circumstances, while with dynptr they will be shot down early on. Not
saying this is wrong or bad, just bringing this up for you and others
to chime in if it's an ok change

> -       off += sizeof(struct icmphdr);
> -       iph = data + off;
> -       if (iph + 1 > data_end)
> -               return TC_ACT_SHOT;
> +       iph = (struct iphdr *)(icmp_hdr + 1);
>         if (iph->ihl != 5)
>                 return TC_ACT_SHOT;
>         pckt->proto = iph->protocol;
> @@ -277,13 +271,13 @@ static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
>         return TC_ACT_UNSPEC;
>  }

[...]

> -static __always_inline int handle_ipv4(struct xdp_md *xdp)
> +static __always_inline int handle_ipv4(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
>  {
> -       void *data_end = (void *)(long)xdp->data_end;
> -       void *data = (void *)(long)xdp->data;
> +       struct bpf_dynptr new_xdp_ptr;
>         struct iptnl_info *tnl;
>         struct ethhdr *new_eth;
>         struct ethhdr *old_eth;
> -       struct iphdr *iph = data + sizeof(struct ethhdr);
> +       struct iphdr *iph;
>         __u16 *next_iph;
>         __u16 payload_len;
>         struct vip vip = {};
> @@ -90,10 +90,12 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
>         __u32 csum = 0;
>         int i;
>
> -       if (iph + 1 > data_end)
> +       iph = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> +                             iphdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));

tcphdr_sz (20) is always bigger than udphdr_sz (8), so just use the
bigger one here? Though again, for UDP packet it might be a bit too
pessimistic to reject small packets?

> +       if (!iph)
>                 return XDP_DROP;
>
> -       dport = get_dport(iph + 1, data_end, iph->protocol);
> +       dport = get_dport(iph + 1, iph->protocol);
>         if (dport == -1)
>                 return XDP_DROP;

[...]

> -static __always_inline int handle_ipv6(struct xdp_md *xdp)
> +static __always_inline int handle_ipv6(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
>  {
> -       void *data_end = (void *)(long)xdp->data_end;
> -       void *data = (void *)(long)xdp->data;
> +       struct bpf_dynptr new_xdp_ptr;
>         struct iptnl_info *tnl;
>         struct ethhdr *new_eth;
>         struct ethhdr *old_eth;
> -       struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
> +       struct ipv6hdr *ip6h;
>         __u16 payload_len;
>         struct vip vip = {};
>         int dport;
>
> -       if (ip6h + 1 > data_end)
> +       ip6h = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> +                              ipv6hdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));

ditto, there is no dynamism here, verifier actually enforces that this
value is statically known, I think this example will create false
assumptions if written this way

> +       if (!ip6h)
>                 return XDP_DROP;
>
> -       dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
> +       dport = get_dport(ip6h + 1, ip6h->nexthdr);
>         if (dport == -1)
>                 return XDP_DROP;
>

[...]
Alexei Starovoitov Aug. 1, 2022, 7:12 p.m. UTC | #4
On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
>
>
> -static __always_inline int handle_ipv4(struct xdp_md *xdp)
> +static __always_inline int handle_ipv4(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
>  {
> -       void *data_end = (void *)(long)xdp->data_end;
> -       void *data = (void *)(long)xdp->data;
> +       struct bpf_dynptr new_xdp_ptr;
>         struct iptnl_info *tnl;
>         struct ethhdr *new_eth;
>         struct ethhdr *old_eth;
> -       struct iphdr *iph = data + sizeof(struct ethhdr);
> +       struct iphdr *iph;
>         __u16 *next_iph;
>         __u16 payload_len;
>         struct vip vip = {};
> @@ -90,10 +90,12 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
>         __u32 csum = 0;
>         int i;
>
> -       if (iph + 1 > data_end)
> +       iph = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> +                             iphdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
> +       if (!iph)
>                 return XDP_DROP;

dynptr based xdp/skb access looks neat.
Maybe in addition to bpf_dynptr_data() we can add helper(s)
that return skb/xdp_md from dynptr?
This way the code will be passing dynptr only and there will
be no need to pass around 'struct xdp_md *xdp' (like this function).

Separately please keep the existing tests instead of converting them.
Either ifdef data/data_end vs dynptr style or copy paste
the whole test into a new .c file. Whichever is cleaner.
Joanne Koong Aug. 2, 2022, 10:21 p.m. UTC | #5
On Mon, Aug 1, 2022 at 12:12 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> >
> > -static __always_inline int handle_ipv4(struct xdp_md *xdp)
> > +static __always_inline int handle_ipv4(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
> >  {
> > -       void *data_end = (void *)(long)xdp->data_end;
> > -       void *data = (void *)(long)xdp->data;
> > +       struct bpf_dynptr new_xdp_ptr;
> >         struct iptnl_info *tnl;
> >         struct ethhdr *new_eth;
> >         struct ethhdr *old_eth;
> > -       struct iphdr *iph = data + sizeof(struct ethhdr);
> > +       struct iphdr *iph;
> >         __u16 *next_iph;
> >         __u16 payload_len;
> >         struct vip vip = {};
> > @@ -90,10 +90,12 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
> >         __u32 csum = 0;
> >         int i;
> >
> > -       if (iph + 1 > data_end)
> > +       iph = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> > +                             iphdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
> > +       if (!iph)
> >                 return XDP_DROP;
>
> dynptr based xdp/skb access looks neat.
> Maybe in addition to bpf_dynptr_data() we can add helper(s)
> that return skb/xdp_md from dynptr?
> This way the code will be passing dynptr only and there will
> be no need to pass around 'struct xdp_md *xdp' (like this function).

Great idea! I'll add this to v2.

>
> Separately please keep the existing tests instead of converting them.
> Either ifdef data/data_end vs dynptr style or copy paste
> the whole test into a new .c file. Whichever is cleaner.

Will do for v2.
Joanne Koong Aug. 2, 2022, 10:56 p.m. UTC | #6
On Mon, Aug 1, 2022 at 10:58 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > Test skb and xdp dynptr functionality in the following ways:
> >
> > 1) progs/test_xdp.c
> >    * Change existing test to use dynptrs to parse xdp data
> >
> >      There were no noticeable diferences in user + system time between
> >      the original version vs. using dynptrs. Averaging the time for 10
> >      runs (run using "time ./test_progs -t xdp_bpf2bpf"):
> >          original version: 0.0449 sec
> >          with dynptrs: 0.0429 sec
> >
> > 2) progs/test_l4lb_noinline.c
> >    * Change existing test to use dynptrs to parse skb data
> >
> >      There were no noticeable diferences in user + system time between
> >      the original version vs. using dynptrs. Averaging the time for 10
> >      runs (run using "time ./test_progs -t l4lb_all/l4lb_noinline"):
> >          original version: 0.0502 sec
> >          with dynptrs: 0.055 sec
> >
> >      For number of processed verifier instructions:
> >          original version: 6284 insns
> >          with dynptrs: 2538 insns
> >
> > 3) progs/test_dynptr_xdp.c
> >    * Add sample code for parsing tcp hdr opt lookup using dynptrs.
> >      This logic is lifted from a real-world use case of packet parsing in
> >      katran [0], a layer 4 load balancer
> >
> > 4) progs/dynptr_success.c
> >    * Add test case "test_skb_readonly" for testing attempts at writes /
> >      data slices on a prog type with read-only skb ctx.
> >
> > 5) progs/dynptr_fail.c
> >    * Add test cases "skb_invalid_data_slice" and
> >      "xdp_invalid_data_slice" for testing that helpers that modify the
> >      underlying packet buffer automatically invalidate the associated
> >      data slice.
> >    * Add test cases "skb_invalid_ctx" and "xdp_invalid_ctx" for testing
> >      that prog types that do not support bpf_dynptr_from_skb/xdp don't
> >      have access to the API.
> >
> > [0] https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
> >  .../testing/selftests/bpf/prog_tests/dynptr.c |  85 ++++++++++---
> >  .../selftests/bpf/prog_tests/dynptr_xdp.c     |  49 ++++++++
> >  .../testing/selftests/bpf/progs/dynptr_fail.c |  76 ++++++++++++
> >  .../selftests/bpf/progs/dynptr_success.c      |  32 +++++
> >  .../selftests/bpf/progs/test_dynptr_xdp.c     | 115 ++++++++++++++++++
> >  .../selftests/bpf/progs/test_l4lb_noinline.c  |  71 +++++------
> >  tools/testing/selftests/bpf/progs/test_xdp.c  |  95 +++++++--------
> >  .../selftests/bpf/test_tcp_hdr_options.h      |   1 +
> >  8 files changed, 416 insertions(+), 108 deletions(-)
> >  create mode 100644 tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> >  create mode 100644 tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> >
> > diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > index bcf80b9f7c27..c40631f33c7b 100644
> > --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > @@ -2,6 +2,7 @@
> >  /* Copyright (c) 2022 Facebook */
> >
> >  #include <test_progs.h>
> > +#include <network_helpers.h>
> >  #include "dynptr_fail.skel.h"
> >  #include "dynptr_success.skel.h"
> >
> > @@ -11,8 +12,8 @@ static char obj_log_buf[1048576];
> >  static struct {
> >         const char *prog_name;
> >         const char *expected_err_msg;
> > -} dynptr_tests[] = {
> > -       /* failure cases */
> > +} verifier_error_tests[] = {
> > +       /* these cases should trigger a verifier error */
> >         {"ringbuf_missing_release1", "Unreleased reference id=1"},
> >         {"ringbuf_missing_release2", "Unreleased reference id=2"},
> >         {"ringbuf_missing_release_callback", "Unreleased reference id"},
> > @@ -42,11 +43,25 @@ static struct {
> >         {"release_twice_callback", "arg 1 is an unacquired reference"},
> >         {"dynptr_from_mem_invalid_api",
> >                 "Unsupported reg type fp for bpf_dynptr_from_mem data"},
> > +       {"skb_invalid_data_slice", "invalid mem access 'scalar'"},
> > +       {"xdp_invalid_data_slice", "invalid mem access 'scalar'"},
> > +       {"skb_invalid_ctx", "unknown func bpf_dynptr_from_skb"},
> > +       {"xdp_invalid_ctx", "unknown func bpf_dynptr_from_xdp"},
> > +};
> > +
> > +enum test_setup_type {
> > +       SETUP_SYSCALL_SLEEP,
> > +       SETUP_SKB_PROG,
> > +};
> >
> > -       /* success cases */
> > -       {"test_read_write", NULL},
> > -       {"test_data_slice", NULL},
> > -       {"test_ringbuf", NULL},
> > +static struct {
> > +       const char *prog_name;
> > +       enum test_setup_type type;
> > +} runtime_tests[] = {
> > +       {"test_read_write", SETUP_SYSCALL_SLEEP},
> > +       {"test_data_slice", SETUP_SYSCALL_SLEEP},
> > +       {"test_ringbuf", SETUP_SYSCALL_SLEEP},
> > +       {"test_skb_readonly", SETUP_SKB_PROG},
>
> nit: wouldn't it be better to add test_setup_type to dynptr_tests (and
> keep fail and success cases together)? It's conceivable that you might
> want different setups to test different error conditions, right?

Yeah! I originally separated it out because the success tests don't
have an error message while the fail ones do, and fail ones don't have
a setup (I don't think we'll need any custom userspace setup for those
since we're checking for verifier failures at prog load time) and the
success ones do. But I can combine them into 1 so that it's simpler. I
will do this in v2.
>
> >  };
> >
> >  static void verify_fail(const char *prog_name, const char *expected_err_msg)
> > @@ -85,7 +100,7 @@ static void verify_fail(const char *prog_name, const char *expected_err_msg)
> >         dynptr_fail__destroy(skel);
> >  }
> >
> > -static void verify_success(const char *prog_name)
> > +static void run_tests(const char *prog_name, enum test_setup_type setup_type)
> >  {
> >         struct dynptr_success *skel;
> >         struct bpf_program *prog;
> > @@ -107,15 +122,42 @@ static void verify_success(const char *prog_name)
> >         if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
> >                 goto cleanup;
> >
> > -       link = bpf_program__attach(prog);
> > -       if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
> > -               goto cleanup;
> > +       switch (setup_type) {
> > +       case SETUP_SYSCALL_SLEEP:
> > +               link = bpf_program__attach(prog);
> > +               if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
> > +                       goto cleanup;
> >
> > -       usleep(1);
> > +               usleep(1);
> >
> > -       ASSERT_EQ(skel->bss->err, 0, "err");
> > +               bpf_link__destroy(link);
> > +               break;
> > +       case SETUP_SKB_PROG:
> > +       {
> > +               int prog_fd, err;
> > +               char buf[64];
> > +
> > +               prog_fd = bpf_program__fd(prog);
> > +               if (CHECK_FAIL(prog_fd < 0))
>
> please don't use CHECK and especially CHECK_FAIL
>
> > +                       goto cleanup;
> > +
> > +               LIBBPF_OPTS(bpf_test_run_opts, topts,
> > +                           .data_in = &pkt_v4,
> > +                           .data_size_in = sizeof(pkt_v4),
> > +                           .data_out = buf,
> > +                           .data_size_out = sizeof(buf),
> > +                           .repeat = 1,
> > +               );
>
> nit: LIBBPF_OPTS declares variable, so should be part of variable
> declaration block
>
> >
> > -       bpf_link__destroy(link);
> > +               err = bpf_prog_test_run_opts(prog_fd, &topts);
> > +
> > +               if (!ASSERT_OK(err, "test_run"))
> > +                       goto cleanup;
> > +
> > +               break;
> > +       }
> > +       }
> > +       ASSERT_EQ(skel->bss->err, 0, "err");
> >
> >  cleanup:
> >         dynptr_success__destroy(skel);
> > @@ -125,14 +167,17 @@ void test_dynptr(void)
> >  {
> >         int i;
> >
> > -       for (i = 0; i < ARRAY_SIZE(dynptr_tests); i++) {
> > -               if (!test__start_subtest(dynptr_tests[i].prog_name))
> > +       for (i = 0; i < ARRAY_SIZE(verifier_error_tests); i++) {
> > +               if (!test__start_subtest(verifier_error_tests[i].prog_name))
> > +                       continue;
> > +
> > +               verify_fail(verifier_error_tests[i].prog_name,
> > +                           verifier_error_tests[i].expected_err_msg);
> > +       }
> > +       for (i = 0; i < ARRAY_SIZE(runtime_tests); i++) {
> > +               if (!test__start_subtest(runtime_tests[i].prog_name))
> >                         continue;
> >
> > -               if (dynptr_tests[i].expected_err_msg)
> > -                       verify_fail(dynptr_tests[i].prog_name,
> > -                                   dynptr_tests[i].expected_err_msg);
> > -               else
> > -                       verify_success(dynptr_tests[i].prog_name);
> > +               run_tests(runtime_tests[i].prog_name, runtime_tests[i].type);
> >         }
> >  }
> > diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c b/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> > new file mode 100644
> > index 000000000000..ca775d126b60
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> > @@ -0,0 +1,49 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <test_progs.h>
> > +#include <network_helpers.h>
> > +#include "test_dynptr_xdp.skel.h"
> > +#include "test_tcp_hdr_options.h"
> > +
> > +struct test_pkt {
> > +       struct ipv6_packet pk6_v6;
> > +       u8 options[16];
> > +} __packed;
> > +
> > +void test_dynptr_xdp(void)
> > +{
> > +       struct test_dynptr_xdp *skel;
> > +       char buf[128];
> > +       int err;
> > +
> > +       skel = test_dynptr_xdp__open_and_load();
> > +       if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
> > +               return;
> > +
> > +       struct test_pkt pkt = {
> > +               .pk6_v6.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
> > +               .pk6_v6.iph.nexthdr = IPPROTO_TCP,
> > +               .pk6_v6.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
> > +               .pk6_v6.tcp.urg_ptr = 123,
> > +               .pk6_v6.tcp.doff = 9, /* 16 bytes of options */
> > +
> > +               .options = {
> > +                       TCPOPT_MSS, 4, 0x05, 0xB4, TCPOPT_NOP, TCPOPT_NOP,
> > +                       skel->rodata->tcp_hdr_opt_kind_tpr, 6, 0, 0, 0, 9, TCPOPT_EOL
> > +               },
> > +       };
> > +
> > +       LIBBPF_OPTS(bpf_test_run_opts, topts,
> > +                   .data_in = &pkt,
> > +                   .data_size_in = sizeof(pkt),
> > +                   .data_out = buf,
> > +                   .data_size_out = sizeof(buf),
> > +                   .repeat = 3,
> > +       );
> > +
>
> for topts and pkt, they should be up above with other variables
> (unless we want to break off from kernel code style, which I didn't
> think we want)
>
> > +       err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.xdp_ingress_v6), &topts);
> > +       ASSERT_OK(err, "ipv6 test_run");
> > +       ASSERT_EQ(skel->bss->server_id, 0x9000000, "server id");
> > +       ASSERT_EQ(topts.retval, XDP_PASS, "ipv6 test_run retval");
> > +
> > +       test_dynptr_xdp__destroy(skel);
> > +}
> > diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
> > index c1814938a5fd..4e3f853b2d02 100644
> > --- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
> > +++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
> > @@ -5,6 +5,7 @@
> >  #include <string.h>
> >  #include <linux/bpf.h>
> >  #include <bpf/bpf_helpers.h>
> > +#include <linux/if_ether.h>
> >  #include "bpf_misc.h"
> >
> >  char _license[] SEC("license") = "GPL";
> > @@ -622,3 +623,78 @@ int dynptr_from_mem_invalid_api(void *ctx)
> >
> >         return 0;
> >  }
> > +
> > +/* The data slice is invalidated whenever a helper changes packet data */
> > +SEC("?tc")
> > +int skb_invalid_data_slice(struct __sk_buff *skb)
> > +{
> > +       struct bpf_dynptr ptr;
> > +       struct ethhdr *hdr;
> > +
> > +       bpf_dynptr_from_skb(skb, 0, &ptr);
> > +       hdr = bpf_dynptr_data(&ptr, 0, sizeof(*hdr));
> > +       if (!hdr)
> > +               return SK_DROP;
> > +
> > +       hdr->h_proto = 12;
> > +
> > +       if (bpf_skb_pull_data(skb, skb->len))
> > +               return SK_DROP;
> > +
> > +       /* this should fail */
> > +       hdr->h_proto = 1;
> > +
> > +       return SK_PASS;
> > +}
> > +
> > +/* The data slice is invalidated whenever a helper changes packet data */
> > +SEC("?xdp")
> > +int xdp_invalid_data_slice(struct xdp_md *xdp)
> > +{
> > +       struct bpf_dynptr ptr;
> > +       struct ethhdr *hdr1, *hdr2;
> > +
> > +       bpf_dynptr_from_xdp(xdp, 0, &ptr);
> > +       hdr1 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr1));
> > +       if (!hdr1)
> > +               return XDP_DROP;
> > +
> > +       hdr2 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr2));
> > +       if (!hdr2)
> > +               return XDP_DROP;
> > +
> > +       hdr1->h_proto = 12;
> > +       hdr2->h_proto = 12;
> > +
> > +       if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr1)))
> > +               return XDP_DROP;
> > +
> > +       /* this should fail */
> > +       hdr2->h_proto = 1;
>
> is there something special about having both hdr1 and hdr2? Wouldn't
> this test work with just single hdr pointer?

Yes, this test would work with just 1 single hdr pointer (which is
what skb_invalid_data_slice does) but I wanted to ensure that this
also works in the case of multiple data slices. If you think this is
unnecessary / adds clutter, I can remove hdr2.

>
> > +
> > +       return XDP_PASS;
> > +}
> > +
> > +/* Only supported prog type can create skb-type dynptrs */
>
> [...]
>
> > +       err = 1;
> > +
> > +       if (bpf_dynptr_from_skb(ctx, 0, &ptr))
> > +               return 0;
> > +       err++;
> > +
> > +       data = bpf_dynptr_data(&ptr, 0, 1);
> > +       if (data)
> > +               /* it's an error if data is not NULL since cgroup skbs
> > +                * are read only
> > +                */
> > +               return 0;
> > +       err++;
> > +
> > +       ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
> > +       /* since cgroup skbs are read only, writes should fail */
> > +       if (ret != -EINVAL)
> > +               return 0;
> > +
> > +       err = 0;
>
> hm, if data is NULL you'll still report success if bpf_dynptr_write
> returns 0 or any other error but -EINVAL... The logic is a bit unclear
> here...
>
If data is NULL and bpf_dynptr_write returns 0 or any other error
besides -EINVAL, I think we report a failure here (err is set to a
non-zero value, which userspace checks against)

> > +
> > +       return 0;
> > +}
> > diff --git a/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> > new file mode 100644
> > index 000000000000..c879dfb6370a
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> > @@ -0,0 +1,115 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +/* This logic is lifted from a real-world use case of packet parsing, used in
> > + * the open source library katran, a layer 4 load balancer.
> > + *
> > + * This test demonstrates how to parse packet contents using dynptrs.
> > + *
> > + * https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
> > + */
> > +
> > +#include <string.h>
> > +#include <linux/bpf.h>
> > +#include <bpf/bpf_helpers.h>
> > +#include <linux/tcp.h>
> > +#include <stdbool.h>
> > +#include <linux/ipv6.h>
> > +#include <linux/if_ether.h>
> > +#include "test_tcp_hdr_options.h"
> > +
> > +char _license[] SEC("license") = "GPL";
> > +
> > +/* Arbitrarily picked unused value from IANA TCP Option Kind Numbers */
> > +const __u32 tcp_hdr_opt_kind_tpr = 0xB7;
> > +/* Length of the tcp header option */
> > +const __u32 tcp_hdr_opt_len_tpr = 6;
> > +/* maximum number of header options to check to lookup server_id */
> > +const __u32 tcp_hdr_opt_max_opt_checks = 15;
> > +
> > +__u32 server_id;
> > +
> > +static int parse_hdr_opt(struct bpf_dynptr *ptr, __u32 *off, __u8 *hdr_bytes_remaining,
> > +                        __u32 *server_id)
> > +{
> > +       __u8 *tcp_opt, kind, hdr_len;
> > +       __u8 *data;
> > +
> > +       data = bpf_dynptr_data(ptr, *off, sizeof(kind) + sizeof(hdr_len) +
> > +                              sizeof(*server_id));
> > +       if (!data)
> > +               return -1;
> > +
> > +       kind = data[0];
> > +
> > +       if (kind == TCPOPT_EOL)
> > +               return -1;
> > +
> > +       if (kind == TCPOPT_NOP) {
> > +               *off += 1;
> > +               /* continue to the next option */
> > +               *hdr_bytes_remaining -= 1;
> > +
> > +               return 0;
> > +       }
> > +
> > +       if (*hdr_bytes_remaining < 2)
> > +               return -1;
> > +
> > +       hdr_len = data[1];
> > +       if (hdr_len > *hdr_bytes_remaining)
> > +               return -1;
> > +
> > +       if (kind == tcp_hdr_opt_kind_tpr) {
> > +               if (hdr_len != tcp_hdr_opt_len_tpr)
> > +                       return -1;
> > +
> > +               memcpy(server_id, (__u32 *)(data + 2), sizeof(*server_id));
>
> this implicitly relies on compiler inlining memcpy, let's use
> __builtint_memcpy() here instead to set a good example?

Sounds good, I will change this for v2. Should memcpys in bpf progs
always use __builtin_memcpy or is it on a case-by-case basis where if
the size is small enough, then you use it?
>
> > +               return 1;
> > +       }
> > +
> > +       *off += hdr_len;
> > +       *hdr_bytes_remaining -= hdr_len;
> > +
> > +       return 0;
> > +}
> > +
> > +SEC("xdp")
> > +int xdp_ingress_v6(struct xdp_md *xdp)
> > +{
> > +       __u8 hdr_bytes_remaining;
> > +       struct tcphdr *tcp_hdr;
> > +       __u8 tcp_hdr_opt_len;
> > +       int err = 0;
> > +       __u32 off;
> > +
> > +       struct bpf_dynptr ptr;
> > +
> > +       bpf_dynptr_from_xdp(xdp, 0, &ptr);
> > +
> > +       off = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
> > +
> > +       tcp_hdr = bpf_dynptr_data(&ptr, off, sizeof(*tcp_hdr));
> > +       if (!tcp_hdr)
> > +               return XDP_DROP;
> > +
> > +       tcp_hdr_opt_len = (tcp_hdr->doff * 4) - sizeof(struct tcphdr);
> > +       if (tcp_hdr_opt_len < tcp_hdr_opt_len_tpr)
> > +               return XDP_DROP;
> > +
> > +       hdr_bytes_remaining = tcp_hdr_opt_len;
> > +
> > +       off += sizeof(struct tcphdr);
> > +
> > +       /* max number of bytes of options in tcp header is 40 bytes */
> > +       for (int i = 0; i < tcp_hdr_opt_max_opt_checks; i++) {
> > +               err = parse_hdr_opt(&ptr, &off, &hdr_bytes_remaining, &server_id);
> > +
> > +               if (err || !hdr_bytes_remaining)
> > +                       break;
> > +       }
> > +
> > +       if (!server_id)
> > +               return XDP_DROP;
> > +
> > +       return XDP_PASS;
> > +}
>
> I'm not a networking BPF expert, but the logic of packet parsing here
> looks pretty clean! Would it be possible to also backport original
> code with data and data_end, both for testing but also to be able to
> compare and contrast dynptr vs data/data_end approaches?
>

Yeah, good idea! I'll backport the original code from katran for v2.

>
> > diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> > index c8bc0c6947aa..1fef7868ea8b 100644
> > --- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> > +++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
> > @@ -230,21 +230,18 @@ static __noinline bool get_packet_dst(struct real_definition **real,
> >         return true;
> >  }
> >
> > -static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
> > +static __noinline int parse_icmpv6(struct bpf_dynptr *skb_ptr, __u64 off,
> >                                    struct packet_description *pckt)
> >  {
> >         struct icmp6hdr *icmp_hdr;
> >         struct ipv6hdr *ip6h;
> >
> > -       icmp_hdr = data + off;
> > -       if (icmp_hdr + 1 > data_end)
> > +       icmp_hdr = bpf_dynptr_data(skb_ptr, off, sizeof(*icmp_hdr) + sizeof(*ip6h));
> > +       if (!icmp_hdr)
> >                 return TC_ACT_SHOT;
> >         if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
> >                 return TC_ACT_OK;
>
> previously you can still TC_ACT_OK if it's ICMPV6_PKT_TOOBIG even if
> packet size is < sizeof(*icmp_hdr) + sizeof(*ip6h), which might have
> been a bug, but current logic will enforce that packet is at least
> sizeof(*icmp_hdr) + sizeof(*ip6h). Is that a problem?

Good point, I think it's best to maintain the original behavior. I'll
fix this (and the one below) for v2.

>
> > -       off += sizeof(struct icmp6hdr);
> > -       ip6h = data + off;
> > -       if (ip6h + 1 > data_end)
> > -               return TC_ACT_SHOT;
> > +       ip6h = (struct ipv6hdr *)(icmp_hdr + 1);
> >         pckt->proto = ip6h->nexthdr;
> >         pckt->flags |= F_ICMP;
> >         memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
> > @@ -252,22 +249,19 @@ static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
> >         return TC_ACT_UNSPEC;
> >  }
> >
> > -static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
> > +static __noinline int parse_icmp(struct bpf_dynptr *skb_ptr, __u64 off,
> >                                  struct packet_description *pckt)
> >  {
> >         struct icmphdr *icmp_hdr;
> >         struct iphdr *iph;
> >
> > -       icmp_hdr = data + off;
> > -       if (icmp_hdr + 1 > data_end)
> > +       icmp_hdr = bpf_dynptr_data(skb_ptr, off, sizeof(*icmp_hdr) + sizeof(*iph));
> > +       if (!icmp_hdr)
> >                 return TC_ACT_SHOT;
> >         if (icmp_hdr->type != ICMP_DEST_UNREACH ||
> >             icmp_hdr->code != ICMP_FRAG_NEEDED)
> >                 return TC_ACT_OK;
>
> similarly here, short packets can still be TC_ACT_OK in some
> circumstances, while with dynptr they will be shot down early on. Not
> saying this is wrong or bad, just bringing this up for you and others
> to chime in if it's an ok change
>
> > -       off += sizeof(struct icmphdr);
> > -       iph = data + off;
> > -       if (iph + 1 > data_end)
> > -               return TC_ACT_SHOT;
> > +       iph = (struct iphdr *)(icmp_hdr + 1);
> >         if (iph->ihl != 5)
> >                 return TC_ACT_SHOT;
> >         pckt->proto = iph->protocol;
> > @@ -277,13 +271,13 @@ static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
> >         return TC_ACT_UNSPEC;
> >  }
>
> [...]
>
> > -static __always_inline int handle_ipv4(struct xdp_md *xdp)
> > +static __always_inline int handle_ipv4(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
> >  {
> > -       void *data_end = (void *)(long)xdp->data_end;
> > -       void *data = (void *)(long)xdp->data;
> > +       struct bpf_dynptr new_xdp_ptr;
> >         struct iptnl_info *tnl;
> >         struct ethhdr *new_eth;
> >         struct ethhdr *old_eth;
> > -       struct iphdr *iph = data + sizeof(struct ethhdr);
> > +       struct iphdr *iph;
> >         __u16 *next_iph;
> >         __u16 payload_len;
> >         struct vip vip = {};
> > @@ -90,10 +90,12 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
> >         __u32 csum = 0;
> >         int i;
> >
> > -       if (iph + 1 > data_end)
> > +       iph = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> > +                             iphdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
>
> tcphdr_sz (20) is always bigger than udphdr_sz (8), so just use the
> bigger one here? Though again, for UDP packet it might be a bit too
> pessimistic to reject small packets?

Yeah, maybe the best way here is to first check that if data_end -
data is less than the size of the ethhdr + iphdr + tcphdr, then we
must use udphdr size, otherwise we can use tcphdr size. I'll change it
(and the one below) to this for v2

>
> > +       if (!iph)
> >                 return XDP_DROP;
> >
> > -       dport = get_dport(iph + 1, data_end, iph->protocol);
> > +       dport = get_dport(iph + 1, iph->protocol);
> >         if (dport == -1)
> >                 return XDP_DROP;
>
> [...]
>
> > -static __always_inline int handle_ipv6(struct xdp_md *xdp)
> > +static __always_inline int handle_ipv6(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
> >  {
> > -       void *data_end = (void *)(long)xdp->data_end;
> > -       void *data = (void *)(long)xdp->data;
> > +       struct bpf_dynptr new_xdp_ptr;
> >         struct iptnl_info *tnl;
> >         struct ethhdr *new_eth;
> >         struct ethhdr *old_eth;
> > -       struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
> > +       struct ipv6hdr *ip6h;
> >         __u16 payload_len;
> >         struct vip vip = {};
> >         int dport;
> >
> > -       if (ip6h + 1 > data_end)
> > +       ip6h = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> > +                              ipv6hdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
>
> ditto, there is no dynamism here, verifier actually enforces that this
> value is statically known, I think this example will create false
> assumptions if written this way
>
> > +       if (!ip6h)
> >                 return XDP_DROP;
> >
> > -       dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
> > +       dport = get_dport(ip6h + 1, ip6h->nexthdr);
> >         if (dport == -1)
> >                 return XDP_DROP;
> >
>
> [...]
Andrii Nakryiko Aug. 3, 2022, 12:53 a.m. UTC | #7
On Tue, Aug 2, 2022 at 3:56 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Mon, Aug 1, 2022 at 10:58 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> > >
> > > Test skb and xdp dynptr functionality in the following ways:
> > >
> > > 1) progs/test_xdp.c
> > >    * Change existing test to use dynptrs to parse xdp data
> > >
> > >      There were no noticeable diferences in user + system time between
> > >      the original version vs. using dynptrs. Averaging the time for 10
> > >      runs (run using "time ./test_progs -t xdp_bpf2bpf"):
> > >          original version: 0.0449 sec
> > >          with dynptrs: 0.0429 sec
> > >
> > > 2) progs/test_l4lb_noinline.c
> > >    * Change existing test to use dynptrs to parse skb data
> > >
> > >      There were no noticeable diferences in user + system time between
> > >      the original version vs. using dynptrs. Averaging the time for 10
> > >      runs (run using "time ./test_progs -t l4lb_all/l4lb_noinline"):
> > >          original version: 0.0502 sec
> > >          with dynptrs: 0.055 sec
> > >
> > >      For number of processed verifier instructions:
> > >          original version: 6284 insns
> > >          with dynptrs: 2538 insns
> > >
> > > 3) progs/test_dynptr_xdp.c
> > >    * Add sample code for parsing tcp hdr opt lookup using dynptrs.
> > >      This logic is lifted from a real-world use case of packet parsing in
> > >      katran [0], a layer 4 load balancer
> > >
> > > 4) progs/dynptr_success.c
> > >    * Add test case "test_skb_readonly" for testing attempts at writes /
> > >      data slices on a prog type with read-only skb ctx.
> > >
> > > 5) progs/dynptr_fail.c
> > >    * Add test cases "skb_invalid_data_slice" and
> > >      "xdp_invalid_data_slice" for testing that helpers that modify the
> > >      underlying packet buffer automatically invalidate the associated
> > >      data slice.
> > >    * Add test cases "skb_invalid_ctx" and "xdp_invalid_ctx" for testing
> > >      that prog types that do not support bpf_dynptr_from_skb/xdp don't
> > >      have access to the API.
> > >
> > > [0] https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
> > >
> > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > ---
> > >  .../testing/selftests/bpf/prog_tests/dynptr.c |  85 ++++++++++---
> > >  .../selftests/bpf/prog_tests/dynptr_xdp.c     |  49 ++++++++
> > >  .../testing/selftests/bpf/progs/dynptr_fail.c |  76 ++++++++++++
> > >  .../selftests/bpf/progs/dynptr_success.c      |  32 +++++
> > >  .../selftests/bpf/progs/test_dynptr_xdp.c     | 115 ++++++++++++++++++
> > >  .../selftests/bpf/progs/test_l4lb_noinline.c  |  71 +++++------
> > >  tools/testing/selftests/bpf/progs/test_xdp.c  |  95 +++++++--------
> > >  .../selftests/bpf/test_tcp_hdr_options.h      |   1 +
> > >  8 files changed, 416 insertions(+), 108 deletions(-)
> > >  create mode 100644 tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> > >  create mode 100644 tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> > >
> > > diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > > index bcf80b9f7c27..c40631f33c7b 100644
> > > --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > > +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > > @@ -2,6 +2,7 @@
> > >  /* Copyright (c) 2022 Facebook */
> > >
> > >  #include <test_progs.h>
> > > +#include <network_helpers.h>
> > >  #include "dynptr_fail.skel.h"
> > >  #include "dynptr_success.skel.h"
> > >
> > > @@ -11,8 +12,8 @@ static char obj_log_buf[1048576];
> > >  static struct {
> > >         const char *prog_name;
> > >         const char *expected_err_msg;
> > > -} dynptr_tests[] = {
> > > -       /* failure cases */
> > > +} verifier_error_tests[] = {
> > > +       /* these cases should trigger a verifier error */
> > >         {"ringbuf_missing_release1", "Unreleased reference id=1"},
> > >         {"ringbuf_missing_release2", "Unreleased reference id=2"},
> > >         {"ringbuf_missing_release_callback", "Unreleased reference id"},
> > > @@ -42,11 +43,25 @@ static struct {
> > >         {"release_twice_callback", "arg 1 is an unacquired reference"},
> > >         {"dynptr_from_mem_invalid_api",
> > >                 "Unsupported reg type fp for bpf_dynptr_from_mem data"},
> > > +       {"skb_invalid_data_slice", "invalid mem access 'scalar'"},
> > > +       {"xdp_invalid_data_slice", "invalid mem access 'scalar'"},
> > > +       {"skb_invalid_ctx", "unknown func bpf_dynptr_from_skb"},
> > > +       {"xdp_invalid_ctx", "unknown func bpf_dynptr_from_xdp"},
> > > +};
> > > +
> > > +enum test_setup_type {
> > > +       SETUP_SYSCALL_SLEEP,
> > > +       SETUP_SKB_PROG,
> > > +};
> > >
> > > -       /* success cases */
> > > -       {"test_read_write", NULL},
> > > -       {"test_data_slice", NULL},
> > > -       {"test_ringbuf", NULL},
> > > +static struct {
> > > +       const char *prog_name;
> > > +       enum test_setup_type type;
> > > +} runtime_tests[] = {
> > > +       {"test_read_write", SETUP_SYSCALL_SLEEP},
> > > +       {"test_data_slice", SETUP_SYSCALL_SLEEP},
> > > +       {"test_ringbuf", SETUP_SYSCALL_SLEEP},
> > > +       {"test_skb_readonly", SETUP_SKB_PROG},
> >
> > nit: wouldn't it be better to add test_setup_type to dynptr_tests (and
> > keep fail and success cases together)? It's conceivable that you might
> > want different setups to test different error conditions, right?
>
> Yeah! I originally separated it out because the success tests don't
> have an error message while the fail ones do, and fail ones don't have
> a setup (I don't think we'll need any custom userspace setup for those
> since we're checking for verifier failures at prog load time) and the
> success ones do. But I can combine them into 1 so that it's simpler. I
> will do this in v2.

great, thanks! you might actually need custom setup for SKB vs XDP
programs if you are unifying bpf_dynptr_from_packet?

> >
> > >  };
> > >
> > >  static void verify_fail(const char *prog_name, const char *expected_err_msg)
> > > @@ -85,7 +100,7 @@ static void verify_fail(const char *prog_name, const char *expected_err_msg)
> > >         dynptr_fail__destroy(skel);
> > >  }
> > >

[...]

> > > +/* The data slice is invalidated whenever a helper changes packet data */
> > > +SEC("?xdp")
> > > +int xdp_invalid_data_slice(struct xdp_md *xdp)
> > > +{
> > > +       struct bpf_dynptr ptr;
> > > +       struct ethhdr *hdr1, *hdr2;
> > > +
> > > +       bpf_dynptr_from_xdp(xdp, 0, &ptr);
> > > +       hdr1 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr1));
> > > +       if (!hdr1)
> > > +               return XDP_DROP;
> > > +
> > > +       hdr2 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr2));
> > > +       if (!hdr2)
> > > +               return XDP_DROP;
> > > +
> > > +       hdr1->h_proto = 12;
> > > +       hdr2->h_proto = 12;
> > > +
> > > +       if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr1)))
> > > +               return XDP_DROP;
> > > +
> > > +       /* this should fail */
> > > +       hdr2->h_proto = 1;
> >
> > is there something special about having both hdr1 and hdr2? Wouldn't
> > this test work with just single hdr pointer?
>
> Yes, this test would work with just 1 single hdr pointer (which is
> what skb_invalid_data_slice does) but I wanted to ensure that this
> also works in the case of multiple data slices. If you think this is
> unnecessary / adds clutter, I can remove hdr2.


I think testing two pointers isn't the point, so I'd keep the test
minimal. It seems like testing two pointers should be in a success
test, to prove it works, rather than as some side effect of an
expected-to-fail test, no?

>
> >
> > > +
> > > +       return XDP_PASS;
> > > +}
> > > +
> > > +/* Only supported prog type can create skb-type dynptrs */
> >
> > [...]
> >
> > > +       err = 1;
> > > +
> > > +       if (bpf_dynptr_from_skb(ctx, 0, &ptr))
> > > +               return 0;
> > > +       err++;
> > > +
> > > +       data = bpf_dynptr_data(&ptr, 0, 1);
> > > +       if (data)
> > > +               /* it's an error if data is not NULL since cgroup skbs
> > > +                * are read only
> > > +                */
> > > +               return 0;
> > > +       err++;
> > > +
> > > +       ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
> > > +       /* since cgroup skbs are read only, writes should fail */
> > > +       if (ret != -EINVAL)
> > > +               return 0;
> > > +
> > > +       err = 0;
> >
> > hm, if data is NULL you'll still report success if bpf_dynptr_write
> > returns 0 or any other error but -EINVAL... The logic is a bit unclear
> > here...
> >
> If data is NULL and bpf_dynptr_write returns 0 or any other error
> besides -EINVAL, I think we report a failure here (err is set to a
> non-zero value, which userspace checks against)

oh, ok, I read it backwards. I find this "stateful increasing error
number" pattern very confusing. Why not write it more
straightforwardly as:

if (bpf_dynptr_from_skb(...)) {
    err = 1;
    return 0;
}

data = bpf_dynptr_data(...);
if (data) {
    err = 2;
    return 0;
}

ret = bpf_dynptr_write(...);
if (ret != -EINVAL) {
    err = 3;
    return 0;
}

/* all tests passed */
err = 0;
return 0;

?

>
> > > +
> > > +       return 0;
> > > +}
> > > diff --git a/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> > > new file mode 100644
> > > index 000000000000..c879dfb6370a
> > > --- /dev/null
> > > +++ b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> > > @@ -0,0 +1,115 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> > > +

[...]

> > > +       hdr_len = data[1];
> > > +       if (hdr_len > *hdr_bytes_remaining)
> > > +               return -1;
> > > +
> > > +       if (kind == tcp_hdr_opt_kind_tpr) {
> > > +               if (hdr_len != tcp_hdr_opt_len_tpr)
> > > +                       return -1;
> > > +
> > > +               memcpy(server_id, (__u32 *)(data + 2), sizeof(*server_id));
> >
> > this implicitly relies on compiler inlining memcpy, let's use
> > __builtint_memcpy() here instead to set a good example?
>
> Sounds good, I will change this for v2. Should memcpys in bpf progs
> always use __builtin_memcpy or is it on a case-by-case basis where if
> the size is small enough, then you use it?

__builtin_memcpy() is best. When we write just "memcpy()" we still
rely on compiler to actually optimizing that to __builtin_memcpy(),
because there is no memcpy() (we'd get unrecognized extern error if
compiler actually emitted call to memcpy()).

> >
> > > +               return 1;
> > > +       }
> > > +
> > > +       *off += hdr_len;
> > > +       *hdr_bytes_remaining -= hdr_len;
> > > +
> > > +       return 0;
> > > +}
> > > +

[...]
Joanne Koong Aug. 3, 2022, 4:11 p.m. UTC | #8
On Tue, Aug 2, 2022 at 5:53 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Aug 2, 2022 at 3:56 PM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > On Mon, Aug 1, 2022 at 10:58 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> > > >
> > > > Test skb and xdp dynptr functionality in the following ways:
> > > >
> > > > 1) progs/test_xdp.c
> > > >    * Change existing test to use dynptrs to parse xdp data
> > > >
> > > >      There were no noticeable diferences in user + system time between
> > > >      the original version vs. using dynptrs. Averaging the time for 10
> > > >      runs (run using "time ./test_progs -t xdp_bpf2bpf"):
> > > >          original version: 0.0449 sec
> > > >          with dynptrs: 0.0429 sec
> > > >
> > > > 2) progs/test_l4lb_noinline.c
> > > >    * Change existing test to use dynptrs to parse skb data
> > > >
> > > >      There were no noticeable diferences in user + system time between
> > > >      the original version vs. using dynptrs. Averaging the time for 10
> > > >      runs (run using "time ./test_progs -t l4lb_all/l4lb_noinline"):
> > > >          original version: 0.0502 sec
> > > >          with dynptrs: 0.055 sec
> > > >
> > > >      For number of processed verifier instructions:
> > > >          original version: 6284 insns
> > > >          with dynptrs: 2538 insns
> > > >
> > > > 3) progs/test_dynptr_xdp.c
> > > >    * Add sample code for parsing tcp hdr opt lookup using dynptrs.
> > > >      This logic is lifted from a real-world use case of packet parsing in
> > > >      katran [0], a layer 4 load balancer
> > > >
> > > > 4) progs/dynptr_success.c
> > > >    * Add test case "test_skb_readonly" for testing attempts at writes /
> > > >      data slices on a prog type with read-only skb ctx.
> > > >
> > > > 5) progs/dynptr_fail.c
> > > >    * Add test cases "skb_invalid_data_slice" and
> > > >      "xdp_invalid_data_slice" for testing that helpers that modify the
> > > >      underlying packet buffer automatically invalidate the associated
> > > >      data slice.
> > > >    * Add test cases "skb_invalid_ctx" and "xdp_invalid_ctx" for testing
> > > >      that prog types that do not support bpf_dynptr_from_skb/xdp don't
> > > >      have access to the API.
> > > >
> > > > [0] https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
> > > >
> > > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > > ---
> > > >  .../testing/selftests/bpf/prog_tests/dynptr.c |  85 ++++++++++---
> > > >  .../selftests/bpf/prog_tests/dynptr_xdp.c     |  49 ++++++++
> > > >  .../testing/selftests/bpf/progs/dynptr_fail.c |  76 ++++++++++++
> > > >  .../selftests/bpf/progs/dynptr_success.c      |  32 +++++
> > > >  .../selftests/bpf/progs/test_dynptr_xdp.c     | 115 ++++++++++++++++++
> > > >  .../selftests/bpf/progs/test_l4lb_noinline.c  |  71 +++++------
> > > >  tools/testing/selftests/bpf/progs/test_xdp.c  |  95 +++++++--------
> > > >  .../selftests/bpf/test_tcp_hdr_options.h      |   1 +
> > > >  8 files changed, 416 insertions(+), 108 deletions(-)
> > > >  create mode 100644 tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
> > > >  create mode 100644 tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> > > >
> > > > diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > > > index bcf80b9f7c27..c40631f33c7b 100644
> > > > --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > > > +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
> > > > @@ -2,6 +2,7 @@
> > > >  /* Copyright (c) 2022 Facebook */
> > > >
> > > >  #include <test_progs.h>
> > > > +#include <network_helpers.h>
> > > >  #include "dynptr_fail.skel.h"
> > > >  #include "dynptr_success.skel.h"
> > > >
> > > > @@ -11,8 +12,8 @@ static char obj_log_buf[1048576];
> > > >  static struct {
> > > >         const char *prog_name;
> > > >         const char *expected_err_msg;
> > > > -} dynptr_tests[] = {
> > > > -       /* failure cases */
> > > > +} verifier_error_tests[] = {
> > > > +       /* these cases should trigger a verifier error */
> > > >         {"ringbuf_missing_release1", "Unreleased reference id=1"},
> > > >         {"ringbuf_missing_release2", "Unreleased reference id=2"},
> > > >         {"ringbuf_missing_release_callback", "Unreleased reference id"},
> > > > @@ -42,11 +43,25 @@ static struct {
> > > >         {"release_twice_callback", "arg 1 is an unacquired reference"},
> > > >         {"dynptr_from_mem_invalid_api",
> > > >                 "Unsupported reg type fp for bpf_dynptr_from_mem data"},
> > > > +       {"skb_invalid_data_slice", "invalid mem access 'scalar'"},
> > > > +       {"xdp_invalid_data_slice", "invalid mem access 'scalar'"},
> > > > +       {"skb_invalid_ctx", "unknown func bpf_dynptr_from_skb"},
> > > > +       {"xdp_invalid_ctx", "unknown func bpf_dynptr_from_xdp"},
> > > > +};
> > > > +
> > > > +enum test_setup_type {
> > > > +       SETUP_SYSCALL_SLEEP,
> > > > +       SETUP_SKB_PROG,
> > > > +};
> > > >
> > > > -       /* success cases */
> > > > -       {"test_read_write", NULL},
> > > > -       {"test_data_slice", NULL},
> > > > -       {"test_ringbuf", NULL},
> > > > +static struct {
> > > > +       const char *prog_name;
> > > > +       enum test_setup_type type;
> > > > +} runtime_tests[] = {
> > > > +       {"test_read_write", SETUP_SYSCALL_SLEEP},
> > > > +       {"test_data_slice", SETUP_SYSCALL_SLEEP},
> > > > +       {"test_ringbuf", SETUP_SYSCALL_SLEEP},
> > > > +       {"test_skb_readonly", SETUP_SKB_PROG},
> > >
> > > nit: wouldn't it be better to add test_setup_type to dynptr_tests (and
> > > keep fail and success cases together)? It's conceivable that you might
> > > want different setups to test different error conditions, right?
> >
> > Yeah! I originally separated it out because the success tests don't
> > have an error message while the fail ones do, and fail ones don't have
> > a setup (I don't think we'll need any custom userspace setup for those
> > since we're checking for verifier failures at prog load time) and the
> > success ones do. But I can combine them into 1 so that it's simpler. I
> > will do this in v2.
>
> great, thanks! you might actually need custom setup for SKB vs XDP
> programs if you are unifying bpf_dynptr_from_packet?

Yes I think so for the success cases (for the fail cases, I think just
having SEC("xdp") and SEC("tc") is sufficient)

>
> > >
> > > >  };
> > > >
> > > >  static void verify_fail(const char *prog_name, const char *expected_err_msg)
> > > > @@ -85,7 +100,7 @@ static void verify_fail(const char *prog_name, const char *expected_err_msg)
> > > >         dynptr_fail__destroy(skel);
> > > >  }
> > > >
>
> [...]
>
> > > > +/* The data slice is invalidated whenever a helper changes packet data */
> > > > +SEC("?xdp")
> > > > +int xdp_invalid_data_slice(struct xdp_md *xdp)
> > > > +{
> > > > +       struct bpf_dynptr ptr;
> > > > +       struct ethhdr *hdr1, *hdr2;
> > > > +
> > > > +       bpf_dynptr_from_xdp(xdp, 0, &ptr);
> > > > +       hdr1 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr1));
> > > > +       if (!hdr1)
> > > > +               return XDP_DROP;
> > > > +
> > > > +       hdr2 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr2));
> > > > +       if (!hdr2)
> > > > +               return XDP_DROP;
> > > > +
> > > > +       hdr1->h_proto = 12;
> > > > +       hdr2->h_proto = 12;
> > > > +
> > > > +       if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr1)))
> > > > +               return XDP_DROP;
> > > > +
> > > > +       /* this should fail */
> > > > +       hdr2->h_proto = 1;
> > >
> > > is there something special about having both hdr1 and hdr2? Wouldn't
> > > this test work with just single hdr pointer?
> >
> > Yes, this test would work with just 1 single hdr pointer (which is
> > what skb_invalid_data_slice does) but I wanted to ensure that this
> > also works in the case of multiple data slices. If you think this is
> > unnecessary / adds clutter, I can remove hdr2.
>
>
> I think testing two pointers isn't the point, so I'd keep the test
> minimal. It seems like testing two pointers should be in a success
> test, to prove it works, rather than as some side effect of an
> expected-to-fail test, no?

My intention was to test that two pointers doesn't work (eg that when
you have multiple data slices, changing the packet buffer should
invalidate all slices, so that any attempt to access any slice should
fail) so I think to test that this would have to stay in the
verifier_fail test. I think this test might be more confusing than
helpful, so I will just remove hdr2 for v2 :)

>
> >
> > >
> > > > +
> > > > +       return XDP_PASS;
> > > > +}
> > > > +
> > > > +/* Only supported prog type can create skb-type dynptrs */
> > >
> > > [...]
> > >
> > > > +       err = 1;
> > > > +
> > > > +       if (bpf_dynptr_from_skb(ctx, 0, &ptr))
> > > > +               return 0;
> > > > +       err++;
> > > > +
> > > > +       data = bpf_dynptr_data(&ptr, 0, 1);
> > > > +       if (data)
> > > > +               /* it's an error if data is not NULL since cgroup skbs
> > > > +                * are read only
> > > > +                */
> > > > +               return 0;
> > > > +       err++;
> > > > +
> > > > +       ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
> > > > +       /* since cgroup skbs are read only, writes should fail */
> > > > +       if (ret != -EINVAL)
> > > > +               return 0;
> > > > +
> > > > +       err = 0;
> > >
> > > hm, if data is NULL you'll still report success if bpf_dynptr_write
> > > returns 0 or any other error but -EINVAL... The logic is a bit unclear
> > > here...
> > >
> > If data is NULL and bpf_dynptr_write returns 0 or any other error
> > besides -EINVAL, I think we report a failure here (err is set to a
> > non-zero value, which userspace checks against)
>
> oh, ok, I read it backwards. I find this "stateful increasing error
> number" pattern very confusing. Why not write it more
> straightforwardly as:
>
> if (bpf_dynptr_from_skb(...)) {
>     err = 1;
>     return 0;
> }
>
> data = bpf_dynptr_data(...);
> if (data) {
>     err = 2;
>     return 0;
> }
>
> ret = bpf_dynptr_write(...);
> if (ret != -EINVAL) {
>     err = 3;
>     return 0;
> }
>
> /* all tests passed */
> err = 0;
> return 0;
>
> ?
>

My thinking was that err++ would be more robust (eg if you add another
case to the code, then you don't have to go down and fix all the err
numbers to adjust them by 1). But you're right, I think this just ends
up being confusing / non-intuitive to read. I will change it to the
more explicit way for v2

> >
> > > > +
> > > > +       return 0;
> > > > +}
> > > > diff --git a/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> > > > new file mode 100644
> > > > index 000000000000..c879dfb6370a
> > > > --- /dev/null
> > > > +++ b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
> > > > @@ -0,0 +1,115 @@
> > > > +// SPDX-License-Identifier: GPL-2.0
> > > > +
>
> [...]
>
> > > > +       hdr_len = data[1];
> > > > +       if (hdr_len > *hdr_bytes_remaining)
> > > > +               return -1;
> > > > +
> > > > +       if (kind == tcp_hdr_opt_kind_tpr) {
> > > > +               if (hdr_len != tcp_hdr_opt_len_tpr)
> > > > +                       return -1;
> > > > +
> > > > +               memcpy(server_id, (__u32 *)(data + 2), sizeof(*server_id));
> > >
> > > this implicitly relies on compiler inlining memcpy, let's use
> > > __builtint_memcpy() here instead to set a good example?
> >
> > Sounds good, I will change this for v2. Should memcpys in bpf progs
> > always use __builtin_memcpy or is it on a case-by-case basis where if
> > the size is small enough, then you use it?
>
> __builtin_memcpy() is best. When we write just "memcpy()" we still
> rely on compiler to actually optimizing that to __builtin_memcpy(),
> because there is no memcpy() (we'd get unrecognized extern error if
> compiler actually emitted call to memcpy()).

Ohh I see, thanks for the explanation!

I am going to do some selftests cleanup this week, so I'll change the
other usages of memcpy() to __builtin_memcpy() as part of that clean
up.

>
> > >
> > > > +               return 1;
> > > > +       }
> > > > +
> > > > +       *off += hdr_len;
> > > > +       *hdr_bytes_remaining -= hdr_len;
> > > > +
> > > > +       return 0;
> > > > +}
> > > > +
>
> [...]
Alexei Starovoitov Aug. 4, 2022, 6:45 p.m. UTC | #9
On Wed, Aug 3, 2022 at 9:11 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > __builtin_memcpy() is best. When we write just "memcpy()" we still
> > rely on compiler to actually optimizing that to __builtin_memcpy(),
> > because there is no memcpy() (we'd get unrecognized extern error if
> > compiler actually emitted call to memcpy()).
>
> Ohh I see, thanks for the explanation!
>
> I am going to do some selftests cleanup this week, so I'll change the
> other usages of memcpy() to __builtin_memcpy() as part of that clean
> up.

builtin_memcpy might be doing single byte copy when
alignment is not known which is often the case when
working with packets.
If we do this cleanup, let's copy-paste cilium's memcpy
helper that does 8 byte copy.
It's much better than builtin_memcpy.
https://github.com/cilium/cilium/blob/master/bpf/include/bpf/builtins.h
Joanne Koong Aug. 4, 2022, 9:46 p.m. UTC | #10
On Tue, Aug 2, 2022 at 3:21 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Mon, Aug 1, 2022 at 12:12 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Jul 26, 2022 at 11:48 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> > >
> > >
> > > -static __always_inline int handle_ipv4(struct xdp_md *xdp)
> > > +static __always_inline int handle_ipv4(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
> > >  {
> > > -       void *data_end = (void *)(long)xdp->data_end;
> > > -       void *data = (void *)(long)xdp->data;
> > > +       struct bpf_dynptr new_xdp_ptr;
> > >         struct iptnl_info *tnl;
> > >         struct ethhdr *new_eth;
> > >         struct ethhdr *old_eth;
> > > -       struct iphdr *iph = data + sizeof(struct ethhdr);
> > > +       struct iphdr *iph;
> > >         __u16 *next_iph;
> > >         __u16 payload_len;
> > >         struct vip vip = {};
> > > @@ -90,10 +90,12 @@ static __always_inline int handle_ipv4(struct xdp_md *xdp)
> > >         __u32 csum = 0;
> > >         int i;
> > >
> > > -       if (iph + 1 > data_end)
> > > +       iph = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
> > > +                             iphdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
> > > +       if (!iph)
> > >                 return XDP_DROP;
> >
> > dynptr based xdp/skb access looks neat.
> > Maybe in addition to bpf_dynptr_data() we can add helper(s)
> > that return skb/xdp_md from dynptr?
> > This way the code will be passing dynptr only and there will
> > be no need to pass around 'struct xdp_md *xdp' (like this function).
>
> Great idea! I'll add this to v2.

Thinking about this some more, I don't think the extra helpers will be
that useful. We'd have to add 2 custom helpers (bpf_dynptr_get_xdp +
bpf_dynptr_get_skb) and calling them would always require a null check
(since we'd return NULL if the dyntpr is invalid/null). I think it'd
be faster / easier for the program to just pass in the ctx as an extra
arg in the special cases where it needs that.

>
> >
> > Separately please keep the existing tests instead of converting them.
> > Either ifdef data/data_end vs dynptr style or copy paste
> > the whole test into a new .c file. Whichever is cleaner.
>
> Will do for v2.
Joanne Koong Aug. 5, 2022, 4:29 p.m. UTC | #11
On Thu, Aug 4, 2022 at 11:45 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Wed, Aug 3, 2022 at 9:11 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> > >
> > > __builtin_memcpy() is best. When we write just "memcpy()" we still
> > > rely on compiler to actually optimizing that to __builtin_memcpy(),
> > > because there is no memcpy() (we'd get unrecognized extern error if
> > > compiler actually emitted call to memcpy()).
> >
> > Ohh I see, thanks for the explanation!
> >
> > I am going to do some selftests cleanup this week, so I'll change the
> > other usages of memcpy() to __builtin_memcpy() as part of that clean
> > up.
>
> builtin_memcpy might be doing single byte copy when
> alignment is not known which is often the case when
> working with packets.
> If we do this cleanup, let's copy-paste cilium's memcpy
> helper that does 8 byte copy.
> It's much better than builtin_memcpy.
> https://github.com/cilium/cilium/blob/master/bpf/include/bpf/builtins.h

Maybe we should have a separate patchset to changing to use cilium's
other builtins as well (eg memzero, memcmp, memmove); I'll leave the
memcpy() -> __builtin_memcpy() changes to be part of that patchset.
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c
index bcf80b9f7c27..c40631f33c7b 100644
--- a/tools/testing/selftests/bpf/prog_tests/dynptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c
@@ -2,6 +2,7 @@ 
 /* Copyright (c) 2022 Facebook */
 
 #include <test_progs.h>
+#include <network_helpers.h>
 #include "dynptr_fail.skel.h"
 #include "dynptr_success.skel.h"
 
@@ -11,8 +12,8 @@  static char obj_log_buf[1048576];
 static struct {
 	const char *prog_name;
 	const char *expected_err_msg;
-} dynptr_tests[] = {
-	/* failure cases */
+} verifier_error_tests[] = {
+	/* these cases should trigger a verifier error */
 	{"ringbuf_missing_release1", "Unreleased reference id=1"},
 	{"ringbuf_missing_release2", "Unreleased reference id=2"},
 	{"ringbuf_missing_release_callback", "Unreleased reference id"},
@@ -42,11 +43,25 @@  static struct {
 	{"release_twice_callback", "arg 1 is an unacquired reference"},
 	{"dynptr_from_mem_invalid_api",
 		"Unsupported reg type fp for bpf_dynptr_from_mem data"},
+	{"skb_invalid_data_slice", "invalid mem access 'scalar'"},
+	{"xdp_invalid_data_slice", "invalid mem access 'scalar'"},
+	{"skb_invalid_ctx", "unknown func bpf_dynptr_from_skb"},
+	{"xdp_invalid_ctx", "unknown func bpf_dynptr_from_xdp"},
+};
+
+enum test_setup_type {
+	SETUP_SYSCALL_SLEEP,
+	SETUP_SKB_PROG,
+};
 
-	/* success cases */
-	{"test_read_write", NULL},
-	{"test_data_slice", NULL},
-	{"test_ringbuf", NULL},
+static struct {
+	const char *prog_name;
+	enum test_setup_type type;
+} runtime_tests[] = {
+	{"test_read_write", SETUP_SYSCALL_SLEEP},
+	{"test_data_slice", SETUP_SYSCALL_SLEEP},
+	{"test_ringbuf", SETUP_SYSCALL_SLEEP},
+	{"test_skb_readonly", SETUP_SKB_PROG},
 };
 
 static void verify_fail(const char *prog_name, const char *expected_err_msg)
@@ -85,7 +100,7 @@  static void verify_fail(const char *prog_name, const char *expected_err_msg)
 	dynptr_fail__destroy(skel);
 }
 
-static void verify_success(const char *prog_name)
+static void run_tests(const char *prog_name, enum test_setup_type setup_type)
 {
 	struct dynptr_success *skel;
 	struct bpf_program *prog;
@@ -107,15 +122,42 @@  static void verify_success(const char *prog_name)
 	if (!ASSERT_OK_PTR(prog, "bpf_object__find_program_by_name"))
 		goto cleanup;
 
-	link = bpf_program__attach(prog);
-	if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
-		goto cleanup;
+	switch (setup_type) {
+	case SETUP_SYSCALL_SLEEP:
+		link = bpf_program__attach(prog);
+		if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+			goto cleanup;
 
-	usleep(1);
+		usleep(1);
 
-	ASSERT_EQ(skel->bss->err, 0, "err");
+		bpf_link__destroy(link);
+		break;
+	case SETUP_SKB_PROG:
+	{
+		int prog_fd, err;
+		char buf[64];
+
+		prog_fd = bpf_program__fd(prog);
+		if (CHECK_FAIL(prog_fd < 0))
+			goto cleanup;
+
+		LIBBPF_OPTS(bpf_test_run_opts, topts,
+			    .data_in = &pkt_v4,
+			    .data_size_in = sizeof(pkt_v4),
+			    .data_out = buf,
+			    .data_size_out = sizeof(buf),
+			    .repeat = 1,
+		);
 
-	bpf_link__destroy(link);
+		err = bpf_prog_test_run_opts(prog_fd, &topts);
+
+		if (!ASSERT_OK(err, "test_run"))
+			goto cleanup;
+
+		break;
+	}
+	}
+	ASSERT_EQ(skel->bss->err, 0, "err");
 
 cleanup:
 	dynptr_success__destroy(skel);
@@ -125,14 +167,17 @@  void test_dynptr(void)
 {
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(dynptr_tests); i++) {
-		if (!test__start_subtest(dynptr_tests[i].prog_name))
+	for (i = 0; i < ARRAY_SIZE(verifier_error_tests); i++) {
+		if (!test__start_subtest(verifier_error_tests[i].prog_name))
+			continue;
+
+		verify_fail(verifier_error_tests[i].prog_name,
+			    verifier_error_tests[i].expected_err_msg);
+	}
+	for (i = 0; i < ARRAY_SIZE(runtime_tests); i++) {
+		if (!test__start_subtest(runtime_tests[i].prog_name))
 			continue;
 
-		if (dynptr_tests[i].expected_err_msg)
-			verify_fail(dynptr_tests[i].prog_name,
-				    dynptr_tests[i].expected_err_msg);
-		else
-			verify_success(dynptr_tests[i].prog_name);
+		run_tests(runtime_tests[i].prog_name, runtime_tests[i].type);
 	}
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c b/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
new file mode 100644
index 000000000000..ca775d126b60
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/dynptr_xdp.c
@@ -0,0 +1,49 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "test_dynptr_xdp.skel.h"
+#include "test_tcp_hdr_options.h"
+
+struct test_pkt {
+	struct ipv6_packet pk6_v6;
+	u8 options[16];
+} __packed;
+
+void test_dynptr_xdp(void)
+{
+	struct test_dynptr_xdp *skel;
+	char buf[128];
+	int err;
+
+	skel = test_dynptr_xdp__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open_and_load"))
+		return;
+
+	struct test_pkt pkt = {
+		.pk6_v6.eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+		.pk6_v6.iph.nexthdr = IPPROTO_TCP,
+		.pk6_v6.iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+		.pk6_v6.tcp.urg_ptr = 123,
+		.pk6_v6.tcp.doff = 9, /* 16 bytes of options */
+
+		.options = {
+			TCPOPT_MSS, 4, 0x05, 0xB4, TCPOPT_NOP, TCPOPT_NOP,
+			skel->rodata->tcp_hdr_opt_kind_tpr, 6, 0, 0, 0, 9, TCPOPT_EOL
+		},
+	};
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		    .data_in = &pkt,
+		    .data_size_in = sizeof(pkt),
+		    .data_out = buf,
+		    .data_size_out = sizeof(buf),
+		    .repeat = 3,
+	);
+
+	err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.xdp_ingress_v6), &topts);
+	ASSERT_OK(err, "ipv6 test_run");
+	ASSERT_EQ(skel->bss->server_id, 0x9000000, "server id");
+	ASSERT_EQ(topts.retval, XDP_PASS, "ipv6 test_run retval");
+
+	test_dynptr_xdp__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/dynptr_fail.c b/tools/testing/selftests/bpf/progs/dynptr_fail.c
index c1814938a5fd..4e3f853b2d02 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_fail.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_fail.c
@@ -5,6 +5,7 @@ 
 #include <string.h>
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
+#include <linux/if_ether.h>
 #include "bpf_misc.h"
 
 char _license[] SEC("license") = "GPL";
@@ -622,3 +623,78 @@  int dynptr_from_mem_invalid_api(void *ctx)
 
 	return 0;
 }
+
+/* The data slice is invalidated whenever a helper changes packet data */
+SEC("?tc")
+int skb_invalid_data_slice(struct __sk_buff *skb)
+{
+	struct bpf_dynptr ptr;
+	struct ethhdr *hdr;
+
+	bpf_dynptr_from_skb(skb, 0, &ptr);
+	hdr = bpf_dynptr_data(&ptr, 0, sizeof(*hdr));
+	if (!hdr)
+		return SK_DROP;
+
+	hdr->h_proto = 12;
+
+	if (bpf_skb_pull_data(skb, skb->len))
+		return SK_DROP;
+
+	/* this should fail */
+	hdr->h_proto = 1;
+
+	return SK_PASS;
+}
+
+/* The data slice is invalidated whenever a helper changes packet data */
+SEC("?xdp")
+int xdp_invalid_data_slice(struct xdp_md *xdp)
+{
+	struct bpf_dynptr ptr;
+	struct ethhdr *hdr1, *hdr2;
+
+	bpf_dynptr_from_xdp(xdp, 0, &ptr);
+	hdr1 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr1));
+	if (!hdr1)
+		return XDP_DROP;
+
+	hdr2 = bpf_dynptr_data(&ptr, 0, sizeof(*hdr2));
+	if (!hdr2)
+		return XDP_DROP;
+
+	hdr1->h_proto = 12;
+	hdr2->h_proto = 12;
+
+	if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(*hdr1)))
+		return XDP_DROP;
+
+	/* this should fail */
+	hdr2->h_proto = 1;
+
+	return XDP_PASS;
+}
+
+/* Only supported prog type can create skb-type dynptrs */
+SEC("?raw_tp/sys_nanosleep")
+int skb_invalid_ctx(void *ctx)
+{
+	struct bpf_dynptr ptr;
+
+	/* this should fail */
+	bpf_dynptr_from_skb(ctx, 0, &ptr);
+
+	return 0;
+}
+
+/* Only supported prog type can create xdp-type dynptrs */
+SEC("?raw_tp/sys_nanosleep")
+int xdp_invalid_ctx(void *ctx)
+{
+	struct bpf_dynptr ptr;
+
+	/* this should fail */
+	bpf_dynptr_from_xdp(ctx, 0, &ptr);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c
index a3a6103c8569..ffddd6ddc7fb 100644
--- a/tools/testing/selftests/bpf/progs/dynptr_success.c
+++ b/tools/testing/selftests/bpf/progs/dynptr_success.c
@@ -162,3 +162,35 @@  int test_ringbuf(void *ctx)
 	bpf_ringbuf_discard_dynptr(&ptr, 0);
 	return 0;
 }
+
+SEC("cgroup_skb/egress")
+int test_skb_readonly(void *ctx)
+{
+	__u8 write_data[2] = {1, 2};
+	struct bpf_dynptr ptr;
+	void *data;
+	int ret;
+
+	err = 1;
+
+	if (bpf_dynptr_from_skb(ctx, 0, &ptr))
+		return 0;
+	err++;
+
+	data = bpf_dynptr_data(&ptr, 0, 1);
+	if (data)
+		/* it's an error if data is not NULL since cgroup skbs
+		 * are read only
+		 */
+		return 0;
+	err++;
+
+	ret = bpf_dynptr_write(&ptr, 0, write_data, sizeof(write_data), 0);
+	/* since cgroup skbs are read only, writes should fail */
+	if (ret != -EINVAL)
+		return 0;
+
+	err = 0;
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
new file mode 100644
index 000000000000..c879dfb6370a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_dynptr_xdp.c
@@ -0,0 +1,115 @@ 
+// SPDX-License-Identifier: GPL-2.0
+
+/* This logic is lifted from a real-world use case of packet parsing, used in
+ * the open source library katran, a layer 4 load balancer.
+ *
+ * This test demonstrates how to parse packet contents using dynptrs.
+ *
+ * https://github.com/facebookincubator/katran/blob/main/katran/lib/bpf/pckt_parsing.h
+ */
+
+#include <string.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <linux/tcp.h>
+#include <stdbool.h>
+#include <linux/ipv6.h>
+#include <linux/if_ether.h>
+#include "test_tcp_hdr_options.h"
+
+char _license[] SEC("license") = "GPL";
+
+/* Arbitrarily picked unused value from IANA TCP Option Kind Numbers */
+const __u32 tcp_hdr_opt_kind_tpr = 0xB7;
+/* Length of the tcp header option */
+const __u32 tcp_hdr_opt_len_tpr = 6;
+/* maximum number of header options to check to lookup server_id */
+const __u32 tcp_hdr_opt_max_opt_checks = 15;
+
+__u32 server_id;
+
+static int parse_hdr_opt(struct bpf_dynptr *ptr, __u32 *off, __u8 *hdr_bytes_remaining,
+			 __u32 *server_id)
+{
+	__u8 *tcp_opt, kind, hdr_len;
+	__u8 *data;
+
+	data = bpf_dynptr_data(ptr, *off, sizeof(kind) + sizeof(hdr_len) +
+			       sizeof(*server_id));
+	if (!data)
+		return -1;
+
+	kind = data[0];
+
+	if (kind == TCPOPT_EOL)
+		return -1;
+
+	if (kind == TCPOPT_NOP) {
+		*off += 1;
+		/* continue to the next option */
+		*hdr_bytes_remaining -= 1;
+
+		return 0;
+	}
+
+	if (*hdr_bytes_remaining < 2)
+		return -1;
+
+	hdr_len = data[1];
+	if (hdr_len > *hdr_bytes_remaining)
+		return -1;
+
+	if (kind == tcp_hdr_opt_kind_tpr) {
+		if (hdr_len != tcp_hdr_opt_len_tpr)
+			return -1;
+
+		memcpy(server_id, (__u32 *)(data + 2), sizeof(*server_id));
+		return 1;
+	}
+
+	*off += hdr_len;
+	*hdr_bytes_remaining -= hdr_len;
+
+	return 0;
+}
+
+SEC("xdp")
+int xdp_ingress_v6(struct xdp_md *xdp)
+{
+	__u8 hdr_bytes_remaining;
+	struct tcphdr *tcp_hdr;
+	__u8 tcp_hdr_opt_len;
+	int err = 0;
+	__u32 off;
+
+	struct bpf_dynptr ptr;
+
+	bpf_dynptr_from_xdp(xdp, 0, &ptr);
+
+	off = sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
+
+	tcp_hdr = bpf_dynptr_data(&ptr, off, sizeof(*tcp_hdr));
+	if (!tcp_hdr)
+		return XDP_DROP;
+
+	tcp_hdr_opt_len = (tcp_hdr->doff * 4) - sizeof(struct tcphdr);
+	if (tcp_hdr_opt_len < tcp_hdr_opt_len_tpr)
+		return XDP_DROP;
+
+	hdr_bytes_remaining = tcp_hdr_opt_len;
+
+	off += sizeof(struct tcphdr);
+
+	/* max number of bytes of options in tcp header is 40 bytes */
+	for (int i = 0; i < tcp_hdr_opt_max_opt_checks; i++) {
+		err = parse_hdr_opt(&ptr, &off, &hdr_bytes_remaining, &server_id);
+
+		if (err || !hdr_bytes_remaining)
+			break;
+	}
+
+	if (!server_id)
+		return XDP_DROP;
+
+	return XDP_PASS;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
index c8bc0c6947aa..1fef7868ea8b 100644
--- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
+++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
@@ -230,21 +230,18 @@  static __noinline bool get_packet_dst(struct real_definition **real,
 	return true;
 }
 
-static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
+static __noinline int parse_icmpv6(struct bpf_dynptr *skb_ptr, __u64 off,
 				   struct packet_description *pckt)
 {
 	struct icmp6hdr *icmp_hdr;
 	struct ipv6hdr *ip6h;
 
-	icmp_hdr = data + off;
-	if (icmp_hdr + 1 > data_end)
+	icmp_hdr = bpf_dynptr_data(skb_ptr, off, sizeof(*icmp_hdr) + sizeof(*ip6h));
+	if (!icmp_hdr)
 		return TC_ACT_SHOT;
 	if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
 		return TC_ACT_OK;
-	off += sizeof(struct icmp6hdr);
-	ip6h = data + off;
-	if (ip6h + 1 > data_end)
-		return TC_ACT_SHOT;
+	ip6h = (struct ipv6hdr *)(icmp_hdr + 1);
 	pckt->proto = ip6h->nexthdr;
 	pckt->flags |= F_ICMP;
 	memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
@@ -252,22 +249,19 @@  static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
 	return TC_ACT_UNSPEC;
 }
 
-static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
+static __noinline int parse_icmp(struct bpf_dynptr *skb_ptr, __u64 off,
 				 struct packet_description *pckt)
 {
 	struct icmphdr *icmp_hdr;
 	struct iphdr *iph;
 
-	icmp_hdr = data + off;
-	if (icmp_hdr + 1 > data_end)
+	icmp_hdr = bpf_dynptr_data(skb_ptr, off, sizeof(*icmp_hdr) + sizeof(*iph));
+	if (!icmp_hdr)
 		return TC_ACT_SHOT;
 	if (icmp_hdr->type != ICMP_DEST_UNREACH ||
 	    icmp_hdr->code != ICMP_FRAG_NEEDED)
 		return TC_ACT_OK;
-	off += sizeof(struct icmphdr);
-	iph = data + off;
-	if (iph + 1 > data_end)
-		return TC_ACT_SHOT;
+	iph = (struct iphdr *)(icmp_hdr + 1);
 	if (iph->ihl != 5)
 		return TC_ACT_SHOT;
 	pckt->proto = iph->protocol;
@@ -277,13 +271,13 @@  static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
 	return TC_ACT_UNSPEC;
 }
 
-static __noinline bool parse_udp(void *data, __u64 off, void *data_end,
+static __noinline bool parse_udp(struct bpf_dynptr *skb_ptr, __u64 off,
 				 struct packet_description *pckt)
 {
 	struct udphdr *udp;
-	udp = data + off;
 
-	if (udp + 1 > data_end)
+	udp = bpf_dynptr_data(skb_ptr, off, sizeof(*udp));
+	if (!udp)
 		return false;
 
 	if (!(pckt->flags & F_ICMP)) {
@@ -296,13 +290,13 @@  static __noinline bool parse_udp(void *data, __u64 off, void *data_end,
 	return true;
 }
 
-static __noinline bool parse_tcp(void *data, __u64 off, void *data_end,
+static __noinline bool parse_tcp(struct bpf_dynptr *skb_ptr, __u64 off,
 				 struct packet_description *pckt)
 {
 	struct tcphdr *tcp;
 
-	tcp = data + off;
-	if (tcp + 1 > data_end)
+	tcp = bpf_dynptr_data(skb_ptr, off, sizeof(*tcp));
+	if (!tcp)
 		return false;
 
 	if (tcp->syn)
@@ -318,12 +312,11 @@  static __noinline bool parse_tcp(void *data, __u64 off, void *data_end,
 	return true;
 }
 
-static __noinline int process_packet(void *data, __u64 off, void *data_end,
+static __noinline int process_packet(struct bpf_dynptr *skb_ptr,
+				     struct eth_hdr *eth, __u64 off,
 				     bool is_ipv6, struct __sk_buff *skb)
 {
-	void *pkt_start = (void *)(long)skb->data;
 	struct packet_description pckt = {};
-	struct eth_hdr *eth = pkt_start;
 	struct bpf_tunnel_key tkey = {};
 	struct vip_stats *data_stats;
 	struct real_definition *dst;
@@ -344,8 +337,8 @@  static __noinline int process_packet(void *data, __u64 off, void *data_end,
 
 	tkey.tunnel_ttl = 64;
 	if (is_ipv6) {
-		ip6h = data + off;
-		if (ip6h + 1 > data_end)
+		ip6h = bpf_dynptr_data(skb_ptr, off, sizeof(*ip6h));
+		if (!ip6h)
 			return TC_ACT_SHOT;
 
 		iph_len = sizeof(struct ipv6hdr);
@@ -356,7 +349,7 @@  static __noinline int process_packet(void *data, __u64 off, void *data_end,
 		if (protocol == IPPROTO_FRAGMENT) {
 			return TC_ACT_SHOT;
 		} else if (protocol == IPPROTO_ICMPV6) {
-			action = parse_icmpv6(data, data_end, off, &pckt);
+			action = parse_icmpv6(skb_ptr, off, &pckt);
 			if (action >= 0)
 				return action;
 			off += IPV6_PLUS_ICMP_HDR;
@@ -365,10 +358,8 @@  static __noinline int process_packet(void *data, __u64 off, void *data_end,
 			memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16);
 		}
 	} else {
-		iph = data + off;
-		if (iph + 1 > data_end)
-			return TC_ACT_SHOT;
-		if (iph->ihl != 5)
+		iph = bpf_dynptr_data(skb_ptr, off, sizeof(*iph));
+		if (!iph || iph->ihl != 5)
 			return TC_ACT_SHOT;
 
 		protocol = iph->protocol;
@@ -379,7 +370,7 @@  static __noinline int process_packet(void *data, __u64 off, void *data_end,
 		if (iph->frag_off & PCKT_FRAGMENTED)
 			return TC_ACT_SHOT;
 		if (protocol == IPPROTO_ICMP) {
-			action = parse_icmp(data, data_end, off, &pckt);
+			action = parse_icmp(skb_ptr, off, &pckt);
 			if (action >= 0)
 				return action;
 			off += IPV4_PLUS_ICMP_HDR;
@@ -391,10 +382,10 @@  static __noinline int process_packet(void *data, __u64 off, void *data_end,
 	protocol = pckt.proto;
 
 	if (protocol == IPPROTO_TCP) {
-		if (!parse_tcp(data, off, data_end, &pckt))
+		if (!parse_tcp(skb_ptr, off, &pckt))
 			return TC_ACT_SHOT;
 	} else if (protocol == IPPROTO_UDP) {
-		if (!parse_udp(data, off, data_end, &pckt))
+		if (!parse_udp(skb_ptr, off, &pckt))
 			return TC_ACT_SHOT;
 	} else {
 		return TC_ACT_SHOT;
@@ -450,20 +441,22 @@  static __noinline int process_packet(void *data, __u64 off, void *data_end,
 SEC("tc")
 int balancer_ingress(struct __sk_buff *ctx)
 {
-	void *data_end = (void *)(long)ctx->data_end;
-	void *data = (void *)(long)ctx->data;
-	struct eth_hdr *eth = data;
+	struct bpf_dynptr ptr;
+	struct eth_hdr *eth;
 	__u32 eth_proto;
 	__u32 nh_off;
 
 	nh_off = sizeof(struct eth_hdr);
-	if (data + nh_off > data_end)
+
+	bpf_dynptr_from_skb(ctx, 0, &ptr);
+	eth = bpf_dynptr_data(&ptr, 0, sizeof(*eth));
+	if (!eth)
 		return TC_ACT_SHOT;
 	eth_proto = eth->eth_proto;
 	if (eth_proto == bpf_htons(ETH_P_IP))
-		return process_packet(data, nh_off, data_end, false, ctx);
+		return process_packet(&ptr, eth, nh_off, false, ctx);
 	else if (eth_proto == bpf_htons(ETH_P_IPV6))
-		return process_packet(data, nh_off, data_end, true, ctx);
+		return process_packet(&ptr, eth, nh_off, true, ctx);
 	else
 		return TC_ACT_SHOT;
 }
diff --git a/tools/testing/selftests/bpf/progs/test_xdp.c b/tools/testing/selftests/bpf/progs/test_xdp.c
index d7a9a74b7245..2272b56a8777 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp.c
@@ -20,6 +20,12 @@ 
 #include <bpf/bpf_endian.h>
 #include "test_iptunnel_common.h"
 
+const size_t tcphdr_sz = sizeof(struct tcphdr);
+const size_t udphdr_sz = sizeof(struct udphdr);
+const size_t ethhdr_sz = sizeof(struct ethhdr);
+const size_t iphdr_sz = sizeof(struct iphdr);
+const size_t ipv6hdr_sz = sizeof(struct ipv6hdr);
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
 	__uint(max_entries, 256);
@@ -43,8 +49,7 @@  static __always_inline void count_tx(__u32 protocol)
 		*rxcnt_count += 1;
 }
 
-static __always_inline int get_dport(void *trans_data, void *data_end,
-				     __u8 protocol)
+static __always_inline int get_dport(void *trans_data, __u8 protocol)
 {
 	struct tcphdr *th;
 	struct udphdr *uh;
@@ -52,13 +57,9 @@  static __always_inline int get_dport(void *trans_data, void *data_end,
 	switch (protocol) {
 	case IPPROTO_TCP:
 		th = (struct tcphdr *)trans_data;
-		if (th + 1 > data_end)
-			return -1;
 		return th->dest;
 	case IPPROTO_UDP:
 		uh = (struct udphdr *)trans_data;
-		if (uh + 1 > data_end)
-			return -1;
 		return uh->dest;
 	default:
 		return 0;
@@ -75,14 +76,13 @@  static __always_inline void set_ethhdr(struct ethhdr *new_eth,
 	new_eth->h_proto = h_proto;
 }
 
-static __always_inline int handle_ipv4(struct xdp_md *xdp)
+static __always_inline int handle_ipv4(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
 {
-	void *data_end = (void *)(long)xdp->data_end;
-	void *data = (void *)(long)xdp->data;
+	struct bpf_dynptr new_xdp_ptr;
 	struct iptnl_info *tnl;
 	struct ethhdr *new_eth;
 	struct ethhdr *old_eth;
-	struct iphdr *iph = data + sizeof(struct ethhdr);
+	struct iphdr *iph;
 	__u16 *next_iph;
 	__u16 payload_len;
 	struct vip vip = {};
@@ -90,10 +90,12 @@  static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	__u32 csum = 0;
 	int i;
 
-	if (iph + 1 > data_end)
+	iph = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
+			      iphdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
+	if (!iph)
 		return XDP_DROP;
 
-	dport = get_dport(iph + 1, data_end, iph->protocol);
+	dport = get_dport(iph + 1, iph->protocol);
 	if (dport == -1)
 		return XDP_DROP;
 
@@ -108,37 +110,33 @@  static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	if (!tnl || tnl->family != AF_INET)
 		return XDP_PASS;
 
-	if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
+	if (bpf_xdp_adjust_head(xdp, 0 - (int)iphdr_sz))
 		return XDP_DROP;
 
-	data = (void *)(long)xdp->data;
-	data_end = (void *)(long)xdp->data_end;
-
-	new_eth = data;
-	iph = data + sizeof(*new_eth);
-	old_eth = data + sizeof(*iph);
-
-	if (new_eth + 1 > data_end ||
-	    old_eth + 1 > data_end ||
-	    iph + 1 > data_end)
+	bpf_dynptr_from_xdp(xdp, 0, &new_xdp_ptr);
+	new_eth = bpf_dynptr_data(&new_xdp_ptr, 0, ethhdr_sz + iphdr_sz + ethhdr_sz);
+	if (!new_eth)
 		return XDP_DROP;
 
+	iph = (struct iphdr *)(new_eth + 1);
+	old_eth = (struct ethhdr *)(iph + 1);
+
 	set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IP));
 
 	iph->version = 4;
-	iph->ihl = sizeof(*iph) >> 2;
+	iph->ihl = iphdr_sz >> 2;
 	iph->frag_off =	0;
 	iph->protocol = IPPROTO_IPIP;
 	iph->check = 0;
 	iph->tos = 0;
-	iph->tot_len = bpf_htons(payload_len + sizeof(*iph));
+	iph->tot_len = bpf_htons(payload_len + iphdr_sz);
 	iph->daddr = tnl->daddr.v4;
 	iph->saddr = tnl->saddr.v4;
 	iph->ttl = 8;
 
 	next_iph = (__u16 *)iph;
 #pragma clang loop unroll(full)
-	for (i = 0; i < sizeof(*iph) >> 1; i++)
+	for (i = 0; i < iphdr_sz >> 1; i++)
 		csum += *next_iph++;
 
 	iph->check = ~((csum & 0xffff) + (csum >> 16));
@@ -148,22 +146,23 @@  static __always_inline int handle_ipv4(struct xdp_md *xdp)
 	return XDP_TX;
 }
 
-static __always_inline int handle_ipv6(struct xdp_md *xdp)
+static __always_inline int handle_ipv6(struct xdp_md *xdp, struct bpf_dynptr *xdp_ptr)
 {
-	void *data_end = (void *)(long)xdp->data_end;
-	void *data = (void *)(long)xdp->data;
+	struct bpf_dynptr new_xdp_ptr;
 	struct iptnl_info *tnl;
 	struct ethhdr *new_eth;
 	struct ethhdr *old_eth;
-	struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
+	struct ipv6hdr *ip6h;
 	__u16 payload_len;
 	struct vip vip = {};
 	int dport;
 
-	if (ip6h + 1 > data_end)
+	ip6h = bpf_dynptr_data(xdp_ptr, ethhdr_sz,
+			       ipv6hdr_sz + (tcphdr_sz > udphdr_sz ? tcphdr_sz : udphdr_sz));
+	if (!ip6h)
 		return XDP_DROP;
 
-	dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
+	dport = get_dport(ip6h + 1, ip6h->nexthdr);
 	if (dport == -1)
 		return XDP_DROP;
 
@@ -178,26 +177,23 @@  static __always_inline int handle_ipv6(struct xdp_md *xdp)
 	if (!tnl || tnl->family != AF_INET6)
 		return XDP_PASS;
 
-	if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
+	if (bpf_xdp_adjust_head(xdp, 0 - (int)ipv6hdr_sz))
 		return XDP_DROP;
 
-	data = (void *)(long)xdp->data;
-	data_end = (void *)(long)xdp->data_end;
-
-	new_eth = data;
-	ip6h = data + sizeof(*new_eth);
-	old_eth = data + sizeof(*ip6h);
-
-	if (new_eth + 1 > data_end || old_eth + 1 > data_end ||
-	    ip6h + 1 > data_end)
+	bpf_dynptr_from_xdp(xdp, 0, &new_xdp_ptr);
+	new_eth = bpf_dynptr_data(&new_xdp_ptr, 0, ethhdr_sz + ipv6hdr_sz + ethhdr_sz);
+	if (!new_eth)
 		return XDP_DROP;
 
+	ip6h = (struct ipv6hdr *)(new_eth + 1);
+	old_eth = (struct ethhdr *)(ip6h + 1);
+
 	set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IPV6));
 
 	ip6h->version = 6;
 	ip6h->priority = 0;
 	memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
-	ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + sizeof(*ip6h));
+	ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + ipv6hdr_sz);
 	ip6h->nexthdr = IPPROTO_IPV6;
 	ip6h->hop_limit = 8;
 	memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
@@ -211,21 +207,22 @@  static __always_inline int handle_ipv6(struct xdp_md *xdp)
 SEC("xdp")
 int _xdp_tx_iptunnel(struct xdp_md *xdp)
 {
-	void *data_end = (void *)(long)xdp->data_end;
-	void *data = (void *)(long)xdp->data;
-	struct ethhdr *eth = data;
+	struct bpf_dynptr ptr;
+	struct ethhdr *eth;
 	__u16 h_proto;
 
-	if (eth + 1 > data_end)
+	bpf_dynptr_from_xdp(xdp, 0, &ptr);
+	eth = bpf_dynptr_data(&ptr, 0, ethhdr_sz);
+	if (!eth)
 		return XDP_DROP;
 
 	h_proto = eth->h_proto;
 
 	if (h_proto == bpf_htons(ETH_P_IP))
-		return handle_ipv4(xdp);
+		return handle_ipv4(xdp, &ptr);
 	else if (h_proto == bpf_htons(ETH_P_IPV6))
 
-		return handle_ipv6(xdp);
+		return handle_ipv6(xdp, &ptr);
 	else
 		return XDP_DROP;
 }
diff --git a/tools/testing/selftests/bpf/test_tcp_hdr_options.h b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
index 6118e3ab61fc..56c9f8a3ad3d 100644
--- a/tools/testing/selftests/bpf/test_tcp_hdr_options.h
+++ b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
@@ -50,6 +50,7 @@  struct linum_err {
 
 #define TCPOPT_EOL		0
 #define TCPOPT_NOP		1
+#define TCPOPT_MSS		2
 #define TCPOPT_WINDOW		3
 #define TCPOPT_EXP		254