diff mbox series

[bpf-next,v3,2/3] libbpf: add low level TC-BPF API

Message ID 20210420193740.124285-3-memxor@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Add TC-BPF API | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers success CCed 10 of 10 maintainers
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch fail ERROR: code indent should use tabs where possible ERROR: space prohibited before that ':' (ctx:WxV)
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/header_inline success Link

Commit Message

Kumar Kartikeya Dwivedi April 20, 2021, 7:37 p.m. UTC
This adds functions that wrap the netlink API used for adding,
manipulating, and removing traffic control filters. These functions
operate directly on the loaded prog's fd, and return a handle to the
filter using an out parameter named id.

The basic featureset is covered to allow for attaching and removal of
filters. Some additional features like TCA_BPF_POLICE and TCA_RATE for
the API have been omitted. These can added on top later by extending the
bpf_tc_opts struct.

Support for binding actions directly to a classifier by passing them in
during filter creation has also been omitted for now. These actions have
an auto clean up property because their lifetime is bound to the filter
they are attached to. This can be added later, but was omitted for now
as direct action mode is a better alternative to it, which is enabled by
default.

An API summary:

bpf_tc_attach may be used to attach, and replace SCHED_CLS bpf
classifier. The protocol is always set as ETH_P_ALL. The replace option
in bpf_tc_opts is used to control replacement behavior.  Attachment
fails if filter with existing attributes already exists.

bpf_tc_detach may be used to detach existing SCHED_CLS filter. The
bpf_tc_attach_id object filled in during attach must be passed in to the
detach functions for them to remove the filter and its attached
classififer correctly.

bpf_tc_get_info is a helper that can be used to obtain attributes
for the filter and classififer.

Examples:

	struct bpf_tc_attach_id id = {};
	struct bpf_object *obj;
	struct bpf_program *p;
	int fd, r;

	obj = bpf_object_open("foo.o");
	if (IS_ERR_OR_NULL(obj))
		return PTR_ERR(obj);

	p = bpf_object__find_program_by_title(obj, "classifier");
	if (IS_ERR_OR_NULL(p))
		return PTR_ERR(p);

	if (bpf_object__load(obj) < 0)
		return -1;

	fd = bpf_program__fd(p);

	r = bpf_tc_attach(fd, if_nametoindex("lo"),
			  BPF_TC_CLSACT_INGRESS,
			  NULL, &id);
	if (r < 0)
		return r;

... which is roughly equivalent to:
  # tc qdisc add dev lo clsact
  # tc filter add dev lo ingress bpf obj foo.o sec classifier da

... as direct action mode is always enabled.

To replace an existing filter:

	DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts, .handle = id.handle,
			    .priority = id.priority, .replace = true);
	r = bpf_tc_attach(fd, if_nametoindex("lo"),
			  BPF_TC_CLSACT_INGRESS,
			  &opts, &id);
	if (r < 0)
		return r;

To obtain info of a particular filter, the example above can be extended
as follows:

	struct bpf_tc_info info = {};

	r = bpf_tc_get_info(if_nametoindex("lo"),
			    BPF_TC_CLSACT_INGRESS,
			    &id, &info);
	if (r < 0)
		return r;

... where id corresponds to the bpf_tc_attach_id filled in during an
attach operation.

Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 tools/lib/bpf/libbpf.h   |  44 ++++++
 tools/lib/bpf/libbpf.map |   3 +
 tools/lib/bpf/netlink.c  | 319 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 360 insertions(+), 6 deletions(-)

Comments

Yonghong Song April 21, 2021, 6:58 a.m. UTC | #1
On 4/20/21 12:37 PM, Kumar Kartikeya Dwivedi wrote:
> This adds functions that wrap the netlink API used for adding,
> manipulating, and removing traffic control filters. These functions
> operate directly on the loaded prog's fd, and return a handle to the
> filter using an out parameter named id.
> 
> The basic featureset is covered to allow for attaching and removal of
> filters. Some additional features like TCA_BPF_POLICE and TCA_RATE for
> the API have been omitted. These can added on top later by extending the

"later" => "layer"?

> bpf_tc_opts struct.
> 
> Support for binding actions directly to a classifier by passing them in
> during filter creation has also been omitted for now. These actions have
> an auto clean up property because their lifetime is bound to the filter
> they are attached to. This can be added later, but was omitted for now
> as direct action mode is a better alternative to it, which is enabled by
> default.
> 
> An API summary:
> 
> bpf_tc_attach may be used to attach, and replace SCHED_CLS bpf
> classifier. The protocol is always set as ETH_P_ALL. The replace option
> in bpf_tc_opts is used to control replacement behavior.  Attachment
> fails if filter with existing attributes already exists.
> 
> bpf_tc_detach may be used to detach existing SCHED_CLS filter. The
> bpf_tc_attach_id object filled in during attach must be passed in to the
> detach functions for them to remove the filter and its attached
> classififer correctly.
> 
> bpf_tc_get_info is a helper that can be used to obtain attributes
> for the filter and classififer.
> 
> Examples:
> 
> 	struct bpf_tc_attach_id id = {};
> 	struct bpf_object *obj;
> 	struct bpf_program *p;
> 	int fd, r;
> 
> 	obj = bpf_object_open("foo.o");
> 	if (IS_ERR_OR_NULL(obj))
> 		return PTR_ERR(obj);
> 
> 	p = bpf_object__find_program_by_title(obj, "classifier");

Please use bpf_object__find_program_by_name() API.
bpf_object__find_program_by_title() is not recommended as now
libbpf supports multiple programs within the same section.

> 	if (IS_ERR_OR_NULL(p))
> 		return PTR_ERR(p);
> 
> 	if (bpf_object__load(obj) < 0)
> 		return -1;
> 
> 	fd = bpf_program__fd(p);
> 
> 	r = bpf_tc_attach(fd, if_nametoindex("lo"),
> 			  BPF_TC_CLSACT_INGRESS,
> 			  NULL, &id);
> 	if (r < 0)
> 		return r;
> 
> ... which is roughly equivalent to:
>    # tc qdisc add dev lo clsact
>    # tc filter add dev lo ingress bpf obj foo.o sec classifier da
> 
> ... as direct action mode is always enabled.
> 
> To replace an existing filter:
> 
> 	DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts, .handle = id.handle,
> 			    .priority = id.priority, .replace = true);
> 	r = bpf_tc_attach(fd, if_nametoindex("lo"),
> 			  BPF_TC_CLSACT_INGRESS,
> 			  &opts, &id);
> 	if (r < 0)
> 		return r;
> 
> To obtain info of a particular filter, the example above can be extended
> as follows:
> 
> 	struct bpf_tc_info info = {};
> 
> 	r = bpf_tc_get_info(if_nametoindex("lo"),
> 			    BPF_TC_CLSACT_INGRESS,
> 			    &id, &info);
> 	if (r < 0)
> 		return r;
> 
> ... where id corresponds to the bpf_tc_attach_id filled in during an
> attach operation.
> 
> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> ---
>   tools/lib/bpf/libbpf.h   |  44 ++++++
>   tools/lib/bpf/libbpf.map |   3 +
>   tools/lib/bpf/netlink.c  | 319 ++++++++++++++++++++++++++++++++++++++-
>   3 files changed, 360 insertions(+), 6 deletions(-)
> 
> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> index bec4e6a6e31d..b4ed6a41ea70 100644
> --- a/tools/lib/bpf/libbpf.h
> +++ b/tools/lib/bpf/libbpf.h
> @@ -16,6 +16,8 @@
>   #include <stdbool.h>
>   #include <sys/types.h>  // for size_t
>   #include <linux/bpf.h>
> +#include <linux/pkt_sched.h>
> +#include <linux/tc_act/tc_bpf.h>
>   
>   #include "libbpf_common.h"
>   
> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>   LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>   LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>   
> +/* Convenience macros for the clsact attach hooks */
> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
> +
> +struct bpf_tc_opts {
> +	size_t sz;
> +	__u32 handle;
> +	__u32 class_id;
> +	__u16 priority;
> +	bool replace;
> +	size_t :0;

Did you see any error without "size_t :0"?

> +};
> +
> +#define bpf_tc_opts__last_field replace
> +
> +/* Acts as a handle for an attached filter */
> +struct bpf_tc_attach_id {
> +	__u32 handle;
> +	__u16 priority;
> +};
> +
[...]
Kumar Kartikeya Dwivedi April 21, 2021, 5:06 p.m. UTC | #2
On Wed, Apr 21, 2021 at 12:28:01PM IST, Yonghong Song wrote:
>
>
> On 4/20/21 12:37 PM, Kumar Kartikeya Dwivedi wrote:
> > This adds functions that wrap the netlink API used for adding,
> > manipulating, and removing traffic control filters. These functions
> > operate directly on the loaded prog's fd, and return a handle to the
> > filter using an out parameter named id.
> >
> > The basic featureset is covered to allow for attaching and removal of
> > filters. Some additional features like TCA_BPF_POLICE and TCA_RATE for
> > the API have been omitted. These can added on top later by extending the
>
> "later" => "layer"?
>

No, I meant that other options can be added on top of this series by someone
later. I'll reword it.

> > bpf_tc_opts struct.
> >
> > Support for binding actions directly to a classifier by passing them in
> > during filter creation has also been omitted for now. These actions have
> > an auto clean up property because their lifetime is bound to the filter
> > they are attached to. This can be added later, but was omitted for now
> > as direct action mode is a better alternative to it, which is enabled by
> > default.
> >
> > An API summary:
> >
> > bpf_tc_attach may be used to attach, and replace SCHED_CLS bpf
> > classifier. The protocol is always set as ETH_P_ALL. The replace option
> > in bpf_tc_opts is used to control replacement behavior.  Attachment
> > fails if filter with existing attributes already exists.
> >
> > bpf_tc_detach may be used to detach existing SCHED_CLS filter. The
> > bpf_tc_attach_id object filled in during attach must be passed in to the
> > detach functions for them to remove the filter and its attached
> > classififer correctly.
> >
> > bpf_tc_get_info is a helper that can be used to obtain attributes
> > for the filter and classififer.
> >
> > Examples:
> >
> > 	struct bpf_tc_attach_id id = {};
> > 	struct bpf_object *obj;
> > 	struct bpf_program *p;
> > 	int fd, r;
> >
> > 	obj = bpf_object_open("foo.o");
> > 	if (IS_ERR_OR_NULL(obj))
> > 		return PTR_ERR(obj);
> >
> > 	p = bpf_object__find_program_by_title(obj, "classifier");
>
> Please use bpf_object__find_program_by_name() API.
> bpf_object__find_program_by_title() is not recommended as now
> libbpf supports multiple programs within the same section.

Thanks, will do.

>
> > 	if (IS_ERR_OR_NULL(p))
> > 		return PTR_ERR(p);
> >
> > 	if (bpf_object__load(obj) < 0)
> > 		return -1;
> >
> > 	fd = bpf_program__fd(p);
> >
> > 	r = bpf_tc_attach(fd, if_nametoindex("lo"),
> > 			  BPF_TC_CLSACT_INGRESS,
> > 			  NULL, &id);
> > 	if (r < 0)
> > 		return r;
> >
> > ... which is roughly equivalent to:
> >    # tc qdisc add dev lo clsact
> >    # tc filter add dev lo ingress bpf obj foo.o sec classifier da
> >
> > ... as direct action mode is always enabled.
> >
> > To replace an existing filter:
> >
> > 	DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts, .handle = id.handle,
> > 			    .priority = id.priority, .replace = true);
> > 	r = bpf_tc_attach(fd, if_nametoindex("lo"),
> > 			  BPF_TC_CLSACT_INGRESS,
> > 			  &opts, &id);
> > 	if (r < 0)
> > 		return r;
> >
> > To obtain info of a particular filter, the example above can be extended
> > as follows:
> >
> > 	struct bpf_tc_info info = {};
> >
> > 	r = bpf_tc_get_info(if_nametoindex("lo"),
> > 			    BPF_TC_CLSACT_INGRESS,
> > 			    &id, &info);
> > 	if (r < 0)
> > 		return r;
> >
> > ... where id corresponds to the bpf_tc_attach_id filled in during an
> > attach operation.
> >
> > Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
> > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> > ---
> >   tools/lib/bpf/libbpf.h   |  44 ++++++
> >   tools/lib/bpf/libbpf.map |   3 +
> >   tools/lib/bpf/netlink.c  | 319 ++++++++++++++++++++++++++++++++++++++-
> >   3 files changed, 360 insertions(+), 6 deletions(-)
> >
> > diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> > index bec4e6a6e31d..b4ed6a41ea70 100644
> > --- a/tools/lib/bpf/libbpf.h
> > +++ b/tools/lib/bpf/libbpf.h
> > @@ -16,6 +16,8 @@
> >   #include <stdbool.h>
> >   #include <sys/types.h>  // for size_t
> >   #include <linux/bpf.h>
> > +#include <linux/pkt_sched.h>
> > +#include <linux/tc_act/tc_bpf.h>
> >   #include "libbpf_common.h"
> > @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
> >   LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
> >   LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
> > +/* Convenience macros for the clsact attach hooks */
> > +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
> > +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
> > +
> > +struct bpf_tc_opts {
> > +	size_t sz;
> > +	__u32 handle;
> > +	__u32 class_id;
> > +	__u16 priority;
> > +	bool replace;
> > +	size_t :0;
>
> Did you see any error without "size_t :0"?
>

Not really, I just added this considering this recent change:
dde7b3f5f2f4 ("libbpf: Add explicit padding to bpf_xdp_set_link_opts")

> > +};
> > +
> > +#define bpf_tc_opts__last_field replace
> > +
> > +/* Acts as a handle for an attached filter */
> > +struct bpf_tc_attach_id {
> > +	__u32 handle;
> > +	__u16 priority;
> > +};
> > +
> [...]

--
Kartikeya
Andrii Nakryiko April 21, 2021, 6:19 p.m. UTC | #3
On Tue, Apr 20, 2021 at 12:37 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> This adds functions that wrap the netlink API used for adding,
> manipulating, and removing traffic control filters. These functions
> operate directly on the loaded prog's fd, and return a handle to the
> filter using an out parameter named id.
>
> The basic featureset is covered to allow for attaching and removal of
> filters. Some additional features like TCA_BPF_POLICE and TCA_RATE for
> the API have been omitted. These can added on top later by extending the
> bpf_tc_opts struct.
>
> Support for binding actions directly to a classifier by passing them in
> during filter creation has also been omitted for now. These actions have
> an auto clean up property because their lifetime is bound to the filter
> they are attached to. This can be added later, but was omitted for now
> as direct action mode is a better alternative to it, which is enabled by
> default.
>
> An API summary:
>
> bpf_tc_attach may be used to attach, and replace SCHED_CLS bpf
> classifier. The protocol is always set as ETH_P_ALL. The replace option
> in bpf_tc_opts is used to control replacement behavior.  Attachment
> fails if filter with existing attributes already exists.
>
> bpf_tc_detach may be used to detach existing SCHED_CLS filter. The
> bpf_tc_attach_id object filled in during attach must be passed in to the
> detach functions for them to remove the filter and its attached
> classififer correctly.
>
> bpf_tc_get_info is a helper that can be used to obtain attributes
> for the filter and classififer.
>
> Examples:
>
>         struct bpf_tc_attach_id id = {};
>         struct bpf_object *obj;
>         struct bpf_program *p;
>         int fd, r;
>
>         obj = bpf_object_open("foo.o");
>         if (IS_ERR_OR_NULL(obj))
>                 return PTR_ERR(obj);
>
>         p = bpf_object__find_program_by_title(obj, "classifier");
>         if (IS_ERR_OR_NULL(p))
>                 return PTR_ERR(p);
>
>         if (bpf_object__load(obj) < 0)
>                 return -1;
>
>         fd = bpf_program__fd(p);
>
>         r = bpf_tc_attach(fd, if_nametoindex("lo"),
>                           BPF_TC_CLSACT_INGRESS,
>                           NULL, &id);
>         if (r < 0)
>                 return r;
>
> ... which is roughly equivalent to:
>   # tc qdisc add dev lo clsact
>   # tc filter add dev lo ingress bpf obj foo.o sec classifier da
>
> ... as direct action mode is always enabled.
>
> To replace an existing filter:
>
>         DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts, .handle = id.handle,
>                             .priority = id.priority, .replace = true);
>         r = bpf_tc_attach(fd, if_nametoindex("lo"),
>                           BPF_TC_CLSACT_INGRESS,
>                           &opts, &id);
>         if (r < 0)
>                 return r;
>
> To obtain info of a particular filter, the example above can be extended
> as follows:
>
>         struct bpf_tc_info info = {};
>
>         r = bpf_tc_get_info(if_nametoindex("lo"),
>                             BPF_TC_CLSACT_INGRESS,
>                             &id, &info);
>         if (r < 0)
>                 return r;
>
> ... where id corresponds to the bpf_tc_attach_id filled in during an
> attach operation.
>
> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> ---
>  tools/lib/bpf/libbpf.h   |  44 ++++++
>  tools/lib/bpf/libbpf.map |   3 +
>  tools/lib/bpf/netlink.c  | 319 ++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 360 insertions(+), 6 deletions(-)
>
> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> index bec4e6a6e31d..b4ed6a41ea70 100644
> --- a/tools/lib/bpf/libbpf.h
> +++ b/tools/lib/bpf/libbpf.h
> @@ -16,6 +16,8 @@
>  #include <stdbool.h>
>  #include <sys/types.h>  // for size_t
>  #include <linux/bpf.h>
> +#include <linux/pkt_sched.h>
> +#include <linux/tc_act/tc_bpf.h>

apart from those unused macros below, are these needed in public API header?

>
>  #include "libbpf_common.h"
>
> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>  LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>  LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>
> +/* Convenience macros for the clsact attach hooks */
> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)

these seem to be used only internally, why exposing them in public API?

> +
> +struct bpf_tc_opts {
> +       size_t sz;
> +       __u32 handle;
> +       __u32 class_id;
> +       __u16 priority;
> +       bool replace;
> +       size_t :0;
> +};
> +
> +#define bpf_tc_opts__last_field replace
> +
> +/* Acts as a handle for an attached filter */
> +struct bpf_tc_attach_id {
> +       __u32 handle;
> +       __u16 priority;
> +};

what are the chances that we'll need to grow this id struct? If that
happens, how do we do that in a backward/forward compatible manner?

if handle/prio are the only two ever necessary, we can actually use
bpf_tc_opts to return them back to user (we do that with
bpf_test_run_opts API). And then adjust detach/get_info methods to let
pass those values.

The whole idea of a struct for id just screams "compatibility problems
down the road" at me. Does anyone else has any other opinion on this?

> +
> +struct bpf_tc_info {
> +       struct bpf_tc_attach_id id;
> +       __u16 protocol;
> +       __u32 chain_index;
> +       __u32 prog_id;
> +       __u8 tag[BPF_TAG_SIZE];
> +       __u32 class_id;
> +       __u32 bpf_flags;
> +       __u32 bpf_flags_gen;
> +};
> +
> +/* id is out parameter that will be written to, it must not be NULL */
> +LIBBPF_API int bpf_tc_attach(int fd, __u32 ifindex, __u32 parent_id,

so parent_id is INGRESS|EGRESS, right? Is that an obvious name for
this parameter? I had to look at the code to understand what's
expected. Is it possible that it will be anything other than INGRESS
or EGRESS? If not `bool ingress` might be an option. Or perhaps enum
bpf_tc_direction { BPF_TC_INGRESS, BPF_TC_EGRESS } is better still.

> +                            const struct bpf_tc_opts *opts,
> +                            struct bpf_tc_attach_id *id);
> +LIBBPF_API int bpf_tc_detach(__u32 ifindex, __u32 parent_id,
> +                            const struct bpf_tc_attach_id *id);
> +LIBBPF_API int bpf_tc_get_info(__u32 ifindex, __u32 parent_id,

bpf_tc_query() to be more in line with attach/detach single-word verbs?

> +                              const struct bpf_tc_attach_id *id,
> +                              struct bpf_tc_info *info);
> +
>  #ifdef __cplusplus
>  } /* extern "C" */
>  #endif
> diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
> index b9b29baf1df8..686444fbb838 100644
> --- a/tools/lib/bpf/libbpf.map
> +++ b/tools/lib/bpf/libbpf.map
> @@ -361,4 +361,7 @@ LIBBPF_0.4.0 {
>                 bpf_linker__new;
>                 bpf_map__inner_map;
>                 bpf_object__set_kversion;
> +               bpf_tc_attach;
> +               bpf_tc_detach;
> +               bpf_tc_get_info;
>  } LIBBPF_0.3.0;

[...]
Toke Høiland-Jørgensen April 21, 2021, 7:48 p.m. UTC | #4
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Tue, Apr 20, 2021 at 12:37 PM Kumar Kartikeya Dwivedi
> <memxor@gmail.com> wrote:
>>
>> This adds functions that wrap the netlink API used for adding,
>> manipulating, and removing traffic control filters. These functions
>> operate directly on the loaded prog's fd, and return a handle to the
>> filter using an out parameter named id.
>>
>> The basic featureset is covered to allow for attaching and removal of
>> filters. Some additional features like TCA_BPF_POLICE and TCA_RATE for
>> the API have been omitted. These can added on top later by extending the
>> bpf_tc_opts struct.
>>
>> Support for binding actions directly to a classifier by passing them in
>> during filter creation has also been omitted for now. These actions have
>> an auto clean up property because their lifetime is bound to the filter
>> they are attached to. This can be added later, but was omitted for now
>> as direct action mode is a better alternative to it, which is enabled by
>> default.
>>
>> An API summary:
>>
>> bpf_tc_attach may be used to attach, and replace SCHED_CLS bpf
>> classifier. The protocol is always set as ETH_P_ALL. The replace option
>> in bpf_tc_opts is used to control replacement behavior.  Attachment
>> fails if filter with existing attributes already exists.
>>
>> bpf_tc_detach may be used to detach existing SCHED_CLS filter. The
>> bpf_tc_attach_id object filled in during attach must be passed in to the
>> detach functions for them to remove the filter and its attached
>> classififer correctly.
>>
>> bpf_tc_get_info is a helper that can be used to obtain attributes
>> for the filter and classififer.
>>
>> Examples:
>>
>>         struct bpf_tc_attach_id id = {};
>>         struct bpf_object *obj;
>>         struct bpf_program *p;
>>         int fd, r;
>>
>>         obj = bpf_object_open("foo.o");
>>         if (IS_ERR_OR_NULL(obj))
>>                 return PTR_ERR(obj);
>>
>>         p = bpf_object__find_program_by_title(obj, "classifier");
>>         if (IS_ERR_OR_NULL(p))
>>                 return PTR_ERR(p);
>>
>>         if (bpf_object__load(obj) < 0)
>>                 return -1;
>>
>>         fd = bpf_program__fd(p);
>>
>>         r = bpf_tc_attach(fd, if_nametoindex("lo"),
>>                           BPF_TC_CLSACT_INGRESS,
>>                           NULL, &id);
>>         if (r < 0)
>>                 return r;
>>
>> ... which is roughly equivalent to:
>>   # tc qdisc add dev lo clsact
>>   # tc filter add dev lo ingress bpf obj foo.o sec classifier da
>>
>> ... as direct action mode is always enabled.
>>
>> To replace an existing filter:
>>
>>         DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts, .handle = id.handle,
>>                             .priority = id.priority, .replace = true);
>>         r = bpf_tc_attach(fd, if_nametoindex("lo"),
>>                           BPF_TC_CLSACT_INGRESS,
>>                           &opts, &id);
>>         if (r < 0)
>>                 return r;
>>
>> To obtain info of a particular filter, the example above can be extended
>> as follows:
>>
>>         struct bpf_tc_info info = {};
>>
>>         r = bpf_tc_get_info(if_nametoindex("lo"),
>>                             BPF_TC_CLSACT_INGRESS,
>>                             &id, &info);
>>         if (r < 0)
>>                 return r;
>>
>> ... where id corresponds to the bpf_tc_attach_id filled in during an
>> attach operation.
>>
>> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
>> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
>> ---
>>  tools/lib/bpf/libbpf.h   |  44 ++++++
>>  tools/lib/bpf/libbpf.map |   3 +
>>  tools/lib/bpf/netlink.c  | 319 ++++++++++++++++++++++++++++++++++++++-
>>  3 files changed, 360 insertions(+), 6 deletions(-)
>>
>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>> index bec4e6a6e31d..b4ed6a41ea70 100644
>> --- a/tools/lib/bpf/libbpf.h
>> +++ b/tools/lib/bpf/libbpf.h
>> @@ -16,6 +16,8 @@
>>  #include <stdbool.h>
>>  #include <sys/types.h>  // for size_t
>>  #include <linux/bpf.h>
>> +#include <linux/pkt_sched.h>
>> +#include <linux/tc_act/tc_bpf.h>
>
> apart from those unused macros below, are these needed in public API header?
>
>>
>>  #include "libbpf_common.h"
>>
>> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>>  LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>>  LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>>
>> +/* Convenience macros for the clsact attach hooks */
>> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
>> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>
> these seem to be used only internally, why exposing them in public
> API?

No they're "aliases" for when you want to attach the filter directly to
the interface (and thus install the clsact qdisc as the root). You can
also use the filter with an existing qdisc (most commonly HTB), in which
case you need to specify the qdisc handle as the root. We have a few
examples of this use case:

https://github.com/xdp-project/bpf-examples/tree/master/traffic-pacing-edt
and
https://github.com/xdp-project/xdp-cpumap-tc

>> +struct bpf_tc_opts {
>> +       size_t sz;
>> +       __u32 handle;
>> +       __u32 class_id;
>> +       __u16 priority;
>> +       bool replace;
>> +       size_t :0;
>> +};
>> +
>> +#define bpf_tc_opts__last_field replace
>> +
>> +/* Acts as a handle for an attached filter */
>> +struct bpf_tc_attach_id {
>> +       __u32 handle;
>> +       __u16 priority;
>> +};
>
> what are the chances that we'll need to grow this id struct? If that
> happens, how do we do that in a backward/forward compatible manner?
>
> if handle/prio are the only two ever necessary, we can actually use
> bpf_tc_opts to return them back to user (we do that with
> bpf_test_run_opts API). And then adjust detach/get_info methods to let
> pass those values.
>
> The whole idea of a struct for id just screams "compatibility problems
> down the road" at me. Does anyone else has any other opinion on this?

Well, *if* we ever want to extend them (e.g., to support other values of
the protocol field, if that ever becomes necessary), we'll probably also
want to make it possible to pass the same identifiers as options, so
just reusing the opts struct definitely makes sense!

>> +struct bpf_tc_info {
>> +       struct bpf_tc_attach_id id;
>> +       __u16 protocol;
>> +       __u32 chain_index;
>> +       __u32 prog_id;
>> +       __u8 tag[BPF_TAG_SIZE];
>> +       __u32 class_id;
>> +       __u32 bpf_flags;
>> +       __u32 bpf_flags_gen;
>> +};
>> +
>> +/* id is out parameter that will be written to, it must not be NULL */
>> +LIBBPF_API int bpf_tc_attach(int fd, __u32 ifindex, __u32 parent_id,
>
> so parent_id is INGRESS|EGRESS, right? Is that an obvious name for
> this parameter? I had to look at the code to understand what's
> expected. Is it possible that it will be anything other than INGRESS
> or EGRESS? If not `bool ingress` might be an option. Or perhaps enum
> bpf_tc_direction { BPF_TC_INGRESS, BPF_TC_EGRESS } is better still.

See above; the parent is the attach point, and you use the defines from
above if you just want to attach to the interface.

But maybe documenting this in a comment above the function signature
would be good (along with a bit of terminology from the TC world for
those coming from there) :)

>> +                            const struct bpf_tc_opts *opts,
>> +                            struct bpf_tc_attach_id *id);
>> +LIBBPF_API int bpf_tc_detach(__u32 ifindex, __u32 parent_id,
>> +                            const struct bpf_tc_attach_id *id);
>> +LIBBPF_API int bpf_tc_get_info(__u32 ifindex, __u32 parent_id,
>
> bpf_tc_query() to be more in line with attach/detach single-word
> verbs?

OK by me!

-Toke
Daniel Borkmann April 21, 2021, 10:59 p.m. UTC | #5
On 4/20/21 9:37 PM, Kumar Kartikeya Dwivedi wrote:
[...]
> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> index bec4e6a6e31d..b4ed6a41ea70 100644
> --- a/tools/lib/bpf/libbpf.h
> +++ b/tools/lib/bpf/libbpf.h
> @@ -16,6 +16,8 @@
>   #include <stdbool.h>
>   #include <sys/types.h>  // for size_t
>   #include <linux/bpf.h>
> +#include <linux/pkt_sched.h>
> +#include <linux/tc_act/tc_bpf.h>
>   
>   #include "libbpf_common.h"
>   
> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>   LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>   LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>   
> +/* Convenience macros for the clsact attach hooks */
> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)

I would abstract those away into an enum, plus avoid having to pull in
linux/pkt_sched.h and linux/tc_act/tc_bpf.h from main libbpf.h header.

Just add a enum { BPF_TC_DIR_INGRESS, BPF_TC_DIR_EGRESS, } and then the
concrete tc bits (TC_H_MAKE()) can be translated internally.

> +struct bpf_tc_opts {
> +	size_t sz;

Is this set anywhere?

> +	__u32 handle;
> +	__u32 class_id;

I'd remove class_id from here as well given in direct-action a BPF prog can
set it if needed.

> +	__u16 priority;
> +	bool replace;
> +	size_t :0;

What's the rationale for this padding?

> +};
> +
> +#define bpf_tc_opts__last_field replace
> +
> +/* Acts as a handle for an attached filter */
> +struct bpf_tc_attach_id {

nit: maybe bpf_tc_ctx

> +	__u32 handle;
> +	__u16 priority;
> +};
> +
> +struct bpf_tc_info {
> +	struct bpf_tc_attach_id id;
> +	__u16 protocol;
> +	__u32 chain_index;
> +	__u32 prog_id;
> +	__u8 tag[BPF_TAG_SIZE];
> +	__u32 class_id;
> +	__u32 bpf_flags;
> +	__u32 bpf_flags_gen;

Given we do not yet have any setters e.g. for offload, etc, the one thing
I'd see useful and crucial initially is prog_id.

The protocol, chain_index, and I would also include tag should be dropped.
Similarly class_id given my earlier statement, and flags I would extend once
this lib API would support offloading progs.

> +};
> +
> +/* id is out parameter that will be written to, it must not be NULL */
> +LIBBPF_API int bpf_tc_attach(int fd, __u32 ifindex, __u32 parent_id,
> +			     const struct bpf_tc_opts *opts,
> +			     struct bpf_tc_attach_id *id);
> +LIBBPF_API int bpf_tc_detach(__u32 ifindex, __u32 parent_id,
> +			     const struct bpf_tc_attach_id *id);
> +LIBBPF_API int bpf_tc_get_info(__u32 ifindex, __u32 parent_id,
> +			       const struct bpf_tc_attach_id *id,
> +			       struct bpf_tc_info *info);

As per above, for parent_id I'd replace with dir enum.

> +
>   #ifdef __cplusplus
>   } /* extern "C" */
>   #endif
Kumar Kartikeya Dwivedi April 21, 2021, 11:08 p.m. UTC | #6
On Thu, Apr 22, 2021 at 04:29:28AM IST, Daniel Borkmann wrote:
> On 4/20/21 9:37 PM, Kumar Kartikeya Dwivedi wrote:
> [...]
> > diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> > index bec4e6a6e31d..b4ed6a41ea70 100644
> > --- a/tools/lib/bpf/libbpf.h
> > +++ b/tools/lib/bpf/libbpf.h
> > @@ -16,6 +16,8 @@
> >   #include <stdbool.h>
> >   #include <sys/types.h>  // for size_t
> >   #include <linux/bpf.h>
> > +#include <linux/pkt_sched.h>
> > +#include <linux/tc_act/tc_bpf.h>
> >   #include "libbpf_common.h"
> > @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
> >   LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
> >   LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
> > +/* Convenience macros for the clsact attach hooks */
> > +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
> > +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>
> I would abstract those away into an enum, plus avoid having to pull in
> linux/pkt_sched.h and linux/tc_act/tc_bpf.h from main libbpf.h header.
>
> Just add a enum { BPF_TC_DIR_INGRESS, BPF_TC_DIR_EGRESS, } and then the
> concrete tc bits (TC_H_MAKE()) can be translated internally.
>

Ok, will do.

> > +struct bpf_tc_opts {
> > +	size_t sz;
>
> Is this set anywhere?
>

This is needed by the OPTS_* infrastructure.

> > +	__u32 handle;
> > +	__u32 class_id;
>
> I'd remove class_id from here as well given in direct-action a BPF prog can
> set it if needed.
>

Ok, makes sense.

> > +	__u16 priority;
> > +	bool replace;
> > +	size_t :0;
>
> What's the rationale for this padding?
>

dde7b3f5f2f4 ("libbpf: Add explicit padding to bpf_xdp_set_link_opts")

> > +};
> > +
> > +#define bpf_tc_opts__last_field replace
> > +
> > +/* Acts as a handle for an attached filter */
> > +struct bpf_tc_attach_id {
>
> nit: maybe bpf_tc_ctx
>

Noted.

> > +	__u32 handle;
> > +	__u16 priority;
> > +};
> > +
> > +struct bpf_tc_info {
> > +	struct bpf_tc_attach_id id;
> > +	__u16 protocol;
> > +	__u32 chain_index;
> > +	__u32 prog_id;
> > +	__u8 tag[BPF_TAG_SIZE];
> > +	__u32 class_id;
> > +	__u32 bpf_flags;
> > +	__u32 bpf_flags_gen;
>
> Given we do not yet have any setters e.g. for offload, etc, the one thing
> I'd see useful and crucial initially is prog_id.
>
> The protocol, chain_index, and I would also include tag should be dropped.

A future user of this API needs to know the tag, so I would like to keep that.
The rest we can drop, and probably document the default values explicitly.

> Similarly class_id given my earlier statement, and flags I would extend once
> this lib API would support offloading progs.
>
> > +};
> > +
> > +/* id is out parameter that will be written to, it must not be NULL */
> > +LIBBPF_API int bpf_tc_attach(int fd, __u32 ifindex, __u32 parent_id,
> > +			     const struct bpf_tc_opts *opts,
> > +			     struct bpf_tc_attach_id *id);
> > +LIBBPF_API int bpf_tc_detach(__u32 ifindex, __u32 parent_id,
> > +			     const struct bpf_tc_attach_id *id);
> > +LIBBPF_API int bpf_tc_get_info(__u32 ifindex, __u32 parent_id,
> > +			       const struct bpf_tc_attach_id *id,
> > +			       struct bpf_tc_info *info);
>
> As per above, for parent_id I'd replace with dir enum.
>
> > +
> >   #ifdef __cplusplus
> >   } /* extern "C" */
> >   #endif

--
Kartikeya
Daniel Borkmann April 21, 2021, 11:14 p.m. UTC | #7
On 4/21/21 9:48 PM, Toke Høiland-Jørgensen wrote:
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>> On Tue, Apr 20, 2021 at 12:37 PM Kumar Kartikeya Dwivedi
>> <memxor@gmail.com> wrote:
[...]
>>> ---
>>>   tools/lib/bpf/libbpf.h   |  44 ++++++
>>>   tools/lib/bpf/libbpf.map |   3 +
>>>   tools/lib/bpf/netlink.c  | 319 ++++++++++++++++++++++++++++++++++++++-
>>>   3 files changed, 360 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>>> index bec4e6a6e31d..b4ed6a41ea70 100644
>>> --- a/tools/lib/bpf/libbpf.h
>>> +++ b/tools/lib/bpf/libbpf.h
>>> @@ -16,6 +16,8 @@
>>>   #include <stdbool.h>
>>>   #include <sys/types.h>  // for size_t
>>>   #include <linux/bpf.h>
>>> +#include <linux/pkt_sched.h>
>>> +#include <linux/tc_act/tc_bpf.h>
>>
>> apart from those unused macros below, are these needed in public API header?
>>
>>>   #include "libbpf_common.h"
>>>
>>> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>>>   LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>>>   LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>>>
>>> +/* Convenience macros for the clsact attach hooks */
>>> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
>>> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>>
>> these seem to be used only internally, why exposing them in public
>> API?
> 
> No they're "aliases" for when you want to attach the filter directly to
> the interface (and thus install the clsact qdisc as the root). You can
> also use the filter with an existing qdisc (most commonly HTB), in which
> case you need to specify the qdisc handle as the root. We have a few
> examples of this use case:
> 
> https://github.com/xdp-project/bpf-examples/tree/master/traffic-pacing-edt
> and
> https://github.com/xdp-project/xdp-cpumap-tc

I'm a bit puzzled, could you elaborate on your use case on why you wouldn't
use the tc egress hook for those especially given it's guaranteed to run
outside of root qdisc lock?

Some pointers as well:

  - http://vger.kernel.org/lpc-bpf2018.html#session-1
  - https://netdevconf.info/0x14/session.html?talk-replacing-HTB-with-EDT-and-BPF
  - https://cilium.io/blog/2020/11/10/cilium-19#bwmanager

Thanks,
Daniel
Daniel Borkmann April 21, 2021, 11:21 p.m. UTC | #8
On 4/22/21 1:08 AM, Kumar Kartikeya Dwivedi wrote:
> On Thu, Apr 22, 2021 at 04:29:28AM IST, Daniel Borkmann wrote:
>> On 4/20/21 9:37 PM, Kumar Kartikeya Dwivedi wrote:
>> [...]
>>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>>> index bec4e6a6e31d..b4ed6a41ea70 100644
>>> --- a/tools/lib/bpf/libbpf.h
>>> +++ b/tools/lib/bpf/libbpf.h
>>> @@ -16,6 +16,8 @@
>>>    #include <stdbool.h>
>>>    #include <sys/types.h>  // for size_t
>>>    #include <linux/bpf.h>
>>> +#include <linux/pkt_sched.h>
>>> +#include <linux/tc_act/tc_bpf.h>
>>>    #include "libbpf_common.h"
>>> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>>>    LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>>>    LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>>> +/* Convenience macros for the clsact attach hooks */
>>> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
>>> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>>
>> I would abstract those away into an enum, plus avoid having to pull in
>> linux/pkt_sched.h and linux/tc_act/tc_bpf.h from main libbpf.h header.
>>
>> Just add a enum { BPF_TC_DIR_INGRESS, BPF_TC_DIR_EGRESS, } and then the
>> concrete tc bits (TC_H_MAKE()) can be translated internally.
> 
> Ok, will do.
> 
>>> +struct bpf_tc_opts {
>>> +	size_t sz;
>>
>> Is this set anywhere?
> 
> This is needed by the OPTS_* infrastructure.
> 
>>> +	__u32 handle;
>>> +	__u32 class_id;
>>
>> I'd remove class_id from here as well given in direct-action a BPF prog can
>> set it if needed.
> 
> Ok, makes sense.
> 
>>> +	__u16 priority;
>>> +	bool replace;
>>> +	size_t :0;
>>
>> What's the rationale for this padding?
> 
> dde7b3f5f2f4 ("libbpf: Add explicit padding to bpf_xdp_set_link_opts")

Hm, fair enough.

>>> +};
>>> +
>>> +#define bpf_tc_opts__last_field replace
>>> +
>>> +/* Acts as a handle for an attached filter */
>>> +struct bpf_tc_attach_id {
>>
>> nit: maybe bpf_tc_ctx
>>
> 
> Noted.
> 
>>> +	__u32 handle;
>>> +	__u16 priority;
>>> +};
>>> +
>>> +struct bpf_tc_info {
>>> +	struct bpf_tc_attach_id id;
>>> +	__u16 protocol;
>>> +	__u32 chain_index;
>>> +	__u32 prog_id;
>>> +	__u8 tag[BPF_TAG_SIZE];
>>> +	__u32 class_id;
>>> +	__u32 bpf_flags;
>>> +	__u32 bpf_flags_gen;
>>
>> Given we do not yet have any setters e.g. for offload, etc, the one thing
>> I'd see useful and crucial initially is prog_id.
>>
>> The protocol, chain_index, and I would also include tag should be dropped.
> 
> A future user of this API needs to know the tag, so I would like to keep that.
> The rest we can drop, and probably document the default values explicitly.

Couldn't this be added along with the future patch for the [future] user?

The tag should be the tag of the prog itself, so if you have prog_id, you
could also retrieve the same tag from the prog. The tag was basically from
the early days where we didn't have bpf_prog_get_info_by_fd().

What does that future user need to do different here?

Thanks,
Daniel
Kumar Kartikeya Dwivedi April 21, 2021, 11:30 p.m. UTC | #9
On Thu, Apr 22, 2021 at 04:51:55AM IST, Daniel Borkmann wrote:
> On 4/22/21 1:08 AM, Kumar Kartikeya Dwivedi wrote:
> > On Thu, Apr 22, 2021 at 04:29:28AM IST, Daniel Borkmann wrote:
> > > On 4/20/21 9:37 PM, Kumar Kartikeya Dwivedi wrote:
> > > [...]
> > > > diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> > > > index bec4e6a6e31d..b4ed6a41ea70 100644
> > > > --- a/tools/lib/bpf/libbpf.h
> > > > +++ b/tools/lib/bpf/libbpf.h
> > > > @@ -16,6 +16,8 @@
> > > >    #include <stdbool.h>
> > > >    #include <sys/types.h>  // for size_t
> > > >    #include <linux/bpf.h>
> > > > +#include <linux/pkt_sched.h>
> > > > +#include <linux/tc_act/tc_bpf.h>
> > > >    #include "libbpf_common.h"
> > > > @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
> > > >    LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
> > > >    LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
> > > > +/* Convenience macros for the clsact attach hooks */
> > > > +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
> > > > +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
> > >
> > > I would abstract those away into an enum, plus avoid having to pull in
> > > linux/pkt_sched.h and linux/tc_act/tc_bpf.h from main libbpf.h header.
> > >
> > > Just add a enum { BPF_TC_DIR_INGRESS, BPF_TC_DIR_EGRESS, } and then the
> > > concrete tc bits (TC_H_MAKE()) can be translated internally.
> >
> > Ok, will do.
> >
> > > > +struct bpf_tc_opts {
> > > > +	size_t sz;
> > >
> > > Is this set anywhere?
> >
> > This is needed by the OPTS_* infrastructure.
> >
> > > > +	__u32 handle;
> > > > +	__u32 class_id;
> > >
> > > I'd remove class_id from here as well given in direct-action a BPF prog can
> > > set it if needed.
> >
> > Ok, makes sense.
> >
> > > > +	__u16 priority;
> > > > +	bool replace;
> > > > +	size_t :0;
> > >
> > > What's the rationale for this padding?
> >
> > dde7b3f5f2f4 ("libbpf: Add explicit padding to bpf_xdp_set_link_opts")
>
> Hm, fair enough.
>
> > > > +};
> > > > +
> > > > +#define bpf_tc_opts__last_field replace
> > > > +
> > > > +/* Acts as a handle for an attached filter */
> > > > +struct bpf_tc_attach_id {
> > >
> > > nit: maybe bpf_tc_ctx
> > >
> >
> > Noted.
> >
> > > > +	__u32 handle;
> > > > +	__u16 priority;
> > > > +};
> > > > +
> > > > +struct bpf_tc_info {
> > > > +	struct bpf_tc_attach_id id;
> > > > +	__u16 protocol;
> > > > +	__u32 chain_index;
> > > > +	__u32 prog_id;
> > > > +	__u8 tag[BPF_TAG_SIZE];
> > > > +	__u32 class_id;
> > > > +	__u32 bpf_flags;
> > > > +	__u32 bpf_flags_gen;
> > >
> > > Given we do not yet have any setters e.g. for offload, etc, the one thing
> > > I'd see useful and crucial initially is prog_id.
> > >
> > > The protocol, chain_index, and I would also include tag should be dropped.
> >
> > A future user of this API needs to know the tag, so I would like to keep that.
> > The rest we can drop, and probably document the default values explicitly.
>
> Couldn't this be added along with the future patch for the [future] user?
>

True.

> The tag should be the tag of the prog itself, so if you have prog_id, you
> could also retrieve the same tag from the prog. The tag was basically from
> the early days where we didn't have bpf_prog_get_info_by_fd().
>
> What does that future user need to do different here?
>

From Shaun Crampton:
"My particular use case is to load a program, link it with its maps and then
check if its tag matches the existing program on the interface (and if so, abort
the update)"

Also CC'd, they would be able to elaborate better, and whether or not dropping
it is ok.

> Thanks,
> Daniel

--
Kartikeya
Daniel Borkmann April 21, 2021, 11:41 p.m. UTC | #10
On 4/22/21 1:30 AM, Kumar Kartikeya Dwivedi wrote:
> On Thu, Apr 22, 2021 at 04:51:55AM IST, Daniel Borkmann wrote:
>> On 4/22/21 1:08 AM, Kumar Kartikeya Dwivedi wrote:
>>> On Thu, Apr 22, 2021 at 04:29:28AM IST, Daniel Borkmann wrote:
>>>> On 4/20/21 9:37 PM, Kumar Kartikeya Dwivedi wrote:
>>>> [...]
>>>>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>>>>> index bec4e6a6e31d..b4ed6a41ea70 100644
>>>>> --- a/tools/lib/bpf/libbpf.h
>>>>> +++ b/tools/lib/bpf/libbpf.h
>>>>> @@ -16,6 +16,8 @@
>>>>>     #include <stdbool.h>
>>>>>     #include <sys/types.h>  // for size_t
>>>>>     #include <linux/bpf.h>
>>>>> +#include <linux/pkt_sched.h>
>>>>> +#include <linux/tc_act/tc_bpf.h>
>>>>>     #include "libbpf_common.h"
>>>>> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>>>>>     LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>>>>>     LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>>>>> +/* Convenience macros for the clsact attach hooks */
>>>>> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
>>>>> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>>>>
>>>> I would abstract those away into an enum, plus avoid having to pull in
>>>> linux/pkt_sched.h and linux/tc_act/tc_bpf.h from main libbpf.h header.
>>>>
>>>> Just add a enum { BPF_TC_DIR_INGRESS, BPF_TC_DIR_EGRESS, } and then the
>>>> concrete tc bits (TC_H_MAKE()) can be translated internally.
>>>
>>> Ok, will do.
>>>
>>>>> +struct bpf_tc_opts {
>>>>> +	size_t sz;
>>>>
>>>> Is this set anywhere?
>>>
>>> This is needed by the OPTS_* infrastructure.
>>>
>>>>> +	__u32 handle;
>>>>> +	__u32 class_id;
>>>>
>>>> I'd remove class_id from here as well given in direct-action a BPF prog can
>>>> set it if needed.
>>>
>>> Ok, makes sense.
>>>
>>>>> +	__u16 priority;
>>>>> +	bool replace;
>>>>> +	size_t :0;
>>>>
>>>> What's the rationale for this padding?
>>>
>>> dde7b3f5f2f4 ("libbpf: Add explicit padding to bpf_xdp_set_link_opts")
>>
>> Hm, fair enough.
>>
>>>>> +};
>>>>> +
>>>>> +#define bpf_tc_opts__last_field replace
>>>>> +
>>>>> +/* Acts as a handle for an attached filter */
>>>>> +struct bpf_tc_attach_id {
>>>>
>>>> nit: maybe bpf_tc_ctx
>>>
>>> Noted.
>>>
>>>>> +	__u32 handle;
>>>>> +	__u16 priority;
>>>>> +};
>>>>> +
>>>>> +struct bpf_tc_info {
>>>>> +	struct bpf_tc_attach_id id;
>>>>> +	__u16 protocol;
>>>>> +	__u32 chain_index;
>>>>> +	__u32 prog_id;
>>>>> +	__u8 tag[BPF_TAG_SIZE];
>>>>> +	__u32 class_id;
>>>>> +	__u32 bpf_flags;
>>>>> +	__u32 bpf_flags_gen;
>>>>
>>>> Given we do not yet have any setters e.g. for offload, etc, the one thing
>>>> I'd see useful and crucial initially is prog_id.
>>>>
>>>> The protocol, chain_index, and I would also include tag should be dropped.
>>>
>>> A future user of this API needs to know the tag, so I would like to keep that.
>>> The rest we can drop, and probably document the default values explicitly.
>>
>> Couldn't this be added along with the future patch for the [future] user?
> 
> True.
> 
>> The tag should be the tag of the prog itself, so if you have prog_id, you
>> could also retrieve the same tag from the prog. The tag was basically from
>> the early days where we didn't have bpf_prog_get_info_by_fd().
>>
>> What does that future user need to do different here?
> 
>  From Shaun Crampton:
> "My particular use case is to load a program, link it with its maps and then
> check if its tag matches the existing program on the interface (and if so, abort
> the update)"
> 
> Also CC'd, they would be able to elaborate better, and whether or not dropping
> it is ok.

Nope, just get it from the prog itself.
Yonghong Song April 22, 2021, 1:56 a.m. UTC | #11
On 4/21/21 10:06 AM, Kumar Kartikeya Dwivedi wrote:
> On Wed, Apr 21, 2021 at 12:28:01PM IST, Yonghong Song wrote:
>>
>>
>> On 4/20/21 12:37 PM, Kumar Kartikeya Dwivedi wrote:
>>> This adds functions that wrap the netlink API used for adding,
>>> manipulating, and removing traffic control filters. These functions
>>> operate directly on the loaded prog's fd, and return a handle to the
>>> filter using an out parameter named id.
>>>
>>> The basic featureset is covered to allow for attaching and removal of
>>> filters. Some additional features like TCA_BPF_POLICE and TCA_RATE for
>>> the API have been omitted. These can added on top later by extending the
>>
>> "later" => "layer"?
>>
> 
> No, I meant that other options can be added on top of this series by someone
> later. I'll reword it.
> 
[...]
>>>
>>> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
>>> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
>>> ---
>>>    tools/lib/bpf/libbpf.h   |  44 ++++++
>>>    tools/lib/bpf/libbpf.map |   3 +
>>>    tools/lib/bpf/netlink.c  | 319 ++++++++++++++++++++++++++++++++++++++-
>>>    3 files changed, 360 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>>> index bec4e6a6e31d..b4ed6a41ea70 100644
>>> --- a/tools/lib/bpf/libbpf.h
>>> +++ b/tools/lib/bpf/libbpf.h
>>> @@ -16,6 +16,8 @@
>>>    #include <stdbool.h>
>>>    #include <sys/types.h>  // for size_t
>>>    #include <linux/bpf.h>
>>> +#include <linux/pkt_sched.h>
>>> +#include <linux/tc_act/tc_bpf.h>
>>>    #include "libbpf_common.h"
>>> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>>>    LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>>>    LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>>> +/* Convenience macros for the clsact attach hooks */
>>> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
>>> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>>> +
>>> +struct bpf_tc_opts {
>>> +	size_t sz;
>>> +	__u32 handle;
>>> +	__u32 class_id;
>>> +	__u16 priority;
>>> +	bool replace;
>>> +	size_t :0;
>>
>> Did you see any error without "size_t :0"?
>>
> 
> Not really, I just added this considering this recent change:
> dde7b3f5f2f4 ("libbpf: Add explicit padding to bpf_xdp_set_link_opts")

Thanks. That is fine with me then.

> 
>>> +};
>>> +
>>> +#define bpf_tc_opts__last_field replace
>>> +
>>> +/* Acts as a handle for an attached filter */
>>> +struct bpf_tc_attach_id {
>>> +	__u32 handle;
>>> +	__u16 priority;
>>> +};
>>> +
>> [...]
> 
> --
> Kartikeya
>
Andrii Nakryiko April 22, 2021, 3:43 a.m. UTC | #12
On Wed, Apr 21, 2021 at 3:59 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 4/20/21 9:37 PM, Kumar Kartikeya Dwivedi wrote:
> [...]
> > diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> > index bec4e6a6e31d..b4ed6a41ea70 100644
> > --- a/tools/lib/bpf/libbpf.h
> > +++ b/tools/lib/bpf/libbpf.h
> > @@ -16,6 +16,8 @@
> >   #include <stdbool.h>
> >   #include <sys/types.h>  // for size_t
> >   #include <linux/bpf.h>
> > +#include <linux/pkt_sched.h>
> > +#include <linux/tc_act/tc_bpf.h>
> >
> >   #include "libbpf_common.h"
> >
> > @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
> >   LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
> >   LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
> >
> > +/* Convenience macros for the clsact attach hooks */
> > +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
> > +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>
> I would abstract those away into an enum, plus avoid having to pull in
> linux/pkt_sched.h and linux/tc_act/tc_bpf.h from main libbpf.h header.
>
> Just add a enum { BPF_TC_DIR_INGRESS, BPF_TC_DIR_EGRESS, } and then the
> concrete tc bits (TC_H_MAKE()) can be translated internally.
>
> > +struct bpf_tc_opts {
> > +     size_t sz;
>
> Is this set anywhere?
>
> > +     __u32 handle;
> > +     __u32 class_id;
>
> I'd remove class_id from here as well given in direct-action a BPF prog can
> set it if needed.
>
> > +     __u16 priority;
> > +     bool replace;
> > +     size_t :0;
>
> What's the rationale for this padding?
>
> > +};
> > +
> > +#define bpf_tc_opts__last_field replace
> > +
> > +/* Acts as a handle for an attached filter */
> > +struct bpf_tc_attach_id {
>
> nit: maybe bpf_tc_ctx

ok, so wait. It seems like apart from INGRESS|EGRESS enum and ifindex,
everything else is optional and/or has some sane defaults, right? So
this bpf_tc_attach_id or bpf_tc_ctx seems a bit artificial construct
and it will cause problems for extending this.

So if my understanding is correct, I'd get rid of it completely. As I
said previously, opts allow returning parameters back, so if user
didn't specify handle and priority and kernel picks values on user's
behalf, we can return them in the same opts fields.

For detach, again, ifindex and INGRESS|EGRESS is sufficient, but if
user want to provide more detailed parameters, we should do that
through extensible opts. That way we can keep growing this easily,
plus simple cases will remain simple.

Similarly bpf_tc_info below, there is no need to have struct
bpf_tc_attach_id id; field, just have handle and priority right there.
And bpf_tc_info should use OPTS framework for extensibility (though
opts name doesn't fit it very well, but it is still nice for
extensibility and for doing optional input/output params).

Does this make sense? Am I missing something crucial here?

The general rule with any new structs added to libbpf APIs is to
either be 100% (ok, 99.99%) sure that they will never be changed, or
do forward/backward compatible OPTS. Any other thing is pain and calls
for symbol versioning, which we are trying really hard to avoid.

>
> > +     __u32 handle;
> > +     __u16 priority;
> > +};
> > +
> > +struct bpf_tc_info {
> > +     struct bpf_tc_attach_id id;
> > +     __u16 protocol;
> > +     __u32 chain_index;
> > +     __u32 prog_id;
> > +     __u8 tag[BPF_TAG_SIZE];
> > +     __u32 class_id;
> > +     __u32 bpf_flags;
> > +     __u32 bpf_flags_gen;
>
> Given we do not yet have any setters e.g. for offload, etc, the one thing
> I'd see useful and crucial initially is prog_id.
>
> The protocol, chain_index, and I would also include tag should be dropped.
> Similarly class_id given my earlier statement, and flags I would extend once
> this lib API would support offloading progs.
>
> > +};
> > +
> > +/* id is out parameter that will be written to, it must not be NULL */
> > +LIBBPF_API int bpf_tc_attach(int fd, __u32 ifindex, __u32 parent_id,
> > +                          const struct bpf_tc_opts *opts,
> > +                          struct bpf_tc_attach_id *id);
> > +LIBBPF_API int bpf_tc_detach(__u32 ifindex, __u32 parent_id,
> > +                          const struct bpf_tc_attach_id *id);
> > +LIBBPF_API int bpf_tc_get_info(__u32 ifindex, __u32 parent_id,
> > +                            const struct bpf_tc_attach_id *id,
> > +                            struct bpf_tc_info *info);
>
> As per above, for parent_id I'd replace with dir enum.
>
> > +
> >   #ifdef __cplusplus
> >   } /* extern "C" */
> >   #endif
Toke Høiland-Jørgensen April 22, 2021, 9:08 a.m. UTC | #13
Daniel Borkmann <daniel@iogearbox.net> writes:

> On 4/21/21 9:48 PM, Toke Høiland-Jørgensen wrote:
>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>> On Tue, Apr 20, 2021 at 12:37 PM Kumar Kartikeya Dwivedi
>>> <memxor@gmail.com> wrote:
> [...]
>>>> ---
>>>>   tools/lib/bpf/libbpf.h   |  44 ++++++
>>>>   tools/lib/bpf/libbpf.map |   3 +
>>>>   tools/lib/bpf/netlink.c  | 319 ++++++++++++++++++++++++++++++++++++++-
>>>>   3 files changed, 360 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>>>> index bec4e6a6e31d..b4ed6a41ea70 100644
>>>> --- a/tools/lib/bpf/libbpf.h
>>>> +++ b/tools/lib/bpf/libbpf.h
>>>> @@ -16,6 +16,8 @@
>>>>   #include <stdbool.h>
>>>>   #include <sys/types.h>  // for size_t
>>>>   #include <linux/bpf.h>
>>>> +#include <linux/pkt_sched.h>
>>>> +#include <linux/tc_act/tc_bpf.h>
>>>
>>> apart from those unused macros below, are these needed in public API header?
>>>
>>>>   #include "libbpf_common.h"
>>>>
>>>> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>>>>   LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>>>>   LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>>>>
>>>> +/* Convenience macros for the clsact attach hooks */
>>>> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
>>>> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>>>
>>> these seem to be used only internally, why exposing them in public
>>> API?
>> 
>> No they're "aliases" for when you want to attach the filter directly to
>> the interface (and thus install the clsact qdisc as the root). You can
>> also use the filter with an existing qdisc (most commonly HTB), in which
>> case you need to specify the qdisc handle as the root. We have a few
>> examples of this use case:
>> 
>> https://github.com/xdp-project/bpf-examples/tree/master/traffic-pacing-edt
>> and
>> https://github.com/xdp-project/xdp-cpumap-tc
>
> I'm a bit puzzled, could you elaborate on your use case on why you wouldn't
> use the tc egress hook for those especially given it's guaranteed to run
> outside of root qdisc lock?

Jesper can correct me if I'm wrong, but I think the first one of the
links above is basically his implementation of just that EDT-based
shaper. And it works reasonably well, except you don't get the nice
per-flow scheduling and sparse flow prioritisation like in FQ-CoDel
unless you implement that yourself in BPF when you set the timestamps
(and that is by no means trivial to implement).

So if you want to use any of the features of the existing qdiscs (I have
also been suggesting to people that they use tc_bpf if they want to
customise sch_cake's notion of flows or shaping tiers), you need to be
able to attach the filter to an existing qdisc. Sure, this means you're
still stuck behind the qdisc lock, but for some applications that is
fine (not everything is a data centre, some devices don't have that many
CPUs anyway; and as the second example above shows, you can get around
the qdisc lock by some clever use of partitioning of flows using
cpumap).

So what this boils down to is, we should keep the 'parent' parameter not
just as an egress/ingress enum, but also as a field the user can fill
out. I'm fine with moving the latter into the opts struct, though, so
maybe the function parameter can be an enum like:

enum bpf_tc_attach_point {
  BPF_TC_CLSACT_INGRESS,
  BPF_TC_CLSACT_EGRESS,
  BPF_TC_QDISC_PARENT
};

where if you set the last one you have to fill in the parent in opts?

-Toke
Shaun Crampton April 22, 2021, 9:47 a.m. UTC | #14
> Nope, just get it from the prog itself.

Looks like the API returns the prog ID, so from that we can look up the prog
and then get its tag? If so that meets our needs.  Alternatively, if
the API allows
for atomic replacement of a BPF program with another, that'd also work for us.

The use case is that our daemon is restarted and it doesn't know what BPF
program was previously loaded. We want to make sure we end up with the
latest version loaded, either by doing a seamless replace with the
latest version
or by checking if the loaded version matches the latest.
Daniel Borkmann April 22, 2021, 11:26 a.m. UTC | #15
On 4/22/21 11:47 AM, Shaun Crampton wrote:
>> Nope, just get it from the prog itself.
> 
> Looks like the API returns the prog ID, so from that we can look up the prog
> and then get its tag? If so that meets our needs.  Alternatively, if
> the API allows
> for atomic replacement of a BPF program with another, that'd also work for us.

Both is the case: from prog ID you can already retrieve that same tag, and progs
can be atomically replaced with the current API code.

Exposing the tag in here otherwise feels just odd/wrong from a design PoV, explain
that to a user of this API on /why/ such field is in the tc API when it already
can be retrieved via bpf_prog_get_info_by_fd(), as in, what is so special on this
field in the tc API here (aside from legacy reasons when there was no mentioned
helper [which we don't need to support given it dates way too far back]) ... I
cannot. ;-) Hence lets drop it from there.

Thanks,
Daniel
Daniel Borkmann April 22, 2021, 11:51 a.m. UTC | #16
On 4/22/21 11:08 AM, Toke Høiland-Jørgensen wrote:
> Daniel Borkmann <daniel@iogearbox.net> writes:
>> On 4/21/21 9:48 PM, Toke Høiland-Jørgensen wrote:
>>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>>> On Tue, Apr 20, 2021 at 12:37 PM Kumar Kartikeya Dwivedi
>>>> <memxor@gmail.com> wrote:
>> [...]
>>>>> ---
>>>>>    tools/lib/bpf/libbpf.h   |  44 ++++++
>>>>>    tools/lib/bpf/libbpf.map |   3 +
>>>>>    tools/lib/bpf/netlink.c  | 319 ++++++++++++++++++++++++++++++++++++++-
>>>>>    3 files changed, 360 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>>>>> index bec4e6a6e31d..b4ed6a41ea70 100644
>>>>> --- a/tools/lib/bpf/libbpf.h
>>>>> +++ b/tools/lib/bpf/libbpf.h
>>>>> @@ -16,6 +16,8 @@
>>>>>    #include <stdbool.h>
>>>>>    #include <sys/types.h>  // for size_t
>>>>>    #include <linux/bpf.h>
>>>>> +#include <linux/pkt_sched.h>
>>>>> +#include <linux/tc_act/tc_bpf.h>
>>>>
>>>> apart from those unused macros below, are these needed in public API header?
>>>>
>>>>>    #include "libbpf_common.h"
>>>>>
>>>>> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>>>>>    LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>>>>>    LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>>>>>
>>>>> +/* Convenience macros for the clsact attach hooks */
>>>>> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
>>>>> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>>>>
>>>> these seem to be used only internally, why exposing them in public
>>>> API?
>>>
>>> No they're "aliases" for when you want to attach the filter directly to
>>> the interface (and thus install the clsact qdisc as the root). You can
>>> also use the filter with an existing qdisc (most commonly HTB), in which
>>> case you need to specify the qdisc handle as the root. We have a few
>>> examples of this use case:
>>>
>>> https://github.com/xdp-project/bpf-examples/tree/master/traffic-pacing-edt
>>> and
>>> https://github.com/xdp-project/xdp-cpumap-tc
>>
>> I'm a bit puzzled, could you elaborate on your use case on why you wouldn't
>> use the tc egress hook for those especially given it's guaranteed to run
>> outside of root qdisc lock?
> 
> Jesper can correct me if I'm wrong, but I think the first one of the
> links above is basically his implementation of just that EDT-based
> shaper. And it works reasonably well, except you don't get the nice
> per-flow scheduling and sparse flow prioritisation like in FQ-CoDel
> unless you implement that yourself in BPF when you set the timestamps
> (and that is by no means trivial to implement).
> 
> So if you want to use any of the features of the existing qdiscs (I have
> also been suggesting to people that they use tc_bpf if they want to
> customise sch_cake's notion of flows or shaping tiers), you need to be
> able to attach the filter to an existing qdisc. Sure, this means you're
> still stuck behind the qdisc lock, but for some applications that is
> fine (not everything is a data centre, some devices don't have that many
> CPUs anyway; and as the second example above shows, you can get around
> the qdisc lock by some clever use of partitioning of flows using
> cpumap).
> 
> So what this boils down to is, we should keep the 'parent' parameter not
> just as an egress/ingress enum, but also as a field the user can fill
> out. I'm fine with moving the latter into the opts struct, though, so
> maybe the function parameter can be an enum like:
> 
> enum bpf_tc_attach_point {
>    BPF_TC_CLSACT_INGRESS,
>    BPF_TC_CLSACT_EGRESS,
>    BPF_TC_QDISC_PARENT
> };
> 
> where if you set the last one you have to fill in the parent in opts?

Fair enough, I still think this is a bit backwards and should be discouraged
given the constraints, but if you have an actual need for it ... I'd rather
simplify API naming, the fact that it's clsact is an implementation detail
and shouldn't matter to a user, like:

enum bpf_tc_attach_point {
	BPF_TC_INGRESS,
	BPF_TC_EGRESS,
	BPF_TC_CUSTOM_PARENT,
};

For BPF_TC_INGRESS and BPF_TC_EGRESS, I would enforce opts parent parameter
to be /zero/ from the API.

Thanks,
Daniel
Toke Høiland-Jørgensen April 22, 2021, 12:46 p.m. UTC | #17
Daniel Borkmann <daniel@iogearbox.net> writes:

> On 4/22/21 11:08 AM, Toke Høiland-Jørgensen wrote:
>> Daniel Borkmann <daniel@iogearbox.net> writes:
>>> On 4/21/21 9:48 PM, Toke Høiland-Jørgensen wrote:
>>>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>>>> On Tue, Apr 20, 2021 at 12:37 PM Kumar Kartikeya Dwivedi
>>>>> <memxor@gmail.com> wrote:
>>> [...]
>>>>>> ---
>>>>>>    tools/lib/bpf/libbpf.h   |  44 ++++++
>>>>>>    tools/lib/bpf/libbpf.map |   3 +
>>>>>>    tools/lib/bpf/netlink.c  | 319 ++++++++++++++++++++++++++++++++++++++-
>>>>>>    3 files changed, 360 insertions(+), 6 deletions(-)
>>>>>>
>>>>>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>>>>>> index bec4e6a6e31d..b4ed6a41ea70 100644
>>>>>> --- a/tools/lib/bpf/libbpf.h
>>>>>> +++ b/tools/lib/bpf/libbpf.h
>>>>>> @@ -16,6 +16,8 @@
>>>>>>    #include <stdbool.h>
>>>>>>    #include <sys/types.h>  // for size_t
>>>>>>    #include <linux/bpf.h>
>>>>>> +#include <linux/pkt_sched.h>
>>>>>> +#include <linux/tc_act/tc_bpf.h>
>>>>>
>>>>> apart from those unused macros below, are these needed in public API header?
>>>>>
>>>>>>    #include "libbpf_common.h"
>>>>>>
>>>>>> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>>>>>>    LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>>>>>>    LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>>>>>>
>>>>>> +/* Convenience macros for the clsact attach hooks */
>>>>>> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
>>>>>> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>>>>>
>>>>> these seem to be used only internally, why exposing them in public
>>>>> API?
>>>>
>>>> No they're "aliases" for when you want to attach the filter directly to
>>>> the interface (and thus install the clsact qdisc as the root). You can
>>>> also use the filter with an existing qdisc (most commonly HTB), in which
>>>> case you need to specify the qdisc handle as the root. We have a few
>>>> examples of this use case:
>>>>
>>>> https://github.com/xdp-project/bpf-examples/tree/master/traffic-pacing-edt
>>>> and
>>>> https://github.com/xdp-project/xdp-cpumap-tc
>>>
>>> I'm a bit puzzled, could you elaborate on your use case on why you wouldn't
>>> use the tc egress hook for those especially given it's guaranteed to run
>>> outside of root qdisc lock?
>> 
>> Jesper can correct me if I'm wrong, but I think the first one of the
>> links above is basically his implementation of just that EDT-based
>> shaper. And it works reasonably well, except you don't get the nice
>> per-flow scheduling and sparse flow prioritisation like in FQ-CoDel
>> unless you implement that yourself in BPF when you set the timestamps
>> (and that is by no means trivial to implement).
>> 
>> So if you want to use any of the features of the existing qdiscs (I have
>> also been suggesting to people that they use tc_bpf if they want to
>> customise sch_cake's notion of flows or shaping tiers), you need to be
>> able to attach the filter to an existing qdisc. Sure, this means you're
>> still stuck behind the qdisc lock, but for some applications that is
>> fine (not everything is a data centre, some devices don't have that many
>> CPUs anyway; and as the second example above shows, you can get around
>> the qdisc lock by some clever use of partitioning of flows using
>> cpumap).
>> 
>> So what this boils down to is, we should keep the 'parent' parameter not
>> just as an egress/ingress enum, but also as a field the user can fill
>> out. I'm fine with moving the latter into the opts struct, though, so
>> maybe the function parameter can be an enum like:
>> 
>> enum bpf_tc_attach_point {
>>    BPF_TC_CLSACT_INGRESS,
>>    BPF_TC_CLSACT_EGRESS,
>>    BPF_TC_QDISC_PARENT
>> };
>> 
>> where if you set the last one you have to fill in the parent in opts?
>
> Fair enough, I still think this is a bit backwards and should be discouraged
> given the constraints, but if you have an actual need for it ... I'd rather
> simplify API naming, the fact that it's clsact is an implementation detail
> and shouldn't matter to a user, like:
>
> enum bpf_tc_attach_point {
> 	BPF_TC_INGRESS,
> 	BPF_TC_EGRESS,
> 	BPF_TC_CUSTOM_PARENT,
> };
>
> For BPF_TC_INGRESS and BPF_TC_EGRESS, I would enforce opts parent parameter
> to be /zero/ from the API.

OK, SGTM :)

-Toke
Daniel Borkmann April 22, 2021, 3:35 p.m. UTC | #18
On 4/22/21 5:43 AM, Andrii Nakryiko wrote:
> On Wed, Apr 21, 2021 at 3:59 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 4/20/21 9:37 PM, Kumar Kartikeya Dwivedi wrote:
>> [...]
>>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
>>> index bec4e6a6e31d..b4ed6a41ea70 100644
>>> --- a/tools/lib/bpf/libbpf.h
>>> +++ b/tools/lib/bpf/libbpf.h
>>> @@ -16,6 +16,8 @@
>>>    #include <stdbool.h>
>>>    #include <sys/types.h>  // for size_t
>>>    #include <linux/bpf.h>
>>> +#include <linux/pkt_sched.h>
>>> +#include <linux/tc_act/tc_bpf.h>
>>>
>>>    #include "libbpf_common.h"
>>>
>>> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
>>>    LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
>>>    LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
>>>
>>> +/* Convenience macros for the clsact attach hooks */
>>> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
>>> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
>>
>> I would abstract those away into an enum, plus avoid having to pull in
>> linux/pkt_sched.h and linux/tc_act/tc_bpf.h from main libbpf.h header.
>>
>> Just add a enum { BPF_TC_DIR_INGRESS, BPF_TC_DIR_EGRESS, } and then the
>> concrete tc bits (TC_H_MAKE()) can be translated internally.
>>
>>> +struct bpf_tc_opts {
>>> +     size_t sz;
>>
>> Is this set anywhere?
>>
>>> +     __u32 handle;
>>> +     __u32 class_id;
>>
>> I'd remove class_id from here as well given in direct-action a BPF prog can
>> set it if needed.
>>
>>> +     __u16 priority;
>>> +     bool replace;
>>> +     size_t :0;
>>
>> What's the rationale for this padding?
>>
>>> +};
>>> +
>>> +#define bpf_tc_opts__last_field replace
>>> +
>>> +/* Acts as a handle for an attached filter */
>>> +struct bpf_tc_attach_id {
>>
>> nit: maybe bpf_tc_ctx
> 
> ok, so wait. It seems like apart from INGRESS|EGRESS enum and ifindex,
> everything else is optional and/or has some sane defaults, right? So
> this bpf_tc_attach_id or bpf_tc_ctx seems a bit artificial construct
> and it will cause problems for extending this.
> 
> So if my understanding is correct, I'd get rid of it completely. As I
> said previously, opts allow returning parameters back, so if user
> didn't specify handle and priority and kernel picks values on user's
> behalf, we can return them in the same opts fields.
> 
> For detach, again, ifindex and INGRESS|EGRESS is sufficient, but if
> user want to provide more detailed parameters, we should do that
> through extensible opts. That way we can keep growing this easily,
> plus simple cases will remain simple.
> 
> Similarly bpf_tc_info below, there is no need to have struct
> bpf_tc_attach_id id; field, just have handle and priority right there.
> And bpf_tc_info should use OPTS framework for extensibility (though
> opts name doesn't fit it very well, but it is still nice for
> extensibility and for doing optional input/output params).
> 
> Does this make sense? Am I missing something crucial here?

I would probably keep the handle + priority in there; maybe if both are 0,
we could fix it to some default value internally, but without those it might
be a bit hard if people want to build a 'pipeline' of cls_bpf progs if they
need/want to.

Potentially, one could fixate the handle itself, and then allow to specify
different priorities for it such that when a BPF prog returns a TC_ACT_UNSPEC,
it will exec the next one inside that cls_bpf instance, every other TC_ACT_*
opcode will terminate the processing. Technically, only priority would really
be needed (unless you combine multiple different classifiers from tc side on
the ingress/egress hook which is not great to begin with, tbh).

> The general rule with any new structs added to libbpf APIs is to
> either be 100% (ok, 99.99%) sure that they will never be changed, or
> do forward/backward compatible OPTS. Any other thing is pain and calls
> for symbol versioning, which we are trying really hard to avoid.
> 
>>> +     __u32 handle;
>>> +     __u16 priority;
>>> +};
>>> +
>>> +struct bpf_tc_info {
>>> +     struct bpf_tc_attach_id id;
>>> +     __u16 protocol;
>>> +     __u32 chain_index;
>>> +     __u32 prog_id;
>>> +     __u8 tag[BPF_TAG_SIZE];
>>> +     __u32 class_id;
>>> +     __u32 bpf_flags;
>>> +     __u32 bpf_flags_gen;
>>
>> Given we do not yet have any setters e.g. for offload, etc, the one thing
>> I'd see useful and crucial initially is prog_id.
>>
>> The protocol, chain_index, and I would also include tag should be dropped.
>> Similarly class_id given my earlier statement, and flags I would extend once
>> this lib API would support offloading progs.
>>
>>> +};
>>> +
>>> +/* id is out parameter that will be written to, it must not be NULL */
>>> +LIBBPF_API int bpf_tc_attach(int fd, __u32 ifindex, __u32 parent_id,
>>> +                          const struct bpf_tc_opts *opts,
>>> +                          struct bpf_tc_attach_id *id);
>>> +LIBBPF_API int bpf_tc_detach(__u32 ifindex, __u32 parent_id,
>>> +                          const struct bpf_tc_attach_id *id);
>>> +LIBBPF_API int bpf_tc_get_info(__u32 ifindex, __u32 parent_id,
>>> +                            const struct bpf_tc_attach_id *id,
>>> +                            struct bpf_tc_info *info);
>>
>> As per above, for parent_id I'd replace with dir enum.
>>
>>> +
>>>    #ifdef __cplusplus
>>>    } /* extern "C" */
>>>    #endif
Andrii Nakryiko April 22, 2021, 6:28 p.m. UTC | #19
On Thu, Apr 22, 2021 at 8:35 AM Daniel Borkmann <daniel@iogearbox.net> wrote:
>
> On 4/22/21 5:43 AM, Andrii Nakryiko wrote:
> > On Wed, Apr 21, 2021 at 3:59 PM Daniel Borkmann <daniel@iogearbox.net> wrote:
> >> On 4/20/21 9:37 PM, Kumar Kartikeya Dwivedi wrote:
> >> [...]
> >>> diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
> >>> index bec4e6a6e31d..b4ed6a41ea70 100644
> >>> --- a/tools/lib/bpf/libbpf.h
> >>> +++ b/tools/lib/bpf/libbpf.h
> >>> @@ -16,6 +16,8 @@
> >>>    #include <stdbool.h>
> >>>    #include <sys/types.h>  // for size_t
> >>>    #include <linux/bpf.h>
> >>> +#include <linux/pkt_sched.h>
> >>> +#include <linux/tc_act/tc_bpf.h>
> >>>
> >>>    #include "libbpf_common.h"
> >>>
> >>> @@ -775,6 +777,48 @@ LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
> >>>    LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
> >>>    LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
> >>>
> >>> +/* Convenience macros for the clsact attach hooks */
> >>> +#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
> >>> +#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
> >>
> >> I would abstract those away into an enum, plus avoid having to pull in
> >> linux/pkt_sched.h and linux/tc_act/tc_bpf.h from main libbpf.h header.
> >>
> >> Just add a enum { BPF_TC_DIR_INGRESS, BPF_TC_DIR_EGRESS, } and then the
> >> concrete tc bits (TC_H_MAKE()) can be translated internally.
> >>
> >>> +struct bpf_tc_opts {
> >>> +     size_t sz;
> >>
> >> Is this set anywhere?
> >>
> >>> +     __u32 handle;
> >>> +     __u32 class_id;
> >>
> >> I'd remove class_id from here as well given in direct-action a BPF prog can
> >> set it if needed.
> >>
> >>> +     __u16 priority;
> >>> +     bool replace;
> >>> +     size_t :0;
> >>
> >> What's the rationale for this padding?
> >>
> >>> +};
> >>> +
> >>> +#define bpf_tc_opts__last_field replace
> >>> +
> >>> +/* Acts as a handle for an attached filter */
> >>> +struct bpf_tc_attach_id {
> >>
> >> nit: maybe bpf_tc_ctx
> >
> > ok, so wait. It seems like apart from INGRESS|EGRESS enum and ifindex,
> > everything else is optional and/or has some sane defaults, right? So
> > this bpf_tc_attach_id or bpf_tc_ctx seems a bit artificial construct
> > and it will cause problems for extending this.
> >
> > So if my understanding is correct, I'd get rid of it completely. As I
> > said previously, opts allow returning parameters back, so if user
> > didn't specify handle and priority and kernel picks values on user's
> > behalf, we can return them in the same opts fields.
> >
> > For detach, again, ifindex and INGRESS|EGRESS is sufficient, but if
> > user want to provide more detailed parameters, we should do that
> > through extensible opts. That way we can keep growing this easily,
> > plus simple cases will remain simple.
> >
> > Similarly bpf_tc_info below, there is no need to have struct
> > bpf_tc_attach_id id; field, just have handle and priority right there.
> > And bpf_tc_info should use OPTS framework for extensibility (though
> > opts name doesn't fit it very well, but it is still nice for
> > extensibility and for doing optional input/output params).
> >
> > Does this make sense? Am I missing something crucial here?
>
> I would probably keep the handle + priority in there; maybe if both are 0,
> we could fix it to some default value internally, but without those it might
> be a bit hard if people want to build a 'pipeline' of cls_bpf progs if they
> need/want to.

Oh, I'm not proposing to drop support for specifying handle and prio.
I'm just saying having a fixed UAPI struct bpf_tc_attach_id as an "ID"
is problematic from API stability point of view. So instead of
pretending we know what "ID" will always be like, pass any extra
non-default fields in OPTS struct. And if those are not specified by
user (either opts is NULL or handle/prio is 0), use sane defaults, as
you are proposing.

>
> Potentially, one could fixate the handle itself, and then allow to specify
> different priorities for it such that when a BPF prog returns a TC_ACT_UNSPEC,
> it will exec the next one inside that cls_bpf instance, every other TC_ACT_*
> opcode will terminate the processing. Technically, only priority would really
> be needed (unless you combine multiple different classifiers from tc side on
> the ingress/egress hook which is not great to begin with, tbh).
>
> > The general rule with any new structs added to libbpf APIs is to
> > either be 100% (ok, 99.99%) sure that they will never be changed, or
> > do forward/backward compatible OPTS. Any other thing is pain and calls
> > for symbol versioning, which we are trying really hard to avoid.
> >
> >>> +     __u32 handle;
> >>> +     __u16 priority;
> >>> +};
> >>> +
> >>> +struct bpf_tc_info {
> >>> +     struct bpf_tc_attach_id id;
> >>> +     __u16 protocol;
> >>> +     __u32 chain_index;
> >>> +     __u32 prog_id;
> >>> +     __u8 tag[BPF_TAG_SIZE];
> >>> +     __u32 class_id;
> >>> +     __u32 bpf_flags;
> >>> +     __u32 bpf_flags_gen;
> >>
> >> Given we do not yet have any setters e.g. for offload, etc, the one thing
> >> I'd see useful and crucial initially is prog_id.
> >>
> >> The protocol, chain_index, and I would also include tag should be dropped.
> >> Similarly class_id given my earlier statement, and flags I would extend once
> >> this lib API would support offloading progs.
> >>
> >>> +};
> >>> +
> >>> +/* id is out parameter that will be written to, it must not be NULL */
> >>> +LIBBPF_API int bpf_tc_attach(int fd, __u32 ifindex, __u32 parent_id,
> >>> +                          const struct bpf_tc_opts *opts,
> >>> +                          struct bpf_tc_attach_id *id);
> >>> +LIBBPF_API int bpf_tc_detach(__u32 ifindex, __u32 parent_id,
> >>> +                          const struct bpf_tc_attach_id *id);
> >>> +LIBBPF_API int bpf_tc_get_info(__u32 ifindex, __u32 parent_id,
> >>> +                            const struct bpf_tc_attach_id *id,
> >>> +                            struct bpf_tc_info *info);
> >>
> >> As per above, for parent_id I'd replace with dir enum.
> >>
> >>> +
> >>>    #ifdef __cplusplus
> >>>    } /* extern "C" */
> >>>    #endif
>
diff mbox series

Patch

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index bec4e6a6e31d..b4ed6a41ea70 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -16,6 +16,8 @@ 
 #include <stdbool.h>
 #include <sys/types.h>  // for size_t
 #include <linux/bpf.h>
+#include <linux/pkt_sched.h>
+#include <linux/tc_act/tc_bpf.h>
 
 #include "libbpf_common.h"
 
@@ -775,6 +777,48 @@  LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
 LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
 LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
 
+/* Convenience macros for the clsact attach hooks */
+#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
+#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
+
+struct bpf_tc_opts {
+	size_t sz;
+	__u32 handle;
+	__u32 class_id;
+	__u16 priority;
+	bool replace;
+	size_t :0;
+};
+
+#define bpf_tc_opts__last_field replace
+
+/* Acts as a handle for an attached filter */
+struct bpf_tc_attach_id {
+	__u32 handle;
+	__u16 priority;
+};
+
+struct bpf_tc_info {
+	struct bpf_tc_attach_id id;
+	__u16 protocol;
+	__u32 chain_index;
+	__u32 prog_id;
+	__u8 tag[BPF_TAG_SIZE];
+	__u32 class_id;
+	__u32 bpf_flags;
+	__u32 bpf_flags_gen;
+};
+
+/* id is out parameter that will be written to, it must not be NULL */
+LIBBPF_API int bpf_tc_attach(int fd, __u32 ifindex, __u32 parent_id,
+			     const struct bpf_tc_opts *opts,
+			     struct bpf_tc_attach_id *id);
+LIBBPF_API int bpf_tc_detach(__u32 ifindex, __u32 parent_id,
+			     const struct bpf_tc_attach_id *id);
+LIBBPF_API int bpf_tc_get_info(__u32 ifindex, __u32 parent_id,
+			       const struct bpf_tc_attach_id *id,
+			       struct bpf_tc_info *info);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index b9b29baf1df8..686444fbb838 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -361,4 +361,7 @@  LIBBPF_0.4.0 {
 		bpf_linker__new;
 		bpf_map__inner_map;
 		bpf_object__set_kversion;
+		bpf_tc_attach;
+		bpf_tc_detach;
+		bpf_tc_get_info;
 } LIBBPF_0.3.0;
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index c79e30484e81..71109dcea9e4 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -4,7 +4,10 @@ 
 #include <stdlib.h>
 #include <memory.h>
 #include <unistd.h>
+#include <inttypes.h>
+#include <arpa/inet.h>
 #include <linux/bpf.h>
+#include <linux/if_ether.h>
 #include <linux/rtnetlink.h>
 #include <sys/socket.h>
 #include <errno.h>
@@ -344,6 +347,20 @@  int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags)
 	return ret;
 }
 
+static int bpf_nl_get_ext(struct nlmsghdr *nh, int sock, unsigned int nl_pid,
+			  __dump_nlmsg_t dump_link_nlmsg_p,
+			  libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie)
+{
+	int seq = time(NULL);
+
+	nh->nlmsg_seq = seq;
+	if (send(sock, nh, nh->nlmsg_len, 0) < 0)
+		return -errno;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, dump_link_nlmsg_p,
+				dump_link_nlmsg, cookie);
+}
+
 int libbpf_nl_get_link(int sock, unsigned int nl_pid,
 		       libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie)
 {
@@ -356,12 +373,302 @@  int libbpf_nl_get_link(int sock, unsigned int nl_pid,
 		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
 		.ifm.ifi_family = AF_PACKET,
 	};
-	int seq = time(NULL);
 
-	req.nlh.nlmsg_seq = seq;
-	if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
-		return -errno;
+	return bpf_nl_get_ext(&req.nlh, sock, nl_pid, __dump_link_nlmsg,
+			      dump_link_nlmsg, cookie);
+}
 
-	return bpf_netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg,
-				dump_link_nlmsg, cookie);
+static int tc_setup_clsact_excl(int sock, __u32 nl_pid, __u32 ifindex)
+{
+	int seq = 0, ret = 0;
+	struct {
+		struct nlmsghdr nh;
+		struct tcmsg t;
+		char buf[256];
+	} req;
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	req.nh.nlmsg_flags =
+		NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK | NLM_F_EXCL;
+	req.nh.nlmsg_type = RTM_NEWQDISC;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.t.tcm_family = AF_UNSPEC;
+	req.t.tcm_ifindex = ifindex;
+	req.t.tcm_parent = TC_H_CLSACT;
+	req.t.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0);
+
+	ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "clsact",
+			 sizeof("clsact"));
+	if (ret < 0)
+		return ret;
+
+	ret = send(sock, &req.nh, req.nh.nlmsg_len, 0);
+	if (ret < 0)
+		return ret;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL);
+}
+
+static int tc_bpf_add_fd_and_name(struct nlmsghdr *nh, size_t maxsz, int fd)
+{
+	struct bpf_prog_info info = {};
+	__u32 info_len = sizeof(info);
+	char name[256] = {};
+	int len, ret;
+
+	ret = bpf_obj_get_info_by_fd(fd, &info, &info_len);
+	if (ret < 0)
+		return ret;
+
+	ret = nlattr_add(nh, maxsz, TCA_BPF_FD, &fd, sizeof(fd));
+	if (ret < 0)
+		return ret;
+
+	len = snprintf(name, sizeof(name), "%s:[%" PRIu32 "]", info.name,
+		       info.id);
+	if (len < 0 || len >= sizeof(name))
+		return len < 0 ? -EINVAL : -ENAMETOOLONG;
+
+	return nlattr_add(nh, maxsz, TCA_BPF_NAME, name, len + 1);
+}
+
+static int cls_get_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn,
+			void *cookie);
+
+static int tc_cls_bpf_modify(int fd, int cmd, unsigned int flags, __u32 ifindex,
+			     __u32 parent_id, const struct bpf_tc_opts *opts,
+			     __dump_nlmsg_t fn, struct bpf_tc_attach_id *id)
+{
+	__u32 nl_pid = 0, protocol, priority;
+	struct bpf_tc_info info = {};
+	unsigned int bpf_flags = 0;
+	int sock, seq = 0, ret;
+	struct nlattr *nla;
+	struct {
+		struct nlmsghdr nh;
+		struct tcmsg t;
+		char buf[256];
+	} req;
+
+	sock = libbpf_netlink_open(&nl_pid);
+	if (sock < 0)
+		return sock;
+
+	if ((parent_id == BPF_TC_CLSACT_INGRESS ||
+	     parent_id == BPF_TC_CLSACT_EGRESS) &&
+	    flags & NLM_F_CREATE) {
+		ret = tc_setup_clsact_excl(sock, nl_pid, ifindex);
+		/* Attachment can still fail if ingress qdisc is installed, and
+		 * we're trying attach on egress as parent. Ignore in that case
+		 * as well.
+		 */
+		if (ret < 0 && ret != -EEXIST)
+			goto end;
+	}
+
+	priority = OPTS_GET(opts, priority, 0);
+	protocol = ETH_P_ALL;
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags;
+	req.nh.nlmsg_type = cmd;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.t.tcm_family = AF_UNSPEC;
+	req.t.tcm_handle = OPTS_GET(opts, handle, 0);
+	req.t.tcm_parent = parent_id;
+	req.t.tcm_ifindex = ifindex;
+	req.t.tcm_info = TC_H_MAKE(priority << 16, htons(protocol));
+
+	ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf"));
+	if (ret < 0)
+		goto end;
+
+	nla = nlattr_begin_nested(&req.nh, sizeof(req), TCA_OPTIONS);
+	if (!nla) {
+		ret = -EMSGSIZE;
+		goto end;
+	}
+
+	if (OPTS_GET(opts, class_id, TC_H_UNSPEC)) {
+		ret = nlattr_add(&req.nh, sizeof(req), TCA_BPF_CLASSID,
+				 &opts->class_id, sizeof(opts->class_id));
+		if (ret < 0)
+			goto end;
+	}
+
+	if (cmd != RTM_DELTFILTER) {
+		ret = tc_bpf_add_fd_and_name(&req.nh, sizeof(req), fd);
+		if (ret < 0)
+			goto end;
+
+		/* direct action is always set */
+		bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
+		ret = nlattr_add(&req.nh, sizeof(req), TCA_BPF_FLAGS,
+				 &bpf_flags, sizeof(bpf_flags));
+		if (ret < 0)
+			goto end;
+	}
+
+	nlattr_end_nested(&req.nh, nla);
+
+	ret = send(sock, &req.nh, req.nh.nlmsg_len, 0);
+	if (ret < 0)
+		goto end;
+
+	ret = bpf_netlink_recv(sock, nl_pid, seq, fn, NULL, &info);
+
+	if (fn) {
+		*id = info.id;
+		ret = ret < 0 ? ret : 0;
+	}
+
+end:
+	close(sock);
+	return ret;
+}
+
+int bpf_tc_attach(int fd, __u32 ifindex, __u32 parent_id,
+		  const struct bpf_tc_opts *opts,
+		  struct bpf_tc_attach_id *id)
+{
+	if (fd < 0 || !OPTS_VALID(opts, bpf_tc_opts) || !id)
+		return -EINVAL;
+
+	return tc_cls_bpf_modify(fd, RTM_NEWTFILTER,
+				 NLM_F_ECHO | NLM_F_CREATE |
+			         (OPTS_GET(opts, replace, false) ?
+					NLM_F_REPLACE : NLM_F_EXCL),
+				 ifindex, parent_id, opts, cls_get_info, id);
+}
+
+int bpf_tc_detach(__u32 ifindex, __u32 parent_id,
+		  const struct bpf_tc_attach_id *id)
+{
+	DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts, 0);
+
+	if (!id)
+		return -EINVAL;
+
+	opts.handle = id->handle;
+	opts.priority = id->priority;
+
+	return tc_cls_bpf_modify(-1, RTM_DELTFILTER, 0, ifindex, parent_id,
+				 &opts, NULL, NULL);
+}
+
+static int __cls_get_info(void *cookie, void *msg, struct nlattr **tb)
+{
+	struct nlattr *tbb[TCA_BPF_MAX + 1];
+	struct bpf_tc_info *info = cookie;
+	struct tcmsg *t = msg;
+
+	if (!tb[TCA_OPTIONS])
+		return 0;
+
+	libbpf_nla_parse_nested(tbb, TCA_BPF_MAX, tb[TCA_OPTIONS], NULL);
+	if (!tbb[TCA_BPF_ID])
+		return 0;
+
+	info->protocol = ntohs(TC_H_MIN(t->tcm_info));
+	info->id.priority = TC_H_MAJ(t->tcm_info) >> 16;
+	info->id.handle = t->tcm_handle;
+
+	if (tb[TCA_CHAIN])
+		info->chain_index = libbpf_nla_getattr_u32(tb[TCA_CHAIN]);
+	else
+		info->chain_index = 0;
+
+	if (tbb[TCA_BPF_FLAGS])
+		info->bpf_flags = libbpf_nla_getattr_u32(tbb[TCA_BPF_FLAGS]);
+
+	if (tbb[TCA_BPF_FLAGS_GEN])
+		info->bpf_flags_gen =
+			libbpf_nla_getattr_u32(tbb[TCA_BPF_FLAGS_GEN]);
+
+	if (tbb[TCA_BPF_ID])
+		info->prog_id = libbpf_nla_getattr_u32(tbb[TCA_BPF_ID]);
+
+	if (tbb[TCA_BPF_TAG])
+		memcpy(info->tag, libbpf_nla_getattr_str(tbb[TCA_BPF_TAG]),
+		       sizeof(info->tag));
+
+	if (tbb[TCA_BPF_CLASSID])
+		info->class_id = libbpf_nla_getattr_u32(tbb[TCA_BPF_CLASSID]);
+
+	return 1;
+}
+
+static int cls_get_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn,
+			void *cookie)
+{
+	struct tcmsg *t = NLMSG_DATA(nh);
+	struct nlattr *tb[TCA_MAX + 1];
+
+	libbpf_nla_parse(tb, TCA_MAX,
+			 (struct nlattr *)((char *)t + NLMSG_ALIGN(sizeof(*t))),
+			 NLMSG_PAYLOAD(nh, sizeof(*t)), NULL);
+	if (!tb[TCA_KIND])
+		return -EINVAL;
+
+	return __cls_get_info(cookie, t, tb);
+}
+
+static int tc_cls_get_info(__u32 ifindex, __u32 parent_id,
+			   const struct bpf_tc_attach_id *id,
+			   struct bpf_tc_info *info)
+{
+	__u32 nl_pid = 0, protocol;
+	__u32 priority;
+	int sock, ret;
+	struct {
+		struct nlmsghdr nh;
+		struct tcmsg t;
+		char buf[256];
+	} req = {
+		.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+		.nh.nlmsg_type = RTM_GETTFILTER,
+		.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+		.t.tcm_family = AF_UNSPEC,
+	};
+
+	priority = id->priority;
+	protocol = ETH_P_ALL;
+
+	req.t.tcm_parent = parent_id;
+	req.t.tcm_ifindex = ifindex;
+	req.t.tcm_handle = id->handle;
+	req.t.tcm_info = TC_H_MAKE(priority << 16, htons(protocol));
+
+	sock = libbpf_netlink_open(&nl_pid);
+	if (sock < 0)
+		return sock;
+
+	ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf"));
+	if (ret < 0)
+		goto end;
+
+	req.nh.nlmsg_seq = time(NULL);
+
+	ret = bpf_nl_get_ext(&req.nh, sock, nl_pid, cls_get_info, NULL, info);
+	if (ret < 0)
+		goto end;
+	/* 1 denotes a match */
+	ret = ret == 1 ? 0 : -ESRCH;
+end:
+	close(sock);
+	return ret;
+}
+
+int bpf_tc_get_info(__u32 ifindex, __u32 parent_id,
+		    const struct bpf_tc_attach_id *id,
+		    struct bpf_tc_info *info)
+{
+	if (!id || !info)
+		return -EINVAL;
+
+	return tc_cls_get_info(ifindex, parent_id, id, info);
 }