diff mbox series

[RFC,v2,bpf-next,1/3] bpf: add bpf_link support for BPF_NETFILTER programs

Message ID 20230302172757.9548-2-fw@strlen.de (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series bpf: add netfilter program type | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-3 success Logs for build for aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-5 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-7 success Logs for llvm-toolchain
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-4 success Logs for build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for test_maps on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-12 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-19 fail Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 fail Logs for test_progs_no_alu32 on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 fail Logs for test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-24 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-26 success Logs for test_progs_no_alu32_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for test_progs_parallel on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for test_progs_parallel on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-35 success Logs for test_verifier on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-37 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-38 success Logs for test_verifier on x86_64 with llvm-17
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1798 this patch: 1798
netdev/cc_maintainers warning 19 maintainers not CCed: pabeni@redhat.com coreteam@netfilter.org davem@davemloft.net jolsa@kernel.org ast@kernel.org yhs@fb.com kpsingh@kernel.org song@kernel.org haoluo@google.com kuba@kernel.org netdev@vger.kernel.org andrii@kernel.org john.fastabend@gmail.com daniel@iogearbox.net pablo@netfilter.org martin.lau@linux.dev kadlec@netfilter.org edumazet@google.com sdf@google.com
netdev/build_clang success Errors and warnings before: 161 this patch: 161
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1799 this patch: 1799
netdev/checkpatch warning CHECK: Alignment should match open parenthesis WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 103 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 11 this patch: 11
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for test_progs on aarch64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for test_progs_parallel on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-11 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 fail Logs for test_progs on s390x with gcc

Commit Message

Florian Westphal March 2, 2023, 5:27 p.m. UTC
Add bpf_link support skeleton.  To keep this reviewable, no bpf program
can be invoked here, if a program would be attached only a c-stub is called.

This defaults to 'y' if both netfilter and bpf syscall are enabled in
kconfig.

Uapi example usage:
	union bpf_attr attr = { };

	attr.link_create.prog_fd = progfd;
	attr.link_create.attach_type = BPF_NETFILTER;
	attr.link_create.netfilter.pf = PF_INET;
	attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN;
	attr.link_create.netfilter.priority = -128;

	err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr));

... this would attach progfd to ipv4:input hook.

Such hook gets removed automatically if the calling program exits.

BPF_NETFILTER program invocation is added in followup change.

NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it
allows to tell userspace which program is attached at the given hook
when user runs 'nft hook list' command rather than just the priority
and not-very-helpful 'this hook runs a bpf prog but I can't tell which
one'.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter.h           |   1 +
 include/net/netfilter/nf_hook_bpf.h |   2 +
 include/uapi/linux/bpf.h            |  12 +++
 kernel/bpf/syscall.c                |   6 ++
 net/netfilter/Kconfig               |   3 +
 net/netfilter/Makefile              |   1 +
 net/netfilter/nf_bpf_link.c         | 116 ++++++++++++++++++++++++++++
 7 files changed, 141 insertions(+)
 create mode 100644 include/net/netfilter/nf_hook_bpf.h
 create mode 100644 net/netfilter/nf_bpf_link.c

Comments

Stanislav Fomichev March 2, 2023, 8:28 p.m. UTC | #1
On 03/02, Florian Westphal wrote:
> Add bpf_link support skeleton.  To keep this reviewable, no bpf program
> can be invoked here, if a program would be attached only a c-stub is  
> called.

> This defaults to 'y' if both netfilter and bpf syscall are enabled in
> kconfig.

> Uapi example usage:
> 	union bpf_attr attr = { };

> 	attr.link_create.prog_fd = progfd;
> 	attr.link_create.attach_type = BPF_NETFILTER;
> 	attr.link_create.netfilter.pf = PF_INET;
> 	attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN;
> 	attr.link_create.netfilter.priority = -128;

> 	err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr));

> ... this would attach progfd to ipv4:input hook.

> Such hook gets removed automatically if the calling program exits.

> BPF_NETFILTER program invocation is added in followup change.

> NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it
> allows to tell userspace which program is attached at the given hook
> when user runs 'nft hook list' command rather than just the priority
> and not-very-helpful 'this hook runs a bpf prog but I can't tell which
> one'.

> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>   include/linux/netfilter.h           |   1 +
>   include/net/netfilter/nf_hook_bpf.h |   2 +
>   include/uapi/linux/bpf.h            |  12 +++
>   kernel/bpf/syscall.c                |   6 ++
>   net/netfilter/Kconfig               |   3 +
>   net/netfilter/Makefile              |   1 +
>   net/netfilter/nf_bpf_link.c         | 116 ++++++++++++++++++++++++++++
>   7 files changed, 141 insertions(+)
>   create mode 100644 include/net/netfilter/nf_hook_bpf.h
>   create mode 100644 net/netfilter/nf_bpf_link.c

> diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
> index 6863e271a9de..beec40ccbd79 100644
> --- a/include/linux/netfilter.h
> +++ b/include/linux/netfilter.h
> @@ -80,6 +80,7 @@ typedef unsigned int nf_hookfn(void *priv,
>   enum nf_hook_ops_type {
>   	NF_HOOK_OP_UNDEFINED,
>   	NF_HOOK_OP_NF_TABLES,
> +	NF_HOOK_OP_BPF,
>   };

>   struct nf_hook_ops {
> diff --git a/include/net/netfilter/nf_hook_bpf.h  
> b/include/net/netfilter/nf_hook_bpf.h
> new file mode 100644
> index 000000000000..9d1b338e89d7
> --- /dev/null
> +++ b/include/net/netfilter/nf_hook_bpf.h
> @@ -0,0 +1,2 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog  
> *prog);
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index c9699304aed2..b063c6985769 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -986,6 +986,7 @@ enum bpf_prog_type {
>   	BPF_PROG_TYPE_LSM,
>   	BPF_PROG_TYPE_SK_LOOKUP,
>   	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> +	BPF_PROG_TYPE_NETFILTER,
>   };

>   enum bpf_attach_type {
> @@ -1049,6 +1050,7 @@ enum bpf_link_type {
>   	BPF_LINK_TYPE_PERF_EVENT = 7,
>   	BPF_LINK_TYPE_KPROBE_MULTI = 8,
>   	BPF_LINK_TYPE_STRUCT_OPS = 9,
> +	BPF_LINK_TYPE_NETFILTER = 10,

>   	MAX_BPF_LINK_TYPE,
>   };
> @@ -1543,6 +1545,11 @@ union bpf_attr {
>   				 */
>   				__u64		cookie;
>   			} tracing;
> +			struct {
> +				__u32		pf;
> +				__u32		hooknum;
> +				__s32		prio;
> +			} netfilter;

For recent tc BPF program extensions, we've discussed that it might be  
better
to have an option to attach program before/after another one in the chain.
So the API essentially would receive a before/after flag + fd/id of the
program where we want to be.

Should we do something similar here? See [0] for the original
discussion.

0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/

>   		};
>   	} link_create;

> @@ -6373,6 +6380,11 @@ struct bpf_link_info {
>   		struct {
>   			__u32 ifindex;
>   		} xdp;
> +		struct {
> +			__u32 pf;
> +			__u32 hooknum;
> +			__s32 priority;
> +		} netfilter;
>   	};
>   } __attribute__((aligned(8)));

> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index e3fcdc9836a6..8f5cd1fb83ee 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -35,6 +35,7 @@
>   #include <linux/rcupdate_trace.h>
>   #include <linux/memcontrol.h>
>   #include <linux/trace_events.h>
> +#include <net/netfilter/nf_hook_bpf.h>

>   #define IS_FD_ARRAY(map) ((map)->map_type ==  
> BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
>   			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
> @@ -2448,6 +2449,7 @@ static bool is_net_admin_prog_type(enum  
> bpf_prog_type prog_type)
>   	case BPF_PROG_TYPE_CGROUP_SYSCTL:
>   	case BPF_PROG_TYPE_SOCK_OPS:
>   	case BPF_PROG_TYPE_EXT: /* extends any prog */
> +	case BPF_PROG_TYPE_NETFILTER:
>   		return true;
>   	case BPF_PROG_TYPE_CGROUP_SKB:
>   		/* always unpriv */
> @@ -4562,6 +4564,7 @@ static int link_create(union bpf_attr *attr,  
> bpfptr_t uattr)

>   	switch (prog->type) {
>   	case BPF_PROG_TYPE_EXT:
> +	case BPF_PROG_TYPE_NETFILTER:
>   		break;
>   	case BPF_PROG_TYPE_PERF_EVENT:
>   	case BPF_PROG_TYPE_TRACEPOINT:
> @@ -4628,6 +4631,9 @@ static int link_create(union bpf_attr *attr,  
> bpfptr_t uattr)
>   	case BPF_PROG_TYPE_XDP:
>   		ret = bpf_xdp_link_attach(attr, prog);
>   		break;
> +	case BPF_PROG_TYPE_NETFILTER:
> +		ret = bpf_nf_link_attach(attr, prog);
> +		break;
>   #endif
>   	case BPF_PROG_TYPE_PERF_EVENT:
>   	case BPF_PROG_TYPE_TRACEPOINT:
> diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
> index 4d6737160857..bea06f62a30e 100644
> --- a/net/netfilter/Kconfig
> +++ b/net/netfilter/Kconfig
> @@ -30,6 +30,9 @@ config NETFILTER_FAMILY_BRIDGE
>   config NETFILTER_FAMILY_ARP
>   	bool

> +config NETFILTER_BPF_LINK
> +	def_bool BPF_SYSCALL
> +
>   config NETFILTER_NETLINK_HOOK
>   	tristate "Netfilter base hook dump support"
>   	depends on NETFILTER_ADVANCED
> diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
> index 5ffef1cd6143..d4958e7e7631 100644
> --- a/net/netfilter/Makefile
> +++ b/net/netfilter/Makefile
> @@ -22,6 +22,7 @@ nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) +=  
> nf_conntrack_bpf.o
>   endif

>   obj-$(CONFIG_NETFILTER) = netfilter.o
> +obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o

>   obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
>   obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
> diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
> new file mode 100644
> index 000000000000..fa4fae5cc669
> --- /dev/null
> +++ b/net/netfilter/nf_bpf_link.c
> @@ -0,0 +1,116 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/bpf.h>
> +#include <linux/netfilter.h>
> +
> +#include <net/netfilter/nf_hook_bpf.h>
> +
> +static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb,  
> const struct nf_hook_state *s)
> +{
> +	return NF_ACCEPT;
> +}
> +
> +struct bpf_nf_link {
> +	struct bpf_link link;
> +	struct nf_hook_ops hook_ops;
> +	struct net *net;
> +};
> +
> +static void bpf_nf_link_release(struct bpf_link *link)
> +{
> +	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link,  
> link);
> +
> +	nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
> +}
> +
> +static void bpf_nf_link_dealloc(struct bpf_link *link)
> +{
> +	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link,  
> link);
> +
> +	kfree(nf_link);
> +}
> +
> +static int bpf_nf_link_detach(struct bpf_link *link)
> +{
> +	bpf_nf_link_release(link);
> +	return 0;
> +}
> +
> +static void bpf_nf_link_show_info(const struct bpf_link *link,
> +				  struct seq_file *seq)
> +{
> +	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link,  
> link);
> +
> +	seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n",
> +		  nf_link->hook_ops.pf, nf_link->hook_ops.hooknum,
> +		  nf_link->hook_ops.priority);
> +}
> +
> +static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
> +				      struct bpf_link_info *info)
> +{
> +	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link,  
> link);
> +
> +	info->netfilter.pf = nf_link->hook_ops.pf;
> +	info->netfilter.hooknum = nf_link->hook_ops.hooknum;
> +	info->netfilter.priority = nf_link->hook_ops.priority;
> +
> +	return 0;
> +}
> +
> +static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog  
> *new_prog,
> +			      struct bpf_prog *old_prog)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static const struct bpf_link_ops bpf_nf_link_lops = {
> +	.release = bpf_nf_link_release,
> +	.dealloc = bpf_nf_link_dealloc,
> +	.detach = bpf_nf_link_detach,
> +	.show_fdinfo = bpf_nf_link_show_info,
> +	.fill_link_info = bpf_nf_link_fill_link_info,
> +	.update_prog = bpf_nf_link_update,
> +};
> +
> +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
> +{
> +	struct net *net = current->nsproxy->net_ns;
> +	struct bpf_link_primer link_primer;
> +	struct bpf_nf_link *link;
> +	int err;
> +
> +	if (attr->link_create.flags)
> +		return -EINVAL;
> +
> +	link = kzalloc(sizeof(*link), GFP_USER);
> +	if (!link)
> +		return -ENOMEM;
> +
> +	bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops,  
> prog);
> +
> +	link->hook_ops.hook = nf_hook_run_bpf;
> +	link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
> +	link->hook_ops.priv = prog;
> +
> +	link->hook_ops.pf = attr->link_create.netfilter.pf;
> +	link->hook_ops.priority = attr->link_create.netfilter.prio;
> +	link->hook_ops.hooknum = attr->link_create.netfilter.hooknum;
> +
> +	link->net = net;
> +
> +	err = bpf_link_prime(&link->link, &link_primer);
> +	if (err)
> +		goto out_free;
> +
> +	err = nf_register_net_hook(net, &link->hook_ops);
> +	if (err) {
> +		bpf_link_cleanup(&link_primer);
> +		goto out_free;
> +	}
> +
> +	return bpf_link_settle(&link_primer);
> +
> +out_free:
> +	kfree(link);
> +	return err;
> +}
> --
> 2.39.2
Florian Westphal March 3, 2023, 12:27 a.m. UTC | #2
Stanislav Fomichev <sdf@google.com> wrote:
> On 03/02, Florian Westphal wrote:
> > +			struct {
> > +				__u32		pf;
> > +				__u32		hooknum;
> > +				__s32		prio;
> > +			} netfilter;
> 
> For recent tc BPF program extensions, we've discussed that it might be
> better
> to have an option to attach program before/after another one in the chain.
> So the API essentially would receive a before/after flag + fd/id of the
>
> Should we do something similar here? See [0] for the original
> discussion.
> 
> 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/

Thanks for the pointer, I will have a look.

The above exposes the "prio" of netfilter hooks, so someone
that needs their hook to run early on, say, before netfilters
nat engine, could just use INT_MIN.

We could -- for nf bpf -- make the bpf_link fail if a hook
with the same priority already exists to avoid the "undefined
behaviour" here (same prio means register order decides what
hook function runs first ...).

This could be relevant if you have e.g. one bpf program collecting
statistics vs. one doing drops.

I'll dig though the thread and would try to mimic the tc link
mechanism as close as possible.
Daniel Xu March 23, 2023, 12:41 a.m. UTC | #3
Hi Florian, Stan,

On Fri, Mar 03, 2023 at 01:27:52AM +0100, Florian Westphal wrote:
> Stanislav Fomichev <sdf@google.com> wrote:
> > On 03/02, Florian Westphal wrote:
> > > +			struct {
> > > +				__u32		pf;
> > > +				__u32		hooknum;
> > > +				__s32		prio;
> > > +			} netfilter;
> > 
> > For recent tc BPF program extensions, we've discussed that it might be
> > better
> > to have an option to attach program before/after another one in the chain.
> > So the API essentially would receive a before/after flag + fd/id of the
> >
> > Should we do something similar here? See [0] for the original
> > discussion.
> > 
> > 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/
> 
> Thanks for the pointer, I will have a look.
> 
> The above exposes the "prio" of netfilter hooks, so someone
> that needs their hook to run early on, say, before netfilters
> nat engine, could just use INT_MIN.
> 
> We could -- for nf bpf -- make the bpf_link fail if a hook
> with the same priority already exists to avoid the "undefined
> behaviour" here (same prio means register order decides what
> hook function runs first ...).
> 
> This could be relevant if you have e.g. one bpf program collecting
> statistics vs. one doing drops.
> 
> I'll dig though the thread and would try to mimic the tc link
> mechanism as close as possible.

While I think the direction the TC link discussion took is totally fine,
TC has the advantage (IIUC) of being a somewhat isolated hook. Meaning
it does not make sense for a user to mix priority values && before/after
semantics.

Netfilter is different in that there is by default modules active with
fixed priority values. So mixing in before/after semantics here could
get confusing.

Thanks,
Daniel
Stanislav Fomichev March 23, 2023, 6:31 p.m. UTC | #4
On Wed, Mar 22, 2023 at 5:41 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> Hi Florian, Stan,
>
> On Fri, Mar 03, 2023 at 01:27:52AM +0100, Florian Westphal wrote:
> > Stanislav Fomichev <sdf@google.com> wrote:
> > > On 03/02, Florian Westphal wrote:
> > > > +                 struct {
> > > > +                         __u32           pf;
> > > > +                         __u32           hooknum;
> > > > +                         __s32           prio;
> > > > +                 } netfilter;
> > >
> > > For recent tc BPF program extensions, we've discussed that it might be
> > > better
> > > to have an option to attach program before/after another one in the chain.
> > > So the API essentially would receive a before/after flag + fd/id of the
> > >
> > > Should we do something similar here? See [0] for the original
> > > discussion.
> > >
> > > 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/
> >
> > Thanks for the pointer, I will have a look.
> >
> > The above exposes the "prio" of netfilter hooks, so someone
> > that needs their hook to run early on, say, before netfilters
> > nat engine, could just use INT_MIN.
> >
> > We could -- for nf bpf -- make the bpf_link fail if a hook
> > with the same priority already exists to avoid the "undefined
> > behaviour" here (same prio means register order decides what
> > hook function runs first ...).
> >
> > This could be relevant if you have e.g. one bpf program collecting
> > statistics vs. one doing drops.
> >
> > I'll dig though the thread and would try to mimic the tc link
> > mechanism as close as possible.
>
> While I think the direction the TC link discussion took is totally fine,
> TC has the advantage (IIUC) of being a somewhat isolated hook. Meaning
> it does not make sense for a user to mix priority values && before/after
> semantics.
>
> Netfilter is different in that there is by default modules active with
> fixed priority values. So mixing in before/after semantics here could
> get confusing.

I don't remember the details, so pls correct me, but last time I
looked, this priority was basically an ordering within a hook?
And there were a bunch of kernel-hardcoded values. So either that
whole story has to become a UAPI (so the bpf program knows
before/after which kernel hook it has to run), or we need some other
ordering mechanism. (I'm not sure what's the story with bpf vs kernel
hooks interop, so maybe it's all moot?)
Am I missing something? Can you share more about why those fixed
priorities are fine?

> Thanks,
> Daniel
Daniel Xu March 24, 2023, 5:33 p.m. UTC | #5
Hi Stan,

On Thu, Mar 23, 2023 at 11:31:14AM -0700, Stanislav Fomichev wrote:
> On Wed, Mar 22, 2023 at 5:41 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> >
> > Hi Florian, Stan,
> >
> > On Fri, Mar 03, 2023 at 01:27:52AM +0100, Florian Westphal wrote:
> > > Stanislav Fomichev <sdf@google.com> wrote:
> > > > On 03/02, Florian Westphal wrote:
> > > > > +                 struct {
> > > > > +                         __u32           pf;
> > > > > +                         __u32           hooknum;
> > > > > +                         __s32           prio;
> > > > > +                 } netfilter;
> > > >
> > > > For recent tc BPF program extensions, we've discussed that it might be
> > > > better
> > > > to have an option to attach program before/after another one in the chain.
> > > > So the API essentially would receive a before/after flag + fd/id of the
> > > >
> > > > Should we do something similar here? See [0] for the original
> > > > discussion.
> > > >
> > > > 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/
> > >
> > > Thanks for the pointer, I will have a look.
> > >
> > > The above exposes the "prio" of netfilter hooks, so someone
> > > that needs their hook to run early on, say, before netfilters
> > > nat engine, could just use INT_MIN.
> > >
> > > We could -- for nf bpf -- make the bpf_link fail if a hook
> > > with the same priority already exists to avoid the "undefined
> > > behaviour" here (same prio means register order decides what
> > > hook function runs first ...).
> > >
> > > This could be relevant if you have e.g. one bpf program collecting
> > > statistics vs. one doing drops.
> > >
> > > I'll dig though the thread and would try to mimic the tc link
> > > mechanism as close as possible.
> >
> > While I think the direction the TC link discussion took is totally fine,
> > TC has the advantage (IIUC) of being a somewhat isolated hook. Meaning
> > it does not make sense for a user to mix priority values && before/after
> > semantics.
> >
> > Netfilter is different in that there is by default modules active with
> > fixed priority values. So mixing in before/after semantics here could
> > get confusing.
> 
> I don't remember the details, so pls correct me, but last time I
> looked, this priority was basically an ordering within a hook?

Yeah, that is my understanding as well.

> And there were a bunch of kernel-hardcoded values. So either that
> whole story has to become a UAPI (so the bpf program knows
> before/after which kernel hook it has to run), or we need some other
> ordering mechanism.

I'm not sure what you mean by "whole story" but netfilter kernel modules
register via a priority value as well. As well as the modules the kernel
ships. So there's that to consider.

(I'm not sure what's the story with bpf vs kernel
> hooks interop, so maybe it's all moot?)
> Am I missing something? Can you share more about why those fixed
> priorities are fine?

I guess I wouldn't say it's ideal (for all the reasons brought up in the
previous discussion), but trying to use before/after semantics here
would necessarily pull in a netfilter rework too, no? Or maybe there's
some clever way to merge the two worlds and get both subsystems what
they want.

Thanks,
Daniel
Stanislav Fomichev March 24, 2023, 5:58 p.m. UTC | #6
On Fri, Mar 24, 2023 at 10:33 AM Daniel Xu <dxu@dxuuu.xyz> wrote:
>
> Hi Stan,
>
> On Thu, Mar 23, 2023 at 11:31:14AM -0700, Stanislav Fomichev wrote:
> > On Wed, Mar 22, 2023 at 5:41 PM Daniel Xu <dxu@dxuuu.xyz> wrote:
> > >
> > > Hi Florian, Stan,
> > >
> > > On Fri, Mar 03, 2023 at 01:27:52AM +0100, Florian Westphal wrote:
> > > > Stanislav Fomichev <sdf@google.com> wrote:
> > > > > On 03/02, Florian Westphal wrote:
> > > > > > +                 struct {
> > > > > > +                         __u32           pf;
> > > > > > +                         __u32           hooknum;
> > > > > > +                         __s32           prio;
> > > > > > +                 } netfilter;
> > > > >
> > > > > For recent tc BPF program extensions, we've discussed that it might be
> > > > > better
> > > > > to have an option to attach program before/after another one in the chain.
> > > > > So the API essentially would receive a before/after flag + fd/id of the
> > > > >
> > > > > Should we do something similar here? See [0] for the original
> > > > > discussion.
> > > > >
> > > > > 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/
> > > >
> > > > Thanks for the pointer, I will have a look.
> > > >
> > > > The above exposes the "prio" of netfilter hooks, so someone
> > > > that needs their hook to run early on, say, before netfilters
> > > > nat engine, could just use INT_MIN.
> > > >
> > > > We could -- for nf bpf -- make the bpf_link fail if a hook
> > > > with the same priority already exists to avoid the "undefined
> > > > behaviour" here (same prio means register order decides what
> > > > hook function runs first ...).
> > > >
> > > > This could be relevant if you have e.g. one bpf program collecting
> > > > statistics vs. one doing drops.
> > > >
> > > > I'll dig though the thread and would try to mimic the tc link
> > > > mechanism as close as possible.
> > >
> > > While I think the direction the TC link discussion took is totally fine,
> > > TC has the advantage (IIUC) of being a somewhat isolated hook. Meaning
> > > it does not make sense for a user to mix priority values && before/after
> > > semantics.
> > >
> > > Netfilter is different in that there is by default modules active with
> > > fixed priority values. So mixing in before/after semantics here could
> > > get confusing.
> >
> > I don't remember the details, so pls correct me, but last time I
> > looked, this priority was basically an ordering within a hook?
>
> Yeah, that is my understanding as well.
>
> > And there were a bunch of kernel-hardcoded values. So either that
> > whole story has to become a UAPI (so the bpf program knows
> > before/after which kernel hook it has to run), or we need some other
> > ordering mechanism.
>
> I'm not sure what you mean by "whole story" but netfilter kernel modules
> register via a priority value as well. As well as the modules the kernel
> ships. So there's that to consider.

Sorry for not being clear. What I meant here is that we'd have to
export those existing priorities in the UAPI headers and keep those
numbers stable. Otherwise it seems impossible to have a proper interop
between those fixed existing priorities and new bpf modules?
(idk if that's a real problem or I'm overthinking)

Because the problem as I see it is as follows:
Let's say I want to trigger before/after kernel module X. I need to
know its priority and it has to be stable.
Alternatively, there should be some way to query the priority of
module X so I can do +1/-1 (which is essentially "before X/after X"
semantics).

> (I'm not sure what's the story with bpf vs kernel
> > hooks interop, so maybe it's all moot?)
> > Am I missing something? Can you share more about why those fixed
> > priorities are fine?
>
> I guess I wouldn't say it's ideal (for all the reasons brought up in the
> previous discussion), but trying to use before/after semantics here
> would necessarily pull in a netfilter rework too, no? Or maybe there's
> some clever way to merge the two worlds and get both subsystems what
> they want.

Right, I don't have an answer here, just trying to understand whether
(as a side effect of those patches) we're really making those existing
priorities a UAPI or not :-)

> Thanks,
> Daniel
Florian Westphal March 24, 2023, 6:22 p.m. UTC | #7
Stanislav Fomichev <sdf@google.com> wrote:
> > I'm not sure what you mean by "whole story" but netfilter kernel modules
> > register via a priority value as well. As well as the modules the kernel
> > ships. So there's that to consider.
> 
> Sorry for not being clear. What I meant here is that we'd have to
> export those existing priorities in the UAPI headers and keep those
> numbers stable. Otherwise it seems impossible to have a proper interop
> between those fixed existing priorities and new bpf modules?
> (idk if that's a real problem or I'm overthinking)

They are already in uapi and exported.
Stanislav Fomichev March 24, 2023, 7:22 p.m. UTC | #8
On Fri, Mar 24, 2023 at 11:22 AM Florian Westphal <fw@strlen.de> wrote:
>
> Stanislav Fomichev <sdf@google.com> wrote:
> > > I'm not sure what you mean by "whole story" but netfilter kernel modules
> > > register via a priority value as well. As well as the modules the kernel
> > > ships. So there's that to consider.
> >
> > Sorry for not being clear. What I meant here is that we'd have to
> > export those existing priorities in the UAPI headers and keep those
> > numbers stable. Otherwise it seems impossible to have a proper interop
> > between those fixed existing priorities and new bpf modules?
> > (idk if that's a real problem or I'm overthinking)
>
> They are already in uapi and exported.

Oh, nice, then probably keeping those prios is the way to go. Up to
you on whether to explore the alternative (before/after) or not. Agree
with Daniel that it probably requires reworking netfilter internals
and it's not really justified here.
diff mbox series

Patch

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 6863e271a9de..beec40ccbd79 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -80,6 +80,7 @@  typedef unsigned int nf_hookfn(void *priv,
 enum nf_hook_ops_type {
 	NF_HOOK_OP_UNDEFINED,
 	NF_HOOK_OP_NF_TABLES,
+	NF_HOOK_OP_BPF,
 };
 
 struct nf_hook_ops {
diff --git a/include/net/netfilter/nf_hook_bpf.h b/include/net/netfilter/nf_hook_bpf.h
new file mode 100644
index 000000000000..9d1b338e89d7
--- /dev/null
+++ b/include/net/netfilter/nf_hook_bpf.h
@@ -0,0 +1,2 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c9699304aed2..b063c6985769 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -986,6 +986,7 @@  enum bpf_prog_type {
 	BPF_PROG_TYPE_LSM,
 	BPF_PROG_TYPE_SK_LOOKUP,
 	BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
+	BPF_PROG_TYPE_NETFILTER,
 };
 
 enum bpf_attach_type {
@@ -1049,6 +1050,7 @@  enum bpf_link_type {
 	BPF_LINK_TYPE_PERF_EVENT = 7,
 	BPF_LINK_TYPE_KPROBE_MULTI = 8,
 	BPF_LINK_TYPE_STRUCT_OPS = 9,
+	BPF_LINK_TYPE_NETFILTER = 10,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -1543,6 +1545,11 @@  union bpf_attr {
 				 */
 				__u64		cookie;
 			} tracing;
+			struct {
+				__u32		pf;
+				__u32		hooknum;
+				__s32		prio;
+			} netfilter;
 		};
 	} link_create;
 
@@ -6373,6 +6380,11 @@  struct bpf_link_info {
 		struct {
 			__u32 ifindex;
 		} xdp;
+		struct {
+			__u32 pf;
+			__u32 hooknum;
+			__s32 priority;
+		} netfilter;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e3fcdc9836a6..8f5cd1fb83ee 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,6 +35,7 @@ 
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
 #include <linux/trace_events.h>
+#include <net/netfilter/nf_hook_bpf.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
 			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -2448,6 +2449,7 @@  static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
 	case BPF_PROG_TYPE_CGROUP_SYSCTL:
 	case BPF_PROG_TYPE_SOCK_OPS:
 	case BPF_PROG_TYPE_EXT: /* extends any prog */
+	case BPF_PROG_TYPE_NETFILTER:
 		return true;
 	case BPF_PROG_TYPE_CGROUP_SKB:
 		/* always unpriv */
@@ -4562,6 +4564,7 @@  static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 
 	switch (prog->type) {
 	case BPF_PROG_TYPE_EXT:
+	case BPF_PROG_TYPE_NETFILTER:
 		break;
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_TRACEPOINT:
@@ -4628,6 +4631,9 @@  static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_XDP:
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
+	case BPF_PROG_TYPE_NETFILTER:
+		ret = bpf_nf_link_attach(attr, prog);
+		break;
 #endif
 	case BPF_PROG_TYPE_PERF_EVENT:
 	case BPF_PROG_TYPE_TRACEPOINT:
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 4d6737160857..bea06f62a30e 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -30,6 +30,9 @@  config NETFILTER_FAMILY_BRIDGE
 config NETFILTER_FAMILY_ARP
 	bool
 
+config NETFILTER_BPF_LINK
+	def_bool BPF_SYSCALL
+
 config NETFILTER_NETLINK_HOOK
 	tristate "Netfilter base hook dump support"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 5ffef1cd6143..d4958e7e7631 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -22,6 +22,7 @@  nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
 endif
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
+obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o
 
 obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
 obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c
new file mode 100644
index 000000000000..fa4fae5cc669
--- /dev/null
+++ b/net/netfilter/nf_bpf_link.c
@@ -0,0 +1,116 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_hook_bpf.h>
+
+static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb, const struct nf_hook_state *s)
+{
+	return NF_ACCEPT;
+}
+
+struct bpf_nf_link {
+	struct bpf_link link;
+	struct nf_hook_ops hook_ops;
+	struct net *net;
+};
+
+static void bpf_nf_link_release(struct bpf_link *link)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops);
+}
+
+static void bpf_nf_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	kfree(nf_link);
+}
+
+static int bpf_nf_link_detach(struct bpf_link *link)
+{
+	bpf_nf_link_release(link);
+	return 0;
+}
+
+static void bpf_nf_link_show_info(const struct bpf_link *link,
+				  struct seq_file *seq)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n",
+		  nf_link->hook_ops.pf, nf_link->hook_ops.hooknum,
+		  nf_link->hook_ops.priority);
+}
+
+static int bpf_nf_link_fill_link_info(const struct bpf_link *link,
+				      struct bpf_link_info *info)
+{
+	struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link);
+
+	info->netfilter.pf = nf_link->hook_ops.pf;
+	info->netfilter.hooknum = nf_link->hook_ops.hooknum;
+	info->netfilter.priority = nf_link->hook_ops.priority;
+
+	return 0;
+}
+
+static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+			      struct bpf_prog *old_prog)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct bpf_link_ops bpf_nf_link_lops = {
+	.release = bpf_nf_link_release,
+	.dealloc = bpf_nf_link_dealloc,
+	.detach = bpf_nf_link_detach,
+	.show_fdinfo = bpf_nf_link_show_info,
+	.fill_link_info = bpf_nf_link_fill_link_info,
+	.update_prog = bpf_nf_link_update,
+};
+
+int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_link_primer link_primer;
+	struct bpf_nf_link *link;
+	int err;
+
+	if (attr->link_create.flags)
+		return -EINVAL;
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link)
+		return -ENOMEM;
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog);
+
+	link->hook_ops.hook = nf_hook_run_bpf;
+	link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF;
+	link->hook_ops.priv = prog;
+
+	link->hook_ops.pf = attr->link_create.netfilter.pf;
+	link->hook_ops.priority = attr->link_create.netfilter.prio;
+	link->hook_ops.hooknum = attr->link_create.netfilter.hooknum;
+
+	link->net = net;
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err)
+		goto out_free;
+
+	err = nf_register_net_hook(net, &link->hook_ops);
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		goto out_free;
+	}
+
+	return bpf_link_settle(&link_primer);
+
+out_free:
+	kfree(link);
+	return err;
+}