Message ID | 20230302172757.9548-2-fw@strlen.de (mailing list archive) |
---|---|
State | RFC |
Delegated to: | BPF |
Headers | show |
Series | bpf: add netfilter program type | expand |
On 03/02, Florian Westphal wrote: > Add bpf_link support skeleton. To keep this reviewable, no bpf program > can be invoked here, if a program would be attached only a c-stub is > called. > This defaults to 'y' if both netfilter and bpf syscall are enabled in > kconfig. > Uapi example usage: > union bpf_attr attr = { }; > attr.link_create.prog_fd = progfd; > attr.link_create.attach_type = BPF_NETFILTER; > attr.link_create.netfilter.pf = PF_INET; > attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN; > attr.link_create.netfilter.priority = -128; > err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr)); > ... this would attach progfd to ipv4:input hook. > Such hook gets removed automatically if the calling program exits. > BPF_NETFILTER program invocation is added in followup change. > NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it > allows to tell userspace which program is attached at the given hook > when user runs 'nft hook list' command rather than just the priority > and not-very-helpful 'this hook runs a bpf prog but I can't tell which > one'. > Signed-off-by: Florian Westphal <fw@strlen.de> > --- > include/linux/netfilter.h | 1 + > include/net/netfilter/nf_hook_bpf.h | 2 + > include/uapi/linux/bpf.h | 12 +++ > kernel/bpf/syscall.c | 6 ++ > net/netfilter/Kconfig | 3 + > net/netfilter/Makefile | 1 + > net/netfilter/nf_bpf_link.c | 116 ++++++++++++++++++++++++++++ > 7 files changed, 141 insertions(+) > create mode 100644 include/net/netfilter/nf_hook_bpf.h > create mode 100644 net/netfilter/nf_bpf_link.c > diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h > index 6863e271a9de..beec40ccbd79 100644 > --- a/include/linux/netfilter.h > +++ b/include/linux/netfilter.h > @@ -80,6 +80,7 @@ typedef unsigned int nf_hookfn(void *priv, > enum nf_hook_ops_type { > NF_HOOK_OP_UNDEFINED, > NF_HOOK_OP_NF_TABLES, > + NF_HOOK_OP_BPF, > }; > struct nf_hook_ops { > diff --git a/include/net/netfilter/nf_hook_bpf.h > b/include/net/netfilter/nf_hook_bpf.h > new file mode 100644 > index 000000000000..9d1b338e89d7 > --- /dev/null > +++ b/include/net/netfilter/nf_hook_bpf.h > @@ -0,0 +1,2 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog > *prog); > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > index c9699304aed2..b063c6985769 100644 > --- a/include/uapi/linux/bpf.h > +++ b/include/uapi/linux/bpf.h > @@ -986,6 +986,7 @@ enum bpf_prog_type { > BPF_PROG_TYPE_LSM, > BPF_PROG_TYPE_SK_LOOKUP, > BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ > + BPF_PROG_TYPE_NETFILTER, > }; > enum bpf_attach_type { > @@ -1049,6 +1050,7 @@ enum bpf_link_type { > BPF_LINK_TYPE_PERF_EVENT = 7, > BPF_LINK_TYPE_KPROBE_MULTI = 8, > BPF_LINK_TYPE_STRUCT_OPS = 9, > + BPF_LINK_TYPE_NETFILTER = 10, > MAX_BPF_LINK_TYPE, > }; > @@ -1543,6 +1545,11 @@ union bpf_attr { > */ > __u64 cookie; > } tracing; > + struct { > + __u32 pf; > + __u32 hooknum; > + __s32 prio; > + } netfilter; For recent tc BPF program extensions, we've discussed that it might be better to have an option to attach program before/after another one in the chain. So the API essentially would receive a before/after flag + fd/id of the program where we want to be. Should we do something similar here? See [0] for the original discussion. 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/ > }; > } link_create; > @@ -6373,6 +6380,11 @@ struct bpf_link_info { > struct { > __u32 ifindex; > } xdp; > + struct { > + __u32 pf; > + __u32 hooknum; > + __s32 priority; > + } netfilter; > }; > } __attribute__((aligned(8))); > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index e3fcdc9836a6..8f5cd1fb83ee 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -35,6 +35,7 @@ > #include <linux/rcupdate_trace.h> > #include <linux/memcontrol.h> > #include <linux/trace_events.h> > +#include <net/netfilter/nf_hook_bpf.h> > #define IS_FD_ARRAY(map) ((map)->map_type == > BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ > (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ > @@ -2448,6 +2449,7 @@ static bool is_net_admin_prog_type(enum > bpf_prog_type prog_type) > case BPF_PROG_TYPE_CGROUP_SYSCTL: > case BPF_PROG_TYPE_SOCK_OPS: > case BPF_PROG_TYPE_EXT: /* extends any prog */ > + case BPF_PROG_TYPE_NETFILTER: > return true; > case BPF_PROG_TYPE_CGROUP_SKB: > /* always unpriv */ > @@ -4562,6 +4564,7 @@ static int link_create(union bpf_attr *attr, > bpfptr_t uattr) > switch (prog->type) { > case BPF_PROG_TYPE_EXT: > + case BPF_PROG_TYPE_NETFILTER: > break; > case BPF_PROG_TYPE_PERF_EVENT: > case BPF_PROG_TYPE_TRACEPOINT: > @@ -4628,6 +4631,9 @@ static int link_create(union bpf_attr *attr, > bpfptr_t uattr) > case BPF_PROG_TYPE_XDP: > ret = bpf_xdp_link_attach(attr, prog); > break; > + case BPF_PROG_TYPE_NETFILTER: > + ret = bpf_nf_link_attach(attr, prog); > + break; > #endif > case BPF_PROG_TYPE_PERF_EVENT: > case BPF_PROG_TYPE_TRACEPOINT: > diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig > index 4d6737160857..bea06f62a30e 100644 > --- a/net/netfilter/Kconfig > +++ b/net/netfilter/Kconfig > @@ -30,6 +30,9 @@ config NETFILTER_FAMILY_BRIDGE > config NETFILTER_FAMILY_ARP > bool > +config NETFILTER_BPF_LINK > + def_bool BPF_SYSCALL > + > config NETFILTER_NETLINK_HOOK > tristate "Netfilter base hook dump support" > depends on NETFILTER_ADVANCED > diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile > index 5ffef1cd6143..d4958e7e7631 100644 > --- a/net/netfilter/Makefile > +++ b/net/netfilter/Makefile > @@ -22,6 +22,7 @@ nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += > nf_conntrack_bpf.o > endif > obj-$(CONFIG_NETFILTER) = netfilter.o > +obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o > obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o > obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o > diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c > new file mode 100644 > index 000000000000..fa4fae5cc669 > --- /dev/null > +++ b/net/netfilter/nf_bpf_link.c > @@ -0,0 +1,116 @@ > +// SPDX-License-Identifier: GPL-2.0 > +#include <linux/bpf.h> > +#include <linux/netfilter.h> > + > +#include <net/netfilter/nf_hook_bpf.h> > + > +static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb, > const struct nf_hook_state *s) > +{ > + return NF_ACCEPT; > +} > + > +struct bpf_nf_link { > + struct bpf_link link; > + struct nf_hook_ops hook_ops; > + struct net *net; > +}; > + > +static void bpf_nf_link_release(struct bpf_link *link) > +{ > + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, > link); > + > + nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops); > +} > + > +static void bpf_nf_link_dealloc(struct bpf_link *link) > +{ > + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, > link); > + > + kfree(nf_link); > +} > + > +static int bpf_nf_link_detach(struct bpf_link *link) > +{ > + bpf_nf_link_release(link); > + return 0; > +} > + > +static void bpf_nf_link_show_info(const struct bpf_link *link, > + struct seq_file *seq) > +{ > + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, > link); > + > + seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n", > + nf_link->hook_ops.pf, nf_link->hook_ops.hooknum, > + nf_link->hook_ops.priority); > +} > + > +static int bpf_nf_link_fill_link_info(const struct bpf_link *link, > + struct bpf_link_info *info) > +{ > + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, > link); > + > + info->netfilter.pf = nf_link->hook_ops.pf; > + info->netfilter.hooknum = nf_link->hook_ops.hooknum; > + info->netfilter.priority = nf_link->hook_ops.priority; > + > + return 0; > +} > + > +static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog > *new_prog, > + struct bpf_prog *old_prog) > +{ > + return -EOPNOTSUPP; > +} > + > +static const struct bpf_link_ops bpf_nf_link_lops = { > + .release = bpf_nf_link_release, > + .dealloc = bpf_nf_link_dealloc, > + .detach = bpf_nf_link_detach, > + .show_fdinfo = bpf_nf_link_show_info, > + .fill_link_info = bpf_nf_link_fill_link_info, > + .update_prog = bpf_nf_link_update, > +}; > + > +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) > +{ > + struct net *net = current->nsproxy->net_ns; > + struct bpf_link_primer link_primer; > + struct bpf_nf_link *link; > + int err; > + > + if (attr->link_create.flags) > + return -EINVAL; > + > + link = kzalloc(sizeof(*link), GFP_USER); > + if (!link) > + return -ENOMEM; > + > + bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, > prog); > + > + link->hook_ops.hook = nf_hook_run_bpf; > + link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF; > + link->hook_ops.priv = prog; > + > + link->hook_ops.pf = attr->link_create.netfilter.pf; > + link->hook_ops.priority = attr->link_create.netfilter.prio; > + link->hook_ops.hooknum = attr->link_create.netfilter.hooknum; > + > + link->net = net; > + > + err = bpf_link_prime(&link->link, &link_primer); > + if (err) > + goto out_free; > + > + err = nf_register_net_hook(net, &link->hook_ops); > + if (err) { > + bpf_link_cleanup(&link_primer); > + goto out_free; > + } > + > + return bpf_link_settle(&link_primer); > + > +out_free: > + kfree(link); > + return err; > +} > -- > 2.39.2
Stanislav Fomichev <sdf@google.com> wrote: > On 03/02, Florian Westphal wrote: > > + struct { > > + __u32 pf; > > + __u32 hooknum; > > + __s32 prio; > > + } netfilter; > > For recent tc BPF program extensions, we've discussed that it might be > better > to have an option to attach program before/after another one in the chain. > So the API essentially would receive a before/after flag + fd/id of the > > Should we do something similar here? See [0] for the original > discussion. > > 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/ Thanks for the pointer, I will have a look. The above exposes the "prio" of netfilter hooks, so someone that needs their hook to run early on, say, before netfilters nat engine, could just use INT_MIN. We could -- for nf bpf -- make the bpf_link fail if a hook with the same priority already exists to avoid the "undefined behaviour" here (same prio means register order decides what hook function runs first ...). This could be relevant if you have e.g. one bpf program collecting statistics vs. one doing drops. I'll dig though the thread and would try to mimic the tc link mechanism as close as possible.
Hi Florian, Stan, On Fri, Mar 03, 2023 at 01:27:52AM +0100, Florian Westphal wrote: > Stanislav Fomichev <sdf@google.com> wrote: > > On 03/02, Florian Westphal wrote: > > > + struct { > > > + __u32 pf; > > > + __u32 hooknum; > > > + __s32 prio; > > > + } netfilter; > > > > For recent tc BPF program extensions, we've discussed that it might be > > better > > to have an option to attach program before/after another one in the chain. > > So the API essentially would receive a before/after flag + fd/id of the > > > > Should we do something similar here? See [0] for the original > > discussion. > > > > 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/ > > Thanks for the pointer, I will have a look. > > The above exposes the "prio" of netfilter hooks, so someone > that needs their hook to run early on, say, before netfilters > nat engine, could just use INT_MIN. > > We could -- for nf bpf -- make the bpf_link fail if a hook > with the same priority already exists to avoid the "undefined > behaviour" here (same prio means register order decides what > hook function runs first ...). > > This could be relevant if you have e.g. one bpf program collecting > statistics vs. one doing drops. > > I'll dig though the thread and would try to mimic the tc link > mechanism as close as possible. While I think the direction the TC link discussion took is totally fine, TC has the advantage (IIUC) of being a somewhat isolated hook. Meaning it does not make sense for a user to mix priority values && before/after semantics. Netfilter is different in that there is by default modules active with fixed priority values. So mixing in before/after semantics here could get confusing. Thanks, Daniel
On Wed, Mar 22, 2023 at 5:41 PM Daniel Xu <dxu@dxuuu.xyz> wrote: > > Hi Florian, Stan, > > On Fri, Mar 03, 2023 at 01:27:52AM +0100, Florian Westphal wrote: > > Stanislav Fomichev <sdf@google.com> wrote: > > > On 03/02, Florian Westphal wrote: > > > > + struct { > > > > + __u32 pf; > > > > + __u32 hooknum; > > > > + __s32 prio; > > > > + } netfilter; > > > > > > For recent tc BPF program extensions, we've discussed that it might be > > > better > > > to have an option to attach program before/after another one in the chain. > > > So the API essentially would receive a before/after flag + fd/id of the > > > > > > Should we do something similar here? See [0] for the original > > > discussion. > > > > > > 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/ > > > > Thanks for the pointer, I will have a look. > > > > The above exposes the "prio" of netfilter hooks, so someone > > that needs their hook to run early on, say, before netfilters > > nat engine, could just use INT_MIN. > > > > We could -- for nf bpf -- make the bpf_link fail if a hook > > with the same priority already exists to avoid the "undefined > > behaviour" here (same prio means register order decides what > > hook function runs first ...). > > > > This could be relevant if you have e.g. one bpf program collecting > > statistics vs. one doing drops. > > > > I'll dig though the thread and would try to mimic the tc link > > mechanism as close as possible. > > While I think the direction the TC link discussion took is totally fine, > TC has the advantage (IIUC) of being a somewhat isolated hook. Meaning > it does not make sense for a user to mix priority values && before/after > semantics. > > Netfilter is different in that there is by default modules active with > fixed priority values. So mixing in before/after semantics here could > get confusing. I don't remember the details, so pls correct me, but last time I looked, this priority was basically an ordering within a hook? And there were a bunch of kernel-hardcoded values. So either that whole story has to become a UAPI (so the bpf program knows before/after which kernel hook it has to run), or we need some other ordering mechanism. (I'm not sure what's the story with bpf vs kernel hooks interop, so maybe it's all moot?) Am I missing something? Can you share more about why those fixed priorities are fine? > Thanks, > Daniel
Hi Stan, On Thu, Mar 23, 2023 at 11:31:14AM -0700, Stanislav Fomichev wrote: > On Wed, Mar 22, 2023 at 5:41 PM Daniel Xu <dxu@dxuuu.xyz> wrote: > > > > Hi Florian, Stan, > > > > On Fri, Mar 03, 2023 at 01:27:52AM +0100, Florian Westphal wrote: > > > Stanislav Fomichev <sdf@google.com> wrote: > > > > On 03/02, Florian Westphal wrote: > > > > > + struct { > > > > > + __u32 pf; > > > > > + __u32 hooknum; > > > > > + __s32 prio; > > > > > + } netfilter; > > > > > > > > For recent tc BPF program extensions, we've discussed that it might be > > > > better > > > > to have an option to attach program before/after another one in the chain. > > > > So the API essentially would receive a before/after flag + fd/id of the > > > > > > > > Should we do something similar here? See [0] for the original > > > > discussion. > > > > > > > > 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/ > > > > > > Thanks for the pointer, I will have a look. > > > > > > The above exposes the "prio" of netfilter hooks, so someone > > > that needs their hook to run early on, say, before netfilters > > > nat engine, could just use INT_MIN. > > > > > > We could -- for nf bpf -- make the bpf_link fail if a hook > > > with the same priority already exists to avoid the "undefined > > > behaviour" here (same prio means register order decides what > > > hook function runs first ...). > > > > > > This could be relevant if you have e.g. one bpf program collecting > > > statistics vs. one doing drops. > > > > > > I'll dig though the thread and would try to mimic the tc link > > > mechanism as close as possible. > > > > While I think the direction the TC link discussion took is totally fine, > > TC has the advantage (IIUC) of being a somewhat isolated hook. Meaning > > it does not make sense for a user to mix priority values && before/after > > semantics. > > > > Netfilter is different in that there is by default modules active with > > fixed priority values. So mixing in before/after semantics here could > > get confusing. > > I don't remember the details, so pls correct me, but last time I > looked, this priority was basically an ordering within a hook? Yeah, that is my understanding as well. > And there were a bunch of kernel-hardcoded values. So either that > whole story has to become a UAPI (so the bpf program knows > before/after which kernel hook it has to run), or we need some other > ordering mechanism. I'm not sure what you mean by "whole story" but netfilter kernel modules register via a priority value as well. As well as the modules the kernel ships. So there's that to consider. (I'm not sure what's the story with bpf vs kernel > hooks interop, so maybe it's all moot?) > Am I missing something? Can you share more about why those fixed > priorities are fine? I guess I wouldn't say it's ideal (for all the reasons brought up in the previous discussion), but trying to use before/after semantics here would necessarily pull in a netfilter rework too, no? Or maybe there's some clever way to merge the two worlds and get both subsystems what they want. Thanks, Daniel
On Fri, Mar 24, 2023 at 10:33 AM Daniel Xu <dxu@dxuuu.xyz> wrote: > > Hi Stan, > > On Thu, Mar 23, 2023 at 11:31:14AM -0700, Stanislav Fomichev wrote: > > On Wed, Mar 22, 2023 at 5:41 PM Daniel Xu <dxu@dxuuu.xyz> wrote: > > > > > > Hi Florian, Stan, > > > > > > On Fri, Mar 03, 2023 at 01:27:52AM +0100, Florian Westphal wrote: > > > > Stanislav Fomichev <sdf@google.com> wrote: > > > > > On 03/02, Florian Westphal wrote: > > > > > > + struct { > > > > > > + __u32 pf; > > > > > > + __u32 hooknum; > > > > > > + __s32 prio; > > > > > > + } netfilter; > > > > > > > > > > For recent tc BPF program extensions, we've discussed that it might be > > > > > better > > > > > to have an option to attach program before/after another one in the chain. > > > > > So the API essentially would receive a before/after flag + fd/id of the > > > > > > > > > > Should we do something similar here? See [0] for the original > > > > > discussion. > > > > > > > > > > 0: https://lore.kernel.org/bpf/YzzWDqAmN5DRTupQ@google.com/ > > > > > > > > Thanks for the pointer, I will have a look. > > > > > > > > The above exposes the "prio" of netfilter hooks, so someone > > > > that needs their hook to run early on, say, before netfilters > > > > nat engine, could just use INT_MIN. > > > > > > > > We could -- for nf bpf -- make the bpf_link fail if a hook > > > > with the same priority already exists to avoid the "undefined > > > > behaviour" here (same prio means register order decides what > > > > hook function runs first ...). > > > > > > > > This could be relevant if you have e.g. one bpf program collecting > > > > statistics vs. one doing drops. > > > > > > > > I'll dig though the thread and would try to mimic the tc link > > > > mechanism as close as possible. > > > > > > While I think the direction the TC link discussion took is totally fine, > > > TC has the advantage (IIUC) of being a somewhat isolated hook. Meaning > > > it does not make sense for a user to mix priority values && before/after > > > semantics. > > > > > > Netfilter is different in that there is by default modules active with > > > fixed priority values. So mixing in before/after semantics here could > > > get confusing. > > > > I don't remember the details, so pls correct me, but last time I > > looked, this priority was basically an ordering within a hook? > > Yeah, that is my understanding as well. > > > And there were a bunch of kernel-hardcoded values. So either that > > whole story has to become a UAPI (so the bpf program knows > > before/after which kernel hook it has to run), or we need some other > > ordering mechanism. > > I'm not sure what you mean by "whole story" but netfilter kernel modules > register via a priority value as well. As well as the modules the kernel > ships. So there's that to consider. Sorry for not being clear. What I meant here is that we'd have to export those existing priorities in the UAPI headers and keep those numbers stable. Otherwise it seems impossible to have a proper interop between those fixed existing priorities and new bpf modules? (idk if that's a real problem or I'm overthinking) Because the problem as I see it is as follows: Let's say I want to trigger before/after kernel module X. I need to know its priority and it has to be stable. Alternatively, there should be some way to query the priority of module X so I can do +1/-1 (which is essentially "before X/after X" semantics). > (I'm not sure what's the story with bpf vs kernel > > hooks interop, so maybe it's all moot?) > > Am I missing something? Can you share more about why those fixed > > priorities are fine? > > I guess I wouldn't say it's ideal (for all the reasons brought up in the > previous discussion), but trying to use before/after semantics here > would necessarily pull in a netfilter rework too, no? Or maybe there's > some clever way to merge the two worlds and get both subsystems what > they want. Right, I don't have an answer here, just trying to understand whether (as a side effect of those patches) we're really making those existing priorities a UAPI or not :-) > Thanks, > Daniel
Stanislav Fomichev <sdf@google.com> wrote: > > I'm not sure what you mean by "whole story" but netfilter kernel modules > > register via a priority value as well. As well as the modules the kernel > > ships. So there's that to consider. > > Sorry for not being clear. What I meant here is that we'd have to > export those existing priorities in the UAPI headers and keep those > numbers stable. Otherwise it seems impossible to have a proper interop > between those fixed existing priorities and new bpf modules? > (idk if that's a real problem or I'm overthinking) They are already in uapi and exported.
On Fri, Mar 24, 2023 at 11:22 AM Florian Westphal <fw@strlen.de> wrote: > > Stanislav Fomichev <sdf@google.com> wrote: > > > I'm not sure what you mean by "whole story" but netfilter kernel modules > > > register via a priority value as well. As well as the modules the kernel > > > ships. So there's that to consider. > > > > Sorry for not being clear. What I meant here is that we'd have to > > export those existing priorities in the UAPI headers and keep those > > numbers stable. Otherwise it seems impossible to have a proper interop > > between those fixed existing priorities and new bpf modules? > > (idk if that's a real problem or I'm overthinking) > > They are already in uapi and exported. Oh, nice, then probably keeping those prios is the way to go. Up to you on whether to explore the alternative (before/after) or not. Agree with Daniel that it probably requires reworking netfilter internals and it's not really justified here.
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 6863e271a9de..beec40ccbd79 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -80,6 +80,7 @@ typedef unsigned int nf_hookfn(void *priv, enum nf_hook_ops_type { NF_HOOK_OP_UNDEFINED, NF_HOOK_OP_NF_TABLES, + NF_HOOK_OP_BPF, }; struct nf_hook_ops { diff --git a/include/net/netfilter/nf_hook_bpf.h b/include/net/netfilter/nf_hook_bpf.h new file mode 100644 index 000000000000..9d1b338e89d7 --- /dev/null +++ b/include/net/netfilter/nf_hook_bpf.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c9699304aed2..b063c6985769 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -986,6 +986,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LSM, BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_NETFILTER, }; enum bpf_attach_type { @@ -1049,6 +1050,7 @@ enum bpf_link_type { BPF_LINK_TYPE_PERF_EVENT = 7, BPF_LINK_TYPE_KPROBE_MULTI = 8, BPF_LINK_TYPE_STRUCT_OPS = 9, + BPF_LINK_TYPE_NETFILTER = 10, MAX_BPF_LINK_TYPE, }; @@ -1543,6 +1545,11 @@ union bpf_attr { */ __u64 cookie; } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 prio; + } netfilter; }; } link_create; @@ -6373,6 +6380,11 @@ struct bpf_link_info { struct { __u32 ifindex; } xdp; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + } netfilter; }; } __attribute__((aligned(8))); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e3fcdc9836a6..8f5cd1fb83ee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -35,6 +35,7 @@ #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> #include <linux/trace_events.h> +#include <net/netfilter/nf_hook_bpf.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2448,6 +2449,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_EXT: /* extends any prog */ + case BPF_PROG_TYPE_NETFILTER: return true; case BPF_PROG_TYPE_CGROUP_SKB: /* always unpriv */ @@ -4562,6 +4564,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) switch (prog->type) { case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_NETFILTER: break; case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: @@ -4628,6 +4631,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) case BPF_PROG_TYPE_XDP: ret = bpf_xdp_link_attach(attr, prog); break; + case BPF_PROG_TYPE_NETFILTER: + ret = bpf_nf_link_attach(attr, prog); + break; #endif case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 4d6737160857..bea06f62a30e 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -30,6 +30,9 @@ config NETFILTER_FAMILY_BRIDGE config NETFILTER_FAMILY_ARP bool +config NETFILTER_BPF_LINK + def_bool BPF_SYSCALL + config NETFILTER_NETLINK_HOOK tristate "Netfilter base hook dump support" depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5ffef1cd6143..d4958e7e7631 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -22,6 +22,7 @@ nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o endif obj-$(CONFIG_NETFILTER) = netfilter.o +obj-$(CONFIG_NETFILTER_BPF_LINK) += nf_bpf_link.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c new file mode 100644 index 000000000000..fa4fae5cc669 --- /dev/null +++ b/net/netfilter/nf_bpf_link.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/bpf.h> +#include <linux/netfilter.h> + +#include <net/netfilter/nf_hook_bpf.h> + +static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb, const struct nf_hook_state *s) +{ + return NF_ACCEPT; +} + +struct bpf_nf_link { + struct bpf_link link; + struct nf_hook_ops hook_ops; + struct net *net; +}; + +static void bpf_nf_link_release(struct bpf_link *link) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops); +} + +static void bpf_nf_link_dealloc(struct bpf_link *link) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + kfree(nf_link); +} + +static int bpf_nf_link_detach(struct bpf_link *link) +{ + bpf_nf_link_release(link); + return 0; +} + +static void bpf_nf_link_show_info(const struct bpf_link *link, + struct seq_file *seq) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + seq_printf(seq, "pf:\t%u\thooknum:\t%u\tprio:\t%d\n", + nf_link->hook_ops.pf, nf_link->hook_ops.hooknum, + nf_link->hook_ops.priority); +} + +static int bpf_nf_link_fill_link_info(const struct bpf_link *link, + struct bpf_link_info *info) +{ + struct bpf_nf_link *nf_link = container_of(link, struct bpf_nf_link, link); + + info->netfilter.pf = nf_link->hook_ops.pf; + info->netfilter.hooknum = nf_link->hook_ops.hooknum; + info->netfilter.priority = nf_link->hook_ops.priority; + + return 0; +} + +static int bpf_nf_link_update(struct bpf_link *link, struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + return -EOPNOTSUPP; +} + +static const struct bpf_link_ops bpf_nf_link_lops = { + .release = bpf_nf_link_release, + .dealloc = bpf_nf_link_dealloc, + .detach = bpf_nf_link_detach, + .show_fdinfo = bpf_nf_link_show_info, + .fill_link_info = bpf_nf_link_fill_link_info, + .update_prog = bpf_nf_link_update, +}; + +int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_link_primer link_primer; + struct bpf_nf_link *link; + int err; + + if (attr->link_create.flags) + return -EINVAL; + + link = kzalloc(sizeof(*link), GFP_USER); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_NETFILTER, &bpf_nf_link_lops, prog); + + link->hook_ops.hook = nf_hook_run_bpf; + link->hook_ops.hook_ops_type = NF_HOOK_OP_BPF; + link->hook_ops.priv = prog; + + link->hook_ops.pf = attr->link_create.netfilter.pf; + link->hook_ops.priority = attr->link_create.netfilter.prio; + link->hook_ops.hooknum = attr->link_create.netfilter.hooknum; + + link->net = net; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) + goto out_free; + + err = nf_register_net_hook(net, &link->hook_ops); + if (err) { + bpf_link_cleanup(&link_primer); + goto out_free; + } + + return bpf_link_settle(&link_primer); + +out_free: + kfree(link); + return err; +}
Add bpf_link support skeleton. To keep this reviewable, no bpf program can be invoked here, if a program would be attached only a c-stub is called. This defaults to 'y' if both netfilter and bpf syscall are enabled in kconfig. Uapi example usage: union bpf_attr attr = { }; attr.link_create.prog_fd = progfd; attr.link_create.attach_type = BPF_NETFILTER; attr.link_create.netfilter.pf = PF_INET; attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN; attr.link_create.netfilter.priority = -128; err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr)); ... this would attach progfd to ipv4:input hook. Such hook gets removed automatically if the calling program exits. BPF_NETFILTER program invocation is added in followup change. NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it allows to tell userspace which program is attached at the given hook when user runs 'nft hook list' command rather than just the priority and not-very-helpful 'this hook runs a bpf prog but I can't tell which one'. Signed-off-by: Florian Westphal <fw@strlen.de> --- include/linux/netfilter.h | 1 + include/net/netfilter/nf_hook_bpf.h | 2 + include/uapi/linux/bpf.h | 12 +++ kernel/bpf/syscall.c | 6 ++ net/netfilter/Kconfig | 3 + net/netfilter/Makefile | 1 + net/netfilter/nf_bpf_link.c | 116 ++++++++++++++++++++++++++++ 7 files changed, 141 insertions(+) create mode 100644 include/net/netfilter/nf_hook_bpf.h create mode 100644 net/netfilter/nf_bpf_link.c