[RFC,v2,6/9] netfilter: add bpf base hook program generator

Message ID	20221005141309.31758-7-fw@strlen.de (mailing list archive)
State	RFC
Delegated to:	Netdev Maintainers
Headers	show Return-Path: <bpf-owner@kernel.org> From: Florian Westphal <fw@strlen.de> To: bpf@vger.kernel.org Cc: Florian Westphal <fw@strlen.de> Subject: [RFC v2 6/9] netfilter: add bpf base hook program generator Date: Wed, 5 Oct 2022 16:13:06 +0200 Message-Id: <20221005141309.31758-7-fw@strlen.de> In-Reply-To: <20221005141309.31758-1-fw@strlen.de> References: <20221005141309.31758-1-fw@strlen.de> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	netfilter: bpf base hook program generator \| expand [RFC,0/9,v2] netfilter: bpf base hook program generator [RFC,v2,1/9] netfilter: nf_queue: carry index in hook state [RFC,v2,2/9] netfilter: nat: split nat hook iteration into a helper [RFC,v2,3/9] netfilter: remove hook index from nf_hook_slow arguments [RFC,v2,4/9] netfilter: make hook functions accept only one argument [RFC,v2,5/9] netfilter: reduce allowed hook count to 32 [RFC,v2,6/9] netfilter: add bpf base hook program generator [RFC,v2,7/9] netfilter: core: do not rebuild bpf program on dying netns [RFC,v2,8/9] netfilter: netdev: switch to invocation via bpf [RFC,v2,9/9] netfilter: hook_jit: add prog cache

Context	Check	Description
netdev/tree_selection	success	Guessed tree name to be net-next, async
netdev/fixes_present	success	Fixes tag not required for -next series
netdev/subject_prefix	success	Link
netdev/cover_letter	success	Series has a cover letter
netdev/patch_count	success	Link
netdev/header_inline	success	No static functions without inline keyword in header files
netdev/build_32bit	success	Errors and warnings before: 224 this patch: 224
netdev/cc_maintainers	warning	9 maintainers not CCed: kuba@kernel.org davem@davemloft.net pablo@netfilter.org netfilter-devel@vger.kernel.org kadlec@netfilter.org netdev@vger.kernel.org coreteam@netfilter.org edumazet@google.com pabeni@redhat.com
netdev/build_clang	fail	Errors and warnings before: 60 this patch: 66
netdev/module_param	success	Was 0 now: 0
netdev/verify_signedoff	success	Signed-off-by tag matches author and committer
netdev/check_selftest	success	No net selftest shell script
netdev/verify_fixes	success	No Fixes tag
netdev/build_allmodconfig_warn	success	Errors and warnings before: 228 this patch: 228
netdev/checkpatch	warning	WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns WARNING: please write a help paragraph that fully describes the config symbol
netdev/kdoc	success	Errors and warnings before: 11 this patch: 11
netdev/source_inline	success	Was 0 now: 0

On Wed, Oct 05, 2022 at 04:13:06PM +0200, Florian Westphal wrote: > > @@ -254,11 +269,24 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, > > if (hook_head) { > struct nf_hook_state state; > +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) > + const struct bpf_prog *p = READ_ONCE(hook_head->hook_prog); > + > + nf_hook_state_init(&state, hook, pf, indev, outdev, > + sk, net, okfn); > + > + state.priv = (void *)hook_head; > + state.skb = skb; > > + migrate_disable(); > + ret = bpf_prog_run_nf(p, &state); > + migrate_enable(); Since generated prog doesn't do any per-cpu work and not using any maps there is no need for migrate_disable. There is cant_migrate() in __bpf_prog_run(), but it's probably better to silence that instead of adding migrate_disable/enable overhead. I guess it's ok for now. > +static bool emit_mov_ptr_reg(struct nf_hook_prog *p, u8 dreg, u8 sreg) > +{ > + if (sizeof(void *) == sizeof(u64)) > + return emit(p, BPF_MOV64_REG(dreg, sreg)); > + if (sizeof(void *) == sizeof(u32)) > + return emit(p, BPF_MOV32_REG(dreg, sreg)); I bet that was never tested :) because... see below. > + > + return false; > +} > + > +static bool do_prologue(struct nf_hook_prog *p) > +{ > + int width = bytes_to_bpf_size(sizeof(void *)); > + > + if (WARN_ON_ONCE(width < 0)) > + return false; > + > + /* argument to program is a pointer to struct nf_hook_state, in BPF_REG_1. */ > + if (!emit_mov_ptr_reg(p, BPF_REG_6, BPF_REG_1)) > + return false; > + > + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_7, BPF_REG_1, > + offsetof(struct nf_hook_state, priv)))) > + return false; > + > + /* could load state->hook_index, but we don't support index > 0 for bpf call. */ > + if (!emit(p, BPF_MOV32_IMM(BPF_REG_8, 0))) > + return false; > + > + return true; > +} > + > +static void patch_hook_jumps(struct nf_hook_prog *p) > +{ > + unsigned int i; > + > + if (!p->insns) > + return; > + > + for (i = 0; i < p->pos; i++) { > + if (BPF_CLASS(p->insns[i].code) != BPF_JMP) > + continue; > + > + if (p->insns[i].code == (BPF_EXIT | BPF_JMP)) > + continue; > + if (p->insns[i].code == (BPF_CALL | BPF_JMP)) > + continue; > + > + if (p->insns[i].off != JMP_INVALID) > + continue; > + p->insns[i].off = p->pos - i - 1; Pls add a check that it fits in 16-bits. > + } > +} > + > +static bool emit_retval(struct nf_hook_prog *p, int retval) > +{ > + if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, retval))) > + return false; > + > + return emit(p, BPF_EXIT_INSN()); > +} > + > +static bool emit_nf_hook_slow(struct nf_hook_prog *p) > +{ > + int width = bytes_to_bpf_size(sizeof(void *)); > + > + /* restore the original state->priv. */ > + if (!emit(p, BPF_STX_MEM(width, BPF_REG_6, BPF_REG_7, > + offsetof(struct nf_hook_state, priv)))) > + return false; > + > + /* arg1 is state->skb */ > + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, > + offsetof(struct nf_hook_state, skb)))) > + return false; > + > + /* arg2 is "struct nf_hook_state *" */ > + if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6))) > + return false; > + > + /* arg3 is nf_hook_entries (original state->priv) */ > + if (!emit(p, BPF_MOV64_REG(BPF_REG_3, BPF_REG_7))) > + return false; > + > + if (!emit(p, BPF_EMIT_CALL(nf_hook_slow))) > + return false; > + > + /* No further action needed, return retval provided by nf_hook_slow */ > + return emit(p, BPF_EXIT_INSN()); > +} > + > +static bool emit_nf_queue(struct nf_hook_prog *p) > +{ > + int width = bytes_to_bpf_size(sizeof(void *)); > + > + if (width < 0) { > + WARN_ON_ONCE(1); > + return false; > + } > + > + /* int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int verdict) */ > + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, > + offsetof(struct nf_hook_state, skb)))) > + return false; > + if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8, > + offsetof(struct nf_hook_state, hook_index)))) > + return false; > + /* arg2: struct nf_hook_state * */ > + if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6))) > + return false; > + /* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */ > + if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0))) > + return false; > + if (!emit(p, BPF_EMIT_CALL(nf_queue))) > + return false; here and other CALL work by accident on x84-64. You need to wrap them with BPF_CALL_ and point BPF_EMIT_CALL to that wrapper. On x86-64 it will be a nop. On x86-32 it will do quite a bit of work. > + > + /* Check nf_queue return value. Abnormal case: nf_queue returned != 0. > + * > + * Fall back to nf_hook_slow(). > + */ > + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2))) > + return false; > + > + /* Normal case: skb was stolen. Return 0. */ > + return emit_retval(p, 0); > +} > + > +static bool do_epilogue_base_hooks(struct nf_hook_prog *p) > +{ > + int width = bytes_to_bpf_size(sizeof(void *)); > + > + if (WARN_ON_ONCE(width < 0)) > + return false; > + > + /* last 'hook'. We arrive here if previous hook returned ACCEPT, > + * i.e. all hooks passed -- we are done. > + * > + * Return 1, skb can continue traversing network stack. > + */ > + if (!emit_retval(p, 1)) > + return false; > + > + /* Patch all hook jumps, in case any of these are taken > + * we need to jump to this location. > + * > + * This happens when verdict is != ACCEPT. > + */ > + patch_hook_jumps(p); > + > + /* need to ignore upper 24 bits, might contain errno or queue number */ > + if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0))) > + return false; > + if (!emit(p, BPF_ALU32_IMM(BPF_AND, BPF_REG_3, 0xff))) > + return false; > + > + /* ACCEPT handled, check STOLEN. */ > + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_STOLEN, 2))) > + return false; > + > + if (!emit_retval(p, 0)) > + return false; > + > + /* ACCEPT and STOLEN handled. Check DROP next */ > + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_DROP, 1 + 2 + 2 + 2 + 2))) > + return false; > + > + /* First step. Extract the errno number. 1 insn. */ > + if (!emit(p, BPF_ALU32_IMM(BPF_RSH, BPF_REG_0, NF_VERDICT_QBITS))) > + return false; > + > + /* Second step: replace errno with EPERM if it was 0. 2 insns. */ > + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1))) > + return false; > + if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, EPERM))) > + return false; > + > + /* Third step: negate reg0: Caller expects -EFOO and stash the result. 2 insns. */ > + if (!emit(p, BPF_ALU32_IMM(BPF_NEG, BPF_REG_0, 0))) > + return false; > + if (!emit(p, BPF_MOV32_REG(BPF_REG_8, BPF_REG_0))) > + return false; > + > + /* Fourth step: free the skb. 2 insns. */ > + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, > + offsetof(struct nf_hook_state, skb)))) > + return false; > + if (!emit(p, BPF_EMIT_CALL(kfree_skb))) > + return false; ditto. > + > + /* Last step: return. 2 insns. */ > + if (!emit(p, BPF_MOV32_REG(BPF_REG_0, BPF_REG_8))) > + return false; > + if (!emit(p, BPF_EXIT_INSN())) > + return false; > + > + /* ACCEPT, STOLEN and DROP have been handled. > + * REPEAT and STOP are not allowed anymore for individual hook functions. > + * This leaves NFQUEUE as only remaing return value. > + * > + * In this case BPF_REG_0 still contains the original verdict of > + * '(NUM << NF_VERDICT_QBITS | NF_QUEUE)', so pass it to nf_queue() as-is. > + */ > + if (!emit_nf_queue(p)) > + return false; > + > + /* Increment hook index and store it in nf_hook_state so nf_hook_slow will > + * start at the next hook, if any. > + */ > + if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1))) > + return false; > + if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8, > + offsetof(struct nf_hook_state, hook_index)))) > + return false; > + > + return emit_nf_hook_slow(p); > +} > + > +static int nf_hook_prog_init(struct nf_hook_prog *p) > +{ > + memset(p, 0, sizeof(*p)); > + > + p->insns = kcalloc(BPF_MAXINSNS, sizeof(*p->insns), GFP_KERNEL); > + if (!p->insns) > + return -ENOMEM; > + > + return 0; > +} > + > +static void nf_hook_prog_free(struct nf_hook_prog *p) > +{ > + kfree(p->insns); > +} > + > +static int xlate_base_hooks(struct nf_hook_prog *p, const struct nf_hook_entries *e) > +{ > + unsigned int i, len; > + > + len = e->num_hook_entries; > + > + if (!do_prologue(p)) > + goto out; > + > + for (i = 0; i < len; i++) { > + if (!xlate_one_hook(p, e, &e->hooks[i])) > + goto out; > + > + if (i + 1 < len) { > + if (!emit(p, BPF_MOV64_REG(BPF_REG_1, BPF_REG_6))) > + goto out; > + > + if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1))) > + goto out; > + } > + } > + > + if (!do_epilogue_base_hooks(p)) > + goto out; > + > + return 0; > +out: > + return -EINVAL; > +} > + > +static struct bpf_prog *nf_hook_jit_compile(struct bpf_insn *insns, unsigned int len) > +{ > + struct bpf_prog *prog; > + int err = 0; > + > + prog = bpf_prog_alloc(bpf_prog_size(len), 0); > + if (!prog) > + return NULL; > + > + prog->len = len; > + prog->type = BPF_PROG_TYPE_SOCKET_FILTER; lol. Just say BPF_PROG_TYPE_UNSPEC ? > + memcpy(prog->insnsi, insns, prog->len * sizeof(struct bpf_insn)); > + > + prog = bpf_prog_select_runtime(prog, &err); > + if (err) { > + bpf_prog_free(prog); > + return NULL; > + } Would be good to do bpf_prog_alloc_id() so it can be seen in bpftool prog show. and bpf_prog_kallsyms_add() to make 'perf report' and stack traces readable. Overall I don't hate it, but don't like it either. Please provide performance numbers. It's a lot of tricky code and not clear what the benefits are. Who will maintain this body of code long term? How are we going to deal with refactoring that will touch generic bpf bits and this generated prog? > Purpose of this is to eventually add a 'netfilter prog type' to bpf and > permit attachment of (userspace generated) bpf programs to the netfilter > machinery, e.g. 'attach bpf prog id 1234 to ipv6 PREROUTING at prio -300'. > > This will require to expose the context structure (program argument, > '__nf_hook_state', with rewriting accesses to match nf_hook_state layout. This part is orthogonal, right? I don't see how this work is connected to above idea. I'm still convinced that xt_bpf was a bad choice for many reasons. "Add a 'netfilter prog type' to bpf" would repeat the same mistakes. Let's evaluate this set independently.

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 7c604ef8e8cb..b7874b772dd1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -2,6 +2,7 @@ #ifndef __LINUX_NETFILTER_H #define __LINUX_NETFILTER_H +#include <linux/filter.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/net.h> @@ -106,6 +107,9 @@ struct nf_hook_entries_rcu_head { }; struct nf_hook_entries { +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + struct bpf_prog *hook_prog; +#endif u16 num_hook_entries; /* padding */ struct nf_hook_entry hooks[]; @@ -205,6 +209,17 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e); + +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) +DECLARE_BPF_DISPATCHER(nf_hook_base); + +static __always_inline int bpf_prog_run_nf(const struct bpf_prog *prog, + struct nf_hook_state *state) +{ + return __bpf_prog_run(prog, state, BPF_DISPATCHER_FUNC(nf_hook_base)); +} +#endif + /** * nf_hook - call a netfilter hook * @@ -213,17 +228,17 @@ void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, * value indicates the packet has been consumed by the hook. */ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, - struct sock *sk, struct sk_buff *skb, - struct net_device *indev, struct net_device *outdev, - int (*okfn)(struct net *, struct sock *, struct sk_buff *)) + struct sock *sk, struct sk_buff *skb, + struct net_device *indev, struct net_device *outdev, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; int ret = 1; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && - __builtin_constant_p(hook) && - !static_key_false(&nf_hooks_needed[pf][hook])) + __builtin_constant_p(hook) && + !static_key_false(&nf_hooks_needed[pf][hook])) return 1; #endif @@ -254,11 +269,24 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, if (hook_head) { struct nf_hook_state state; +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + const struct bpf_prog *p = READ_ONCE(hook_head->hook_prog); + + nf_hook_state_init(&state, hook, pf, indev, outdev, + sk, net, okfn); + + state.priv = (void *)hook_head; + state.skb = skb; + migrate_disable(); + ret = bpf_prog_run_nf(p, &state); + migrate_enable(); +#else nf_hook_state_init(&state, hook, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, hook_head); +#endif } rcu_read_unlock(); @@ -336,10 +364,38 @@ NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, if (hook_head) { struct nf_hook_state state; +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + const struct bpf_prog *p = hook_head->hook_prog; + struct sk_buff *skb, *next; + struct list_head sublist; + int ret; + + nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); + + INIT_LIST_HEAD(&sublist); + migrate_disable(); + + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + + state.priv = (void *)hook_head; + state.skb = skb; + + ret = bpf_prog_run_nf(p, &state); + if (ret == 1) + list_add_tail(&skb->list, &sublist); + } + + migrate_enable(); + + /* Put passed packets back on main list */ + list_splice(&sublist, head); +#else nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); nf_hook_slow_list(head, &state, hook_head); +#endif } rcu_read_unlock(); } diff --git a/include/net/netfilter/nf_hook_bpf.h b/include/net/netfilter/nf_hook_bpf.h new file mode 100644 index 000000000000..1792f97a806d --- /dev/null +++ b/include/net/netfilter/nf_hook_bpf.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +struct bpf_dispatcher; +struct bpf_prog; + +struct bpf_prog *nf_hook_bpf_create_fb(void); + +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) +struct bpf_prog *nf_hook_bpf_create(const struct nf_hook_entries *n); + +void nf_hook_bpf_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); +#else +static inline void +nf_hook_bpf_change_prog(struct bpf_dispatcher *d, struct bpf_prog *f, struct bpf_prog *t) +{ +} + +static inline struct bpf_prog *nf_hook_bpf_create(const struct nf_hook_entries *n) +{ + return NULL; +} +#endif diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 4b8d04640ff3..2610786b6ad8 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -30,6 +30,16 @@ config NETFILTER_FAMILY_BRIDGE config NETFILTER_FAMILY_ARP bool +config HAVE_NF_HOOK_BPF + bool + +config NF_HOOK_BPF + bool "netfilter base hook bpf translator" + depends on BPF_JIT + help + This unrolls the nf_hook_slow interpreter loop with + auto-generated BPF program. + config NETFILTER_NETLINK_HOOK tristate "Netfilter base hook dump support" depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 06df49ea6329..e465659e87ad 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -21,6 +21,7 @@ nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o endif obj-$(CONFIG_NETFILTER) = netfilter.o +obj-$(CONFIG_NF_HOOK_BPF) += nf_hook_bpf.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 17165f9cf4a1..6888c7fd5aeb 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -24,6 +24,7 @@ #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netfilter/nf_queue.h> +#include <net/netfilter/nf_hook_bpf.h> #include <net/sock.h> #include "nf_internals.h" @@ -47,6 +48,33 @@ static DEFINE_MUTEX(nf_hook_mutex); #define nf_entry_dereference(e) \ rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) +DEFINE_BPF_DISPATCHER(nf_hook_base); + +#define NF_DISPATCHER_PTR BPF_DISPATCHER_PTR(nf_hook_base) +#else +#define NF_DISPATCHER_PTR NULL +#endif + +static struct bpf_prog *fallback_nf_hook_slow; + +static void nf_hook_bpf_prog_set(struct nf_hook_entries *e, + struct bpf_prog *p) +{ +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + WRITE_ONCE(e->hook_prog, p); +#endif +} + +static struct bpf_prog *nf_hook_bpf_prog_get(struct nf_hook_entries *e) +{ +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + if (e) + return e->hook_prog; +#endif + return NULL; +} + static struct nf_hook_entries *allocate_hook_entries_size(u16 num) { struct nf_hook_entries *e; @@ -58,9 +86,23 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num) if (num == 0) return NULL; - e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT); - if (e) - e->num_hook_entries = num; +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + if (!fallback_nf_hook_slow) { + /* never free'd */ + fallback_nf_hook_slow = nf_hook_bpf_create_fb(); + + if (!fallback_nf_hook_slow) + return NULL; + } +#endif + + e = kvzalloc(alloc, GFP_KERNEL); + if (!e) + return NULL; + + e->num_hook_entries = num; + nf_hook_bpf_prog_set(e, fallback_nf_hook_slow); + return e; } @@ -98,6 +140,29 @@ static const struct nf_hook_ops dummy_ops = { .priority = INT_MIN, }; +static void nf_hook_entries_grow_bpf(const struct nf_hook_entries *old, + struct nf_hook_entries *new) +{ +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + struct bpf_prog *hook_bpf_prog = nf_hook_bpf_create(new); + + /* allocate_hook_entries_size() pre-inits new->hook_prog + * to a fallback program that calls nf_hook_slow(). + */ + if (hook_bpf_prog) { + struct bpf_prog *old_prog = NULL; + + new->hook_prog = hook_bpf_prog; + + if (old) + old_prog = old->hook_prog; + + nf_hook_bpf_change_prog(BPF_DISPATCHER_PTR(nf_hook_base), + old_prog, hook_bpf_prog); + } +#endif +} + static struct nf_hook_entries * nf_hook_entries_grow(const struct nf_hook_entries *old, const struct nf_hook_ops *reg) @@ -156,6 +221,7 @@ nf_hook_entries_grow(const struct nf_hook_entries *old, new->hooks[nhooks].priv = reg->priv; } + nf_hook_entries_grow_bpf(old, new); return new; } @@ -221,6 +287,7 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old, struct nf_hook_entries __rcu **pp) { unsigned int i, j, skip = 0, hook_entries; + struct bpf_prog *hook_bpf_prog = NULL; struct nf_hook_entries *new = NULL; struct nf_hook_ops **orig_ops; struct nf_hook_ops **new_ops; @@ -244,8 +311,13 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old, hook_entries -= skip; new = allocate_hook_entries_size(hook_entries); - if (!new) + if (!new) { + struct bpf_prog *old_prog = nf_hook_bpf_prog_get(old); + + nf_hook_bpf_prog_set(old, fallback_nf_hook_slow); + nf_hook_bpf_change_prog(NF_DISPATCHER_PTR, old_prog, NULL); return NULL; + } new_ops = nf_hook_entries_get_hook_ops(new); for (i = 0, j = 0; i < old->num_hook_entries; i++) { @@ -256,7 +328,13 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old, j++; } hooks_validate(new); + + /* if this fails fallback prog calls nf_hook_slow. */ + hook_bpf_prog = nf_hook_bpf_create(new); + if (hook_bpf_prog) + nf_hook_bpf_prog_set(new, hook_bpf_prog); out_assign: + nf_hook_bpf_change_prog(NF_DISPATCHER_PTR, nf_hook_bpf_prog_get(old), hook_bpf_prog); rcu_assign_pointer(*pp, new); return old; } @@ -609,6 +687,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, int ret; state->skb = skb; + for (; s < e->num_hook_entries; s++) { verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state); switch (verdict & NF_VERDICT_MASK) { @@ -783,6 +862,11 @@ int __init netfilter_init(void) if (ret < 0) goto err_pernet; +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + fallback_nf_hook_slow = nf_hook_bpf_create_fb(); + WARN_ON_ONCE(!fallback_nf_hook_slow); +#endif + return 0; err_pernet: unregister_pernet_subsys(&netfilter_net_ops); diff --git a/net/netfilter/nf_hook_bpf.c b/net/netfilter/nf_hook_bpf.c new file mode 100644 index 000000000000..dab13b803801 --- /dev/null +++ b/net/netfilter/nf_hook_bpf.c @@ -0,0 +1,424 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/string.h> +#include <linux/hashtable.h> +#include <linux/jhash.h> +#include <linux/netfilter.h> + +#include <net/netfilter/nf_hook_bpf.h> +#include <net/netfilter/nf_queue.h> + +#define JMP_INVALID 0 +#define JIT_SIZE_MAX 0xffff + +/* BPF translator for netfilter hooks. + * + * Create a bpf program that can be called *instead* of nf_hook_slow(). + * This program thus has same return value as nf_hook_slow and + * handles nfqueue and packet drops internally. + * Call nf_hook_bpf_create(struct nf_hook_entries *e, NF_HOOK_BPF_TYPE_BASE) + * to unroll the functions described by nf_hook_entries into such + * a bpf program. + * + * These bpf programs are called/run from nf_hook() inline function. + * + * Register usage is: + * + * BPF_REG_0: verdict. + * BPF_REG_1: struct nf_hook_state * + * BPF_REG_2: reserved as arg to nf_queue() + * BPF_REG_3: reserved as arg to nf_queue() + * + * Prologue storage: + * BPF_REG_6: copy of REG_1 (original struct nf_hook_state *) + * BPF_REG_7: copy of original state->priv value + * BPF_REG_8: copy of state->hook_index + */ +struct nf_hook_prog { + struct bpf_insn *insns; + unsigned int pos; +}; + +static bool emit(struct nf_hook_prog *p, struct bpf_insn insn) +{ + if (WARN_ON_ONCE(p->pos >= BPF_MAXINSNS)) + return false; + + p->insns[p->pos] = insn; + p->pos++; + return true; +} + +static bool xlate_one_hook(struct nf_hook_prog *p, const struct nf_hook_entries *e, + const struct nf_hook_entry *h) +{ + int width = bytes_to_bpf_size(sizeof(h->priv)); + + /* if priv is NULL, the called hookfn does not use the priv member. */ + if (!h->priv) + goto emit_hook_call; + + if (WARN_ON_ONCE(width < 0)) + return false; + + /* x = entries[s]->priv; */ + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_2, BPF_REG_7, + (unsigned long)&h->priv - (unsigned long)e))) + return false; + + /* state->priv = x */ + if (!emit(p, BPF_STX_MEM(width, BPF_REG_6, BPF_REG_2, + offsetof(struct nf_hook_state, priv)))) + return false; + +emit_hook_call: + if (!emit(p, BPF_EMIT_CALL(h->hook))) + return false; + + /* Only advance to next hook on ACCEPT verdict. + * Else, skip rest and move to tail. + * + * Postprocessing patches the jump offset to the + * correct position, after last hook. + */ + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, NF_ACCEPT, JMP_INVALID))) + return false; + + return true; +} + +static bool emit_mov_ptr_reg(struct nf_hook_prog *p, u8 dreg, u8 sreg) +{ + if (sizeof(void *) == sizeof(u64)) + return emit(p, BPF_MOV64_REG(dreg, sreg)); + if (sizeof(void *) == sizeof(u32)) + return emit(p, BPF_MOV32_REG(dreg, sreg)); + + return false; +} + +static bool do_prologue(struct nf_hook_prog *p) +{ + int width = bytes_to_bpf_size(sizeof(void *)); + + if (WARN_ON_ONCE(width < 0)) + return false; + + /* argument to program is a pointer to struct nf_hook_state, in BPF_REG_1. */ + if (!emit_mov_ptr_reg(p, BPF_REG_6, BPF_REG_1)) + return false; + + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_7, BPF_REG_1, + offsetof(struct nf_hook_state, priv)))) + return false; + + /* could load state->hook_index, but we don't support index > 0 for bpf call. */ + if (!emit(p, BPF_MOV32_IMM(BPF_REG_8, 0))) + return false; + + return true; +} + +static void patch_hook_jumps(struct nf_hook_prog *p) +{ + unsigned int i; + + if (!p->insns) + return; + + for (i = 0; i < p->pos; i++) { + if (BPF_CLASS(p->insns[i].code) != BPF_JMP) + continue; + + if (p->insns[i].code == (BPF_EXIT | BPF_JMP)) + continue; + if (p->insns[i].code == (BPF_CALL | BPF_JMP)) + continue; + + if (p->insns[i].off != JMP_INVALID) + continue; + p->insns[i].off = p->pos - i - 1; + } +} + +static bool emit_retval(struct nf_hook_prog *p, int retval) +{ + if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, retval))) + return false; + + return emit(p, BPF_EXIT_INSN()); +} + +static bool emit_nf_hook_slow(struct nf_hook_prog *p) +{ + int width = bytes_to_bpf_size(sizeof(void *)); + + /* restore the original state->priv. */ + if (!emit(p, BPF_STX_MEM(width, BPF_REG_6, BPF_REG_7, + offsetof(struct nf_hook_state, priv)))) + return false; + + /* arg1 is state->skb */ + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, + offsetof(struct nf_hook_state, skb)))) + return false; + + /* arg2 is "struct nf_hook_state *" */ + if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6))) + return false; + + /* arg3 is nf_hook_entries (original state->priv) */ + if (!emit(p, BPF_MOV64_REG(BPF_REG_3, BPF_REG_7))) + return false; + + if (!emit(p, BPF_EMIT_CALL(nf_hook_slow))) + return false; + + /* No further action needed, return retval provided by nf_hook_slow */ + return emit(p, BPF_EXIT_INSN()); +} + +static bool emit_nf_queue(struct nf_hook_prog *p) +{ + int width = bytes_to_bpf_size(sizeof(void *)); + + if (width < 0) { + WARN_ON_ONCE(1); + return false; + } + + /* int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int verdict) */ + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, + offsetof(struct nf_hook_state, skb)))) + return false; + if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8, + offsetof(struct nf_hook_state, hook_index)))) + return false; + /* arg2: struct nf_hook_state * */ + if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6))) + return false; + /* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */ + if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0))) + return false; + if (!emit(p, BPF_EMIT_CALL(nf_queue))) + return false; + + /* Check nf_queue return value. Abnormal case: nf_queue returned != 0. + * + * Fall back to nf_hook_slow(). + */ + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2))) + return false; + + /* Normal case: skb was stolen. Return 0. */ + return emit_retval(p, 0); +} + +static bool do_epilogue_base_hooks(struct nf_hook_prog *p) +{ + int width = bytes_to_bpf_size(sizeof(void *)); + + if (WARN_ON_ONCE(width < 0)) + return false; + + /* last 'hook'. We arrive here if previous hook returned ACCEPT, + * i.e. all hooks passed -- we are done. + * + * Return 1, skb can continue traversing network stack. + */ + if (!emit_retval(p, 1)) + return false; + + /* Patch all hook jumps, in case any of these are taken + * we need to jump to this location. + * + * This happens when verdict is != ACCEPT. + */ + patch_hook_jumps(p); + + /* need to ignore upper 24 bits, might contain errno or queue number */ + if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0))) + return false; + if (!emit(p, BPF_ALU32_IMM(BPF_AND, BPF_REG_3, 0xff))) + return false; + + /* ACCEPT handled, check STOLEN. */ + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_STOLEN, 2))) + return false; + + if (!emit_retval(p, 0)) + return false; + + /* ACCEPT and STOLEN handled. Check DROP next */ + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_DROP, 1 + 2 + 2 + 2 + 2))) + return false; + + /* First step. Extract the errno number. 1 insn. */ + if (!emit(p, BPF_ALU32_IMM(BPF_RSH, BPF_REG_0, NF_VERDICT_QBITS))) + return false; + + /* Second step: replace errno with EPERM if it was 0. 2 insns. */ + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1))) + return false; + if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, EPERM))) + return false; + + /* Third step: negate reg0: Caller expects -EFOO and stash the result. 2 insns. */ + if (!emit(p, BPF_ALU32_IMM(BPF_NEG, BPF_REG_0, 0))) + return false; + if (!emit(p, BPF_MOV32_REG(BPF_REG_8, BPF_REG_0))) + return false; + + /* Fourth step: free the skb. 2 insns. */ + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, + offsetof(struct nf_hook_state, skb)))) + return false; + if (!emit(p, BPF_EMIT_CALL(kfree_skb))) + return false; + + /* Last step: return. 2 insns. */ + if (!emit(p, BPF_MOV32_REG(BPF_REG_0, BPF_REG_8))) + return false; + if (!emit(p, BPF_EXIT_INSN())) + return false; + + /* ACCEPT, STOLEN and DROP have been handled. + * REPEAT and STOP are not allowed anymore for individual hook functions. + * This leaves NFQUEUE as only remaing return value. + * + * In this case BPF_REG_0 still contains the original verdict of + * '(NUM << NF_VERDICT_QBITS | NF_QUEUE)', so pass it to nf_queue() as-is. + */ + if (!emit_nf_queue(p)) + return false; + + /* Increment hook index and store it in nf_hook_state so nf_hook_slow will + * start at the next hook, if any. + */ + if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1))) + return false; + if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8, + offsetof(struct nf_hook_state, hook_index)))) + return false; + + return emit_nf_hook_slow(p); +} + +static int nf_hook_prog_init(struct nf_hook_prog *p) +{ + memset(p, 0, sizeof(*p)); + + p->insns = kcalloc(BPF_MAXINSNS, sizeof(*p->insns), GFP_KERNEL); + if (!p->insns) + return -ENOMEM; + + return 0; +} + +static void nf_hook_prog_free(struct nf_hook_prog *p) +{ + kfree(p->insns); +} + +static int xlate_base_hooks(struct nf_hook_prog *p, const struct nf_hook_entries *e) +{ + unsigned int i, len; + + len = e->num_hook_entries; + + if (!do_prologue(p)) + goto out; + + for (i = 0; i < len; i++) { + if (!xlate_one_hook(p, e, &e->hooks[i])) + goto out; + + if (i + 1 < len) { + if (!emit(p, BPF_MOV64_REG(BPF_REG_1, BPF_REG_6))) + goto out; + + if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1))) + goto out; + } + } + + if (!do_epilogue_base_hooks(p)) + goto out; + + return 0; +out: + return -EINVAL; +} + +static struct bpf_prog *nf_hook_jit_compile(struct bpf_insn *insns, unsigned int len) +{ + struct bpf_prog *prog; + int err = 0; + + prog = bpf_prog_alloc(bpf_prog_size(len), 0); + if (!prog) + return NULL; + + prog->len = len; + prog->type = BPF_PROG_TYPE_SOCKET_FILTER; + memcpy(prog->insnsi, insns, prog->len * sizeof(struct bpf_insn)); + + prog = bpf_prog_select_runtime(prog, &err); + if (err) { + bpf_prog_free(prog); + return NULL; + } + + return prog; +} + +/* fallback program, invokes nf_hook_slow interpreter. + * + * Used when a hook is unregistered and new/replacement program cannot + * be compiled for some reason. + */ +struct bpf_prog *nf_hook_bpf_create_fb(void) +{ + struct bpf_prog *prog; + struct nf_hook_prog p; + int err; + + err = nf_hook_prog_init(&p); + if (err) + return NULL; + + if (!do_prologue(&p)) + goto err; + + if (!emit_nf_hook_slow(&p)) + goto err; + + prog = nf_hook_jit_compile(p.insns, p.pos); +err: + nf_hook_prog_free(&p); + return prog; +} + +struct bpf_prog *nf_hook_bpf_create(const struct nf_hook_entries *new) +{ + struct bpf_prog *prog; + struct nf_hook_prog p; + int err; + + err = nf_hook_prog_init(&p); + if (err) + return NULL; + + err = xlate_base_hooks(&p, new); + if (err) + goto err; + + prog = nf_hook_jit_compile(p.insns, p.pos); +err: + nf_hook_prog_free(&p); + return prog; +} + +void nf_hook_bpf_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to) +{ + bpf_dispatcher_change_prog(d, from, to); +}

[RFC,v2,6/9] netfilter: add bpf base hook program generator

Checks

Commit Message

Comments

Patch