diff mbox series

[RFC,v2,6/9] netfilter: add bpf base hook program generator

Message ID 20221005141309.31758-7-fw@strlen.de (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series netfilter: bpf base hook program generator | expand

Checks

Context Check Description
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 224 this patch: 224
netdev/cc_maintainers warning 9 maintainers not CCed: kuba@kernel.org davem@davemloft.net pablo@netfilter.org netfilter-devel@vger.kernel.org kadlec@netfilter.org netdev@vger.kernel.org coreteam@netfilter.org edumazet@google.com pabeni@redhat.com
netdev/build_clang fail Errors and warnings before: 60 this patch: 66
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 228 this patch: 228
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 81 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 88 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 98 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns WARNING: please write a help paragraph that fully describes the config symbol
netdev/kdoc success Errors and warnings before: 11 this patch: 11
netdev/source_inline success Was 0 now: 0

Commit Message

Florian Westphal Oct. 5, 2022, 2:13 p.m. UTC
Add a kernel bpf program generator for netfilter base hooks.

Currently netfilter hooks are invoked by nf_hook_slow:

for i in hooks; do
  verdict = hooks[i]->indirect_func(hooks->[i].hook_arg, skb, state);

  switch (verdict) { ....

The autogenerator unrolls the loop, so we get:

state->priv = hooks->[0].hook_arg;
v = first_hook_function(state);
if (v != ACCEPT) goto done;
state->priv = hooks->[1].hook_arg;
v = second_hook_function(state); ...

Indirections are replaced by direct calls. Invocation of the
autogenerated programs is done via bpf dispatcher from nf_hook().

The autogenerated program has the same return value scheme as
nf_hook_slow(). NF_HOOK() points are converted to call the
autogenerated bpf program instead of nf_hook_slow().

Purpose of this is to eventually add a 'netfilter prog type' to bpf and
permit attachment of (userspace generated) bpf programs to the netfilter
machinery, e.g.  'attach bpf prog id 1234 to ipv6 PREROUTING at prio -300'.

This will require to expose the context structure (program argument,
'__nf_hook_state', with rewriting accesses to match nf_hook_state layout.

Nat hooks are still handled via indirect calls, but they are only called
once per connection.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter.h           |  66 ++++-
 include/net/netfilter/nf_hook_bpf.h |  21 ++
 net/netfilter/Kconfig               |  10 +
 net/netfilter/Makefile              |   1 +
 net/netfilter/core.c                |  92 +++++-
 net/netfilter/nf_hook_bpf.c         | 424 ++++++++++++++++++++++++++++
 6 files changed, 605 insertions(+), 9 deletions(-)
 create mode 100644 include/net/netfilter/nf_hook_bpf.h
 create mode 100644 net/netfilter/nf_hook_bpf.c

Comments

Alexei Starovoitov Oct. 6, 2022, 2:52 a.m. UTC | #1
On Wed, Oct 05, 2022 at 04:13:06PM +0200, Florian Westphal wrote:
>  
> @@ -254,11 +269,24 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
>  
>  	if (hook_head) {
>  		struct nf_hook_state state;
> +#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
> +		const struct bpf_prog *p = READ_ONCE(hook_head->hook_prog);
> +
> +		nf_hook_state_init(&state, hook, pf, indev, outdev,
> +				   sk, net, okfn);
> +
> +		state.priv = (void *)hook_head;
> +		state.skb = skb;
>  
> +		migrate_disable();
> +		ret = bpf_prog_run_nf(p, &state);
> +		migrate_enable();

Since generated prog doesn't do any per-cpu work and not using any maps
there is no need for migrate_disable.
There is cant_migrate() in __bpf_prog_run(), but it's probably better
to silence that instead of adding migrate_disable/enable overhead.
I guess it's ok for now.

> +static bool emit_mov_ptr_reg(struct nf_hook_prog *p, u8 dreg, u8 sreg)
> +{
> +	if (sizeof(void *) == sizeof(u64))
> +		return emit(p, BPF_MOV64_REG(dreg, sreg));
> +	if (sizeof(void *) == sizeof(u32))
> +		return emit(p, BPF_MOV32_REG(dreg, sreg));

I bet that was never tested :) because... see below.

> +
> +	return false;
> +}
> +
> +static bool do_prologue(struct nf_hook_prog *p)
> +{
> +	int width = bytes_to_bpf_size(sizeof(void *));
> +
> +	if (WARN_ON_ONCE(width < 0))
> +		return false;
> +
> +	/* argument to program is a pointer to struct nf_hook_state, in BPF_REG_1. */
> +	if (!emit_mov_ptr_reg(p, BPF_REG_6, BPF_REG_1))
> +		return false;
> +
> +	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_7, BPF_REG_1,
> +				 offsetof(struct nf_hook_state, priv))))
> +		return false;
> +
> +	/* could load state->hook_index, but we don't support index > 0 for bpf call. */
> +	if (!emit(p, BPF_MOV32_IMM(BPF_REG_8, 0)))
> +		return false;
> +
> +	return true;
> +}
> +
> +static void patch_hook_jumps(struct nf_hook_prog *p)
> +{
> +	unsigned int i;
> +
> +	if (!p->insns)
> +		return;
> +
> +	for (i = 0; i < p->pos; i++) {
> +		if (BPF_CLASS(p->insns[i].code) != BPF_JMP)
> +			continue;
> +
> +		if (p->insns[i].code == (BPF_EXIT | BPF_JMP))
> +			continue;
> +		if (p->insns[i].code == (BPF_CALL | BPF_JMP))
> +			continue;
> +
> +		if (p->insns[i].off != JMP_INVALID)
> +			continue;
> +		p->insns[i].off = p->pos - i - 1;

Pls add a check that it fits in 16-bits.

> +	}
> +}
> +
> +static bool emit_retval(struct nf_hook_prog *p, int retval)
> +{
> +	if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, retval)))
> +		return false;
> +
> +	return emit(p, BPF_EXIT_INSN());
> +}
> +
> +static bool emit_nf_hook_slow(struct nf_hook_prog *p)
> +{
> +	int width = bytes_to_bpf_size(sizeof(void *));
> +
> +	/* restore the original state->priv. */
> +	if (!emit(p, BPF_STX_MEM(width, BPF_REG_6, BPF_REG_7,
> +				 offsetof(struct nf_hook_state, priv))))
> +		return false;
> +
> +	/* arg1 is state->skb */
> +	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6,
> +				 offsetof(struct nf_hook_state, skb))))
> +		return false;
> +
> +	/* arg2 is "struct nf_hook_state *" */
> +	if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)))
> +		return false;
> +
> +	/* arg3 is nf_hook_entries (original state->priv) */
> +	if (!emit(p, BPF_MOV64_REG(BPF_REG_3, BPF_REG_7)))
> +		return false;
> +
> +	if (!emit(p, BPF_EMIT_CALL(nf_hook_slow)))
> +		return false;
> +
> +	/* No further action needed, return retval provided by nf_hook_slow */
> +	return emit(p, BPF_EXIT_INSN());
> +}
> +
> +static bool emit_nf_queue(struct nf_hook_prog *p)
> +{
> +	int width = bytes_to_bpf_size(sizeof(void *));
> +
> +	if (width < 0) {
> +		WARN_ON_ONCE(1);
> +		return false;
> +	}
> +
> +	/* int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int verdict) */
> +	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6,
> +				 offsetof(struct nf_hook_state, skb))))
> +		return false;
> +	if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8,
> +				 offsetof(struct nf_hook_state, hook_index))))
> +		return false;
> +	/* arg2: struct nf_hook_state * */
> +	if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)))
> +		return false;
> +	/* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */
> +	if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0)))
> +		return false;
> +	if (!emit(p, BPF_EMIT_CALL(nf_queue)))
> +		return false;

here and other CALL work by accident on x84-64.
You need to wrap them with BPF_CALL_ and point BPF_EMIT_CALL to that wrapper.
On x86-64 it will be a nop.
On x86-32 it will do quite a bit of work.

> +
> +	/* Check nf_queue return value.  Abnormal case: nf_queue returned != 0.
> +	 *
> +	 * Fall back to nf_hook_slow().
> +	 */
> +	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2)))
> +		return false;
> +
> +	/* Normal case: skb was stolen. Return 0. */
> +	return emit_retval(p, 0);
> +}
> +
> +static bool do_epilogue_base_hooks(struct nf_hook_prog *p)
> +{
> +	int width = bytes_to_bpf_size(sizeof(void *));
> +
> +	if (WARN_ON_ONCE(width < 0))
> +		return false;
> +
> +	/* last 'hook'. We arrive here if previous hook returned ACCEPT,
> +	 * i.e. all hooks passed -- we are done.
> +	 *
> +	 * Return 1, skb can continue traversing network stack.
> +	 */
> +	if (!emit_retval(p, 1))
> +		return false;
> +
> +	/* Patch all hook jumps, in case any of these are taken
> +	 * we need to jump to this location.
> +	 *
> +	 * This happens when verdict is != ACCEPT.
> +	 */
> +	patch_hook_jumps(p);
> +
> +	/* need to ignore upper 24 bits, might contain errno or queue number */
> +	if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0)))
> +		return false;
> +	if (!emit(p, BPF_ALU32_IMM(BPF_AND, BPF_REG_3, 0xff)))
> +		return false;
> +
> +	/* ACCEPT handled, check STOLEN. */
> +	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_STOLEN, 2)))
> +		return false;
> +
> +	if (!emit_retval(p, 0))
> +		return false;
> +
> +	/* ACCEPT and STOLEN handled.  Check DROP next */
> +	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_DROP, 1 + 2 + 2 + 2 + 2)))
> +		return false;
> +
> +	/* First step. Extract the errno number. 1 insn. */
> +	if (!emit(p, BPF_ALU32_IMM(BPF_RSH, BPF_REG_0, NF_VERDICT_QBITS)))
> +		return false;
> +
> +	/* Second step: replace errno with EPERM if it was 0. 2 insns. */
> +	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1)))
> +		return false;
> +	if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, EPERM)))
> +		return false;
> +
> +	/* Third step: negate reg0: Caller expects -EFOO and stash the result.  2 insns. */
> +	if (!emit(p, BPF_ALU32_IMM(BPF_NEG, BPF_REG_0, 0)))
> +		return false;
> +	if (!emit(p, BPF_MOV32_REG(BPF_REG_8, BPF_REG_0)))
> +		return false;
> +
> +	/* Fourth step: free the skb. 2 insns. */
> +	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6,
> +				 offsetof(struct nf_hook_state, skb))))
> +		return false;
> +	if (!emit(p, BPF_EMIT_CALL(kfree_skb)))
> +		return false;

ditto.

> +
> +	/* Last step: return. 2 insns. */
> +	if (!emit(p, BPF_MOV32_REG(BPF_REG_0, BPF_REG_8)))
> +		return false;
> +	if (!emit(p, BPF_EXIT_INSN()))
> +		return false;
> +
> +	/* ACCEPT, STOLEN and DROP have been handled.
> +	 * REPEAT and STOP are not allowed anymore for individual hook functions.
> +	 * This leaves NFQUEUE as only remaing return value.
> +	 *
> +	 * In this case BPF_REG_0 still contains the original verdict of
> +	 * '(NUM << NF_VERDICT_QBITS | NF_QUEUE)', so pass it to nf_queue() as-is.
> +	 */
> +	if (!emit_nf_queue(p))
> +		return false;
> +
> +	/* Increment hook index and store it in nf_hook_state so nf_hook_slow will
> +	 * start at the next hook, if any.
> +	 */
> +	if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1)))
> +		return false;
> +	if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8,
> +				 offsetof(struct nf_hook_state, hook_index))))
> +		return false;
> +
> +	return emit_nf_hook_slow(p);
> +}
> +
> +static int nf_hook_prog_init(struct nf_hook_prog *p)
> +{
> +	memset(p, 0, sizeof(*p));
> +
> +	p->insns = kcalloc(BPF_MAXINSNS, sizeof(*p->insns), GFP_KERNEL);
> +	if (!p->insns)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static void nf_hook_prog_free(struct nf_hook_prog *p)
> +{
> +	kfree(p->insns);
> +}
> +
> +static int xlate_base_hooks(struct nf_hook_prog *p, const struct nf_hook_entries *e)
> +{
> +	unsigned int i, len;
> +
> +	len = e->num_hook_entries;
> +
> +	if (!do_prologue(p))
> +		goto out;
> +
> +	for (i = 0; i < len; i++) {
> +		if (!xlate_one_hook(p, e, &e->hooks[i]))
> +			goto out;
> +
> +		if (i + 1 < len) {
> +			if (!emit(p, BPF_MOV64_REG(BPF_REG_1, BPF_REG_6)))
> +				goto out;
> +
> +			if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1)))
> +				goto out;
> +		}
> +	}
> +
> +	if (!do_epilogue_base_hooks(p))
> +		goto out;
> +
> +	return 0;
> +out:
> +	return -EINVAL;
> +}
> +
> +static struct bpf_prog *nf_hook_jit_compile(struct bpf_insn *insns, unsigned int len)
> +{
> +	struct bpf_prog *prog;
> +	int err = 0;
> +
> +	prog = bpf_prog_alloc(bpf_prog_size(len), 0);
> +	if (!prog)
> +		return NULL;
> +
> +	prog->len = len;
> +	prog->type = BPF_PROG_TYPE_SOCKET_FILTER;

lol. Just say BPF_PROG_TYPE_UNSPEC ?

> +	memcpy(prog->insnsi, insns, prog->len * sizeof(struct bpf_insn));
> +
> +	prog = bpf_prog_select_runtime(prog, &err);
> +	if (err) {
> +		bpf_prog_free(prog);
> +		return NULL;
> +	}

Would be good to do bpf_prog_alloc_id() so it can be seen in
bpftool prog show.
and bpf_prog_kallsyms_add() to make 'perf report' and
stack traces readable.

Overall I don't hate it, but don't like it either.
Please provide performance numbers.
It's a lot of tricky code and not clear what the benefits are.
Who will maintain this body of code long term?
How are we going to deal with refactoring that will touch generic bpf bits
and this generated prog?

> Purpose of this is to eventually add a 'netfilter prog type' to bpf and
> permit attachment of (userspace generated) bpf programs to the netfilter
> machinery, e.g.  'attach bpf prog id 1234 to ipv6 PREROUTING at prio -300'.
> 
> This will require to expose the context structure (program argument,
> '__nf_hook_state', with rewriting accesses to match nf_hook_state layout.

This part is orthogonal, right? I don't see how this work is connected
to above idea.
I'm still convinced that xt_bpf was a bad choice for many reasons.
"Add a 'netfilter prog type' to bpf" would repeat the same mistakes.
Let's evaluate this set independently.
Florian Westphal Oct. 6, 2022, 1:51 p.m. UTC | #2
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > +#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
> > +		const struct bpf_prog *p = READ_ONCE(hook_head->hook_prog);
> > +
> > +		nf_hook_state_init(&state, hook, pf, indev, outdev,
> > +				   sk, net, okfn);
> > +
> > +		state.priv = (void *)hook_head;
> > +		state.skb = skb;
> >  
> > +		migrate_disable();
> > +		ret = bpf_prog_run_nf(p, &state);
> > +		migrate_enable();
> 
> Since generated prog doesn't do any per-cpu work and not using any maps
> there is no need for migrate_disable.
> There is cant_migrate() in __bpf_prog_run(), but it's probably better
> to silence that instead of adding migrate_disable/enable overhead.

Ah, thanks -- noted.

> > +static bool emit_mov_ptr_reg(struct nf_hook_prog *p, u8 dreg, u8 sreg)
> > +{
> > +	if (sizeof(void *) == sizeof(u64))
> > +		return emit(p, BPF_MOV64_REG(dreg, sreg));
> > +	if (sizeof(void *) == sizeof(u32))
> > +		return emit(p, BPF_MOV32_REG(dreg, sreg));
> 
> I bet that was never tested :) because... see below.

Right, never tested, only on amd64 arch.

I suspect that real 32bit support won't reduce readability too much,
else I can either remove it or add it in a different patch.

> > +static void patch_hook_jumps(struct nf_hook_prog *p)
> > +{
> > +	unsigned int i;
> > +
> > +	if (!p->insns)
> > +		return;
> > +
> > +	for (i = 0; i < p->pos; i++) {
> > +		if (BPF_CLASS(p->insns[i].code) != BPF_JMP)
> > +			continue;
> > +
> > +		if (p->insns[i].code == (BPF_EXIT | BPF_JMP))
> > +			continue;
> > +		if (p->insns[i].code == (BPF_CALL | BPF_JMP))
> > +			continue;
> > +
> > +		if (p->insns[i].off != JMP_INVALID)
> > +			continue;
> > +		p->insns[i].off = p->pos - i - 1;
> 
> Pls add a check that it fits in 16-bits.

Makes sense.

> > +	if (!emit(p, BPF_EMIT_CALL(nf_queue)))
> > +		return false;
> 
> here and other CALL work by accident on x84-64.
> You need to wrap them with BPF_CALL_ and point BPF_EMIT_CALL to that wrapper.
> On x86-64 it will be a nop.
> On x86-32 it will do quite a bit of work.

I see. thanks.

> > +	prog->len = len;
> > +	prog->type = BPF_PROG_TYPE_SOCKET_FILTER;
> 
> lol. Just say BPF_PROG_TYPE_UNSPEC ?

Right, will do that.

> > +	memcpy(prog->insnsi, insns, prog->len * sizeof(struct bpf_insn));
> > +
> > +	prog = bpf_prog_select_runtime(prog, &err);
> > +	if (err) {
> > +		bpf_prog_free(prog);
> > +		return NULL;
> > +	}
> 
> Would be good to do bpf_prog_alloc_id() so it can be seen in
> bpftool prog show.

Agree.

> and bpf_prog_kallsyms_add() to make 'perf report' and
> stack traces readable.

Good to know, will check that this works.

> Overall I don't hate it, but don't like it either.
> Please provide performance numbers.

Oh, right, I should have included those in the cover letter.
Tests were done on 5.19-rc3 on a 56core intel machine using pktgen,
(based off pktgen_bench_xmit_mode_netif_receive.sh), i.e.
64byte udp packets that get forwarded to a dummy device.

Ruleset had single 'ct state new accept' rule in forward chain.

Baseline, with 56-rx queues: 682006 pps, 348 Mb/s
with this patchset:          696743 pps, 356 MB/s

Averaged over 10 runs each, also reboot after each run.
irqbalance was off, scaling_governor set to 'performance'.

I would redo those tests for future patch submission.
If there is a particular test i should do please let me know.

I also did a test via iperf3 forwarding
(netns -> veth1 -> netns -> veth -> netns), but 'improvement'
was in noise range, too much overhead for the indirection avoidance
to be noticeable.

> It's a lot of tricky code and not clear what the benefits are.
> Who will maintain this body of code long term?
> How are we going to deal with refactoring that will touch generic bpf bits
> and this generated prog?

Good questions.  The only 'good' answer is that it could always be
marked BROKEN and then reverted if needed as it doesn't add new
functionality per se.

Furthermore (I have NOT looked at this at all) this opens the door for
more complexity/trickery.  For example the bpf prog could check (during
code generation) if $indirect_hook is the ipv4 or ipv6 defrag hook and
then insert extra code that avoids the function call for the common
case.  There are probably more hack^W tricks that could be done.

So yes, maintainablity is a good question, plus what other users in the
tree might want something similar (selinux hook invocation for
example...).

I guess it depends on wheter the perf numbers are decent enough.
If they are, then I'd suggest to just do a live experiment and give
it a try -- if it turns out to be a big pain point
(maintenance, frequent crashes, hard-to-debug correctness bugs, e.g.
 'generator failed to re-jit and now it skips my iptables filter
 table',...) or whatever, mark it as BROKEN in Kconfig and, if
everything fails just rip it out again.

Does that sound ok?

> > Purpose of this is to eventually add a 'netfilter prog type' to bpf and
> > permit attachment of (userspace generated) bpf programs to the netfilter
> > machinery, e.g.  'attach bpf prog id 1234 to ipv6 PREROUTING at prio -300'.
> > 
> > This will require to expose the context structure (program argument,
> > '__nf_hook_state', with rewriting accesses to match nf_hook_state layout.
> 
> This part is orthogonal, right? I don't see how this work is connected
> to above idea.

Yes, orthogonal from technical pov.

> I'm still convinced that xt_bpf was a bad choice for many reasons.

Hmmm, ok -- there is nothing I can say, it looks reasonably
innocent/harmless to me wrt. backwards kludge risk etc.

> "Add a 'netfilter prog type' to bpf" would repeat the same mistakes.

Hmm, to me it would be more like the 'xtc/tcx' stuff rather than
cls/act_bpf/xt_bpf etc. pp.  but perhaps I'm missing something.

> Let's evaluate this set independently.

Ok, sure.
Florian Westphal Oct. 7, 2022, 11:45 a.m. UTC | #3
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > +	if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8,
> > +				 offsetof(struct nf_hook_state, hook_index))))
> > +		return false;
> > +	/* arg2: struct nf_hook_state * */
> > +	if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)))
> > +		return false;
> > +	/* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */
> > +	if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0)))
> > +		return false;
> > +	if (!emit(p, BPF_EMIT_CALL(nf_queue)))
> > +		return false;
> 
> here and other CALL work by accident on x84-64.
> You need to wrap them with BPF_CALL_ and point BPF_EMIT_CALL to that wrapper.

Do you mean this? :

BPF_CALL_3(nf_queue_bpf, struct sk_buff *, skb, struct nf_hook_state *,
           state, unsigned int, verdict)
{
     return nf_queue(skb, state, verdict);
}

-       if (!emit(p, BPF_EMIT_CALL(nf_hook_slow)))
+       if (!emit(p, BPF_EMIT_CALL(nf_hook_slow_bpf)))

?

If yes, I don't see how this will work for the case where I only have an
address, i.e.:

if (!emit(p, BPF_EMIT_CALL(h->hook))) ....

(Also, the address might be in a kernel module)

> On x86-64 it will be a nop.
> On x86-32 it will do quite a bit of work.

If this only a problem for 32bit arches, I could also make this
'depends on CONFIG_64BIT'.

But perhaps I am on the wrong track, I see existing code doing:
        *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);

(kernel/bpf/hashtab.c).

> > +	prog = bpf_prog_select_runtime(prog, &err);
> > +	if (err) {
> > +		bpf_prog_free(prog);
> > +		return NULL;
> > +	}
> 
> Would be good to do bpf_prog_alloc_id() so it can be seen in
> bpftool prog show.

Thanks a lot for the hint:

39: unspec  tag 0000000000000000
xlated 416B  jited 221B  memlock 4096B

bpftool prog  dump xlated id 39
   0: (bf) r6 = r1
   1: (79) r7 = *(u64 *)(r1 +8)
   2: (b4) w8 = 0
   3: (85) call ipv6_defrag#526144928
   4: (55) if r0 != 0x1 goto pc+24
   5: (bf) r1 = r6
   6: (04) w8 += 1
   7: (85) call ipv6_conntrack_in#526206096
   [..]
Alexei Starovoitov Oct. 7, 2022, 7:08 p.m. UTC | #4
On Fri, Oct 7, 2022 at 4:45 AM Florian Westphal <fw@strlen.de> wrote:
>
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > > +   if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8,
> > > +                            offsetof(struct nf_hook_state, hook_index))))
> > > +           return false;
> > > +   /* arg2: struct nf_hook_state * */
> > > +   if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)))
> > > +           return false;
> > > +   /* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */
> > > +   if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0)))
> > > +           return false;
> > > +   if (!emit(p, BPF_EMIT_CALL(nf_queue)))
> > > +           return false;
> >
> > here and other CALL work by accident on x84-64.
> > You need to wrap them with BPF_CALL_ and point BPF_EMIT_CALL to that wrapper.
>
> Do you mean this? :
>
> BPF_CALL_3(nf_queue_bpf, struct sk_buff *, skb, struct nf_hook_state *,
>            state, unsigned int, verdict)
> {
>      return nf_queue(skb, state, verdict);
> }

yep.

>
> -       if (!emit(p, BPF_EMIT_CALL(nf_hook_slow)))
> +       if (!emit(p, BPF_EMIT_CALL(nf_hook_slow_bpf)))
>
> ?
>
> If yes, I don't see how this will work for the case where I only have an
> address, i.e.:
>
> if (!emit(p, BPF_EMIT_CALL(h->hook))) ....
>
> (Also, the address might be in a kernel module)
>
> > On x86-64 it will be a nop.
> > On x86-32 it will do quite a bit of work.
>
> If this only a problem for 32bit arches, I could also make this
> 'depends on CONFIG_64BIT'.

If that's acceptable, sure.

> But perhaps I am on the wrong track, I see existing code doing:
>         *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);

Yes, because we do:
                /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
                 * and other inlining handlers are currently limited to 64 bit
                 * only.
                 */
                if (prog->jit_requested && BITS_PER_LONG == 64 &&


I think you already gate this feature with jit_requested?
Otherwise it's going to be slow in the interpreter.

> (kernel/bpf/hashtab.c).
>
> > > +   prog = bpf_prog_select_runtime(prog, &err);
> > > +   if (err) {
> > > +           bpf_prog_free(prog);
> > > +           return NULL;
> > > +   }
> >
> > Would be good to do bpf_prog_alloc_id() so it can be seen in
> > bpftool prog show.
>
> Thanks a lot for the hint:
>
> 39: unspec  tag 0000000000000000
> xlated 416B  jited 221B  memlock 4096B

Probably should do bpf_prog_calc_tag() too.
And please give it some meaningful name.

> bpftool prog  dump xlated id 39
>    0: (bf) r6 = r1
>    1: (79) r7 = *(u64 *)(r1 +8)
>    2: (b4) w8 = 0
>    3: (85) call ipv6_defrag#526144928
>    4: (55) if r0 != 0x1 goto pc+24
>    5: (bf) r1 = r6
>    6: (04) w8 += 1
>    7: (85) call ipv6_conntrack_in#526206096
>    [..]

Nice.
bpftool prog profile
should work too.
Florian Westphal Oct. 7, 2022, 7:35 p.m. UTC | #5
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > -       if (!emit(p, BPF_EMIT_CALL(nf_hook_slow)))
> > +       if (!emit(p, BPF_EMIT_CALL(nf_hook_slow_bpf)))
> >
> > ?
> >
> > If yes, I don't see how this will work for the case where I only have an
> > address, i.e.:
> >
> > if (!emit(p, BPF_EMIT_CALL(h->hook))) ....
> >
> > (Also, the address might be in a kernel module)
> >
> > > On x86-64 it will be a nop.
> > > On x86-32 it will do quite a bit of work.
> >
> > If this only a problem for 32bit arches, I could also make this
> > 'depends on CONFIG_64BIT'.
> 
> If that's acceptable, sure.

Good, thanks!

> > But perhaps I am on the wrong track, I see existing code doing:
> >         *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
> 
> Yes, because we do:
>                 /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
>                  * and other inlining handlers are currently limited to 64 bit
>                  * only.
>                  */
>                 if (prog->jit_requested && BITS_PER_LONG == 64 &&

Ah, thanks, makes sense.

> I think you already gate this feature with jit_requested?
> Otherwise it's going to be slow in the interpreter.

Right, use of bpf interpreter is silly for this.

> > 39: unspec  tag 0000000000000000
> > xlated 416B  jited 221B  memlock 4096B
> 
> Probably should do bpf_prog_calc_tag() too.
> And please give it some meaningful name.

Agree, will add this.
diff mbox series

Patch

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 7c604ef8e8cb..b7874b772dd1 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -2,6 +2,7 @@ 
 #ifndef __LINUX_NETFILTER_H
 #define __LINUX_NETFILTER_H
 
+#include <linux/filter.h>
 #include <linux/init.h>
 #include <linux/skbuff.h>
 #include <linux/net.h>
@@ -106,6 +107,9 @@  struct nf_hook_entries_rcu_head {
 };
 
 struct nf_hook_entries {
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+	struct bpf_prog			*hook_prog;
+#endif
 	u16				num_hook_entries;
 	/* padding */
 	struct nf_hook_entry		hooks[];
@@ -205,6 +209,17 @@  int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 
 void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
 		       const struct nf_hook_entries *e);
+
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+DECLARE_BPF_DISPATCHER(nf_hook_base);
+
+static __always_inline int bpf_prog_run_nf(const struct bpf_prog *prog,
+					   struct nf_hook_state *state)
+{
+	return __bpf_prog_run(prog, state, BPF_DISPATCHER_FUNC(nf_hook_base));
+}
+#endif
+
 /**
  *	nf_hook - call a netfilter hook
  *
@@ -213,17 +228,17 @@  void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state,
  *	value indicates the packet has been consumed by the hook.
  */
 static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
-			  struct sock *sk, struct sk_buff *skb,
-			  struct net_device *indev, struct net_device *outdev,
-			  int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+		struct sock *sk, struct sk_buff *skb,
+		struct net_device *indev, struct net_device *outdev,
+		int (*okfn)(struct net *, struct sock *, struct sk_buff *))
 {
 	struct nf_hook_entries *hook_head = NULL;
 	int ret = 1;
 
 #ifdef CONFIG_JUMP_LABEL
 	if (__builtin_constant_p(pf) &&
-	    __builtin_constant_p(hook) &&
-	    !static_key_false(&nf_hooks_needed[pf][hook]))
+			__builtin_constant_p(hook) &&
+			!static_key_false(&nf_hooks_needed[pf][hook]))
 		return 1;
 #endif
 
@@ -254,11 +269,24 @@  static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
 
 	if (hook_head) {
 		struct nf_hook_state state;
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+		const struct bpf_prog *p = READ_ONCE(hook_head->hook_prog);
+
+		nf_hook_state_init(&state, hook, pf, indev, outdev,
+				   sk, net, okfn);
+
+		state.priv = (void *)hook_head;
+		state.skb = skb;
 
+		migrate_disable();
+		ret = bpf_prog_run_nf(p, &state);
+		migrate_enable();
+#else
 		nf_hook_state_init(&state, hook, pf, indev, outdev,
 				   sk, net, okfn);
 
 		ret = nf_hook_slow(skb, &state, hook_head);
+#endif
 	}
 	rcu_read_unlock();
 
@@ -336,10 +364,38 @@  NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
 
 	if (hook_head) {
 		struct nf_hook_state state;
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+		const struct bpf_prog *p = hook_head->hook_prog;
+		struct sk_buff *skb, *next;
+		struct list_head sublist;
+		int ret;
+
+		nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn);
+
+		INIT_LIST_HEAD(&sublist);
 
+		migrate_disable();
+
+		list_for_each_entry_safe(skb, next, head, list) {
+			skb_list_del_init(skb);
+
+			state.priv = (void *)hook_head;
+			state.skb = skb;
+
+			ret = bpf_prog_run_nf(p, &state);
+			if (ret == 1)
+				list_add_tail(&skb->list, &sublist);
+		}
+
+		migrate_enable();
+
+		/* Put passed packets back on main list */
+		list_splice(&sublist, head);
+#else
 		nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn);
 
 		nf_hook_slow_list(head, &state, hook_head);
+#endif
 	}
 	rcu_read_unlock();
 }
diff --git a/include/net/netfilter/nf_hook_bpf.h b/include/net/netfilter/nf_hook_bpf.h
new file mode 100644
index 000000000000..1792f97a806d
--- /dev/null
+++ b/include/net/netfilter/nf_hook_bpf.h
@@ -0,0 +1,21 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+struct bpf_dispatcher;
+struct bpf_prog;
+
+struct bpf_prog *nf_hook_bpf_create_fb(void);
+
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+struct bpf_prog *nf_hook_bpf_create(const struct nf_hook_entries *n);
+
+void nf_hook_bpf_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to);
+#else
+static inline void
+nf_hook_bpf_change_prog(struct bpf_dispatcher *d, struct bpf_prog *f, struct bpf_prog *t)
+{
+}
+
+static inline struct bpf_prog *nf_hook_bpf_create(const struct nf_hook_entries *n)
+{
+	return NULL;
+}
+#endif
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 4b8d04640ff3..2610786b6ad8 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -30,6 +30,16 @@  config NETFILTER_FAMILY_BRIDGE
 config NETFILTER_FAMILY_ARP
 	bool
 
+config HAVE_NF_HOOK_BPF
+	bool
+
+config NF_HOOK_BPF
+	bool "netfilter base hook bpf translator"
+	depends on BPF_JIT
+	help
+	  This unrolls the nf_hook_slow interpreter loop with
+	  auto-generated BPF program.
+
 config NETFILTER_NETLINK_HOOK
 	tristate "Netfilter base hook dump support"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 06df49ea6329..e465659e87ad 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -21,6 +21,7 @@  nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o
 endif
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
+obj-$(CONFIG_NF_HOOK_BPF) += nf_hook_bpf.o
 
 obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
 obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 17165f9cf4a1..6888c7fd5aeb 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -24,6 +24,7 @@ 
 #include <linux/rcupdate.h>
 #include <net/net_namespace.h>
 #include <net/netfilter/nf_queue.h>
+#include <net/netfilter/nf_hook_bpf.h>
 #include <net/sock.h>
 
 #include "nf_internals.h"
@@ -47,6 +48,33 @@  static DEFINE_MUTEX(nf_hook_mutex);
 #define nf_entry_dereference(e) \
 	rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex))
 
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+DEFINE_BPF_DISPATCHER(nf_hook_base);
+
+#define NF_DISPATCHER_PTR	BPF_DISPATCHER_PTR(nf_hook_base)
+#else
+#define NF_DISPATCHER_PTR	NULL
+#endif
+
+static struct bpf_prog *fallback_nf_hook_slow;
+
+static void nf_hook_bpf_prog_set(struct nf_hook_entries *e,
+				 struct bpf_prog *p)
+{
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+	WRITE_ONCE(e->hook_prog, p);
+#endif
+}
+
+static struct bpf_prog *nf_hook_bpf_prog_get(struct nf_hook_entries *e)
+{
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+	if (e)
+		return e->hook_prog;
+#endif
+	return NULL;
+}
+
 static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
 {
 	struct nf_hook_entries *e;
@@ -58,9 +86,23 @@  static struct nf_hook_entries *allocate_hook_entries_size(u16 num)
 	if (num == 0)
 		return NULL;
 
-	e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT);
-	if (e)
-		e->num_hook_entries = num;
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+	if (!fallback_nf_hook_slow) {
+		/* never free'd */
+		fallback_nf_hook_slow = nf_hook_bpf_create_fb();
+
+		if (!fallback_nf_hook_slow)
+			return NULL;
+	}
+#endif
+
+	e = kvzalloc(alloc, GFP_KERNEL);
+	if (!e)
+		return NULL;
+
+	e->num_hook_entries = num;
+	nf_hook_bpf_prog_set(e, fallback_nf_hook_slow);
+
 	return e;
 }
 
@@ -98,6 +140,29 @@  static const struct nf_hook_ops dummy_ops = {
 	.priority = INT_MIN,
 };
 
+static void nf_hook_entries_grow_bpf(const struct nf_hook_entries *old,
+				     struct nf_hook_entries *new)
+{
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+	struct bpf_prog *hook_bpf_prog = nf_hook_bpf_create(new);
+
+	/* allocate_hook_entries_size() pre-inits new->hook_prog
+	 * to a fallback program that calls nf_hook_slow().
+	 */
+	if (hook_bpf_prog) {
+		struct bpf_prog *old_prog = NULL;
+
+		new->hook_prog = hook_bpf_prog;
+
+		if (old)
+			old_prog = old->hook_prog;
+
+		nf_hook_bpf_change_prog(BPF_DISPATCHER_PTR(nf_hook_base),
+					old_prog, hook_bpf_prog);
+	}
+#endif
+}
+
 static struct nf_hook_entries *
 nf_hook_entries_grow(const struct nf_hook_entries *old,
 		     const struct nf_hook_ops *reg)
@@ -156,6 +221,7 @@  nf_hook_entries_grow(const struct nf_hook_entries *old,
 		new->hooks[nhooks].priv = reg->priv;
 	}
 
+	nf_hook_entries_grow_bpf(old, new);
 	return new;
 }
 
@@ -221,6 +287,7 @@  static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old,
 					  struct nf_hook_entries __rcu **pp)
 {
 	unsigned int i, j, skip = 0, hook_entries;
+	struct bpf_prog *hook_bpf_prog = NULL;
 	struct nf_hook_entries *new = NULL;
 	struct nf_hook_ops **orig_ops;
 	struct nf_hook_ops **new_ops;
@@ -244,8 +311,13 @@  static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old,
 
 	hook_entries -= skip;
 	new = allocate_hook_entries_size(hook_entries);
-	if (!new)
+	if (!new) {
+		struct bpf_prog *old_prog = nf_hook_bpf_prog_get(old);
+
+		nf_hook_bpf_prog_set(old, fallback_nf_hook_slow);
+		nf_hook_bpf_change_prog(NF_DISPATCHER_PTR, old_prog, NULL);
 		return NULL;
+	}
 
 	new_ops = nf_hook_entries_get_hook_ops(new);
 	for (i = 0, j = 0; i < old->num_hook_entries; i++) {
@@ -256,7 +328,13 @@  static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old,
 		j++;
 	}
 	hooks_validate(new);
+
+	/* if this fails fallback prog calls nf_hook_slow. */
+	hook_bpf_prog = nf_hook_bpf_create(new);
+	if (hook_bpf_prog)
+		nf_hook_bpf_prog_set(new, hook_bpf_prog);
 out_assign:
+	nf_hook_bpf_change_prog(NF_DISPATCHER_PTR, nf_hook_bpf_prog_get(old), hook_bpf_prog);
 	rcu_assign_pointer(*pp, new);
 	return old;
 }
@@ -609,6 +687,7 @@  int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state,
 	int ret;
 
 	state->skb = skb;
+
 	for (; s < e->num_hook_entries; s++) {
 		verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state);
 		switch (verdict & NF_VERDICT_MASK) {
@@ -783,6 +862,11 @@  int __init netfilter_init(void)
 	if (ret < 0)
 		goto err_pernet;
 
+#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
+	fallback_nf_hook_slow = nf_hook_bpf_create_fb();
+	WARN_ON_ONCE(!fallback_nf_hook_slow);
+#endif
+
 	return 0;
 err_pernet:
 	unregister_pernet_subsys(&netfilter_net_ops);
diff --git a/net/netfilter/nf_hook_bpf.c b/net/netfilter/nf_hook_bpf.c
new file mode 100644
index 000000000000..dab13b803801
--- /dev/null
+++ b/net/netfilter/nf_hook_bpf.c
@@ -0,0 +1,424 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/string.h>
+#include <linux/hashtable.h>
+#include <linux/jhash.h>
+#include <linux/netfilter.h>
+
+#include <net/netfilter/nf_hook_bpf.h>
+#include <net/netfilter/nf_queue.h>
+
+#define JMP_INVALID 0
+#define JIT_SIZE_MAX 0xffff
+
+/* BPF translator for netfilter hooks.
+ *
+ * Create a bpf program that can be called *instead* of nf_hook_slow().
+ * This program thus has same return value as nf_hook_slow and
+ * handles nfqueue and packet drops internally.
+ * Call nf_hook_bpf_create(struct nf_hook_entries *e, NF_HOOK_BPF_TYPE_BASE)
+ * to unroll the functions described by nf_hook_entries into such
+ * a bpf program.
+ *
+ * These bpf programs are called/run from nf_hook() inline function.
+ *
+ * Register usage is:
+ *
+ * BPF_REG_0: verdict.
+ * BPF_REG_1: struct nf_hook_state *
+ * BPF_REG_2: reserved as arg to nf_queue()
+ * BPF_REG_3: reserved as arg to nf_queue()
+ *
+ * Prologue storage:
+ * BPF_REG_6: copy of REG_1 (original struct nf_hook_state *)
+ * BPF_REG_7: copy of original state->priv value
+ * BPF_REG_8: copy of state->hook_index
+ */
+struct nf_hook_prog {
+	struct bpf_insn *insns;
+	unsigned int pos;
+};
+
+static bool emit(struct nf_hook_prog *p, struct bpf_insn insn)
+{
+	if (WARN_ON_ONCE(p->pos >= BPF_MAXINSNS))
+		return false;
+
+	p->insns[p->pos] = insn;
+	p->pos++;
+	return true;
+}
+
+static bool xlate_one_hook(struct nf_hook_prog *p, const struct nf_hook_entries *e,
+			   const struct nf_hook_entry *h)
+{
+	int width = bytes_to_bpf_size(sizeof(h->priv));
+
+	/* if priv is NULL, the called hookfn does not use the priv member. */
+	if (!h->priv)
+		goto emit_hook_call;
+
+	if (WARN_ON_ONCE(width < 0))
+		return false;
+
+	/* x = entries[s]->priv; */
+	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_2, BPF_REG_7,
+				 (unsigned long)&h->priv - (unsigned long)e)))
+		return false;
+
+	/* state->priv = x */
+	if (!emit(p, BPF_STX_MEM(width, BPF_REG_6, BPF_REG_2,
+				 offsetof(struct nf_hook_state, priv))))
+		return false;
+
+emit_hook_call:
+	if (!emit(p, BPF_EMIT_CALL(h->hook)))
+		return false;
+
+	/* Only advance to next hook on ACCEPT verdict.
+	 * Else, skip rest and move to tail.
+	 *
+	 * Postprocessing patches the jump offset to the
+	 * correct position, after last hook.
+	 */
+	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, NF_ACCEPT, JMP_INVALID)))
+		return false;
+
+	return true;
+}
+
+static bool emit_mov_ptr_reg(struct nf_hook_prog *p, u8 dreg, u8 sreg)
+{
+	if (sizeof(void *) == sizeof(u64))
+		return emit(p, BPF_MOV64_REG(dreg, sreg));
+	if (sizeof(void *) == sizeof(u32))
+		return emit(p, BPF_MOV32_REG(dreg, sreg));
+
+	return false;
+}
+
+static bool do_prologue(struct nf_hook_prog *p)
+{
+	int width = bytes_to_bpf_size(sizeof(void *));
+
+	if (WARN_ON_ONCE(width < 0))
+		return false;
+
+	/* argument to program is a pointer to struct nf_hook_state, in BPF_REG_1. */
+	if (!emit_mov_ptr_reg(p, BPF_REG_6, BPF_REG_1))
+		return false;
+
+	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_7, BPF_REG_1,
+				 offsetof(struct nf_hook_state, priv))))
+		return false;
+
+	/* could load state->hook_index, but we don't support index > 0 for bpf call. */
+	if (!emit(p, BPF_MOV32_IMM(BPF_REG_8, 0)))
+		return false;
+
+	return true;
+}
+
+static void patch_hook_jumps(struct nf_hook_prog *p)
+{
+	unsigned int i;
+
+	if (!p->insns)
+		return;
+
+	for (i = 0; i < p->pos; i++) {
+		if (BPF_CLASS(p->insns[i].code) != BPF_JMP)
+			continue;
+
+		if (p->insns[i].code == (BPF_EXIT | BPF_JMP))
+			continue;
+		if (p->insns[i].code == (BPF_CALL | BPF_JMP))
+			continue;
+
+		if (p->insns[i].off != JMP_INVALID)
+			continue;
+		p->insns[i].off = p->pos - i - 1;
+	}
+}
+
+static bool emit_retval(struct nf_hook_prog *p, int retval)
+{
+	if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, retval)))
+		return false;
+
+	return emit(p, BPF_EXIT_INSN());
+}
+
+static bool emit_nf_hook_slow(struct nf_hook_prog *p)
+{
+	int width = bytes_to_bpf_size(sizeof(void *));
+
+	/* restore the original state->priv. */
+	if (!emit(p, BPF_STX_MEM(width, BPF_REG_6, BPF_REG_7,
+				 offsetof(struct nf_hook_state, priv))))
+		return false;
+
+	/* arg1 is state->skb */
+	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6,
+				 offsetof(struct nf_hook_state, skb))))
+		return false;
+
+	/* arg2 is "struct nf_hook_state *" */
+	if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)))
+		return false;
+
+	/* arg3 is nf_hook_entries (original state->priv) */
+	if (!emit(p, BPF_MOV64_REG(BPF_REG_3, BPF_REG_7)))
+		return false;
+
+	if (!emit(p, BPF_EMIT_CALL(nf_hook_slow)))
+		return false;
+
+	/* No further action needed, return retval provided by nf_hook_slow */
+	return emit(p, BPF_EXIT_INSN());
+}
+
+static bool emit_nf_queue(struct nf_hook_prog *p)
+{
+	int width = bytes_to_bpf_size(sizeof(void *));
+
+	if (width < 0) {
+		WARN_ON_ONCE(1);
+		return false;
+	}
+
+	/* int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int verdict) */
+	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6,
+				 offsetof(struct nf_hook_state, skb))))
+		return false;
+	if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8,
+				 offsetof(struct nf_hook_state, hook_index))))
+		return false;
+	/* arg2: struct nf_hook_state * */
+	if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)))
+		return false;
+	/* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */
+	if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0)))
+		return false;
+	if (!emit(p, BPF_EMIT_CALL(nf_queue)))
+		return false;
+
+	/* Check nf_queue return value.  Abnormal case: nf_queue returned != 0.
+	 *
+	 * Fall back to nf_hook_slow().
+	 */
+	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2)))
+		return false;
+
+	/* Normal case: skb was stolen. Return 0. */
+	return emit_retval(p, 0);
+}
+
+static bool do_epilogue_base_hooks(struct nf_hook_prog *p)
+{
+	int width = bytes_to_bpf_size(sizeof(void *));
+
+	if (WARN_ON_ONCE(width < 0))
+		return false;
+
+	/* last 'hook'. We arrive here if previous hook returned ACCEPT,
+	 * i.e. all hooks passed -- we are done.
+	 *
+	 * Return 1, skb can continue traversing network stack.
+	 */
+	if (!emit_retval(p, 1))
+		return false;
+
+	/* Patch all hook jumps, in case any of these are taken
+	 * we need to jump to this location.
+	 *
+	 * This happens when verdict is != ACCEPT.
+	 */
+	patch_hook_jumps(p);
+
+	/* need to ignore upper 24 bits, might contain errno or queue number */
+	if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0)))
+		return false;
+	if (!emit(p, BPF_ALU32_IMM(BPF_AND, BPF_REG_3, 0xff)))
+		return false;
+
+	/* ACCEPT handled, check STOLEN. */
+	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_STOLEN, 2)))
+		return false;
+
+	if (!emit_retval(p, 0))
+		return false;
+
+	/* ACCEPT and STOLEN handled.  Check DROP next */
+	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_DROP, 1 + 2 + 2 + 2 + 2)))
+		return false;
+
+	/* First step. Extract the errno number. 1 insn. */
+	if (!emit(p, BPF_ALU32_IMM(BPF_RSH, BPF_REG_0, NF_VERDICT_QBITS)))
+		return false;
+
+	/* Second step: replace errno with EPERM if it was 0. 2 insns. */
+	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1)))
+		return false;
+	if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, EPERM)))
+		return false;
+
+	/* Third step: negate reg0: Caller expects -EFOO and stash the result.  2 insns. */
+	if (!emit(p, BPF_ALU32_IMM(BPF_NEG, BPF_REG_0, 0)))
+		return false;
+	if (!emit(p, BPF_MOV32_REG(BPF_REG_8, BPF_REG_0)))
+		return false;
+
+	/* Fourth step: free the skb. 2 insns. */
+	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6,
+				 offsetof(struct nf_hook_state, skb))))
+		return false;
+	if (!emit(p, BPF_EMIT_CALL(kfree_skb)))
+		return false;
+
+	/* Last step: return. 2 insns. */
+	if (!emit(p, BPF_MOV32_REG(BPF_REG_0, BPF_REG_8)))
+		return false;
+	if (!emit(p, BPF_EXIT_INSN()))
+		return false;
+
+	/* ACCEPT, STOLEN and DROP have been handled.
+	 * REPEAT and STOP are not allowed anymore for individual hook functions.
+	 * This leaves NFQUEUE as only remaing return value.
+	 *
+	 * In this case BPF_REG_0 still contains the original verdict of
+	 * '(NUM << NF_VERDICT_QBITS | NF_QUEUE)', so pass it to nf_queue() as-is.
+	 */
+	if (!emit_nf_queue(p))
+		return false;
+
+	/* Increment hook index and store it in nf_hook_state so nf_hook_slow will
+	 * start at the next hook, if any.
+	 */
+	if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1)))
+		return false;
+	if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8,
+				 offsetof(struct nf_hook_state, hook_index))))
+		return false;
+
+	return emit_nf_hook_slow(p);
+}
+
+static int nf_hook_prog_init(struct nf_hook_prog *p)
+{
+	memset(p, 0, sizeof(*p));
+
+	p->insns = kcalloc(BPF_MAXINSNS, sizeof(*p->insns), GFP_KERNEL);
+	if (!p->insns)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void nf_hook_prog_free(struct nf_hook_prog *p)
+{
+	kfree(p->insns);
+}
+
+static int xlate_base_hooks(struct nf_hook_prog *p, const struct nf_hook_entries *e)
+{
+	unsigned int i, len;
+
+	len = e->num_hook_entries;
+
+	if (!do_prologue(p))
+		goto out;
+
+	for (i = 0; i < len; i++) {
+		if (!xlate_one_hook(p, e, &e->hooks[i]))
+			goto out;
+
+		if (i + 1 < len) {
+			if (!emit(p, BPF_MOV64_REG(BPF_REG_1, BPF_REG_6)))
+				goto out;
+
+			if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1)))
+				goto out;
+		}
+	}
+
+	if (!do_epilogue_base_hooks(p))
+		goto out;
+
+	return 0;
+out:
+	return -EINVAL;
+}
+
+static struct bpf_prog *nf_hook_jit_compile(struct bpf_insn *insns, unsigned int len)
+{
+	struct bpf_prog *prog;
+	int err = 0;
+
+	prog = bpf_prog_alloc(bpf_prog_size(len), 0);
+	if (!prog)
+		return NULL;
+
+	prog->len = len;
+	prog->type = BPF_PROG_TYPE_SOCKET_FILTER;
+	memcpy(prog->insnsi, insns, prog->len * sizeof(struct bpf_insn));
+
+	prog = bpf_prog_select_runtime(prog, &err);
+	if (err) {
+		bpf_prog_free(prog);
+		return NULL;
+	}
+
+	return prog;
+}
+
+/* fallback program, invokes nf_hook_slow interpreter.
+ *
+ * Used when a hook is unregistered and new/replacement program cannot
+ * be compiled for some reason.
+ */
+struct bpf_prog *nf_hook_bpf_create_fb(void)
+{
+	struct bpf_prog *prog;
+	struct nf_hook_prog p;
+	int err;
+
+	err = nf_hook_prog_init(&p);
+	if (err)
+		return NULL;
+
+	if (!do_prologue(&p))
+		goto err;
+
+	if (!emit_nf_hook_slow(&p))
+		goto err;
+
+	prog = nf_hook_jit_compile(p.insns, p.pos);
+err:
+	nf_hook_prog_free(&p);
+	return prog;
+}
+
+struct bpf_prog *nf_hook_bpf_create(const struct nf_hook_entries *new)
+{
+	struct bpf_prog *prog;
+	struct nf_hook_prog p;
+	int err;
+
+	err = nf_hook_prog_init(&p);
+	if (err)
+		return NULL;
+
+	err = xlate_base_hooks(&p, new);
+	if (err)
+		goto err;
+
+	prog = nf_hook_jit_compile(p.insns, p.pos);
+err:
+	nf_hook_prog_free(&p);
+	return prog;
+}
+
+void nf_hook_bpf_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to)
+{
+	bpf_dispatcher_change_prog(d, from, to);
+}