Message ID | 20221005141309.31758-7-fw@strlen.de (mailing list archive) |
---|---|
State | RFC |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | netfilter: bpf base hook program generator | expand |
On Wed, Oct 05, 2022 at 04:13:06PM +0200, Florian Westphal wrote: > > @@ -254,11 +269,24 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, > > if (hook_head) { > struct nf_hook_state state; > +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) > + const struct bpf_prog *p = READ_ONCE(hook_head->hook_prog); > + > + nf_hook_state_init(&state, hook, pf, indev, outdev, > + sk, net, okfn); > + > + state.priv = (void *)hook_head; > + state.skb = skb; > > + migrate_disable(); > + ret = bpf_prog_run_nf(p, &state); > + migrate_enable(); Since generated prog doesn't do any per-cpu work and not using any maps there is no need for migrate_disable. There is cant_migrate() in __bpf_prog_run(), but it's probably better to silence that instead of adding migrate_disable/enable overhead. I guess it's ok for now. > +static bool emit_mov_ptr_reg(struct nf_hook_prog *p, u8 dreg, u8 sreg) > +{ > + if (sizeof(void *) == sizeof(u64)) > + return emit(p, BPF_MOV64_REG(dreg, sreg)); > + if (sizeof(void *) == sizeof(u32)) > + return emit(p, BPF_MOV32_REG(dreg, sreg)); I bet that was never tested :) because... see below. > + > + return false; > +} > + > +static bool do_prologue(struct nf_hook_prog *p) > +{ > + int width = bytes_to_bpf_size(sizeof(void *)); > + > + if (WARN_ON_ONCE(width < 0)) > + return false; > + > + /* argument to program is a pointer to struct nf_hook_state, in BPF_REG_1. */ > + if (!emit_mov_ptr_reg(p, BPF_REG_6, BPF_REG_1)) > + return false; > + > + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_7, BPF_REG_1, > + offsetof(struct nf_hook_state, priv)))) > + return false; > + > + /* could load state->hook_index, but we don't support index > 0 for bpf call. */ > + if (!emit(p, BPF_MOV32_IMM(BPF_REG_8, 0))) > + return false; > + > + return true; > +} > + > +static void patch_hook_jumps(struct nf_hook_prog *p) > +{ > + unsigned int i; > + > + if (!p->insns) > + return; > + > + for (i = 0; i < p->pos; i++) { > + if (BPF_CLASS(p->insns[i].code) != BPF_JMP) > + continue; > + > + if (p->insns[i].code == (BPF_EXIT | BPF_JMP)) > + continue; > + if (p->insns[i].code == (BPF_CALL | BPF_JMP)) > + continue; > + > + if (p->insns[i].off != JMP_INVALID) > + continue; > + p->insns[i].off = p->pos - i - 1; Pls add a check that it fits in 16-bits. > + } > +} > + > +static bool emit_retval(struct nf_hook_prog *p, int retval) > +{ > + if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, retval))) > + return false; > + > + return emit(p, BPF_EXIT_INSN()); > +} > + > +static bool emit_nf_hook_slow(struct nf_hook_prog *p) > +{ > + int width = bytes_to_bpf_size(sizeof(void *)); > + > + /* restore the original state->priv. */ > + if (!emit(p, BPF_STX_MEM(width, BPF_REG_6, BPF_REG_7, > + offsetof(struct nf_hook_state, priv)))) > + return false; > + > + /* arg1 is state->skb */ > + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, > + offsetof(struct nf_hook_state, skb)))) > + return false; > + > + /* arg2 is "struct nf_hook_state *" */ > + if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6))) > + return false; > + > + /* arg3 is nf_hook_entries (original state->priv) */ > + if (!emit(p, BPF_MOV64_REG(BPF_REG_3, BPF_REG_7))) > + return false; > + > + if (!emit(p, BPF_EMIT_CALL(nf_hook_slow))) > + return false; > + > + /* No further action needed, return retval provided by nf_hook_slow */ > + return emit(p, BPF_EXIT_INSN()); > +} > + > +static bool emit_nf_queue(struct nf_hook_prog *p) > +{ > + int width = bytes_to_bpf_size(sizeof(void *)); > + > + if (width < 0) { > + WARN_ON_ONCE(1); > + return false; > + } > + > + /* int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int verdict) */ > + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, > + offsetof(struct nf_hook_state, skb)))) > + return false; > + if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8, > + offsetof(struct nf_hook_state, hook_index)))) > + return false; > + /* arg2: struct nf_hook_state * */ > + if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6))) > + return false; > + /* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */ > + if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0))) > + return false; > + if (!emit(p, BPF_EMIT_CALL(nf_queue))) > + return false; here and other CALL work by accident on x84-64. You need to wrap them with BPF_CALL_ and point BPF_EMIT_CALL to that wrapper. On x86-64 it will be a nop. On x86-32 it will do quite a bit of work. > + > + /* Check nf_queue return value. Abnormal case: nf_queue returned != 0. > + * > + * Fall back to nf_hook_slow(). > + */ > + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2))) > + return false; > + > + /* Normal case: skb was stolen. Return 0. */ > + return emit_retval(p, 0); > +} > + > +static bool do_epilogue_base_hooks(struct nf_hook_prog *p) > +{ > + int width = bytes_to_bpf_size(sizeof(void *)); > + > + if (WARN_ON_ONCE(width < 0)) > + return false; > + > + /* last 'hook'. We arrive here if previous hook returned ACCEPT, > + * i.e. all hooks passed -- we are done. > + * > + * Return 1, skb can continue traversing network stack. > + */ > + if (!emit_retval(p, 1)) > + return false; > + > + /* Patch all hook jumps, in case any of these are taken > + * we need to jump to this location. > + * > + * This happens when verdict is != ACCEPT. > + */ > + patch_hook_jumps(p); > + > + /* need to ignore upper 24 bits, might contain errno or queue number */ > + if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0))) > + return false; > + if (!emit(p, BPF_ALU32_IMM(BPF_AND, BPF_REG_3, 0xff))) > + return false; > + > + /* ACCEPT handled, check STOLEN. */ > + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_STOLEN, 2))) > + return false; > + > + if (!emit_retval(p, 0)) > + return false; > + > + /* ACCEPT and STOLEN handled. Check DROP next */ > + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_DROP, 1 + 2 + 2 + 2 + 2))) > + return false; > + > + /* First step. Extract the errno number. 1 insn. */ > + if (!emit(p, BPF_ALU32_IMM(BPF_RSH, BPF_REG_0, NF_VERDICT_QBITS))) > + return false; > + > + /* Second step: replace errno with EPERM if it was 0. 2 insns. */ > + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1))) > + return false; > + if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, EPERM))) > + return false; > + > + /* Third step: negate reg0: Caller expects -EFOO and stash the result. 2 insns. */ > + if (!emit(p, BPF_ALU32_IMM(BPF_NEG, BPF_REG_0, 0))) > + return false; > + if (!emit(p, BPF_MOV32_REG(BPF_REG_8, BPF_REG_0))) > + return false; > + > + /* Fourth step: free the skb. 2 insns. */ > + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, > + offsetof(struct nf_hook_state, skb)))) > + return false; > + if (!emit(p, BPF_EMIT_CALL(kfree_skb))) > + return false; ditto. > + > + /* Last step: return. 2 insns. */ > + if (!emit(p, BPF_MOV32_REG(BPF_REG_0, BPF_REG_8))) > + return false; > + if (!emit(p, BPF_EXIT_INSN())) > + return false; > + > + /* ACCEPT, STOLEN and DROP have been handled. > + * REPEAT and STOP are not allowed anymore for individual hook functions. > + * This leaves NFQUEUE as only remaing return value. > + * > + * In this case BPF_REG_0 still contains the original verdict of > + * '(NUM << NF_VERDICT_QBITS | NF_QUEUE)', so pass it to nf_queue() as-is. > + */ > + if (!emit_nf_queue(p)) > + return false; > + > + /* Increment hook index and store it in nf_hook_state so nf_hook_slow will > + * start at the next hook, if any. > + */ > + if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1))) > + return false; > + if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8, > + offsetof(struct nf_hook_state, hook_index)))) > + return false; > + > + return emit_nf_hook_slow(p); > +} > + > +static int nf_hook_prog_init(struct nf_hook_prog *p) > +{ > + memset(p, 0, sizeof(*p)); > + > + p->insns = kcalloc(BPF_MAXINSNS, sizeof(*p->insns), GFP_KERNEL); > + if (!p->insns) > + return -ENOMEM; > + > + return 0; > +} > + > +static void nf_hook_prog_free(struct nf_hook_prog *p) > +{ > + kfree(p->insns); > +} > + > +static int xlate_base_hooks(struct nf_hook_prog *p, const struct nf_hook_entries *e) > +{ > + unsigned int i, len; > + > + len = e->num_hook_entries; > + > + if (!do_prologue(p)) > + goto out; > + > + for (i = 0; i < len; i++) { > + if (!xlate_one_hook(p, e, &e->hooks[i])) > + goto out; > + > + if (i + 1 < len) { > + if (!emit(p, BPF_MOV64_REG(BPF_REG_1, BPF_REG_6))) > + goto out; > + > + if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1))) > + goto out; > + } > + } > + > + if (!do_epilogue_base_hooks(p)) > + goto out; > + > + return 0; > +out: > + return -EINVAL; > +} > + > +static struct bpf_prog *nf_hook_jit_compile(struct bpf_insn *insns, unsigned int len) > +{ > + struct bpf_prog *prog; > + int err = 0; > + > + prog = bpf_prog_alloc(bpf_prog_size(len), 0); > + if (!prog) > + return NULL; > + > + prog->len = len; > + prog->type = BPF_PROG_TYPE_SOCKET_FILTER; lol. Just say BPF_PROG_TYPE_UNSPEC ? > + memcpy(prog->insnsi, insns, prog->len * sizeof(struct bpf_insn)); > + > + prog = bpf_prog_select_runtime(prog, &err); > + if (err) { > + bpf_prog_free(prog); > + return NULL; > + } Would be good to do bpf_prog_alloc_id() so it can be seen in bpftool prog show. and bpf_prog_kallsyms_add() to make 'perf report' and stack traces readable. Overall I don't hate it, but don't like it either. Please provide performance numbers. It's a lot of tricky code and not clear what the benefits are. Who will maintain this body of code long term? How are we going to deal with refactoring that will touch generic bpf bits and this generated prog? > Purpose of this is to eventually add a 'netfilter prog type' to bpf and > permit attachment of (userspace generated) bpf programs to the netfilter > machinery, e.g. 'attach bpf prog id 1234 to ipv6 PREROUTING at prio -300'. > > This will require to expose the context structure (program argument, > '__nf_hook_state', with rewriting accesses to match nf_hook_state layout. This part is orthogonal, right? I don't see how this work is connected to above idea. I'm still convinced that xt_bpf was a bad choice for many reasons. "Add a 'netfilter prog type' to bpf" would repeat the same mistakes. Let's evaluate this set independently.
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) > > + const struct bpf_prog *p = READ_ONCE(hook_head->hook_prog); > > + > > + nf_hook_state_init(&state, hook, pf, indev, outdev, > > + sk, net, okfn); > > + > > + state.priv = (void *)hook_head; > > + state.skb = skb; > > > > + migrate_disable(); > > + ret = bpf_prog_run_nf(p, &state); > > + migrate_enable(); > > Since generated prog doesn't do any per-cpu work and not using any maps > there is no need for migrate_disable. > There is cant_migrate() in __bpf_prog_run(), but it's probably better > to silence that instead of adding migrate_disable/enable overhead. Ah, thanks -- noted. > > +static bool emit_mov_ptr_reg(struct nf_hook_prog *p, u8 dreg, u8 sreg) > > +{ > > + if (sizeof(void *) == sizeof(u64)) > > + return emit(p, BPF_MOV64_REG(dreg, sreg)); > > + if (sizeof(void *) == sizeof(u32)) > > + return emit(p, BPF_MOV32_REG(dreg, sreg)); > > I bet that was never tested :) because... see below. Right, never tested, only on amd64 arch. I suspect that real 32bit support won't reduce readability too much, else I can either remove it or add it in a different patch. > > +static void patch_hook_jumps(struct nf_hook_prog *p) > > +{ > > + unsigned int i; > > + > > + if (!p->insns) > > + return; > > + > > + for (i = 0; i < p->pos; i++) { > > + if (BPF_CLASS(p->insns[i].code) != BPF_JMP) > > + continue; > > + > > + if (p->insns[i].code == (BPF_EXIT | BPF_JMP)) > > + continue; > > + if (p->insns[i].code == (BPF_CALL | BPF_JMP)) > > + continue; > > + > > + if (p->insns[i].off != JMP_INVALID) > > + continue; > > + p->insns[i].off = p->pos - i - 1; > > Pls add a check that it fits in 16-bits. Makes sense. > > + if (!emit(p, BPF_EMIT_CALL(nf_queue))) > > + return false; > > here and other CALL work by accident on x84-64. > You need to wrap them with BPF_CALL_ and point BPF_EMIT_CALL to that wrapper. > On x86-64 it will be a nop. > On x86-32 it will do quite a bit of work. I see. thanks. > > + prog->len = len; > > + prog->type = BPF_PROG_TYPE_SOCKET_FILTER; > > lol. Just say BPF_PROG_TYPE_UNSPEC ? Right, will do that. > > + memcpy(prog->insnsi, insns, prog->len * sizeof(struct bpf_insn)); > > + > > + prog = bpf_prog_select_runtime(prog, &err); > > + if (err) { > > + bpf_prog_free(prog); > > + return NULL; > > + } > > Would be good to do bpf_prog_alloc_id() so it can be seen in > bpftool prog show. Agree. > and bpf_prog_kallsyms_add() to make 'perf report' and > stack traces readable. Good to know, will check that this works. > Overall I don't hate it, but don't like it either. > Please provide performance numbers. Oh, right, I should have included those in the cover letter. Tests were done on 5.19-rc3 on a 56core intel machine using pktgen, (based off pktgen_bench_xmit_mode_netif_receive.sh), i.e. 64byte udp packets that get forwarded to a dummy device. Ruleset had single 'ct state new accept' rule in forward chain. Baseline, with 56-rx queues: 682006 pps, 348 Mb/s with this patchset: 696743 pps, 356 MB/s Averaged over 10 runs each, also reboot after each run. irqbalance was off, scaling_governor set to 'performance'. I would redo those tests for future patch submission. If there is a particular test i should do please let me know. I also did a test via iperf3 forwarding (netns -> veth1 -> netns -> veth -> netns), but 'improvement' was in noise range, too much overhead for the indirection avoidance to be noticeable. > It's a lot of tricky code and not clear what the benefits are. > Who will maintain this body of code long term? > How are we going to deal with refactoring that will touch generic bpf bits > and this generated prog? Good questions. The only 'good' answer is that it could always be marked BROKEN and then reverted if needed as it doesn't add new functionality per se. Furthermore (I have NOT looked at this at all) this opens the door for more complexity/trickery. For example the bpf prog could check (during code generation) if $indirect_hook is the ipv4 or ipv6 defrag hook and then insert extra code that avoids the function call for the common case. There are probably more hack^W tricks that could be done. So yes, maintainablity is a good question, plus what other users in the tree might want something similar (selinux hook invocation for example...). I guess it depends on wheter the perf numbers are decent enough. If they are, then I'd suggest to just do a live experiment and give it a try -- if it turns out to be a big pain point (maintenance, frequent crashes, hard-to-debug correctness bugs, e.g. 'generator failed to re-jit and now it skips my iptables filter table',...) or whatever, mark it as BROKEN in Kconfig and, if everything fails just rip it out again. Does that sound ok? > > Purpose of this is to eventually add a 'netfilter prog type' to bpf and > > permit attachment of (userspace generated) bpf programs to the netfilter > > machinery, e.g. 'attach bpf prog id 1234 to ipv6 PREROUTING at prio -300'. > > > > This will require to expose the context structure (program argument, > > '__nf_hook_state', with rewriting accesses to match nf_hook_state layout. > > This part is orthogonal, right? I don't see how this work is connected > to above idea. Yes, orthogonal from technical pov. > I'm still convinced that xt_bpf was a bad choice for many reasons. Hmmm, ok -- there is nothing I can say, it looks reasonably innocent/harmless to me wrt. backwards kludge risk etc. > "Add a 'netfilter prog type' to bpf" would repeat the same mistakes. Hmm, to me it would be more like the 'xtc/tcx' stuff rather than cls/act_bpf/xt_bpf etc. pp. but perhaps I'm missing something. > Let's evaluate this set independently. Ok, sure.
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > + if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8, > > + offsetof(struct nf_hook_state, hook_index)))) > > + return false; > > + /* arg2: struct nf_hook_state * */ > > + if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6))) > > + return false; > > + /* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */ > > + if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0))) > > + return false; > > + if (!emit(p, BPF_EMIT_CALL(nf_queue))) > > + return false; > > here and other CALL work by accident on x84-64. > You need to wrap them with BPF_CALL_ and point BPF_EMIT_CALL to that wrapper. Do you mean this? : BPF_CALL_3(nf_queue_bpf, struct sk_buff *, skb, struct nf_hook_state *, state, unsigned int, verdict) { return nf_queue(skb, state, verdict); } - if (!emit(p, BPF_EMIT_CALL(nf_hook_slow))) + if (!emit(p, BPF_EMIT_CALL(nf_hook_slow_bpf))) ? If yes, I don't see how this will work for the case where I only have an address, i.e.: if (!emit(p, BPF_EMIT_CALL(h->hook))) .... (Also, the address might be in a kernel module) > On x86-64 it will be a nop. > On x86-32 it will do quite a bit of work. If this only a problem for 32bit arches, I could also make this 'depends on CONFIG_64BIT'. But perhaps I am on the wrong track, I see existing code doing: *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); (kernel/bpf/hashtab.c). > > + prog = bpf_prog_select_runtime(prog, &err); > > + if (err) { > > + bpf_prog_free(prog); > > + return NULL; > > + } > > Would be good to do bpf_prog_alloc_id() so it can be seen in > bpftool prog show. Thanks a lot for the hint: 39: unspec tag 0000000000000000 xlated 416B jited 221B memlock 4096B bpftool prog dump xlated id 39 0: (bf) r6 = r1 1: (79) r7 = *(u64 *)(r1 +8) 2: (b4) w8 = 0 3: (85) call ipv6_defrag#526144928 4: (55) if r0 != 0x1 goto pc+24 5: (bf) r1 = r6 6: (04) w8 += 1 7: (85) call ipv6_conntrack_in#526206096 [..]
On Fri, Oct 7, 2022 at 4:45 AM Florian Westphal <fw@strlen.de> wrote: > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > > + if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8, > > > + offsetof(struct nf_hook_state, hook_index)))) > > > + return false; > > > + /* arg2: struct nf_hook_state * */ > > > + if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6))) > > > + return false; > > > + /* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */ > > > + if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0))) > > > + return false; > > > + if (!emit(p, BPF_EMIT_CALL(nf_queue))) > > > + return false; > > > > here and other CALL work by accident on x84-64. > > You need to wrap them with BPF_CALL_ and point BPF_EMIT_CALL to that wrapper. > > Do you mean this? : > > BPF_CALL_3(nf_queue_bpf, struct sk_buff *, skb, struct nf_hook_state *, > state, unsigned int, verdict) > { > return nf_queue(skb, state, verdict); > } yep. > > - if (!emit(p, BPF_EMIT_CALL(nf_hook_slow))) > + if (!emit(p, BPF_EMIT_CALL(nf_hook_slow_bpf))) > > ? > > If yes, I don't see how this will work for the case where I only have an > address, i.e.: > > if (!emit(p, BPF_EMIT_CALL(h->hook))) .... > > (Also, the address might be in a kernel module) > > > On x86-64 it will be a nop. > > On x86-32 it will do quite a bit of work. > > If this only a problem for 32bit arches, I could also make this > 'depends on CONFIG_64BIT'. If that's acceptable, sure. > But perhaps I am on the wrong track, I see existing code doing: > *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); Yes, because we do: /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. */ if (prog->jit_requested && BITS_PER_LONG == 64 && I think you already gate this feature with jit_requested? Otherwise it's going to be slow in the interpreter. > (kernel/bpf/hashtab.c). > > > > + prog = bpf_prog_select_runtime(prog, &err); > > > + if (err) { > > > + bpf_prog_free(prog); > > > + return NULL; > > > + } > > > > Would be good to do bpf_prog_alloc_id() so it can be seen in > > bpftool prog show. > > Thanks a lot for the hint: > > 39: unspec tag 0000000000000000 > xlated 416B jited 221B memlock 4096B Probably should do bpf_prog_calc_tag() too. And please give it some meaningful name. > bpftool prog dump xlated id 39 > 0: (bf) r6 = r1 > 1: (79) r7 = *(u64 *)(r1 +8) > 2: (b4) w8 = 0 > 3: (85) call ipv6_defrag#526144928 > 4: (55) if r0 != 0x1 goto pc+24 > 5: (bf) r1 = r6 > 6: (04) w8 += 1 > 7: (85) call ipv6_conntrack_in#526206096 > [..] Nice. bpftool prog profile should work too.
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > > - if (!emit(p, BPF_EMIT_CALL(nf_hook_slow))) > > + if (!emit(p, BPF_EMIT_CALL(nf_hook_slow_bpf))) > > > > ? > > > > If yes, I don't see how this will work for the case where I only have an > > address, i.e.: > > > > if (!emit(p, BPF_EMIT_CALL(h->hook))) .... > > > > (Also, the address might be in a kernel module) > > > > > On x86-64 it will be a nop. > > > On x86-32 it will do quite a bit of work. > > > > If this only a problem for 32bit arches, I could also make this > > 'depends on CONFIG_64BIT'. > > If that's acceptable, sure. Good, thanks! > > But perhaps I am on the wrong track, I see existing code doing: > > *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); > > Yes, because we do: > /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup > * and other inlining handlers are currently limited to 64 bit > * only. > */ > if (prog->jit_requested && BITS_PER_LONG == 64 && Ah, thanks, makes sense. > I think you already gate this feature with jit_requested? > Otherwise it's going to be slow in the interpreter. Right, use of bpf interpreter is silly for this. > > 39: unspec tag 0000000000000000 > > xlated 416B jited 221B memlock 4096B > > Probably should do bpf_prog_calc_tag() too. > And please give it some meaningful name. Agree, will add this.
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 7c604ef8e8cb..b7874b772dd1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -2,6 +2,7 @@ #ifndef __LINUX_NETFILTER_H #define __LINUX_NETFILTER_H +#include <linux/filter.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/net.h> @@ -106,6 +107,9 @@ struct nf_hook_entries_rcu_head { }; struct nf_hook_entries { +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + struct bpf_prog *hook_prog; +#endif u16 num_hook_entries; /* padding */ struct nf_hook_entry hooks[]; @@ -205,6 +209,17 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e); + +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) +DECLARE_BPF_DISPATCHER(nf_hook_base); + +static __always_inline int bpf_prog_run_nf(const struct bpf_prog *prog, + struct nf_hook_state *state) +{ + return __bpf_prog_run(prog, state, BPF_DISPATCHER_FUNC(nf_hook_base)); +} +#endif + /** * nf_hook - call a netfilter hook * @@ -213,17 +228,17 @@ void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, * value indicates the packet has been consumed by the hook. */ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, - struct sock *sk, struct sk_buff *skb, - struct net_device *indev, struct net_device *outdev, - int (*okfn)(struct net *, struct sock *, struct sk_buff *)) + struct sock *sk, struct sk_buff *skb, + struct net_device *indev, struct net_device *outdev, + int (*okfn)(struct net *, struct sock *, struct sk_buff *)) { struct nf_hook_entries *hook_head = NULL; int ret = 1; #ifdef CONFIG_JUMP_LABEL if (__builtin_constant_p(pf) && - __builtin_constant_p(hook) && - !static_key_false(&nf_hooks_needed[pf][hook])) + __builtin_constant_p(hook) && + !static_key_false(&nf_hooks_needed[pf][hook])) return 1; #endif @@ -254,11 +269,24 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, if (hook_head) { struct nf_hook_state state; +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + const struct bpf_prog *p = READ_ONCE(hook_head->hook_prog); + + nf_hook_state_init(&state, hook, pf, indev, outdev, + sk, net, okfn); + + state.priv = (void *)hook_head; + state.skb = skb; + migrate_disable(); + ret = bpf_prog_run_nf(p, &state); + migrate_enable(); +#else nf_hook_state_init(&state, hook, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, hook_head); +#endif } rcu_read_unlock(); @@ -336,10 +364,38 @@ NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk, if (hook_head) { struct nf_hook_state state; +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + const struct bpf_prog *p = hook_head->hook_prog; + struct sk_buff *skb, *next; + struct list_head sublist; + int ret; + + nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); + + INIT_LIST_HEAD(&sublist); + migrate_disable(); + + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + + state.priv = (void *)hook_head; + state.skb = skb; + + ret = bpf_prog_run_nf(p, &state); + if (ret == 1) + list_add_tail(&skb->list, &sublist); + } + + migrate_enable(); + + /* Put passed packets back on main list */ + list_splice(&sublist, head); +#else nf_hook_state_init(&state, hook, pf, in, out, sk, net, okfn); nf_hook_slow_list(head, &state, hook_head); +#endif } rcu_read_unlock(); } diff --git a/include/net/netfilter/nf_hook_bpf.h b/include/net/netfilter/nf_hook_bpf.h new file mode 100644 index 000000000000..1792f97a806d --- /dev/null +++ b/include/net/netfilter/nf_hook_bpf.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +struct bpf_dispatcher; +struct bpf_prog; + +struct bpf_prog *nf_hook_bpf_create_fb(void); + +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) +struct bpf_prog *nf_hook_bpf_create(const struct nf_hook_entries *n); + +void nf_hook_bpf_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); +#else +static inline void +nf_hook_bpf_change_prog(struct bpf_dispatcher *d, struct bpf_prog *f, struct bpf_prog *t) +{ +} + +static inline struct bpf_prog *nf_hook_bpf_create(const struct nf_hook_entries *n) +{ + return NULL; +} +#endif diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 4b8d04640ff3..2610786b6ad8 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -30,6 +30,16 @@ config NETFILTER_FAMILY_BRIDGE config NETFILTER_FAMILY_ARP bool +config HAVE_NF_HOOK_BPF + bool + +config NF_HOOK_BPF + bool "netfilter base hook bpf translator" + depends on BPF_JIT + help + This unrolls the nf_hook_slow interpreter loop with + auto-generated BPF program. + config NETFILTER_NETLINK_HOOK tristate "Netfilter base hook dump support" depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 06df49ea6329..e465659e87ad 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -21,6 +21,7 @@ nf_conntrack-$(CONFIG_DEBUG_INFO_BTF) += nf_conntrack_bpf.o endif obj-$(CONFIG_NETFILTER) = netfilter.o +obj-$(CONFIG_NF_HOOK_BPF) += nf_hook_bpf.o obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 17165f9cf4a1..6888c7fd5aeb 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -24,6 +24,7 @@ #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netfilter/nf_queue.h> +#include <net/netfilter/nf_hook_bpf.h> #include <net/sock.h> #include "nf_internals.h" @@ -47,6 +48,33 @@ static DEFINE_MUTEX(nf_hook_mutex); #define nf_entry_dereference(e) \ rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) +DEFINE_BPF_DISPATCHER(nf_hook_base); + +#define NF_DISPATCHER_PTR BPF_DISPATCHER_PTR(nf_hook_base) +#else +#define NF_DISPATCHER_PTR NULL +#endif + +static struct bpf_prog *fallback_nf_hook_slow; + +static void nf_hook_bpf_prog_set(struct nf_hook_entries *e, + struct bpf_prog *p) +{ +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + WRITE_ONCE(e->hook_prog, p); +#endif +} + +static struct bpf_prog *nf_hook_bpf_prog_get(struct nf_hook_entries *e) +{ +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + if (e) + return e->hook_prog; +#endif + return NULL; +} + static struct nf_hook_entries *allocate_hook_entries_size(u16 num) { struct nf_hook_entries *e; @@ -58,9 +86,23 @@ static struct nf_hook_entries *allocate_hook_entries_size(u16 num) if (num == 0) return NULL; - e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT); - if (e) - e->num_hook_entries = num; +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + if (!fallback_nf_hook_slow) { + /* never free'd */ + fallback_nf_hook_slow = nf_hook_bpf_create_fb(); + + if (!fallback_nf_hook_slow) + return NULL; + } +#endif + + e = kvzalloc(alloc, GFP_KERNEL); + if (!e) + return NULL; + + e->num_hook_entries = num; + nf_hook_bpf_prog_set(e, fallback_nf_hook_slow); + return e; } @@ -98,6 +140,29 @@ static const struct nf_hook_ops dummy_ops = { .priority = INT_MIN, }; +static void nf_hook_entries_grow_bpf(const struct nf_hook_entries *old, + struct nf_hook_entries *new) +{ +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + struct bpf_prog *hook_bpf_prog = nf_hook_bpf_create(new); + + /* allocate_hook_entries_size() pre-inits new->hook_prog + * to a fallback program that calls nf_hook_slow(). + */ + if (hook_bpf_prog) { + struct bpf_prog *old_prog = NULL; + + new->hook_prog = hook_bpf_prog; + + if (old) + old_prog = old->hook_prog; + + nf_hook_bpf_change_prog(BPF_DISPATCHER_PTR(nf_hook_base), + old_prog, hook_bpf_prog); + } +#endif +} + static struct nf_hook_entries * nf_hook_entries_grow(const struct nf_hook_entries *old, const struct nf_hook_ops *reg) @@ -156,6 +221,7 @@ nf_hook_entries_grow(const struct nf_hook_entries *old, new->hooks[nhooks].priv = reg->priv; } + nf_hook_entries_grow_bpf(old, new); return new; } @@ -221,6 +287,7 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old, struct nf_hook_entries __rcu **pp) { unsigned int i, j, skip = 0, hook_entries; + struct bpf_prog *hook_bpf_prog = NULL; struct nf_hook_entries *new = NULL; struct nf_hook_ops **orig_ops; struct nf_hook_ops **new_ops; @@ -244,8 +311,13 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old, hook_entries -= skip; new = allocate_hook_entries_size(hook_entries); - if (!new) + if (!new) { + struct bpf_prog *old_prog = nf_hook_bpf_prog_get(old); + + nf_hook_bpf_prog_set(old, fallback_nf_hook_slow); + nf_hook_bpf_change_prog(NF_DISPATCHER_PTR, old_prog, NULL); return NULL; + } new_ops = nf_hook_entries_get_hook_ops(new); for (i = 0, j = 0; i < old->num_hook_entries; i++) { @@ -256,7 +328,13 @@ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old, j++; } hooks_validate(new); + + /* if this fails fallback prog calls nf_hook_slow. */ + hook_bpf_prog = nf_hook_bpf_create(new); + if (hook_bpf_prog) + nf_hook_bpf_prog_set(new, hook_bpf_prog); out_assign: + nf_hook_bpf_change_prog(NF_DISPATCHER_PTR, nf_hook_bpf_prog_get(old), hook_bpf_prog); rcu_assign_pointer(*pp, new); return old; } @@ -609,6 +687,7 @@ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, int ret; state->skb = skb; + for (; s < e->num_hook_entries; s++) { verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state); switch (verdict & NF_VERDICT_MASK) { @@ -783,6 +862,11 @@ int __init netfilter_init(void) if (ret < 0) goto err_pernet; +#if IS_ENABLED(CONFIG_NF_HOOK_BPF) + fallback_nf_hook_slow = nf_hook_bpf_create_fb(); + WARN_ON_ONCE(!fallback_nf_hook_slow); +#endif + return 0; err_pernet: unregister_pernet_subsys(&netfilter_net_ops); diff --git a/net/netfilter/nf_hook_bpf.c b/net/netfilter/nf_hook_bpf.c new file mode 100644 index 000000000000..dab13b803801 --- /dev/null +++ b/net/netfilter/nf_hook_bpf.c @@ -0,0 +1,424 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/string.h> +#include <linux/hashtable.h> +#include <linux/jhash.h> +#include <linux/netfilter.h> + +#include <net/netfilter/nf_hook_bpf.h> +#include <net/netfilter/nf_queue.h> + +#define JMP_INVALID 0 +#define JIT_SIZE_MAX 0xffff + +/* BPF translator for netfilter hooks. + * + * Create a bpf program that can be called *instead* of nf_hook_slow(). + * This program thus has same return value as nf_hook_slow and + * handles nfqueue and packet drops internally. + * Call nf_hook_bpf_create(struct nf_hook_entries *e, NF_HOOK_BPF_TYPE_BASE) + * to unroll the functions described by nf_hook_entries into such + * a bpf program. + * + * These bpf programs are called/run from nf_hook() inline function. + * + * Register usage is: + * + * BPF_REG_0: verdict. + * BPF_REG_1: struct nf_hook_state * + * BPF_REG_2: reserved as arg to nf_queue() + * BPF_REG_3: reserved as arg to nf_queue() + * + * Prologue storage: + * BPF_REG_6: copy of REG_1 (original struct nf_hook_state *) + * BPF_REG_7: copy of original state->priv value + * BPF_REG_8: copy of state->hook_index + */ +struct nf_hook_prog { + struct bpf_insn *insns; + unsigned int pos; +}; + +static bool emit(struct nf_hook_prog *p, struct bpf_insn insn) +{ + if (WARN_ON_ONCE(p->pos >= BPF_MAXINSNS)) + return false; + + p->insns[p->pos] = insn; + p->pos++; + return true; +} + +static bool xlate_one_hook(struct nf_hook_prog *p, const struct nf_hook_entries *e, + const struct nf_hook_entry *h) +{ + int width = bytes_to_bpf_size(sizeof(h->priv)); + + /* if priv is NULL, the called hookfn does not use the priv member. */ + if (!h->priv) + goto emit_hook_call; + + if (WARN_ON_ONCE(width < 0)) + return false; + + /* x = entries[s]->priv; */ + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_2, BPF_REG_7, + (unsigned long)&h->priv - (unsigned long)e))) + return false; + + /* state->priv = x */ + if (!emit(p, BPF_STX_MEM(width, BPF_REG_6, BPF_REG_2, + offsetof(struct nf_hook_state, priv)))) + return false; + +emit_hook_call: + if (!emit(p, BPF_EMIT_CALL(h->hook))) + return false; + + /* Only advance to next hook on ACCEPT verdict. + * Else, skip rest and move to tail. + * + * Postprocessing patches the jump offset to the + * correct position, after last hook. + */ + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, NF_ACCEPT, JMP_INVALID))) + return false; + + return true; +} + +static bool emit_mov_ptr_reg(struct nf_hook_prog *p, u8 dreg, u8 sreg) +{ + if (sizeof(void *) == sizeof(u64)) + return emit(p, BPF_MOV64_REG(dreg, sreg)); + if (sizeof(void *) == sizeof(u32)) + return emit(p, BPF_MOV32_REG(dreg, sreg)); + + return false; +} + +static bool do_prologue(struct nf_hook_prog *p) +{ + int width = bytes_to_bpf_size(sizeof(void *)); + + if (WARN_ON_ONCE(width < 0)) + return false; + + /* argument to program is a pointer to struct nf_hook_state, in BPF_REG_1. */ + if (!emit_mov_ptr_reg(p, BPF_REG_6, BPF_REG_1)) + return false; + + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_7, BPF_REG_1, + offsetof(struct nf_hook_state, priv)))) + return false; + + /* could load state->hook_index, but we don't support index > 0 for bpf call. */ + if (!emit(p, BPF_MOV32_IMM(BPF_REG_8, 0))) + return false; + + return true; +} + +static void patch_hook_jumps(struct nf_hook_prog *p) +{ + unsigned int i; + + if (!p->insns) + return; + + for (i = 0; i < p->pos; i++) { + if (BPF_CLASS(p->insns[i].code) != BPF_JMP) + continue; + + if (p->insns[i].code == (BPF_EXIT | BPF_JMP)) + continue; + if (p->insns[i].code == (BPF_CALL | BPF_JMP)) + continue; + + if (p->insns[i].off != JMP_INVALID) + continue; + p->insns[i].off = p->pos - i - 1; + } +} + +static bool emit_retval(struct nf_hook_prog *p, int retval) +{ + if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, retval))) + return false; + + return emit(p, BPF_EXIT_INSN()); +} + +static bool emit_nf_hook_slow(struct nf_hook_prog *p) +{ + int width = bytes_to_bpf_size(sizeof(void *)); + + /* restore the original state->priv. */ + if (!emit(p, BPF_STX_MEM(width, BPF_REG_6, BPF_REG_7, + offsetof(struct nf_hook_state, priv)))) + return false; + + /* arg1 is state->skb */ + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, + offsetof(struct nf_hook_state, skb)))) + return false; + + /* arg2 is "struct nf_hook_state *" */ + if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6))) + return false; + + /* arg3 is nf_hook_entries (original state->priv) */ + if (!emit(p, BPF_MOV64_REG(BPF_REG_3, BPF_REG_7))) + return false; + + if (!emit(p, BPF_EMIT_CALL(nf_hook_slow))) + return false; + + /* No further action needed, return retval provided by nf_hook_slow */ + return emit(p, BPF_EXIT_INSN()); +} + +static bool emit_nf_queue(struct nf_hook_prog *p) +{ + int width = bytes_to_bpf_size(sizeof(void *)); + + if (width < 0) { + WARN_ON_ONCE(1); + return false; + } + + /* int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int verdict) */ + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, + offsetof(struct nf_hook_state, skb)))) + return false; + if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8, + offsetof(struct nf_hook_state, hook_index)))) + return false; + /* arg2: struct nf_hook_state * */ + if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6))) + return false; + /* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */ + if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0))) + return false; + if (!emit(p, BPF_EMIT_CALL(nf_queue))) + return false; + + /* Check nf_queue return value. Abnormal case: nf_queue returned != 0. + * + * Fall back to nf_hook_slow(). + */ + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2))) + return false; + + /* Normal case: skb was stolen. Return 0. */ + return emit_retval(p, 0); +} + +static bool do_epilogue_base_hooks(struct nf_hook_prog *p) +{ + int width = bytes_to_bpf_size(sizeof(void *)); + + if (WARN_ON_ONCE(width < 0)) + return false; + + /* last 'hook'. We arrive here if previous hook returned ACCEPT, + * i.e. all hooks passed -- we are done. + * + * Return 1, skb can continue traversing network stack. + */ + if (!emit_retval(p, 1)) + return false; + + /* Patch all hook jumps, in case any of these are taken + * we need to jump to this location. + * + * This happens when verdict is != ACCEPT. + */ + patch_hook_jumps(p); + + /* need to ignore upper 24 bits, might contain errno or queue number */ + if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0))) + return false; + if (!emit(p, BPF_ALU32_IMM(BPF_AND, BPF_REG_3, 0xff))) + return false; + + /* ACCEPT handled, check STOLEN. */ + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_STOLEN, 2))) + return false; + + if (!emit_retval(p, 0)) + return false; + + /* ACCEPT and STOLEN handled. Check DROP next */ + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_DROP, 1 + 2 + 2 + 2 + 2))) + return false; + + /* First step. Extract the errno number. 1 insn. */ + if (!emit(p, BPF_ALU32_IMM(BPF_RSH, BPF_REG_0, NF_VERDICT_QBITS))) + return false; + + /* Second step: replace errno with EPERM if it was 0. 2 insns. */ + if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1))) + return false; + if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, EPERM))) + return false; + + /* Third step: negate reg0: Caller expects -EFOO and stash the result. 2 insns. */ + if (!emit(p, BPF_ALU32_IMM(BPF_NEG, BPF_REG_0, 0))) + return false; + if (!emit(p, BPF_MOV32_REG(BPF_REG_8, BPF_REG_0))) + return false; + + /* Fourth step: free the skb. 2 insns. */ + if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6, + offsetof(struct nf_hook_state, skb)))) + return false; + if (!emit(p, BPF_EMIT_CALL(kfree_skb))) + return false; + + /* Last step: return. 2 insns. */ + if (!emit(p, BPF_MOV32_REG(BPF_REG_0, BPF_REG_8))) + return false; + if (!emit(p, BPF_EXIT_INSN())) + return false; + + /* ACCEPT, STOLEN and DROP have been handled. + * REPEAT and STOP are not allowed anymore for individual hook functions. + * This leaves NFQUEUE as only remaing return value. + * + * In this case BPF_REG_0 still contains the original verdict of + * '(NUM << NF_VERDICT_QBITS | NF_QUEUE)', so pass it to nf_queue() as-is. + */ + if (!emit_nf_queue(p)) + return false; + + /* Increment hook index and store it in nf_hook_state so nf_hook_slow will + * start at the next hook, if any. + */ + if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1))) + return false; + if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8, + offsetof(struct nf_hook_state, hook_index)))) + return false; + + return emit_nf_hook_slow(p); +} + +static int nf_hook_prog_init(struct nf_hook_prog *p) +{ + memset(p, 0, sizeof(*p)); + + p->insns = kcalloc(BPF_MAXINSNS, sizeof(*p->insns), GFP_KERNEL); + if (!p->insns) + return -ENOMEM; + + return 0; +} + +static void nf_hook_prog_free(struct nf_hook_prog *p) +{ + kfree(p->insns); +} + +static int xlate_base_hooks(struct nf_hook_prog *p, const struct nf_hook_entries *e) +{ + unsigned int i, len; + + len = e->num_hook_entries; + + if (!do_prologue(p)) + goto out; + + for (i = 0; i < len; i++) { + if (!xlate_one_hook(p, e, &e->hooks[i])) + goto out; + + if (i + 1 < len) { + if (!emit(p, BPF_MOV64_REG(BPF_REG_1, BPF_REG_6))) + goto out; + + if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1))) + goto out; + } + } + + if (!do_epilogue_base_hooks(p)) + goto out; + + return 0; +out: + return -EINVAL; +} + +static struct bpf_prog *nf_hook_jit_compile(struct bpf_insn *insns, unsigned int len) +{ + struct bpf_prog *prog; + int err = 0; + + prog = bpf_prog_alloc(bpf_prog_size(len), 0); + if (!prog) + return NULL; + + prog->len = len; + prog->type = BPF_PROG_TYPE_SOCKET_FILTER; + memcpy(prog->insnsi, insns, prog->len * sizeof(struct bpf_insn)); + + prog = bpf_prog_select_runtime(prog, &err); + if (err) { + bpf_prog_free(prog); + return NULL; + } + + return prog; +} + +/* fallback program, invokes nf_hook_slow interpreter. + * + * Used when a hook is unregistered and new/replacement program cannot + * be compiled for some reason. + */ +struct bpf_prog *nf_hook_bpf_create_fb(void) +{ + struct bpf_prog *prog; + struct nf_hook_prog p; + int err; + + err = nf_hook_prog_init(&p); + if (err) + return NULL; + + if (!do_prologue(&p)) + goto err; + + if (!emit_nf_hook_slow(&p)) + goto err; + + prog = nf_hook_jit_compile(p.insns, p.pos); +err: + nf_hook_prog_free(&p); + return prog; +} + +struct bpf_prog *nf_hook_bpf_create(const struct nf_hook_entries *new) +{ + struct bpf_prog *prog; + struct nf_hook_prog p; + int err; + + err = nf_hook_prog_init(&p); + if (err) + return NULL; + + err = xlate_base_hooks(&p, new); + if (err) + goto err; + + prog = nf_hook_jit_compile(p.insns, p.pos); +err: + nf_hook_prog_free(&p); + return prog; +} + +void nf_hook_bpf_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to) +{ + bpf_dispatcher_change_prog(d, from, to); +}
Add a kernel bpf program generator for netfilter base hooks. Currently netfilter hooks are invoked by nf_hook_slow: for i in hooks; do verdict = hooks[i]->indirect_func(hooks->[i].hook_arg, skb, state); switch (verdict) { .... The autogenerator unrolls the loop, so we get: state->priv = hooks->[0].hook_arg; v = first_hook_function(state); if (v != ACCEPT) goto done; state->priv = hooks->[1].hook_arg; v = second_hook_function(state); ... Indirections are replaced by direct calls. Invocation of the autogenerated programs is done via bpf dispatcher from nf_hook(). The autogenerated program has the same return value scheme as nf_hook_slow(). NF_HOOK() points are converted to call the autogenerated bpf program instead of nf_hook_slow(). Purpose of this is to eventually add a 'netfilter prog type' to bpf and permit attachment of (userspace generated) bpf programs to the netfilter machinery, e.g. 'attach bpf prog id 1234 to ipv6 PREROUTING at prio -300'. This will require to expose the context structure (program argument, '__nf_hook_state', with rewriting accesses to match nf_hook_state layout. Nat hooks are still handled via indirect calls, but they are only called once per connection. Signed-off-by: Florian Westphal <fw@strlen.de> --- include/linux/netfilter.h | 66 ++++- include/net/netfilter/nf_hook_bpf.h | 21 ++ net/netfilter/Kconfig | 10 + net/netfilter/Makefile | 1 + net/netfilter/core.c | 92 +++++- net/netfilter/nf_hook_bpf.c | 424 ++++++++++++++++++++++++++++ 6 files changed, 605 insertions(+), 9 deletions(-) create mode 100644 include/net/netfilter/nf_hook_bpf.h create mode 100644 net/netfilter/nf_hook_bpf.c