Message ID | 20231120154948.708762225@infradead.org (mailing list archive) |
---|---|
State | Superseded |
Delegated to: | BPF |
Headers | show |
Series | x86/bpf: Fix FineIBT vs eBPF | expand |
On Mon, Nov 20, 2023 at 03:46:44PM +0100, Peter Zijlstra wrote: > @@ -2935,9 +3009,9 @@ struct bpf_prog *bpf_int_jit_compile(str > jit_data->header = header; > jit_data->rw_header = rw_header; > } > - prog->bpf_func = (void *)image; > + prog->bpf_func = (void *)image + ctx.prog_offset; > prog->jited = 1; > - prog->jited_len = proglen; > + prog->jited_len = proglen - ctx.prog_offset; // XXX? > } else { > prog = orig_prog; > } Note the XXX there, I wasn't sure what the desired semantics of proglen was. As implemented it is the length from where bpf_func points to the end, not including the pre-preamble -- as indicated by offset.
On Mon, Nov 20, 2023 at 03:46:44PM +0100, Peter Zijlstra wrote: > + > +#ifdef CONFIG_CFI_CLANG > +struct bpf_insn; > + > +extern unsigned int bpf_func_proto(const void *ctx, > + const struct bpf_insn *insn); To make it more obvious what is going on could you rename it to __bpf_prog_runX() and add a comment that its prototype should match exactly bpf interpreters created by DEFINE_BPF_PROG_RUN() macro, otherwise cfi will explode. > + > +__ADDRESSABLE(bpf_func_proto); > + > +asm ( > +" .pushsection .data..ro_after_init,\"aw\",@progbits \n" > +" .type cfi_bpf_hash,@object \n" > +" .globl cfi_bpf_hash \n" > +" .p2align 2, 0x0 \n" > +"cfi_bpf_hash: \n" > +" .long __kcfi_typeid_bpf_func_proto \n" Took me some time to grok this. Cannot you use __CFI_TYPE() macro here ? > +" .size cfi_bpf_hash, 4 \n" > +" .popsection \n" > +); > +#endif ... > +static int emit_fineibt(u8 **pprog) > +{ > + u8 *prog = *pprog; > + > + EMIT_ENDBR(); > + EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash); > + EMIT2(0x74, 0x07); > + EMIT2(0x0f, 0x0b); > + EMIT1(0x90); > + EMIT_ENDBR_POISON(); Please add comments what this asm does. No one can read hex. > + > + *pprog = prog; > + return 16; 16 means "the caller of this code will jump to endbr_poison", right? > +} > + > +static int emit_kcfi(u8 **pprog) > +{ > + u8 *prog = *pprog; > + int offset = 5; > + > + EMIT1_off32(0xb8, cfi_bpf_hash); and here too. > +#ifdef CONFIG_CALL_PADDING > + EMIT1(0x90); > + EMIT1(0x90); > + EMIT1(0x90); > + EMIT1(0x90); > + EMIT1(0x90); > + EMIT1(0x90); > + EMIT1(0x90); > + EMIT1(0x90); > + EMIT1(0x90); > + EMIT1(0x90); > + EMIT1(0x90); > + offset += 11; > +#endif > + EMIT_ENDBR(); > + > + *pprog = prog; > + return offset; 5 or 16 would mean "jump to endbr" ? > +} > + > +static int emit_cfi(u8 **pprog) > +{ > + u8 *prog = *pprog; > + int offset = 0; > + > + switch (cfi_mode) { > + case CFI_FINEIBT: > + offset = emit_fineibt(&prog); > + break; > + > + case CFI_KCFI: > + offset = emit_kcfi(&prog); > + break; > + > + default: > + EMIT_ENDBR(); > + break; > + } > + > + *pprog = prog; > + return offset; > +} > + > /* > * Emit x86-64 prologue code for BPF program. > * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes > * while jumping to another program > */ > -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, > - bool tail_call_reachable, bool is_subprog, > - bool is_exception_cb) > +static int emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, > + bool tail_call_reachable, bool is_subprog, > + bool is_exception_cb) > { > u8 *prog = *pprog; > + int offset; > > + offset = emit_cfi(&prog); I'm not sure doing cfi_bpf_hash check in JITed code is completely solving the problem. From bpf_dispatcher_*_func() calling into JITed will work, but this emit_prologue() is doing the same job for all bpf progs. Some bpf progs call each other directly and indirectly. bpf_dispatcher_*_func() -> JITed_BPF_A -> JITed_BPF_B. A into B can be a direct call (which cfi doesn't care about) and indirect via emit_bpf_tail_call_indirect()->emit_indirect_jump(). Should we care about fineibt/kcfi there too? > /* BPF trampoline can be made to work without these nops, > * but let's waste 5 bytes for now and optimize later > */ > - EMIT_ENDBR(); > memcpy(prog, x86_nops[5], X86_PATCH_SIZE); > prog += X86_PATCH_SIZE; > if (!ebpf_from_cbpf) { > @@ -357,6 +426,8 @@ static void emit_prologue(u8 **pprog, u3 > if (tail_call_reachable) > EMIT1(0x50); /* push rax */ > *pprog = prog; > + > + return offset; > } > > static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) > @@ -1083,8 +1154,8 @@ static int do_jit(struct bpf_prog *bpf_p > bool tail_call_seen = false; > bool seen_exit = false; > u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; > - int i, excnt = 0; > int ilen, proglen = 0; > + int i, excnt = 0; > u8 *prog = temp; > int err; > > @@ -1094,9 +1165,12 @@ static int do_jit(struct bpf_prog *bpf_p > /* tail call's presence in current prog implies it is reachable */ > tail_call_reachable |= tail_call_seen; > > - emit_prologue(&prog, bpf_prog->aux->stack_depth, > - bpf_prog_was_classic(bpf_prog), tail_call_reachable, > - bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); > + ctx->prog_offset = emit_prologue(&prog, bpf_prog->aux->stack_depth, > + bpf_prog_was_classic(bpf_prog), > + tail_call_reachable, > + bpf_is_subprog(bpf_prog), > + bpf_prog->aux->exception_cb); > + > /* Exception callback will clobber callee regs for its own use, and > * restore the original callee regs from main prog's stack frame. > */ > @@ -2935,9 +3009,9 @@ struct bpf_prog *bpf_int_jit_compile(str > jit_data->header = header; > jit_data->rw_header = rw_header; > } > - prog->bpf_func = (void *)image; > + prog->bpf_func = (void *)image + ctx.prog_offset; I don't understand this. prog->bpf_func is the main entry point. Everything jumps there. Are you trying to skip all of cfi code in the prologue and let xdp_dispatcher jump to endbr or endbr_poison (depending on fineibt vs kcfi) ? Then what is the point of earlier asm bits? Is it a some clang thing that knows to offset indirect jump by exactly that many hard coded bytes ? Something in the clang does ptr -= 16 in case of fineibt and just jumps there ? and ptr -= 5 for kcfi ? If so, please add a giant comment explaining that. No one should be reverse engineering such intricate details. > prog->jited = 1; > - prog->jited_len = proglen; > + prog->jited_len = proglen - ctx.prog_offset; // XXX? jited_len is used later to cover the whole generated code. See bpf_prog_ksym_set_addr(): prog->aux->ksym.start = (unsigned long) prog->bpf_func; prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; we definitely want ksym [start, end] to cover every useful byte of JITed code in case IRQ happens on that byte. Without covering cfi prologue the stack dump will be wrong for that frame. I guess if xdp_dispatcher with fineibt=on jumps into prog->bpf_func - 16 and IRQ fires we don't care that much about accurate stack of last frame ? I guess it's acceptable, but a comment is necessary.
On Tue, Nov 21, 2023 at 06:18:17PM -0800, Alexei Starovoitov wrote: > On Mon, Nov 20, 2023 at 03:46:44PM +0100, Peter Zijlstra wrote: > > + > > +#ifdef CONFIG_CFI_CLANG > > +struct bpf_insn; > > + > > +extern unsigned int bpf_func_proto(const void *ctx, > > + const struct bpf_insn *insn); > > To make it more obvious what is going on could you rename it to > __bpf_prog_runX() > and add a comment that its prototype should match exactly > bpf interpreters created by DEFINE_BPF_PROG_RUN() macro, > otherwise cfi will explode. Sure. > > > + > > +__ADDRESSABLE(bpf_func_proto); > > + > > +asm ( > > +" .pushsection .data..ro_after_init,\"aw\",@progbits \n" > > +" .type cfi_bpf_hash,@object \n" > > +" .globl cfi_bpf_hash \n" > > +" .p2align 2, 0x0 \n" > > +"cfi_bpf_hash: \n" > > +" .long __kcfi_typeid_bpf_func_proto \n" > > Took me some time to grok this. > Cannot you use __CFI_TYPE() macro here ? __CFI_TYPE() creates the content of the __cfi_foo prefix symbol, which is different from what the above does. The above is basically: u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid_bpf_func_proto; Except I need to do that in asm because __kcfi_typeid magic only works in asm. I'll put the C version in a comment on top. > > +" .size cfi_bpf_hash, 4 \n" > > +" .popsection \n" > > +); > > +#endif > ... > > +static int emit_fineibt(u8 **pprog) > > +{ > > + u8 *prog = *pprog; > > + > > + EMIT_ENDBR(); > > + EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash); > > + EMIT2(0x74, 0x07); > > + EMIT2(0x0f, 0x0b); > > + EMIT1(0x90); > > + EMIT_ENDBR_POISON(); > > Please add comments what this asm does. No one can read hex. Well, I've stared at this so very long that I can in fact get quite a long way with just hex :-/ But point taken. My only problem here is that this file uses Intel syntax, and that melts my brain. Would it be acceptable to have AT&T syntax comments? > > + > > + *pprog = prog; > > + return 16; > > 16 means "the caller of this code will jump to endbr_poison", right? Ah, so the way FineIBT works is that direct calls go to foo()+0, as normal. However the indirect calls will target foo()-16. The 16 bytes preceding every symbol will contain the FineIBT landing pad. As such, we need to offset prog->bpf_func by the expected amount, otherwise foo()-16 will land in outer space and things go *boom*. To be very explicit, let me list all the various forms of function calls: Traditional: foo: ... code here ... ret direct caller: call foo indirect caller: lea foo(%rip), %r11 call *%r11 IBT: foo: endbr64 ... code here ... ret direct caller: call foo / call foo+4 indirect caller: lea foo(%rip), %r11 ... call *%r11 kCFI: __cfi_foo: movl $0x12345678, %rax (11 nops when CALL_PADDING) foo: endbr64 (when also IBT) ... code here ... ret direct caller: call foo / call foo+4 indirect caller: lea foo(%rip), %r11 ... movl $(-0x12345678), %r10d addl -15(%r11), %r10d (or -4 without CALL_PADDING) je 1f ud2 1:call *%r11 FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime patches things to look like): __cfi_foo: endbr64 subl $0x12345678, %r10d jz foo ud2 nop foo: osp nop3 (was endbr64) ... code here ... ret direct caller: call foo / call foo+4 indirect caller: lea foo(%rip), %r11 ... movl $0x12345678, %r10d subl $16, %r11 nop4 call *%r11 As can be seen, both kCFI and FineIBT use the prefix __cfi symbol / negative offsets. To make this work the JIT starts by emitting the prefix text but then offsets prog->bpf_func to point to the 'real' begin. > > +} > > + > > +static int emit_kcfi(u8 **pprog) > > +{ > > + u8 *prog = *pprog; > > + int offset = 5; > > + > > + EMIT1_off32(0xb8, cfi_bpf_hash); > > and here too. > > > +#ifdef CONFIG_CALL_PADDING > > + EMIT1(0x90); > > + EMIT1(0x90); > > + EMIT1(0x90); > > + EMIT1(0x90); > > + EMIT1(0x90); > > + EMIT1(0x90); > > + EMIT1(0x90); > > + EMIT1(0x90); > > + EMIT1(0x90); > > + EMIT1(0x90); > > + EMIT1(0x90); > > + offset += 11; > > +#endif > > + EMIT_ENDBR(); > > + > > + *pprog = prog; > > + return offset; > > 5 or 16 would mean "jump to endbr" ? It's the size of the prefix symbol. > > +} > > + > > +static int emit_cfi(u8 **pprog) > > +{ > > + u8 *prog = *pprog; > > + int offset = 0; > > + > > + switch (cfi_mode) { > > + case CFI_FINEIBT: > > + offset = emit_fineibt(&prog); > > + break; > > + > > + case CFI_KCFI: > > + offset = emit_kcfi(&prog); > > + break; > > + > > + default: > > + EMIT_ENDBR(); > > + break; > > + } > > + > > + *pprog = prog; > > + return offset; > > +} > > + > > /* > > * Emit x86-64 prologue code for BPF program. > > * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes > > * while jumping to another program > > */ > > -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, > > - bool tail_call_reachable, bool is_subprog, > > - bool is_exception_cb) > > +static int emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, > > + bool tail_call_reachable, bool is_subprog, > > + bool is_exception_cb) > > { > > u8 *prog = *pprog; > > + int offset; > > > > + offset = emit_cfi(&prog); > > I'm not sure doing cfi_bpf_hash check in JITed code is completely solving the problem. > From bpf_dispatcher_*_func() calling into JITed will work, > but this emit_prologue() is doing the same job for all bpf progs. > Some bpf progs call each other directly and indirectly. > bpf_dispatcher_*_func() -> JITed_BPF_A -> JITed_BPF_B. > A into B can be a direct call (which cfi doesn't care about) and > indirect via emit_bpf_tail_call_indirect()->emit_indirect_jump(). > Should we care about fineibt/kcfi there too? The way I understood the tail-call thing to work is that it jumps to bpf_prog + X86_TAIL_CALL_OFFSET, we already emit an extra ENDBR there to make this work. So the A -> B indirect call is otherwise unadornen and only needs ENDBR. Ideally that would use kCFI/FineIBT but since it also skips some of the setup, this gets to be non-trivial, so I've let this be as is. > > /* BPF trampoline can be made to work without these nops, > > * but let's waste 5 bytes for now and optimize later > > */ > > - EMIT_ENDBR(); > > memcpy(prog, x86_nops[5], X86_PATCH_SIZE); > > prog += X86_PATCH_SIZE; > > if (!ebpf_from_cbpf) { > > @@ -357,6 +426,8 @@ static void emit_prologue(u8 **pprog, u3 > > if (tail_call_reachable) > > EMIT1(0x50); /* push rax */ > > *pprog = prog; > > + > > + return offset; > > } > > > > static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) > > @@ -1083,8 +1154,8 @@ static int do_jit(struct bpf_prog *bpf_p > > bool tail_call_seen = false; > > bool seen_exit = false; > > u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; > > - int i, excnt = 0; > > int ilen, proglen = 0; > > + int i, excnt = 0; > > u8 *prog = temp; > > int err; > > > > @@ -1094,9 +1165,12 @@ static int do_jit(struct bpf_prog *bpf_p > > /* tail call's presence in current prog implies it is reachable */ > > tail_call_reachable |= tail_call_seen; > > > > - emit_prologue(&prog, bpf_prog->aux->stack_depth, > > - bpf_prog_was_classic(bpf_prog), tail_call_reachable, > > - bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); > > + ctx->prog_offset = emit_prologue(&prog, bpf_prog->aux->stack_depth, > > + bpf_prog_was_classic(bpf_prog), > > + tail_call_reachable, > > + bpf_is_subprog(bpf_prog), > > + bpf_prog->aux->exception_cb); > > + > > /* Exception callback will clobber callee regs for its own use, and > > * restore the original callee regs from main prog's stack frame. > > */ > > @@ -2935,9 +3009,9 @@ struct bpf_prog *bpf_int_jit_compile(str > > jit_data->header = header; > > jit_data->rw_header = rw_header; > > } > > - prog->bpf_func = (void *)image; > > + prog->bpf_func = (void *)image + ctx.prog_offset; > > I don't understand this. > Is it a some clang thing that knows to offset indirect jump by > exactly that many hard coded bytes ? > Something in the clang does ptr -= 16 in case of fineibt and just > jumps there ? and ptr -= 5 for kcfi ? Yep, that. I hope my earlier explanation clarified this. > If so, please add a giant comment explaining that. > No one should be reverse engineering such intricate details. So the kCFI thing is 'new' but readily inspected by objdump or godbolt: https://godbolt.org/z/sGe18z3ca (@Sami, that .Ltmp15 thing, I don't see that in the kernel, what compiler flag makes that go away?) As to FineIBT, that has a big comment in arch/x86/kernel/alternative.c where I rewrite the kCFI thing into FineIBT. I can refer there to avoid duplicating comments, would that work? > > > prog->jited = 1; > > - prog->jited_len = proglen; > > + prog->jited_len = proglen - ctx.prog_offset; // XXX? > > jited_len is used later to cover the whole generated code. > See bpf_prog_ksym_set_addr(): > prog->aux->ksym.start = (unsigned long) prog->bpf_func; > prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; > we definitely want ksym [start, end] to cover every useful byte > of JITed code in case IRQ happens on that byte. > Without covering cfi prologue the stack dump will be wrong for that frame. > I guess if xdp_dispatcher with fineibt=on jumps into prog->bpf_func - 16 > and IRQ fires we don't care that much about accurate stack of last frame ? > I guess it's acceptable, but a comment is necessary. Ah, so normally the __cfi_foo symbol would catch those, lemme see what I can do here. Thanks!
On Wed, Nov 22, 2023 at 12:15:17PM +0100, Peter Zijlstra wrote: > Ah, so normally the __cfi_foo symbol would catch those, lemme see what I > can do here. I have the below delta (untested etc..), does that look about right? --- --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -845,19 +845,24 @@ enum cfi_mode cfi_mode __ro_after_init = #ifdef CONFIG_CFI_CLANG struct bpf_insn; -extern unsigned int bpf_func_proto(const void *ctx, - const struct bpf_insn *insn); +/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ +extern unsigned int __bpf_prog_runX(const void *ctx, + const struct bpf_insn *insn); -__ADDRESSABLE(bpf_func_proto); +/* + * Force a reference to the external symbol so the compiler generates + * __kcfi_typid. + */ +__ADDRESSABLE(__bpf_prog_runX); -/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid_bpf_func_proto */ +/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX */ asm ( " .pushsection .data..ro_after_init,\"aw\",@progbits \n" " .type cfi_bpf_hash,@object \n" " .globl cfi_bpf_hash \n" " .p2align 2, 0x0 \n" "cfi_bpf_hash: \n" -" .long __kcfi_typeid_bpf_func_proto \n" +" .long __kcfi_typeid___bpf_prog_runX \n" " .size cfi_bpf_hash, 4 \n" " .popsection \n" ); --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -308,15 +308,20 @@ static void pop_callee_regs(u8 **pprog, *pprog = prog; } +/* + * Emit the various CFI preambles, see the large comment about FineIBT + * in arch/x86/kernel/alternative.c + */ + static int emit_fineibt(u8 **pprog) { u8 *prog = *pprog; EMIT_ENDBR(); - EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash); - EMIT2(0x74, 0x07); - EMIT2(0x0f, 0x0b); - EMIT1(0x90); + EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash); /* subl $hash, %r10d */ + EMIT2(0x74, 0x07); /* jz.d8 +7 */ + EMIT2(0x0f, 0x0b); /* ud2 */ + EMIT1(0x90); /* nop */ EMIT_ENDBR_POISON(); *pprog = prog; @@ -328,7 +333,7 @@ static int emit_kcfi(u8 **pprog) u8 *prog = *pprog; int offset = 5; - EMIT1_off32(0xb8, cfi_bpf_hash); + EMIT1_off32(0xb8, cfi_bpf_hash); /* movl $hash, %eax */ #ifdef CONFIG_CALL_PADDING EMIT1(0x90); EMIT1(0x90); @@ -3009,6 +3014,10 @@ struct bpf_prog *bpf_int_jit_compile(str jit_data->header = header; jit_data->rw_header = rw_header; } + /* + * ctx.prog_offset is used when CFI preambles put code *before* + * the function. See emit_cfi(). + */ prog->bpf_func = (void *)image + ctx.prog_offset; prog->jited = 1; prog->jited_len = proglen - ctx.prog_offset; // XXX? --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1431,6 +1431,9 @@ struct bpf_prog_aux { struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab; u32 size_poke_tab; +#ifdef CONFIG_FINEIBT + struct bpf_ksym ksym_prefix; +#endif struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -683,6 +683,23 @@ void bpf_prog_kallsyms_add(struct bpf_pr fp->aux->ksym.prog = true; bpf_ksym_add(&fp->aux->ksym); + +#ifdef CONFIG_FINEIBT + /* + * When FineIBT, code in the __cfi_foo() symbols can get executed + * and hence unwinder needs help. + */ + if (cfi_mode != CFI_FINEIBT) + return; + + snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN, + "__cfi_%s", fp->aux->ksym.name); + + prog->aux->ksym_prefix.start = (unsigned long) prog->bpf_func - 16; + prog->aux->ksym_prefix.end = (unsigned long) prog->bpf_func; + + bpf_ksym_add(&fp->aux->ksym_prefix); +#endif } void bpf_prog_kallsyms_del(struct bpf_prog *fp)
On Wed, Nov 22, 2023 at 01:41:34PM +0100, Peter Zijlstra wrote: > +#ifdef CONFIG_FINEIBT > + /* > + * When FineIBT, code in the __cfi_foo() symbols can get executed > + * and hence unwinder needs help. > + */ > + if (cfi_mode != CFI_FINEIBT) > + return; > + > + snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN, > + "__cfi_%s", fp->aux->ksym.name); > + > + prog->aux->ksym_prefix.start = (unsigned long) prog->bpf_func - 16; > + prog->aux->ksym_prefix.end = (unsigned long) prog->bpf_func; > + > + bpf_ksym_add(&fp->aux->ksym_prefix); > +#endif > } s/prog/fp/ makes compiler happier
On Wed, Nov 22, 2023 at 3:15 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > To be very explicit, let me list all the various forms of function > calls: > > Traditional: > > foo: > ... code here ... > ret > > direct caller: > > call foo > > indirect caller: > > lea foo(%rip), %r11 > call *%r11 > > IBT: > > foo: > endbr64 > ... code here ... > ret > > direct caller: > > call foo / call foo+4 > > indirect caller: > > lea foo(%rip), %r11 > ... > call *%r11 > > > kCFI: > > __cfi_foo: > movl $0x12345678, %rax > (11 nops when CALL_PADDING) > foo: > endbr64 (when also IBT) > ... code here ... > ret > > direct caller: > > call foo / call foo+4 > > indirect caller: > > lea foo(%rip), %r11 > ... > movl $(-0x12345678), %r10d > addl -15(%r11), %r10d (or -4 without CALL_PADDING) > je 1f > ud2 > 1:call *%r11 > > > FineIBT (builds as kCFI + CALL_PADDING + IBT + RETPOLINE and runtime > patches things to look like): > > __cfi_foo: > endbr64 > subl $0x12345678, %r10d > jz foo > ud2 > nop > foo: > osp nop3 (was endbr64) > ... code here ... > ret > > direct caller: > > call foo / call foo+4 > > indirect caller: > > lea foo(%rip), %r11 > ... > movl $0x12345678, %r10d > subl $16, %r11 > nop4 > call *%r11 Got it. That helps a lot! You kind of have this comment scattered through arch/x86/kernel/alternative.c but having it in one place like above would go a long way. Could you please add it to arch/x86/net/bpf_jit_comp.c or arch/x86/include/asm/cfi.h next to enum cfi_mode ? > > I'm not sure doing cfi_bpf_hash check in JITed code is completely solving the problem. > > From bpf_dispatcher_*_func() calling into JITed will work, > > but this emit_prologue() is doing the same job for all bpf progs. > > Some bpf progs call each other directly and indirectly. > > bpf_dispatcher_*_func() -> JITed_BPF_A -> JITed_BPF_B. > > A into B can be a direct call (which cfi doesn't care about) and > > indirect via emit_bpf_tail_call_indirect()->emit_indirect_jump(). > > Should we care about fineibt/kcfi there too? > > The way I understood the tail-call thing to work is that it jumps to > bpf_prog + X86_TAIL_CALL_OFFSET, we already emit an extra ENDBR there to > make this work. > > So the A -> B indirect call is otherwise unadornen and only needs ENDBR. > > Ideally that would use kCFI/FineIBT but since it also skips some of the > setup, this gets to be non-trivial, so I've let this be as is. I see. yeah. The setup is not trivial indeed. Keep as-is is fine. > So the kCFI thing is 'new' but readily inspected by objdump or godbolt: > > https://godbolt.org/z/sGe18z3ca > > (@Sami, that .Ltmp15 thing, I don't see that in the kernel, what > compiler flag makes that go away?) I also noticed this discrepancy. It doesn't seem to be used. Looks weird to spend 8 bytes to store -sizeof(ud2) > As to FineIBT, that has a big comment in arch/x86/kernel/alternative.c > where I rewrite the kCFI thing into FineIBT. I can refer there to avoid > duplicating comments, would that work? Just the above comment somewhere would work. I wouldn't worry about duplication. This is tricky stuff. When gcc folks get around implementing kcfi they will find it useful too.
On Wed, Nov 22, 2023 at 4:41 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > +/* > + * Emit the various CFI preambles, see the large comment about FineIBT > + * in arch/x86/kernel/alternative.c .. and in cfi.h .. which will have a copy-paste from your other email? > prog->bpf_func = (void *)image + ctx.prog_offset; > prog->jited = 1; > prog->jited_len = proglen - ctx.prog_offset; // XXX? Just drop XXX. > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -1431,6 +1431,9 @@ struct bpf_prog_aux { > struct bpf_kfunc_desc_tab *kfunc_tab; > struct bpf_kfunc_btf_tab *kfunc_btf_tab; > u32 size_poke_tab; > +#ifdef CONFIG_FINEIBT > + struct bpf_ksym ksym_prefix; > +#endif > struct bpf_ksym ksym; > const struct bpf_prog_ops *ops; > struct bpf_map **used_maps; > --- a/kernel/bpf/core.c > +++ b/kernel/bpf/core.c > @@ -683,6 +683,23 @@ void bpf_prog_kallsyms_add(struct bpf_pr > fp->aux->ksym.prog = true; > > bpf_ksym_add(&fp->aux->ksym); > + > +#ifdef CONFIG_FINEIBT > + /* > + * When FineIBT, code in the __cfi_foo() symbols can get executed > + * and hence unwinder needs help. > + */ I like the idea! > + if (cfi_mode != CFI_FINEIBT) > + return; The cfi_mode var needs to be global along with enum ? Or some new helper function from arch/x86 ? > + > + snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN, > + "__cfi_%s", fp->aux->ksym.name); > + > + prog->aux->ksym_prefix.start = (unsigned long) prog->bpf_func - 16; > + prog->aux->ksym_prefix.end = (unsigned long) prog->bpf_func; > + > + bpf_ksym_add(&fp->aux->ksym_prefix); > +#endif > } > > void bpf_prog_kallsyms_del(struct bpf_prog *fp) and handle deletion of ksym_prefix here. I think it's shaping up nicely. Pls resend both patches as a set and cc bpf @ vger. BPF CI will pick it up and test on arm64, x86-64, s390 with gcc and clang. We don't do CONFIG_*IBT testing automatically, but I can manually try that after the holidays.
On Wed, Nov 22, 2023 at 3:15 AM Peter Zijlstra <peterz@infradead.org> wrote: > > So the kCFI thing is 'new' but readily inspected by objdump or godbolt: > > https://godbolt.org/z/sGe18z3ca > > (@Sami, that .Ltmp15 thing, I don't see that in the kernel, what > compiler flag makes that go away?) Hmm, that looks like that's what we emit to .kcfi_traps. I suppose Godbolt just doesn't show the section directives? https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/X86/kcfi.ll#L30 Sami
On Fri, Dec 1, 2023 at 3:54 PM Sami Tolvanen <samitolvanen@google.com> wrote: > > On Wed, Nov 22, 2023 at 3:15 AM Peter Zijlstra <peterz@infradead.org> wrote: > > > > So the kCFI thing is 'new' but readily inspected by objdump or godbolt: > > > > https://godbolt.org/z/sGe18z3ca > > > > (@Sami, that .Ltmp15 thing, I don't see that in the kernel, what > > compiler flag makes that go away?) > > Hmm, that looks like that's what we emit to .kcfi_traps. I suppose > Godbolt just doesn't show the section directives? Filter > [uncheck] Directives > > https://github.com/llvm/llvm-project/blob/main/llvm/test/CodeGen/X86/kcfi.ll#L30 > > Sami
--- a/arch/x86/include/asm/cfi.h +++ b/arch/x86/include/asm/cfi.h @@ -9,15 +9,27 @@ */ #include <linux/bug.h> +enum cfi_mode { + CFI_DEFAULT, + CFI_OFF, + CFI_KCFI, + CFI_FINEIBT, +}; + +extern enum cfi_mode cfi_mode; + struct pt_regs; #ifdef CONFIG_CFI_CLANG enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); +#define __bpfcall +extern u32 cfi_bpf_hash; #else static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) { return BUG_TRAP_TYPE_NONE; } +#define cfi_bpf_hash 0U #endif /* CONFIG_CFI_CLANG */ #endif /* _ASM_X86_CFI_H */ --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -30,6 +30,7 @@ #include <asm/fixmap.h> #include <asm/paravirt.h> #include <asm/asm-prototypes.h> +#include <asm/cfi.h> int __read_mostly alternatives_patched; @@ -832,15 +833,37 @@ void __init_or_module apply_seal_endbr(s #endif /* CONFIG_X86_KERNEL_IBT */ #ifdef CONFIG_FINEIBT +#define __CFI_DEFAULT CFI_DEFAULT +#elif defined(CONFIG_CFI_CLANG) +#define __CFI_DEFAULT CFI_KCFI +#else +#define __CFI_DEFAULT CFI_OFF +#endif -enum cfi_mode { - CFI_DEFAULT, - CFI_OFF, - CFI_KCFI, - CFI_FINEIBT, -}; +enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; + +#ifdef CONFIG_CFI_CLANG +struct bpf_insn; + +extern unsigned int bpf_func_proto(const void *ctx, + const struct bpf_insn *insn); + +__ADDRESSABLE(bpf_func_proto); + +asm ( +" .pushsection .data..ro_after_init,\"aw\",@progbits \n" +" .type cfi_bpf_hash,@object \n" +" .globl cfi_bpf_hash \n" +" .p2align 2, 0x0 \n" +"cfi_bpf_hash: \n" +" .long __kcfi_typeid_bpf_func_proto \n" +" .size cfi_bpf_hash, 4 \n" +" .popsection \n" +); +#endif + +#ifdef CONFIG_FINEIBT -static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT; static bool cfi_rand __ro_after_init = true; static u32 cfi_seed __ro_after_init; @@ -1149,8 +1172,10 @@ static void __apply_fineibt(s32 *start_r goto err; if (cfi_rand) { - if (builtin) + if (builtin) { cfi_seed = get_random_u32(); + cfi_bpf_hash = cfi_rehash(cfi_bpf_hash); + } ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -17,6 +17,7 @@ #include <asm/nospec-branch.h> #include <asm/text-patching.h> #include <asm/unwind.h> +#include <asm/cfi.h> static bool all_callee_regs_used[4] = {true, true, true, true}; @@ -51,9 +52,11 @@ static u8 *emit_code(u8 *ptr, u32 bytes, do { EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0) #ifdef CONFIG_X86_KERNEL_IBT -#define EMIT_ENDBR() EMIT(gen_endbr(), 4) +#define EMIT_ENDBR() EMIT(gen_endbr(), 4) +#define EMIT_ENDBR_POISON() EMIT(gen_endbr_poison(), 4); #else #define EMIT_ENDBR() +#define EMIT_ENDBR_POISON() #endif static bool is_imm8(int value) @@ -247,6 +250,7 @@ struct jit_context { */ int tail_call_direct_label; int tail_call_indirect_label; + int prog_offset; }; /* Maximum number of bytes emitted while JITing one eBPF insn */ @@ -304,21 +308,86 @@ static void pop_callee_regs(u8 **pprog, *pprog = prog; } +static int emit_fineibt(u8 **pprog) +{ + u8 *prog = *pprog; + + EMIT_ENDBR(); + EMIT3_off32(0x41, 0x81, 0xea, cfi_bpf_hash); + EMIT2(0x74, 0x07); + EMIT2(0x0f, 0x0b); + EMIT1(0x90); + EMIT_ENDBR_POISON(); + + *pprog = prog; + return 16; +} + +static int emit_kcfi(u8 **pprog) +{ + u8 *prog = *pprog; + int offset = 5; + + EMIT1_off32(0xb8, cfi_bpf_hash); +#ifdef CONFIG_CALL_PADDING + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + EMIT1(0x90); + offset += 11; +#endif + EMIT_ENDBR(); + + *pprog = prog; + return offset; +} + +static int emit_cfi(u8 **pprog) +{ + u8 *prog = *pprog; + int offset = 0; + + switch (cfi_mode) { + case CFI_FINEIBT: + offset = emit_fineibt(&prog); + break; + + case CFI_KCFI: + offset = emit_kcfi(&prog); + break; + + default: + EMIT_ENDBR(); + break; + } + + *pprog = prog; + return offset; +} + /* * Emit x86-64 prologue code for BPF program. * bpf_tail_call helper will skip the first X86_TAIL_CALL_OFFSET bytes * while jumping to another program */ -static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, - bool tail_call_reachable, bool is_subprog, - bool is_exception_cb) +static int emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf, + bool tail_call_reachable, bool is_subprog, + bool is_exception_cb) { u8 *prog = *pprog; + int offset; + offset = emit_cfi(&prog); /* BPF trampoline can be made to work without these nops, * but let's waste 5 bytes for now and optimize later */ - EMIT_ENDBR(); memcpy(prog, x86_nops[5], X86_PATCH_SIZE); prog += X86_PATCH_SIZE; if (!ebpf_from_cbpf) { @@ -357,6 +426,8 @@ static void emit_prologue(u8 **pprog, u3 if (tail_call_reachable) EMIT1(0x50); /* push rax */ *pprog = prog; + + return offset; } static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) @@ -1083,8 +1154,8 @@ static int do_jit(struct bpf_prog *bpf_p bool tail_call_seen = false; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i, excnt = 0; int ilen, proglen = 0; + int i, excnt = 0; u8 *prog = temp; int err; @@ -1094,9 +1165,12 @@ static int do_jit(struct bpf_prog *bpf_p /* tail call's presence in current prog implies it is reachable */ tail_call_reachable |= tail_call_seen; - emit_prologue(&prog, bpf_prog->aux->stack_depth, - bpf_prog_was_classic(bpf_prog), tail_call_reachable, - bpf_is_subprog(bpf_prog), bpf_prog->aux->exception_cb); + ctx->prog_offset = emit_prologue(&prog, bpf_prog->aux->stack_depth, + bpf_prog_was_classic(bpf_prog), + tail_call_reachable, + bpf_is_subprog(bpf_prog), + bpf_prog->aux->exception_cb); + /* Exception callback will clobber callee regs for its own use, and * restore the original callee regs from main prog's stack frame. */ @@ -2935,9 +3009,9 @@ struct bpf_prog *bpf_int_jit_compile(str jit_data->header = header; jit_data->rw_header = rw_header; } - prog->bpf_func = (void *)image; + prog->bpf_func = (void *)image + ctx.prog_offset; prog->jited = 1; - prog->jited_len = proglen; + prog->jited_len = proglen - ctx.prog_offset; // XXX? } else { prog = orig_prog; } --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -29,6 +29,7 @@ #include <linux/rcupdate_trace.h> #include <linux/static_call.h> #include <linux/memcontrol.h> +#include <linux/cfi.h> struct bpf_verifier_env; struct bpf_verifier_log; @@ -1188,7 +1189,11 @@ struct bpf_dispatcher { #endif }; -static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( +#ifndef __bpfcall +#define __bpfcall __nocfi +#endif + +static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, bpf_func_t bpf_func) @@ -1278,7 +1283,7 @@ int arch_prepare_bpf_dispatcher(void *im #define DEFINE_BPF_DISPATCHER(name) \ __BPF_DISPATCHER_SC(name); \ - noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ + noinline __bpfcall unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func) \
The current BPF call convention is __nocfi, except when it calls !JIT things, then it calls regular C functions. It so happens that with FineIBT the __nocfi and C calling conventions are incompatible. Specifically __nocfi will call at func+0, while FineIBT will have endbr-poison there, which is not a valid indirect target. Causing #CP. Notably this only triggers on IBT enabled hardware, which is probably why this hasn't been reported (also, most people will have JIT on anyway). Implement proper CFI prologues for the BPF JIT codegen and drop __nocfi for x86. Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> --- arch/x86/include/asm/cfi.h | 12 +++++ arch/x86/kernel/alternative.c | 41 ++++++++++++++--- arch/x86/net/bpf_jit_comp.c | 96 +++++++++++++++++++++++++++++++++++++----- include/linux/bpf.h | 9 +++ 4 files changed, 137 insertions(+), 21 deletions(-)