diff mbox series

[RFC,v2,20/21] x86: Add support for CONFIG_CFI_CLANG

Message ID 20220513202159.1550547-21-samitolvanen@google.com (mailing list archive)
State Superseded
Headers show
Series KCFI support | expand

Commit Message

Sami Tolvanen May 13, 2022, 8:21 p.m. UTC
With CONFIG_CFI_CLANG, the compiler injects a type preamble
immediately before each function and a check to validate the target
function type before indirect calls:

  ; type preamble
  __cfi_function:
    int3
    int3
    mov <id>, %eax
    int3
    int3
  function:
    ...
  ; indirect call check
    cmpl    <id>, -6(%r11)
    je      .Ltmp1
    ud2
  .Ltmp1:
    call    __x86_indirect_thunk_r11

Define the __CFI_TYPE helper macro for manual type annotations in
assembly code, add error handling for the CFI ud2 traps, and allow
CONFIG_CFI_CLANG to be selected on x86_64.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
---
 arch/x86/Kconfig               |  2 ++
 arch/x86/include/asm/linkage.h | 12 +++++++
 arch/x86/kernel/traps.c        | 60 +++++++++++++++++++++++++++++++++-
 3 files changed, 73 insertions(+), 1 deletion(-)

Comments

Kees Cook May 14, 2022, 10:02 p.m. UTC | #1
On Fri, May 13, 2022 at 01:21:58PM -0700, Sami Tolvanen wrote:
> With CONFIG_CFI_CLANG, the compiler injects a type preamble
> immediately before each function and a check to validate the target
> function type before indirect calls:
> 
>   ; type preamble
>   __cfi_function:
>     int3
>     int3
>     mov <id>, %eax
>     int3
>     int3
>   function:
>     ...
>   ; indirect call check
>     cmpl    <id>, -6(%r11)
>     je      .Ltmp1
>     ud2
>   .Ltmp1:
>     call    __x86_indirect_thunk_r11
> 
> Define the __CFI_TYPE helper macro for manual type annotations in
> assembly code, add error handling for the CFI ud2 traps, and allow
> CONFIG_CFI_CLANG to be selected on x86_64.
> 
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
> ---
>  arch/x86/Kconfig               |  2 ++
>  arch/x86/include/asm/linkage.h | 12 +++++++
>  arch/x86/kernel/traps.c        | 60 +++++++++++++++++++++++++++++++++-
>  3 files changed, 73 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 4bed3abf444d..2e73d0792d48 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -108,6 +108,8 @@ config X86
>  	select ARCH_SUPPORTS_PAGE_TABLE_CHECK	if X86_64
>  	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
>  	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
> +	select ARCH_SUPPORTS_CFI_CLANG		if X86_64
> +	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI_CLANG
>  	select ARCH_SUPPORTS_LTO_CLANG
>  	select ARCH_SUPPORTS_LTO_CLANG_THIN
>  	select ARCH_USE_BUILTIN_BSWAP
> diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
> index 85865f1645bd..0ee4a0af3974 100644
> --- a/arch/x86/include/asm/linkage.h
> +++ b/arch/x86/include/asm/linkage.h
> @@ -25,6 +25,18 @@
>  #define RET	ret
>  #endif
>  
> +#ifdef CONFIG_CFI_CLANG
> +#define __CFI_TYPE(name)					\
> +	.fill 7, 1, 0xCC ASM_NL					\
> +	SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE)	\
> +	int3 ASM_NL						\
> +	int3 ASM_NL						\
> +	mov __kcfi_typeid_##name, %eax ASM_NL			\
> +	int3 ASM_NL						\
> +	int3 ASM_NL						\
> +	SYM_FUNC_END(__cfi_##name)
> +#endif
> +
>  #else /* __ASSEMBLY__ */
>  
>  #ifdef CONFIG_SLS
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 1563fb995005..320e257eb4be 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -40,6 +40,7 @@
>  #include <linux/hardirq.h>
>  #include <linux/atomic.h>
>  #include <linux/ioasid.h>
> +#include <linux/cfi.h>
>  
>  #include <asm/stacktrace.h>
>  #include <asm/processor.h>
> @@ -295,6 +296,62 @@ static inline void handle_invalid_op(struct pt_regs *regs)
>  		      ILL_ILLOPN, error_get_trap_addr(regs));
>  }
>  
> +#ifdef CONFIG_CFI_CLANG
> +static void decode_cfi_insn(struct pt_regs *regs, unsigned long *target,
> +			    unsigned long *type)
> +{
> +	char buffer[MAX_INSN_SIZE];
> +	struct insn insn;
> +	int offset;
> +
> +	*target = *type = 0;

Should report_cfi_failure() have some additional hinting for the case
where target/type are zero? Like, "hey, got an inexplicable CFI failure
here, but preamble decode failed. Yikes!"

Reviewed-by: Kees Cook <keescook@chromium.org>
Kees Cook May 15, 2022, 3:19 a.m. UTC | #2
On Fri, May 13, 2022 at 01:21:58PM -0700, Sami Tolvanen wrote:
> With CONFIG_CFI_CLANG, the compiler injects a type preamble
> immediately before each function and a check to validate the target
> function type before indirect calls:
> 
>   ; type preamble
>   __cfi_function:
>     int3
>     int3
>     mov <id>, %eax
>     int3
>     int3
>   function:
>     ...
>   ; indirect call check
>     cmpl    <id>, -6(%r11)
>     je      .Ltmp1
>     ud2
>   .Ltmp1:
>     call    __x86_indirect_thunk_r11
> 
> Define the __CFI_TYPE helper macro for manual type annotations in
> assembly code, add error handling for the CFI ud2 traps, and allow
> CONFIG_CFI_CLANG to be selected on x86_64.
> 
> Signed-off-by: Sami Tolvanen <samitolvanen@google.com>

Looks good testing with LKDTM...

$ echo CFI_FORWARD_PROTO | cat >/sys/kernel/debug/provoke-crash/DIRECT
[  144.084017] lkdtm: Performing direct entry CFI_FORWARD_PROTO
[  144.086138] lkdtm: Calling matched prototype ...
[  144.087833] lkdtm: Calling mismatched prototype ...
[  144.089777] CFI failure at lkdtm_CFI_FORWARD_PROTO+0x34/0x67 [lkdtm] (target: lkdtm_increment_int+0x0/0x7 [lkdtm]; expected type: 0x7e0c52a5)


Tested-by: Kees Cook <keescook@chromium.org>
David Laight May 16, 2022, 8:32 a.m. UTC | #3
From: Sami Tolvanen
> Sent: 13 May 2022 21:22
> 
> With CONFIG_CFI_CLANG, the compiler injects a type preamble
> immediately before each function and a check to validate the target
> function type before indirect calls:
> 
>   ; type preamble
>   __cfi_function:
>     int3
>     int3
>     mov <id>, %eax

Interesting - since this code can't be executed there is no
point adding an instruction 'prefix' to the 32bit constant.
 
>     int3
>     int3
>   function:
>     ...
>   ; indirect call check
>     cmpl    <id>, -6(%r11)
>     je      .Ltmp1
>     ud2
>   .Ltmp1:
>     call    __x86_indirect_thunk_r11
> 
> Define the __CFI_TYPE helper macro for manual type annotations in
> assembly code, add error handling for the CFI ud2 traps, and allow
> CONFIG_CFI_CLANG to be selected on x86_64.
> 
...
> +
> +	/*
> +	 * The compiler generates the following instruction sequence
> +	 * for indirect call checks:
> +	 *
> +	 *   cmpl    <id>, -6(%reg)	; 7 bytes

If the <id> is between -128 and 127 then an 8bit constant
(sign extended) might be used.
Possibly the compiler forces the assembler to generate the
long form.

There could also be a REX prefix.
That will break any code that tries to use %reg.

> +	 *   je      .Ltmp1		; 2 bytes
> +	 *   ud2			; <- addr
> +	 *   .Ltmp1:
> +	 *
> +	 * Both the type and the target address can be decoded from the
> +	 * cmpl instruction.
> +	 */
> +	if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 9, MAX_INSN_SIZE))
> +		return;
> +	if (insn_decode_kernel(&insn, buffer))
> +		return;
> +	if (insn.opcode.value != 0x81 || X86_MODRM_REG(insn.modrm.value) != 7)
> +		return;

Since you are looking for a very specific opcode why bother
calling insn_decode_kernel() - just check for the required (masked)
byte values.

> +
> +	*type = insn.immediate.value;
> +
> +	offset = insn_get_modrm_rm_off(&insn, regs);

Given the expected instruction, isn't that -6 ??
> +	if (offset < 0)
> +		return;
> +
> +	*target = *(unsigned long *)((void *)regs + offset);

WTF is that calculating??

> +}
...

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Peter Zijlstra May 16, 2022, 9:54 a.m. UTC | #4
On Fri, May 13, 2022 at 01:21:58PM -0700, Sami Tolvanen wrote:
> With CONFIG_CFI_CLANG, the compiler injects a type preamble
> immediately before each function and a check to validate the target
> function type before indirect calls:
> 
>   ; type preamble
>   __cfi_function:
>     int3
>     int3
>     mov <id>, %eax
>     int3
>     int3
>   function:
>     ...

When I enable CFI_CLANG and X86_KERNEL_IBT I get:

0000000000000c80 <__cfi_io_schedule_timeout>:
c80:   cc                      int3
c81:   cc                      int3
c82:   b8 b5 b1 39 b3          mov    $0xb339b1b5,%eax
c87:   cc                      int3
c88:   cc                      int3

0000000000000c89 <io_schedule_timeout>:
c89:   f3 0f 1e fa             endbr64


That seems unfortunate. Would it be possible to get an additional
compiler option to suppress the endbr for all symbols that get a __cfi_
preaamble?

Also, perhaps s/CFI_CLANG/KERNEL_CFI/ or somesuch, so that GCC might
also implement this same scheme (in time)?

>   ; indirect call check
>     cmpl    <id>, -6(%r11)
>     je      .Ltmp1
>     ud2
>   .Ltmp1:
>     call    __x86_indirect_thunk_r11

The first one I try and find looks like:

26:       41 81 7b fa a6 96 9e 38         cmpl   $0x389e96a6,-0x6(%r11)
2e:       74 02                   je     32 <__traceiter_sched_kthread_stop+0x29>
30:       0f 0b                   ud2
32:       4c 89 f6                mov    %r14,%rsi
35:       e8 00 00 00 00          call   3a <__traceiter_sched_kthread_stop+0x31> 36: R_X86_64_PLT32      __x86_indirect_thunk_r11-0x4

This must not be. If I'm to rewrite that lot to:

  movl	$\hash, %r10d
  sub	$9, %r11
  call	*%r11
  .nop  4

Then there must not be spurious instruction in between the ud2 and the
indirect call/retpoline thing.
Peter Zijlstra May 16, 2022, 11:45 a.m. UTC | #5
On Mon, May 16, 2022 at 11:54:33AM +0200, Peter Zijlstra wrote:
> On Fri, May 13, 2022 at 01:21:58PM -0700, Sami Tolvanen wrote:
> > With CONFIG_CFI_CLANG, the compiler injects a type preamble
> > immediately before each function and a check to validate the target
> > function type before indirect calls:
> > 
> >   ; type preamble
> >   __cfi_function:
> >     int3
> >     int3
> >     mov <id>, %eax
> >     int3
> >     int3
> >   function:
> >     ...
> 
> When I enable CFI_CLANG and X86_KERNEL_IBT I get:
> 
> 0000000000000c80 <__cfi_io_schedule_timeout>:
> c80:   cc                      int3
> c81:   cc                      int3
> c82:   b8 b5 b1 39 b3          mov    $0xb339b1b5,%eax
> c87:   cc                      int3
> c88:   cc                      int3
> 
> 0000000000000c89 <io_schedule_timeout>:
> c89:   f3 0f 1e fa             endbr64
> 
> 
> That seems unfortunate. Would it be possible to get an additional
> compiler option to suppress the endbr for all symbols that get a __cfi_
> preaamble?
> 
> Also, perhaps s/CFI_CLANG/KERNEL_CFI/ or somesuch, so that GCC might
> also implement this same scheme (in time)?
> 
> >   ; indirect call check
> >     cmpl    <id>, -6(%r11)
> >     je      .Ltmp1
> >     ud2
> >   .Ltmp1:
> >     call    __x86_indirect_thunk_r11
> 
> The first one I try and find looks like:
> 
> 26:       41 81 7b fa a6 96 9e 38         cmpl   $0x389e96a6,-0x6(%r11)
> 2e:       74 02                   je     32 <__traceiter_sched_kthread_stop+0x29>
> 30:       0f 0b                   ud2
> 32:       4c 89 f6                mov    %r14,%rsi
> 35:       e8 00 00 00 00          call   3a <__traceiter_sched_kthread_stop+0x31> 36: R_X86_64_PLT32      __x86_indirect_thunk_r11-0x4
> 
> This must not be. If I'm to rewrite that lot to:
> 
>   movl	$\hash, %r10d
>   sub	$9, %r11
>   call	*%r11
>   .nop  4
> 
> Then there must not be spurious instruction in between the ud2 and the
> indirect call/retpoline thing.

Hmmm.. when I replace it with:

   movl	$\hash, %r10d
   sub	$9, %r11
   .nops 2

That would work, that has the added benefit of nicely co-existing with
the current retpoline patching.

The only remaining problem is how to find this; the .retpoline_sites is
fairly concenient, but if the compiler can put arbitrary amounts of code
in between this is going to be somewhat tedious.
Peter Zijlstra May 16, 2022, 12:58 p.m. UTC | #6
On Mon, May 16, 2022 at 01:45:17PM +0200, Peter Zijlstra wrote:
> On Mon, May 16, 2022 at 11:54:33AM +0200, Peter Zijlstra wrote:
> > On Fri, May 13, 2022 at 01:21:58PM -0700, Sami Tolvanen wrote:

> > >   ; indirect call check
> > >     cmpl    <id>, -6(%r11)
> > >     je      .Ltmp1
> > >     ud2
> > >   .Ltmp1:
> > >     call    __x86_indirect_thunk_r11
> > 
> > The first one I try and find looks like:
> > 
> > 26:       41 81 7b fa a6 96 9e 38         cmpl   $0x389e96a6,-0x6(%r11)
> > 2e:       74 02                   je     32 <__traceiter_sched_kthread_stop+0x29>
> > 30:       0f 0b                   ud2
> > 32:       4c 89 f6                mov    %r14,%rsi
> > 35:       e8 00 00 00 00          call   3a <__traceiter_sched_kthread_stop+0x31> 36: R_X86_64_PLT32      __x86_indirect_thunk_r11-0x4
> > 
> > This must not be. If I'm to rewrite that lot to:
> > 
> >   movl	$\hash, %r10d
> >   sub	$9, %r11
> >   call	*%r11
> >   .nop  4
> > 
> > Then there must not be spurious instruction in between the ud2 and the
> > indirect call/retpoline thing.
> 
> Hmmm.. when I replace it with:
> 
>    movl	$\hash, %r10d
>    sub	$9, %r11
>    .nops 2
> 
> That would work, that has the added benefit of nicely co-existing with
> the current retpoline patching.
> 
> The only remaining problem is how to find this; the .retpoline_sites is
> fairly concenient, but if the compiler can put arbitrary amounts of code
> in between this is going to be somewhat tedious.
> 

Something like the *completely* untested below. It compiles, but because
the above code-gen issue it *cannot* work.

(also, it lacks module support)

@willy, how horribly broken is this xarray usage?

---
 arch/x86/include/asm/traps.h            |   1 +
 arch/x86/kernel/alternative.c           | 316 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/common.c            |   5 +
 arch/x86/kernel/vmlinux.lds.S           |   9 +
 tools/objtool/check.c                   |  67 ++++++-
 tools/objtool/include/objtool/objtool.h |   1 +
 tools/objtool/objtool.c                 |   1 +
 7 files changed, 399 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 35317c5c551d..a423343cffbc 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -19,6 +19,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *e
 #endif
 
 extern bool ibt_selftest(void);
+extern bool ibt_broken;
 
 #ifdef CONFIG_X86_F00F_BUG
 /* For handling the FOOF bug */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d374cb3cf024..abce4e78a1e0 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -18,6 +18,9 @@
 #include <linux/mmu_context.h>
 #include <linux/bsearch.h>
 #include <linux/sync_core.h>
+#include <linux/moduleloader.h>
+#include <linux/xarray.h>
+#include <linux/set_memory.h>
 #include <asm/text-patching.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
@@ -115,6 +118,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 }
 
 extern s32 __retpoline_sites[], __retpoline_sites_end[];
+extern s32 __cfi_sites[], __cfi_sites_end[];
 extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
@@ -549,6 +553,315 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { }
 
 #endif /* CONFIG_X86_KERNEL_IBT */
 
+#ifdef CONFIG_CFI_CLANG
+/*
+ *  FineIBT                                         kCFI
+ *
+ *  __fineibt_\hash:
+ *    xor     \hash, %r10     # 7
+ *    jz      1f              # 2
+ *    ud2                     # 2
+ *  1:ret                     # 1
+ *    int3                    # 1
+ *
+ *
+ *  __cfi_\sym:                                     __cfi_\sym:
+ *                                                    int3; int3                              # 2
+ *    endbr                   # 4                     mov     \hash, %eax                     # 5
+ *    call    __fineibt_\hash # 5                     int3; int3                              # 2
+ *  \sym:                                           \sym:
+ *  ...                                             ...
+ *
+ *
+ *  caller:                                         caller:
+ *    movl    \hash, %r10d    # 6                     cmpl    \hash, -6(%r11)                 # 8
+ *    sub     $9, %r11        # 4                     je      1f                              # 2
+ *    nop2                    # 2                     ud2                                     # 2
+ *
+ *    call    *%r11           # 3                     call    __x86_indirect_thunk_r11        # 5
+ *    nop2                    # 2
+ */
+
+static DEFINE_XARRAY(cfi_hashes);
+static int nr_cfi_hashes;
+
+static u32 decode_cfi_preamble(void *addr)
+{
+	u8 *p = addr;
+
+	if (p[0] == 0xcc && p[1] == 0xcc &&
+	    p[2] == 0xb8 &&
+	    p[7] == 0xcc && p[8] == 0xcc)
+		return *(u32 *)(addr + 3);
+
+	return 0; /* invalid hash value */
+}
+
+static u32 decode_cfi_caller(void *addr)
+{
+	u8 *p = addr;
+
+	if (((p[0] == 0x41 && p[1] == 0x81) ||
+	     (p[0] == 0xeb && p[1] == 0x0a)) && p[2] == 0x7b &&
+	    p[8] == 0x74 && p[9] == 0x02 &&
+	    p[10] == 0x0f && p[11] == 0x0b)
+		return *(u32 *)(addr + 4);
+
+	return 0; /* invalid hash value */
+}
+
+// .cfi_sites
+static int cfi_index_hashes(s32 *start, s32 *end)
+{
+	s32 *s;
+
+	for (s = start; s < end; s++) {
+		void *addr = (void *)s + *s;
+		void *xa;
+		u32 hash;
+
+		hash = decode_cfi_preamble(addr);
+		if (!hash) {
+			//WARN();
+			return -EINVAL;
+		}
+
+		xa = xa_store(&cfi_hashes, hash, NULL, GFP_KERNEL);
+		if (xa_is_err(xa)) {
+			//WARN();
+			return xa_err(xa);
+		}
+		nr_cfi_hashes++;
+	}
+
+	return 0;
+}
+
+asm (	".pushsection .rodata\n"
+	"fineibt_template_start:\n"
+	"	xorl	$0x12345678, %r10d\n"	// 7
+	"	je	1f\n"			// 2
+	"	ud2\n"				// 2
+	"1:	ret\n"				// 1
+	"	int3\n"
+	"	int3\n"
+	"	int3\n"
+	"	int3\n"				// 4
+	"fineibt_template_end:\n"
+	".popsection\n"
+    );
+
+extern u8 fineibt_template_start[];
+extern u8 fineibt_template_end[];
+
+static int cfi_create_fineibt_stubs(void)
+{
+	size_t size = 16 * nr_cfi_hashes;
+	int pages = 1 + ((size - 1) >> PAGE_SHIFT);
+	void *text, *entry, *xa;
+	unsigned long hash;
+	int err = -ENOMEM;
+
+	text = module_alloc(size);
+	if (!text)
+		return err;
+
+	entry = text;
+	xa_for_each(&cfi_hashes, hash, xa) {
+
+		memcpy(entry, fineibt_template_start, 16);
+		*(u32 *)(entry + 3) = hash;
+
+		xa = xa_store(&cfi_hashes, hash, entry, GFP_KERNEL);
+		if (xa_is_err(xa)) {
+			err = xa_err(xa);
+			goto err_alloc;
+		}
+		if (xa) {
+			err = -EINVAL;
+			goto err_alloc;
+		}
+
+		entry += 16;
+	}
+
+	set_memory_ro((unsigned long)text, pages);
+	set_memory_x((unsigned long)text, pages);
+
+	return 0;
+
+err_alloc:
+	module_memfree(text);
+	return -EINVAL;
+}
+
+// .retpoline_sites
+static int cfi_disable_callers(s32 *start, s32 *end)
+{
+	/*
+	 * Disable CFI by patching in a 2 byte JMP, this leaves the hash in
+	 * tact for later usage. Also see decode_cfi_caller() and
+	 * cfu_rewrite_callers().
+	 */
+	const u8 jmp12[] = { 0xeb, 0x0a };
+	s32 *s;
+
+	for (s = start; s < end; s++) {
+		void *addr = (void *)s + *s;
+		u32 hash;
+
+		hash = decode_cfi_caller(addr - 12);
+		if (!hash) {
+			// WARN();
+			return -EINVAL;
+		}
+
+		text_poke_early(addr - 12, jmp12, 2);
+	}
+
+	return 0;
+}
+
+asm (	".pushsection .rodata\n"
+	"fineibt_cfi_start:\n"
+	"	endbr64\n"
+	"	call fineibt_caller_start\n"
+	"fineibt_cfi_end:"
+	".popsection\n"
+    );
+
+extern u8 fineibt_cfi_start[];
+extern u8 fineibt_cfi_end[];
+
+// .cfi_sites
+static int cfi_rewrite_cfi(s32 *start, s32 *end)
+{
+	s32 *s;
+
+	for (s = start; s < end; s++) {
+		void *dest, *addr = (void *)s + *s;
+		unsigned long index;
+		u32 hash;
+
+		index = hash = decode_cfi_preamble(addr);
+		dest = xa_find(&cfi_hashes, &index, hash, XA_PRESENT);
+
+		if (WARN_ON_ONCE(index != hash || !dest))
+			return -EINVAL;
+
+		text_poke_early(addr, fineibt_cfi_start,
+				(fineibt_cfi_end - fineibt_cfi_start));
+
+		__text_gen_insn(addr + 4,
+				CALL_INSN_OPCODE, addr + 4,
+				dest, CALL_INSN_SIZE);
+	}
+
+	return 0;
+}
+
+asm (	".pushsection .rodata\n"
+	"fineibt_caller_start:\n"
+	"	movl	$0x12345678, %r10d\n"
+	"	sub	$9, %r11\n"
+	"	.nops	2\n"
+	"fineibt_caller_end:"
+	".popsection\n"
+    );
+
+extern u8 fineibt_caller_start[];
+extern u8 fineibt_caller_end[];
+
+// .retpoline_sites
+static int cfi_rewrite_callers(s32 *start, s32 *end)
+{
+	s32 *s;
+
+	for (s = start; s < end; s++) {
+		void *addr = (void *)s + *s;
+		u32 hash;
+
+		hash = decode_cfi_caller(addr - 12);
+
+		if (WARN_ON_ONCE(!hash))
+			return -EINVAL;
+
+		text_poke_early(addr - 12, fineibt_caller_start,
+				(fineibt_caller_end - fineibt_caller_end));
+
+		*(u32 *)(addr - 12 + 2) = hash;
+
+		/* rely on apply_retpolines() to rewrite the actual call */
+	}
+
+	return 0;
+}
+
+bool __ro_after_init ibt_broken = false;
+
+static void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+			  s32 *start_cfi, s32 *end_cfi)
+{
+	int ret;
+
+	/* If IBT, use FineIBT */
+	if (!HAS_KERNEL_IBT || !cpu_feature_enabled(X86_FEATURE_IBT))
+		return;
+
+	/*
+	 * Find and count all unique hash values.
+	 */
+	ret = cfi_index_hashes(start_cfi, end_cfi);
+	if (ret)
+		goto err;
+
+	/*
+	 * Allocate module memory and write FineIBT stubs.
+	 */
+	ret = cfi_create_fineibt_stubs();
+	if (ret)
+		goto err;
+
+	/*
+	 * Rewrite the callers to not use the __cfi_ stubs, such that we might
+	 * rewrite them. Disables all CFI. If this succeeds but any of the
+	 * later stages fails, we're CFI-less.
+	 */
+	ret = cfi_disable_callers(start_retpoline, end_retpoline);
+	if (ret)
+		goto err;
+
+	/*
+	 * Rewrite the __cfi_ stubs from kCFI to FineIBT.
+	 */
+	ret = cfi_rewrite_cfi(start_cfi, end_cfi);
+	if (ret)
+		goto err;
+
+	/*
+	 * Now that everything is in place; rewrite the callers to FineIBT.
+	 */
+	ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
+	if (ret)
+		goto err;
+
+	return;
+
+err:
+	pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
+	/* must *NOT* enable IBT */
+	ibt_broken = true;
+}
+
+#else
+
+static void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
+			  s32 *start_cfi, s32 *end_cfi)
+{
+}
+
+#endif
+
 #ifdef CONFIG_SMP
 static void alternatives_smp_lock(const s32 *start, const s32 *end,
 				  u8 *text, u8 *text_end)
@@ -855,6 +1168,9 @@ void __init alternative_instructions(void)
 	 */
 	apply_paravirt(__parainstructions, __parainstructions_end);
 
+	apply_fineibt(__retpoline_sites, __retpoline_sites_end,
+		      __cfi_sites, __cfi_sites_end);
+
 	/*
 	 * Rewrite the retpolines, must be done before alternatives since
 	 * those can rewrite the retpoline thunks.
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e342ae4db3c4..e4377256b952 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -630,6 +630,11 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c)
 	    !cpu_feature_enabled(X86_FEATURE_IBT))
 		return;
 
+#ifdef CONFIG_CFI_CLANG
+	if (ibt_broken)
+		return;
+#endif
+
 	wrmsrl(MSR_IA32_S_CET, msr);
 	cr4_set_bits(X86_CR4_CET);
 
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 7fda7f27e762..72ffc91ddd20 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -294,6 +294,15 @@ SECTIONS
 	}
 #endif
 
+#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_RETPOLINE) && defined(CONFIG_X86_KERNEL_IBT)
+	. = ALIGN(8);
+	.cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) {
+		__cfi_sites = .;
+		*(.cfi_sites)
+		__cfi_sites_end = .;
+	}
+#endif
+
 	/*
 	 * struct alt_inst entries. From the header (alternative.h):
 	 * "Alternative instructions for different CPU types or capabilities"
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 88f005ae6dcc..edc8aecf229c 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -797,6 +797,52 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file)
 	return 0;
 }
 
+static int create_cfi_sections(struct objtool_file *file)
+{
+	struct instruction *insn;
+	struct section *sec;
+	int idx;
+
+	sec = find_section_by_name(file->elf, ".cfi_sites");
+	if (sec) {
+		WARN("file already has .cfi_sites, skipping");
+		return 0;
+	}
+
+	idx = 0;
+	list_for_each_entry(insn, &file->cfi_list, call_node)
+		idx++;
+
+	if (!idx)
+		return 0;
+
+	sec = elf_create_section(file->elf, ".cfi_sites", 0,
+				 sizeof(int), idx);
+	if (!sec) {
+		WARN("elf_create_section: .cfi_sites");
+		return -1;
+	}
+
+	idx = 0;
+	list_for_each_entry(insn, &file->cfi_list, call_node) {
+
+		int *site = (int *)sec->data->d_buf + idx;
+		*site = 0;
+
+		if (elf_add_reloc_to_insn(file->elf, sec,
+					  idx * sizeof(int),
+					  R_X86_64_PC32,
+					  insn->sec, insn->offset)) {
+			WARN("elf_add_reloc_to_insn: .cfi_sites");
+			return -1;
+		}
+
+		idx++;
+	}
+
+	return 0;
+}
+
 static int create_mcount_loc_sections(struct objtool_file *file)
 {
 	struct section *sec;
@@ -3301,6 +3347,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 {
 	struct alternative *alt;
 	struct instruction *next_insn, *prev_insn = NULL;
+	struct instruction *first_insn = insn;
 	struct section *sec;
 	u8 visited;
 	int ret;
@@ -3312,8 +3359,19 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
 
 		if (func && insn->func && func != insn->func->pfunc) {
 			/* Ignore KCFI type preambles, which always fall through */
-			if (!strncmp(func->name, "__cfi_", 6))
+			if (!strncmp(func->name, "__cfi_", 6)) {
+				/*
+				 * If the function has a __cfi_ preamble, the endbr
+				 * will live in there.
+				 */
+				insn->noendbr = true;
+				/*
+				 * The preamble starts with INSN_TRAP,
+				 * call_node cannot be used.
+				 */
+				list_add_tail(&first_insn->call_node, &file->cfi_list);
 				return 0;
+			}
 
 			WARN("%s() falls through to next function %s()",
 			     func->name, insn->func->name);
@@ -3953,6 +4011,13 @@ int check(struct objtool_file *file)
 		warnings += ret;
 	}
 
+	if (ibt && retpoline) {
+		ret = create_cfi_sections(file);
+		if (ret < 0)
+			goto out;
+		warnings += ret;
+	}
+
 	if (stats) {
 		printf("nr_insns_visited: %ld\n", nr_insns_visited);
 		printf("nr_cfi: %ld\n", nr_cfi);
diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h
index a6e72d916807..93f52e275fa6 100644
--- a/tools/objtool/include/objtool/objtool.h
+++ b/tools/objtool/include/objtool/objtool.h
@@ -27,6 +27,7 @@ struct objtool_file {
 	struct list_head static_call_list;
 	struct list_head mcount_loc_list;
 	struct list_head endbr_list;
+	struct list_head cfi_list;
 	bool ignore_unreachables, hints, rodata;
 
 	unsigned int nr_endbr;
diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
index 843ff3c2f28e..16ed3613b0e2 100644
--- a/tools/objtool/objtool.c
+++ b/tools/objtool/objtool.c
@@ -129,6 +129,7 @@ struct objtool_file *objtool_open_read(const char *_objname)
 	INIT_LIST_HEAD(&file.static_call_list);
 	INIT_LIST_HEAD(&file.mcount_loc_list);
 	INIT_LIST_HEAD(&file.endbr_list);
+	INIT_LIST_HEAD(&file.cfi_list);
 	file.ignore_unreachables = no_unreachable;
 	file.hints = false;
Sami Tolvanen May 16, 2022, 4:39 p.m. UTC | #7
On Mon, May 16, 2022 at 1:32 AM David Laight <David.Laight@aculab.com> wrote:
>
> From: Sami Tolvanen
> > Sent: 13 May 2022 21:22
> >
> > With CONFIG_CFI_CLANG, the compiler injects a type preamble
> > immediately before each function and a check to validate the target
> > function type before indirect calls:
> >
> >   ; type preamble
> >   __cfi_function:
> >     int3
> >     int3
> >     mov <id>, %eax
>
> Interesting - since this code can't be executed there is no
> point adding an instruction 'prefix' to the 32bit constant.

The reason to embed the type into an instruction is to avoid the need
to special case objtool's instruction decoder.

> >     int3
> >     int3
> >   function:
> >     ...
> >   ; indirect call check
> >     cmpl    <id>, -6(%r11)
> >     je      .Ltmp1
> >     ud2
> >   .Ltmp1:
> >     call    __x86_indirect_thunk_r11
> >
> > Define the __CFI_TYPE helper macro for manual type annotations in
> > assembly code, add error handling for the CFI ud2 traps, and allow
> > CONFIG_CFI_CLANG to be selected on x86_64.
> >
> ...
> > +
> > +     /*
> > +      * The compiler generates the following instruction sequence
> > +      * for indirect call checks:
> > +      *
> > +      *   cmpl    <id>, -6(%reg)     ; 7 bytes
>
> If the <id> is between -128 and 127 then an 8bit constant
> (sign extended) might be used.
> Possibly the compiler forces the assembler to generate the
> long form.
>
> There could also be a REX prefix.
> That will break any code that tries to use %reg.

The compiler always generates this specific instruction sequence.

> > +      *   je      .Ltmp1             ; 2 bytes
> > +      *   ud2                        ; <- addr
> > +      *   .Ltmp1:
> > +      *
> > +      * Both the type and the target address can be decoded from the
> > +      * cmpl instruction.
> > +      */
> > +     if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 9, MAX_INSN_SIZE))
> > +             return;
> > +     if (insn_decode_kernel(&insn, buffer))
> > +             return;
> > +     if (insn.opcode.value != 0x81 || X86_MODRM_REG(insn.modrm.value) != 7)
> > +             return;
>
> Since you are looking for a very specific opcode why bother
> calling insn_decode_kernel() - just check for the required (masked)
> byte values.

Because I need to decode both the immediate value and the register
from that instruction.

> > +
> > +     *type = insn.immediate.value;
> > +
> > +     offset = insn_get_modrm_rm_off(&insn, regs);
>
> Given the expected instruction, isn't that -6 ??

No, this is the register offset.

> > +     if (offset < 0)
> > +             return;
> > +
> > +     *target = *(unsigned long *)((void *)regs + offset);
>
> WTF is that calculating??

It's reading the register value from pt_regs.

Sami
Sami Tolvanen May 16, 2022, 5:15 p.m. UTC | #8
On Mon, May 16, 2022 at 2:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, May 13, 2022 at 01:21:58PM -0700, Sami Tolvanen wrote:
> > With CONFIG_CFI_CLANG, the compiler injects a type preamble
> > immediately before each function and a check to validate the target
> > function type before indirect calls:
> >
> >   ; type preamble
> >   __cfi_function:
> >     int3
> >     int3
> >     mov <id>, %eax
> >     int3
> >     int3
> >   function:
> >     ...
>
> When I enable CFI_CLANG and X86_KERNEL_IBT I get:
>
> 0000000000000c80 <__cfi_io_schedule_timeout>:
> c80:   cc                      int3
> c81:   cc                      int3
> c82:   b8 b5 b1 39 b3          mov    $0xb339b1b5,%eax
> c87:   cc                      int3
> c88:   cc                      int3
>
> 0000000000000c89 <io_schedule_timeout>:
> c89:   f3 0f 1e fa             endbr64
>
>
> That seems unfortunate. Would it be possible to get an additional
> compiler option to suppress the endbr for all symbols that get a __cfi_
> preaamble?

What's the concern with the endbr? Dropping it would currently break
the CFI+IBT combination on newer hardware, no?

> Also, perhaps s/CFI_CLANG/KERNEL_CFI/ or somesuch, so that GCC might
> also implement this same scheme (in time)?

I'm fine with renaming the config.

> >   ; indirect call check
> >     cmpl    <id>, -6(%r11)
> >     je      .Ltmp1
> >     ud2
> >   .Ltmp1:
> >     call    __x86_indirect_thunk_r11
>
> The first one I try and find looks like:
>
> 26:       41 81 7b fa a6 96 9e 38         cmpl   $0x389e96a6,-0x6(%r11)
> 2e:       74 02                   je     32 <__traceiter_sched_kthread_stop+0x29>
> 30:       0f 0b                   ud2
> 32:       4c 89 f6                mov    %r14,%rsi
> 35:       e8 00 00 00 00          call   3a <__traceiter_sched_kthread_stop+0x31> 36: R_X86_64_PLT32      __x86_indirect_thunk_r11-0x4
>
> This must not be. If I'm to rewrite that lot to:
>
>   movl  $\hash, %r10d
>   sub   $9, %r11
>   call  *%r11
>   .nop  4
>
> Then there must not be spurious instruction in between the ud2 and the
> indirect call/retpoline thing.

With the current compiler patch, LLVM sets up function arguments after
the CFI check. if it's a problem, we can look into changing that.

Sami
Peter Zijlstra May 16, 2022, 6:30 p.m. UTC | #9
On Mon, May 16, 2022 at 10:15:00AM -0700, Sami Tolvanen wrote:
> On Mon, May 16, 2022 at 2:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Fri, May 13, 2022 at 01:21:58PM -0700, Sami Tolvanen wrote:
> > > With CONFIG_CFI_CLANG, the compiler injects a type preamble
> > > immediately before each function and a check to validate the target
> > > function type before indirect calls:
> > >
> > >   ; type preamble
> > >   __cfi_function:
> > >     int3
> > >     int3
> > >     mov <id>, %eax
> > >     int3
> > >     int3
> > >   function:
> > >     ...
> >
> > When I enable CFI_CLANG and X86_KERNEL_IBT I get:
> >
> > 0000000000000c80 <__cfi_io_schedule_timeout>:
> > c80:   cc                      int3
> > c81:   cc                      int3
> > c82:   b8 b5 b1 39 b3          mov    $0xb339b1b5,%eax
> > c87:   cc                      int3
> > c88:   cc                      int3
> >
> > 0000000000000c89 <io_schedule_timeout>:
> > c89:   f3 0f 1e fa             endbr64
> >
> >
> > That seems unfortunate. Would it be possible to get an additional
> > compiler option to suppress the endbr for all symbols that get a __cfi_
> > preaamble?
> 
> What's the concern with the endbr? Dropping it would currently break
> the CFI+IBT combination on newer hardware, no?

Well, yes, but also that combination isn't very interesting. See,

  https://lore.kernel.org/all/20220420004241.2093-1-joao@overdrivepizza.com/T/#m5d67fb010d488b2f8eee33f1eb39d12f769e4ad2

and the patch I did down-thread:

  https://lkml.kernel.org/r/YoJKhHluN4n0kZDm@hirez.programming.kicks-ass.net

If we have IBT, then FineIBT is a much better option than kCFI+IBT.
Removing that superfluous endbr also shrinks the whole thing by 4 bytes.

So I'm fine with the compiler generating working code for that
combination; but please get me an option to supress it in order to save
those pointless bytes. All this CFI stuff is enough bloat as it is.

> > >   ; indirect call check
> > >     cmpl    <id>, -6(%r11)
> > >     je      .Ltmp1
> > >     ud2
> > >   .Ltmp1:
> > >     call    __x86_indirect_thunk_r11
> >
> > The first one I try and find looks like:
> >
> > 26:       41 81 7b fa a6 96 9e 38         cmpl   $0x389e96a6,-0x6(%r11)
> > 2e:       74 02                   je     32 <__traceiter_sched_kthread_stop+0x29>
> > 30:       0f 0b                   ud2
> > 32:       4c 89 f6                mov    %r14,%rsi
> > 35:       e8 00 00 00 00          call   3a <__traceiter_sched_kthread_stop+0x31> 36: R_X86_64_PLT32      __x86_indirect_thunk_r11-0x4
> >
> > This must not be. If I'm to rewrite that lot to:
> >
> >   movl  $\hash, %r10d
> >   sub   $9, %r11
> >   call  *%r11
> >   .nop  4
> >
> > Then there must not be spurious instruction in between the ud2 and the
> > indirect call/retpoline thing.
> 
> With the current compiler patch, LLVM sets up function arguments after
> the CFI check. if it's a problem, we can look into changing that.

Yes, please fix that. Again see that same patch for why this is a
problem. Objtool can trivially find retpoline calls, but finding this
kCFI gadget is going to be hard work. If you ensure they're
unconditionally stuck together, then the problem goes away find one,
finds the other.
Sami Tolvanen May 16, 2022, 6:57 p.m. UTC | #10
On Sat, May 14, 2022 at 3:03 PM Kees Cook <keescook@chromium.org> wrote:
>
> On Fri, May 13, 2022 at 01:21:58PM -0700, Sami Tolvanen wrote:
> > +#ifdef CONFIG_CFI_CLANG
> > +static void decode_cfi_insn(struct pt_regs *regs, unsigned long *target,
> > +                         unsigned long *type)
> > +{
> > +     char buffer[MAX_INSN_SIZE];
> > +     struct insn insn;
> > +     int offset;
> > +
> > +     *target = *type = 0;
>
> Should report_cfi_failure() have some additional hinting for the case
> where target/type are zero? Like, "hey, got an inexplicable CFI failure
> here, but preamble decode failed. Yikes!"

Good point, I'll add an error message here.

Sami
Sami Tolvanen May 16, 2022, 7:39 p.m. UTC | #11
On Mon, May 16, 2022 at 11:30 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, May 16, 2022 at 10:15:00AM -0700, Sami Tolvanen wrote:
> > On Mon, May 16, 2022 at 2:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Fri, May 13, 2022 at 01:21:58PM -0700, Sami Tolvanen wrote:
> > > > With CONFIG_CFI_CLANG, the compiler injects a type preamble
> > > > immediately before each function and a check to validate the target
> > > > function type before indirect calls:
> > > >
> > > >   ; type preamble
> > > >   __cfi_function:
> > > >     int3
> > > >     int3
> > > >     mov <id>, %eax
> > > >     int3
> > > >     int3
> > > >   function:
> > > >     ...
> > >
> > > When I enable CFI_CLANG and X86_KERNEL_IBT I get:
> > >
> > > 0000000000000c80 <__cfi_io_schedule_timeout>:
> > > c80:   cc                      int3
> > > c81:   cc                      int3
> > > c82:   b8 b5 b1 39 b3          mov    $0xb339b1b5,%eax
> > > c87:   cc                      int3
> > > c88:   cc                      int3
> > >
> > > 0000000000000c89 <io_schedule_timeout>:
> > > c89:   f3 0f 1e fa             endbr64
> > >
> > >
> > > That seems unfortunate. Would it be possible to get an additional
> > > compiler option to suppress the endbr for all symbols that get a __cfi_
> > > preaamble?
> >
> > What's the concern with the endbr? Dropping it would currently break
> > the CFI+IBT combination on newer hardware, no?
>
> Well, yes, but also that combination isn't very interesting. See,
>
>   https://lore.kernel.org/all/20220420004241.2093-1-joao@overdrivepizza.com/T/#m5d67fb010d488b2f8eee33f1eb39d12f769e4ad2
>
> and the patch I did down-thread:
>
>   https://lkml.kernel.org/r/YoJKhHluN4n0kZDm@hirez.programming.kicks-ass.net
>
> If we have IBT, then FineIBT is a much better option than kCFI+IBT.
> Removing that superfluous endbr also shrinks the whole thing by 4 bytes.
>
> So I'm fine with the compiler generating working code for that
> combination; but please get me an option to supress it in order to save
> those pointless bytes. All this CFI stuff is enough bloat as it is.

Sure, I'll take a look at what's the best way to accomplish this.

> > > >   ; indirect call check
> > > >     cmpl    <id>, -6(%r11)
> > > >     je      .Ltmp1
> > > >     ud2
> > > >   .Ltmp1:
> > > >     call    __x86_indirect_thunk_r11
> > >
> > > The first one I try and find looks like:
> > >
> > > 26:       41 81 7b fa a6 96 9e 38         cmpl   $0x389e96a6,-0x6(%r11)
> > > 2e:       74 02                   je     32 <__traceiter_sched_kthread_stop+0x29>
> > > 30:       0f 0b                   ud2
> > > 32:       4c 89 f6                mov    %r14,%rsi
> > > 35:       e8 00 00 00 00          call   3a <__traceiter_sched_kthread_stop+0x31> 36: R_X86_64_PLT32      __x86_indirect_thunk_r11-0x4
> > >
> > > This must not be. If I'm to rewrite that lot to:
> > >
> > >   movl  $\hash, %r10d
> > >   sub   $9, %r11
> > >   call  *%r11
> > >   .nop  4
> > >
> > > Then there must not be spurious instruction in between the ud2 and the
> > > indirect call/retpoline thing.
> >
> > With the current compiler patch, LLVM sets up function arguments after
> > the CFI check. if it's a problem, we can look into changing that.
>
> Yes, please fix that. Again see that same patch for why this is a
> problem. Objtool can trivially find retpoline calls, but finding this
> kCFI gadget is going to be hard work. If you ensure they're
> unconditionally stuck together, then the problem goes away find one,
> finds the other.

You can use .kcfi_traps to locate the check right now, but I agree,
it's not quite ideal.

Sami
Peter Zijlstra May 16, 2022, 8:37 p.m. UTC | #12
On Mon, May 16, 2022 at 12:39:19PM -0700, Sami Tolvanen wrote:

> > > With the current compiler patch, LLVM sets up function arguments after
> > > the CFI check. if it's a problem, we can look into changing that.
> >
> > Yes, please fix that. Again see that same patch for why this is a
> > problem. Objtool can trivially find retpoline calls, but finding this
> > kCFI gadget is going to be hard work. If you ensure they're
> > unconditionally stuck together, then the problem goes away find one,
> > finds the other.
> 
> You can use .kcfi_traps to locate the check right now, but I agree,
> it's not quite ideal.

Oohh, indeed. Looking at that, I think .kcfi_traps would be better as
relative offsets; eg. 'addr = (void*)s + *s' like. Halfs the amount of
storage needed for it.

Also, that code can use a few {} extra.
David Laight May 16, 2022, 9:32 p.m. UTC | #13
From: Sami Tolvanen
> Sent: 16 May 2022 17:39
> 
> On Mon, May 16, 2022 at 1:32 AM David Laight <David.Laight@aculab.com> wrote:
> >
> > From: Sami Tolvanen
> > > Sent: 13 May 2022 21:22
> > >
> > > With CONFIG_CFI_CLANG, the compiler injects a type preamble
> > > immediately before each function and a check to validate the target
> > > function type before indirect calls:
> > >
> > >   ; type preamble
> > >   __cfi_function:
> > >     int3
> > >     int3
> > >     mov <id>, %eax
> >
> > Interesting - since this code can't be executed there is no
> > point adding an instruction 'prefix' to the 32bit constant.
> 
> The reason to embed the type into an instruction is to avoid the need
> to special case objtool's instruction decoder.
> 
> > >     int3
> > >     int3
> > >   function:
> > >     ...
> > >   ; indirect call check
> > >     cmpl    <id>, -6(%r11)
> > >     je      .Ltmp1
> > >     ud2
> > >   .Ltmp1:
> > >     call    __x86_indirect_thunk_r11
> > >
> > > Define the __CFI_TYPE helper macro for manual type annotations in
> > > assembly code, add error handling for the CFI ud2 traps, and allow
> > > CONFIG_CFI_CLANG to be selected on x86_64.
> > >
> > ...
> > > +
> > > +     /*
> > > +      * The compiler generates the following instruction sequence
> > > +      * for indirect call checks:
> > > +      *
> > > +      *   cmpl    <id>, -6(%reg)     ; 7 bytes
> >
> > If the <id> is between -128 and 127 then an 8bit constant
> > (sign extended) might be used.
> > Possibly the compiler forces the assembler to generate the
> > long form.
> >
> > There could also be a REX prefix.
> > That will break any code that tries to use %reg.
> 
> The compiler always generates this specific instruction sequence.

Yes, but there are several ways to encode 'cmpl imm,-6(reg)'.
Firstly you can use '81 /7 imm32' or '83 /7 imm8' where imm8 is sign extended.
(the /7 1/7/index_reg for a signed 8 bit offset).
But that only works for the original 32bit registers.
For the 64bit r8 to r15 an extra REX prefix is required.
That makes the instruction 8 bytes (if it needs a full 32bit immediate).

So if the register is %r11 there is an extra REX byte.
If the <id> is a hash and happens to be between -128 and 127
then there are three less bytes.

So decoding from regs->ip - 0 isn't always going to give
you the start of the instruction.

> 
> > > +      *   je      .Ltmp1             ; 2 bytes
> > > +      *   ud2                        ; <- addr
> > > +      *   .Ltmp1:
> > > +      *
> > > +      * Both the type and the target address can be decoded from the
> > > +      * cmpl instruction.
> > > +      */
> > > +     if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 9, MAX_INSN_SIZE))
> > > +             return;
> > > +     if (insn_decode_kernel(&insn, buffer))
> > > +             return;
> > > +     if (insn.opcode.value != 0x81 || X86_MODRM_REG(insn.modrm.value) != 7)
> > > +             return;
> >
> > Since you are looking for a very specific opcode why bother
> > calling insn_decode_kernel() - just check for the required (masked)
> > byte values.
> 
> Because I need to decode both the immediate value and the register
> from that instruction.
> 
> > > +
> > > +     *type = insn.immediate.value;
> > > +
> > > +     offset = insn_get_modrm_rm_off(&insn, regs);
> >
> > Given the expected instruction, isn't that -6 ??
> 
> No, this is the register offset.

Hmmm.... strange function name...
> 
> > > +     if (offset < 0)
> > > +             return;
> > > +
> > > +     *target = *(unsigned long *)((void *)regs + offset);
> >
> > WTF is that calculating??
> 
> It's reading the register value from pt_regs.
> 
> Sami

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Peter Zijlstra May 16, 2022, 9:44 p.m. UTC | #14
On Mon, May 16, 2022 at 09:32:55PM +0000, David Laight wrote:

> > The compiler always generates this specific instruction sequence.
> 
> Yes, but there are several ways to encode 'cmpl imm,-6(reg)'.

Yes, but we don't care. This *always* uses the 32bit immediate form.
Even if the immediate is small.
Sami Tolvanen May 16, 2022, 10:03 p.m. UTC | #15
On Mon, May 16, 2022 at 2:44 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, May 16, 2022 at 09:32:55PM +0000, David Laight wrote:
>
> > > The compiler always generates this specific instruction sequence.
> >
> > Yes, but there are several ways to encode 'cmpl imm,-6(reg)'.
>
> Yes, but we don't care. This *always* uses the 32bit immediate form.
> Even if the immediate is small.

Yes, that part is not a problem, but it's a valid point that LLVM
might not always use r8-r15 here, so I will have to check for the REX
prefix before blindly attempting to decode the instruction.

Sami
Kees Cook May 16, 2022, 10:59 p.m. UTC | #16
On Mon, May 16, 2022 at 08:30:47PM +0200, Peter Zijlstra wrote:
> On Mon, May 16, 2022 at 10:15:00AM -0700, Sami Tolvanen wrote:
> > On Mon, May 16, 2022 at 2:54 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Fri, May 13, 2022 at 01:21:58PM -0700, Sami Tolvanen wrote:
> > > > With CONFIG_CFI_CLANG, the compiler injects a type preamble
> > > > immediately before each function and a check to validate the target
> > > > function type before indirect calls:
> > > >
> > > >   ; type preamble
> > > >   __cfi_function:
> > > >     int3
> > > >     int3
> > > >     mov <id>, %eax
> > > >     int3
> > > >     int3
> > > >   function:
> > > >     ...
> > >
> > > When I enable CFI_CLANG and X86_KERNEL_IBT I get:
> > >
> > > 0000000000000c80 <__cfi_io_schedule_timeout>:
> > > c80:   cc                      int3
> > > c81:   cc                      int3
> > > c82:   b8 b5 b1 39 b3          mov    $0xb339b1b5,%eax
> > > c87:   cc                      int3
> > > c88:   cc                      int3
> > >
> > > 0000000000000c89 <io_schedule_timeout>:
> > > c89:   f3 0f 1e fa             endbr64
> > >
> > >
> > > That seems unfortunate. Would it be possible to get an additional
> > > compiler option to suppress the endbr for all symbols that get a __cfi_
> > > preaamble?
> > 
> > What's the concern with the endbr? Dropping it would currently break
> > the CFI+IBT combination on newer hardware, no?
> 
> Well, yes, but also that combination isn't very interesting. See,
> 
>   https://lore.kernel.org/all/20220420004241.2093-1-joao@overdrivepizza.com/T/#m5d67fb010d488b2f8eee33f1eb39d12f769e4ad2
> 
> and the patch I did down-thread:
> 
>   https://lkml.kernel.org/r/YoJKhHluN4n0kZDm@hirez.programming.kicks-ass.net
> 
> If we have IBT, then FineIBT is a much better option than kCFI+IBT.

I'm still not convinced about this, but I'm on the fence.

Cons:
- FineIBT does callee-based hash verification, which means any
  attacker-constructed memory region just has to have an endbr and nops at
  "shellcode - 9". KCFI would need the region to have the hash at
  "shellcode - 6" and an endbr at "shellcode". However, that hash is well
  known, so it's not much protection.
- Potential performance hit due to making an additional "call" outside
  the cache lines of both caller and callee.

Pros:
- FineIBT can be done without read access to the kernel text, which will
  be nice in the exec-only future.

I'd kind of like the "dynamic FineIBT conversion" to be a config option,
at least at first. We could at least do performance comparisons between
them.

> Removing that superfluous endbr also shrinks the whole thing by 4 bytes.
> 
> So I'm fine with the compiler generating working code for that
> combination; but please get me an option to supress it in order to save
> those pointless bytes. All this CFI stuff is enough bloat as it is.

So, in the case of "built for IBT but running on a system without IBT",
no rewrite happens, and no endbr is present (i.e. address-taken
functions have endbr emission suppressed)?

Stock kernel build:
	function:
		[normal code]
	caller:
		call    __x86_indirect_thunk_r11

IBT kernel build:
	function:
		endbr
		[normal code]
	caller:
		call    __x86_indirect_thunk_r11

CFI kernel build:

	__cfi_function:
		[int3/mov/int3 preamble]
	function:
		[normal code]
	caller:
		cmpl    \hash, -6(%r11)
		je      .Ltmp1
		ud2
	.Ltmp1:
		call    __x86_indirect_thunk_r11

CFI+IBT kernel build:

	__cfi_function:
		[int3/mov/int3 preamble]
	function:
		endbr
		[normal code]
	caller:
		cmpl    \hash, -6(%r11)
		je      .Ltmp1
		ud2
	.Ltmp1:
		call    __x86_indirect_thunk_r11

CFI+IBT+FineIBT kernel build:

	__cfi_function:
		[int3/mov/int3 preamble]
	function:
		/* no endbr emitted */
		[normal code]
	caller:
		cmpl    \hash, -6(%r11)
		je      .Ltmp1
		ud2
	.Ltmp1:
		call    __x86_indirect_thunk_r11

	at boot, if IBT is detected:
	- replace __cfi_function with:
		endbr
		call __fineibt_\hash
	- replace caller with:
		movl    \hash, %r10d
		sub     $9, %r11
		nop2
		call    *%r11
	- inject all the __fineibt_\hash elements via module_alloc()
		__fineibt_\hash:
			xor     \hash, %r10
			jz      1f
			ud2
		1:	ret
			int3
Peter Zijlstra May 17, 2022, 6:44 a.m. UTC | #17
On Mon, May 16, 2022 at 03:03:02PM -0700, Sami Tolvanen wrote:
> On Mon, May 16, 2022 at 2:44 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, May 16, 2022 at 09:32:55PM +0000, David Laight wrote:
> >
> > > > The compiler always generates this specific instruction sequence.
> > >
> > > Yes, but there are several ways to encode 'cmpl imm,-6(reg)'.
> >
> > Yes, but we don't care. This *always* uses the 32bit immediate form.
> > Even if the immediate is small.
> 
> Yes, that part is not a problem, but it's a valid point that LLVM
> might not always use r8-r15 here, so I will have to check for the REX
> prefix before blindly attempting to decode the instruction.

LLVM has always used r11 for indirect calls, will that change?
David Laight May 17, 2022, 7:56 a.m. UTC | #18
From: Sami Tolvanen
> Sent: 16 May 2022 23:03
> 
> On Mon, May 16, 2022 at 2:44 PM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Mon, May 16, 2022 at 09:32:55PM +0000, David Laight wrote:
> >
> > > > The compiler always generates this specific instruction sequence.
> > >
> > > Yes, but there are several ways to encode 'cmpl imm,-6(reg)'.
> >
> > Yes, but we don't care. This *always* uses the 32bit immediate form.
> > Even if the immediate is small.
> 
> Yes, that part is not a problem, but it's a valid point that LLVM
> might not always use r8-r15 here, so I will have to check for the REX
> prefix before blindly attempting to decode the instruction.

Are you allowing for the REX prefix at all?
The encoding of:
> > > +      *   cmpl    <id>, -6(%reg)     ; 7 bytes
is
	<opcode><mod/TTT/rm><off8><imm32>
which is 7 bytes without the REX.
If reg is r11 there is an extra REX byte - for 8 in total.

Without the REX byte the decode will be using %bx.
So the testing should all have failed.
Which means that something else is wrong as well.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Peter Zijlstra May 17, 2022, 8:05 a.m. UTC | #19
On Mon, May 16, 2022 at 03:59:41PM -0700, Kees Cook wrote:

> I'm still not convinced about this, but I'm on the fence.
> 
> Cons:
> - FineIBT does callee-based hash verification, which means any
>   attacker-constructed memory region just has to have an endbr and nops at
>   "shellcode - 9". KCFI would need the region to have the hash at
>   "shellcode - 6" and an endbr at "shellcode". However, that hash is well
>   known, so it's not much protection.

How would you get the ENDBR there anyway? If you can write code it's
game over.

> - Potential performance hit due to making an additional "call" outside
>   the cache lines of both caller and callee.

That was all an effort to shrink and simplify, all this CFI stuff is
massive bloat :/

If we use %eax instead of %r10d for the hash transfer (as per Joao), and
use int3 instead of ud2, then we can shrink the fineibt sequence to:

__cfi_\func:
	endbr				# 4
	xorl	$0x12345678, %eax	# 5
	jz	1f			# 2
	int3				# 1
\func:
	...

Which is 12 bytes, and needs a larger preamble (up from 9 in the current
proposal).

If we do the objtool/linker fixup, such that direct calls/jumps will
*never* hit ENDBR, then we can do something ugly like:

	kCFI			FineIBT

__cfi_\func:			__cfi_\func:
	int3				endbr
	movl $0x12345678, %rax		xorl $0x12345678, %eax
	int3				jz   1f
	int3				int3
\func:
	endbr
__direct_\func:			__direct_\func:
	...				...

which is 12 bytes on both sides and shrinks the preaamble to 8 bytes
while additionally also supporting kCFI+IBT for those few people that
don't care about speculation based attacks much.

But now it's complicated again and requires significant tools work :/
(also, using int3 isn't ideal).

> Pros:
> - FineIBT can be done without read access to the kernel text, which will
>   be nice in the exec-only future.

  - Mostly kills SpectreBHB (because it has the hash check *after*
    ENDBR).

So were IBT limits speculation to all sites that have ENDBR, you can
still target any of them. With FineIBT you loose all sites that don't
match on hash value, significantly reducing the options.

> I'd kind of like the "dynamic FineIBT conversion" to be a config option,
> at least at first. We could at least do performance comparisons between
> them.

Why would you need a config option for that? Since it is dynamic anyway
a boot option works fine.


Also, regardless of all this, it probably makes sense to add an LTO pass
to remove all unused __cfi_ symbols and endbr instructions, less viable
targets is more better :-)

I've been doing that with objtool for the IBT builds.
Joao Moreira May 17, 2022, 8:32 a.m. UTC | #20
>> Cons:
>> - FineIBT does callee-based hash verification, which means any
>>   attacker-constructed memory region just has to have an endbr and 
>> nops at
>>   "shellcode - 9". KCFI would need the region to have the hash at
>>   "shellcode - 6" and an endbr at "shellcode". However, that hash is 
>> well
>>   known, so it's not much protection.
> 
> How would you get the ENDBR there anyway? If you can write code it's
> game over.
> 
+1 here. If you can't keep W^X, both approaches are equally doomed.

Yet, there is a relevant detail here. ENDBR has a pre-defined/fixed 
opcode, which means that it is predictable from the binary perspective. 
Because of that, compilers already do security optimizations and prevent 
for example the emission of ENDBR's opcode as immediate operands. This 
very same approach can be used by JIT stuff, preventing ENDBRs to be 
emitted as unintended gadgets. Because KCFI hashes aren't predictable, 
you can't (or at least I can't think of a way to) prevent these from 
being emitted as operands, which means that if you have an IBT-able 
machine, you will want to enable it even if you have KCFI.

With this said, the instrumentation for KCFI on IBT-enabled machines 
should be of at least 9 bytes (5 for the hash/mov and 4 for ENDBR, not 
counting the additional 4 bytes we asked for).

>> - Potential performance hit due to making an additional "call" outside
>>   the cache lines of both caller and callee.
> 
> That was all an effort to shrink and simplify, all this CFI stuff is
> massive bloat :/
> 
> If we use %eax instead of %r10d for the hash transfer (as per Joao), 
> and
> use int3 instead of ud2, then we can shrink the fineibt sequence to:
> 
> __cfi_\func:
> 	endbr				# 4
> 	xorl	$0x12345678, %eax	# 5
> 	jz	1f			# 2
> 	int3				# 1
> \func:
> 	...
> 
> Which is 12 bytes, and needs a larger preamble (up from 9 in the 
> current
> proposal).

As per the above-analysis, if we can make FineIBT instrumentation fit in 
12 bytes, this means that the 9 bytes required for KCFI+IBT plus three 
bytes would suffice for having FineIBT (again, if we can make it fit). 
And this would make that call go away.

> 
> If we do the objtool/linker fixup, such that direct calls/jumps will
> *never* hit ENDBR, then we can do something ugly like:
> 
> 	kCFI			FineIBT
> 
> __cfi_\func:			__cfi_\func:
> 	int3				endbr
> 	movl $0x12345678, %rax		xorl $0x12345678, %eax
> 	int3				jz   1f
> 	int3				int3
> \func:
> 	endbr
> __direct_\func:			__direct_\func:
> 	...				...
> 
> which is 12 bytes on both sides and shrinks the preaamble to 8 bytes
> while additionally also supporting kCFI+IBT for those few people that
> don't care about speculation based attacks much.
> 
> But now it's complicated again and requires significant tools work :/
> (also, using int3 isn't ideal).
> 
>> Pros:
>> - FineIBT can be done without read access to the kernel text, which 
>> will
>>   be nice in the exec-only future.
> 
>   - Mostly kills SpectreBHB (because it has the hash check *after*
>     ENDBR).

- Callee-side checks allow you to make an specific function 
coarse-grained without making an indirect call instruction 
coarse-grained. I.e: If you have a binary blob or some function that for 
whatever reason can't have a hash, you just disable the check in this 
function, making it reachable by every indirect call in the binary but 
being reasonably able to measure the risks behind it. If you make an 
indirect call coarse-grained, this means that now this indirect call can 
reach all functions in the binary, including functions like 
"disable_cet" and "disable_super_nice_security_feature". The risk 
impacts of these latter relaxations are much harder to measure, imho 
(yet, I would enjoy hearing counter-arguments, if any).

> 
> So were IBT limits speculation to all sites that have ENDBR, you can
> still target any of them. With FineIBT you loose all sites that don't
> match on hash value, significantly reducing the options.
> 
>> I'd kind of like the "dynamic FineIBT conversion" to be a config 
>> option,
>> at least at first. We could at least do performance comparisons 
>> between
>> them.
> 
> Why would you need a config option for that? Since it is dynamic anyway
> a boot option works fine.
> 
> 
> Also, regardless of all this, it probably makes sense to add an LTO 
> pass
> to remove all unused __cfi_ symbols and endbr instructions, less viable
> targets is more better :-)

We have that for IBT in clang already (I implemented and upstreamed it 
back when you were trying ibt-seal in objtool). I did not find the time 
to test it with the final IBT support but it is in my todo list to take 
a look and perhaps send a RFC here. FWIIW, 
https://reviews.llvm.org/D116070

> 
> I've been doing that with objtool for the IBT builds.
Peter Zijlstra May 17, 2022, 8:40 a.m. UTC | #21
On Tue, May 17, 2022 at 10:05:17AM +0200, Peter Zijlstra wrote:
> On Mon, May 16, 2022 at 03:59:41PM -0700, Kees Cook wrote:
> 
> > I'm still not convinced about this, but I'm on the fence.
> > 
> > Cons:
> > - FineIBT does callee-based hash verification, which means any
> >   attacker-constructed memory region just has to have an endbr and nops at
> >   "shellcode - 9". KCFI would need the region to have the hash at
> >   "shellcode - 6" and an endbr at "shellcode". However, that hash is well
> >   known, so it's not much protection.
> 
> How would you get the ENDBR there anyway? If you can write code it's
> game over.
> 
> > - Potential performance hit due to making an additional "call" outside
> >   the cache lines of both caller and callee.
> 
> That was all an effort to shrink and simplify, all this CFI stuff is
> massive bloat :/
> 
> If we use %eax instead of %r10d for the hash transfer (as per Joao), and
> use int3 instead of ud2, then we can shrink the fineibt sequence to:
> 
> __cfi_\func:
> 	endbr				# 4
> 	xorl	$0x12345678, %eax	# 5
> 	jz	1f			# 2
> 	int3				# 1
> \func:
> 	...
> 
> Which is 12 bytes, and needs a larger preamble (up from 9 in the current
> proposal).

On all that; perhaps it would be good to have a compiler option to
specify the preamble size. It can enforce the minimum at 7 to have at
least the required:

	movl $0x12345678, %eax
	int3
	int3

but any larger number will just increase the preamble with int3 padding
at the top.

That can go right along with the option to supress endbr when preamble
:-)
David Laight May 17, 2022, 8:48 a.m. UTC | #22
From: Peter Zijlstra
> Sent: 17 May 2022 09:41
...
> > If we use %eax instead of %r10d for the hash transfer (as per Joao), and
> > use int3 instead of ud2, then we can shrink the fineibt sequence to:
> >
> > __cfi_\func:
> > 	endbr				# 4
> > 	xorl	$0x12345678, %eax	# 5
> > 	jz	1f			# 2
> > 	int3				# 1
> > \func:
> > 	...
> >
> > Which is 12 bytes, and needs a larger preamble (up from 9 in the current
> > proposal).
> 
> On all that; perhaps it would be good to have a compiler option to
> specify the preamble size. It can enforce the minimum at 7 to have at
> least the required:
> 
> 	movl $0x12345678, %eax
> 	int3
> 	int3
> 
> but any larger number will just increase the preamble with int3 padding
> at the top.
> 
> That can go right along with the option to supress endbr when preamble
> :-)

You also need a compiler option to specify the register.
While (I think) %eax is usable in kernel, it isn't in userspace.
It is used in varargs calls to pass (IIRC) the number of fp
args that are passed in registers.

(I can't remember which registers userspace has reserved for
the PLT code? - That might include r10??)

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Peter Zijlstra May 17, 2022, 9:38 a.m. UTC | #23
On Tue, May 17, 2022 at 08:48:41AM +0000, David Laight wrote:
> From: Peter Zijlstra
> > Sent: 17 May 2022 09:41
> ...
> > > If we use %eax instead of %r10d for the hash transfer (as per Joao), and
> > > use int3 instead of ud2, then we can shrink the fineibt sequence to:
> > >
> > > __cfi_\func:
> > > 	endbr				# 4
> > > 	xorl	$0x12345678, %eax	# 5
> > > 	jz	1f			# 2
> > > 	int3				# 1
> > > \func:
> > > 	...
> > >
> > > Which is 12 bytes, and needs a larger preamble (up from 9 in the current
> > > proposal).
> > 
> > On all that; perhaps it would be good to have a compiler option to
> > specify the preamble size. It can enforce the minimum at 7 to have at
> > least the required:
> > 
> > 	movl $0x12345678, %eax
> > 	int3
> > 	int3
> > 
> > but any larger number will just increase the preamble with int3 padding
> > at the top.
> > 
> > That can go right along with the option to supress endbr when preamble
> > :-)
> 
> You also need a compiler option to specify the register.
> While (I think) %eax is usable in kernel, it isn't in userspace.
> It is used in varargs calls to pass (IIRC) the number of fp
> args that are passed in registers.

You're mistaken, the compiler doesn't emit the FineIBT code *at*all*.
That's all patched in later. For kCFI the mov is never executed and is
only there to make it a valid instruction.
Sami Tolvanen May 17, 2022, 8:36 p.m. UTC | #24
On Mon, May 16, 2022 at 11:44 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Mon, May 16, 2022 at 03:03:02PM -0700, Sami Tolvanen wrote:
> > On Mon, May 16, 2022 at 2:44 PM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Mon, May 16, 2022 at 09:32:55PM +0000, David Laight wrote:
> > >
> > > > > The compiler always generates this specific instruction sequence.
> > > >
> > > > Yes, but there are several ways to encode 'cmpl imm,-6(reg)'.
> > >
> > > Yes, but we don't care. This *always* uses the 32bit immediate form.
> > > Even if the immediate is small.
> >
> > Yes, that part is not a problem, but it's a valid point that LLVM
> > might not always use r8-r15 here, so I will have to check for the REX
> > prefix before blindly attempting to decode the instruction.
>
> LLVM has always used r11 for indirect calls, will that change?

No, this won't change register allocation, but I will have to ensure
that the compiler won't do any unnecessary register shuffling for the
check itself.

Sami
Matthew Wilcox (Oracle) May 20, 2022, 1:49 p.m. UTC | #25
On Mon, May 16, 2022 at 02:58:44PM +0200, Peter Zijlstra wrote:
> @willy, how horribly broken is this xarray usage?

The xarray doesn't work very well as a hash ;-(  It has pretty much
pessimal memory usage if you have a good hash.  You'll end up allocating
essentially the entire 4 billion * ptr_size address space of the hash.
Can you use an rhashtable instead?

> ---
>  arch/x86/include/asm/traps.h            |   1 +
>  arch/x86/kernel/alternative.c           | 316 ++++++++++++++++++++++++++++++++
>  arch/x86/kernel/cpu/common.c            |   5 +
>  arch/x86/kernel/vmlinux.lds.S           |   9 +
>  tools/objtool/check.c                   |  67 ++++++-
>  tools/objtool/include/objtool/objtool.h |   1 +
>  tools/objtool/objtool.c                 |   1 +
>  7 files changed, 399 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 35317c5c551d..a423343cffbc 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -19,6 +19,7 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *e
>  #endif
>  
>  extern bool ibt_selftest(void);
> +extern bool ibt_broken;
>  
>  #ifdef CONFIG_X86_F00F_BUG
>  /* For handling the FOOF bug */
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index d374cb3cf024..abce4e78a1e0 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -18,6 +18,9 @@
>  #include <linux/mmu_context.h>
>  #include <linux/bsearch.h>
>  #include <linux/sync_core.h>
> +#include <linux/moduleloader.h>
> +#include <linux/xarray.h>
> +#include <linux/set_memory.h>
>  #include <asm/text-patching.h>
>  #include <asm/alternative.h>
>  #include <asm/sections.h>
> @@ -115,6 +118,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
>  }
>  
>  extern s32 __retpoline_sites[], __retpoline_sites_end[];
> +extern s32 __cfi_sites[], __cfi_sites_end[];
>  extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
>  extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
>  extern s32 __smp_locks[], __smp_locks_end[];
> @@ -549,6 +553,315 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { }
>  
>  #endif /* CONFIG_X86_KERNEL_IBT */
>  
> +#ifdef CONFIG_CFI_CLANG
> +/*
> + *  FineIBT                                         kCFI
> + *
> + *  __fineibt_\hash:
> + *    xor     \hash, %r10     # 7
> + *    jz      1f              # 2
> + *    ud2                     # 2
> + *  1:ret                     # 1
> + *    int3                    # 1
> + *
> + *
> + *  __cfi_\sym:                                     __cfi_\sym:
> + *                                                    int3; int3                              # 2
> + *    endbr                   # 4                     mov     \hash, %eax                     # 5
> + *    call    __fineibt_\hash # 5                     int3; int3                              # 2
> + *  \sym:                                           \sym:
> + *  ...                                             ...
> + *
> + *
> + *  caller:                                         caller:
> + *    movl    \hash, %r10d    # 6                     cmpl    \hash, -6(%r11)                 # 8
> + *    sub     $9, %r11        # 4                     je      1f                              # 2
> + *    nop2                    # 2                     ud2                                     # 2
> + *
> + *    call    *%r11           # 3                     call    __x86_indirect_thunk_r11        # 5
> + *    nop2                    # 2
> + */
> +
> +static DEFINE_XARRAY(cfi_hashes);
> +static int nr_cfi_hashes;
> +
> +static u32 decode_cfi_preamble(void *addr)
> +{
> +	u8 *p = addr;
> +
> +	if (p[0] == 0xcc && p[1] == 0xcc &&
> +	    p[2] == 0xb8 &&
> +	    p[7] == 0xcc && p[8] == 0xcc)
> +		return *(u32 *)(addr + 3);
> +
> +	return 0; /* invalid hash value */
> +}
> +
> +static u32 decode_cfi_caller(void *addr)
> +{
> +	u8 *p = addr;
> +
> +	if (((p[0] == 0x41 && p[1] == 0x81) ||
> +	     (p[0] == 0xeb && p[1] == 0x0a)) && p[2] == 0x7b &&
> +	    p[8] == 0x74 && p[9] == 0x02 &&
> +	    p[10] == 0x0f && p[11] == 0x0b)
> +		return *(u32 *)(addr + 4);
> +
> +	return 0; /* invalid hash value */
> +}
> +
> +// .cfi_sites
> +static int cfi_index_hashes(s32 *start, s32 *end)
> +{
> +	s32 *s;
> +
> +	for (s = start; s < end; s++) {
> +		void *addr = (void *)s + *s;
> +		void *xa;
> +		u32 hash;
> +
> +		hash = decode_cfi_preamble(addr);
> +		if (!hash) {
> +			//WARN();
> +			return -EINVAL;
> +		}
> +
> +		xa = xa_store(&cfi_hashes, hash, NULL, GFP_KERNEL);
> +		if (xa_is_err(xa)) {
> +			//WARN();
> +			return xa_err(xa);
> +		}
> +		nr_cfi_hashes++;
> +	}
> +
> +	return 0;
> +}
> +
> +asm (	".pushsection .rodata\n"
> +	"fineibt_template_start:\n"
> +	"	xorl	$0x12345678, %r10d\n"	// 7
> +	"	je	1f\n"			// 2
> +	"	ud2\n"				// 2
> +	"1:	ret\n"				// 1
> +	"	int3\n"
> +	"	int3\n"
> +	"	int3\n"
> +	"	int3\n"				// 4
> +	"fineibt_template_end:\n"
> +	".popsection\n"
> +    );
> +
> +extern u8 fineibt_template_start[];
> +extern u8 fineibt_template_end[];
> +
> +static int cfi_create_fineibt_stubs(void)
> +{
> +	size_t size = 16 * nr_cfi_hashes;
> +	int pages = 1 + ((size - 1) >> PAGE_SHIFT);
> +	void *text, *entry, *xa;
> +	unsigned long hash;
> +	int err = -ENOMEM;
> +
> +	text = module_alloc(size);
> +	if (!text)
> +		return err;
> +
> +	entry = text;
> +	xa_for_each(&cfi_hashes, hash, xa) {
> +
> +		memcpy(entry, fineibt_template_start, 16);
> +		*(u32 *)(entry + 3) = hash;
> +
> +		xa = xa_store(&cfi_hashes, hash, entry, GFP_KERNEL);
> +		if (xa_is_err(xa)) {
> +			err = xa_err(xa);
> +			goto err_alloc;
> +		}
> +		if (xa) {
> +			err = -EINVAL;
> +			goto err_alloc;
> +		}
> +
> +		entry += 16;
> +	}
> +
> +	set_memory_ro((unsigned long)text, pages);
> +	set_memory_x((unsigned long)text, pages);
> +
> +	return 0;
> +
> +err_alloc:
> +	module_memfree(text);
> +	return -EINVAL;
> +}
> +
> +// .retpoline_sites
> +static int cfi_disable_callers(s32 *start, s32 *end)
> +{
> +	/*
> +	 * Disable CFI by patching in a 2 byte JMP, this leaves the hash in
> +	 * tact for later usage. Also see decode_cfi_caller() and
> +	 * cfu_rewrite_callers().
> +	 */
> +	const u8 jmp12[] = { 0xeb, 0x0a };
> +	s32 *s;
> +
> +	for (s = start; s < end; s++) {
> +		void *addr = (void *)s + *s;
> +		u32 hash;
> +
> +		hash = decode_cfi_caller(addr - 12);
> +		if (!hash) {
> +			// WARN();
> +			return -EINVAL;
> +		}
> +
> +		text_poke_early(addr - 12, jmp12, 2);
> +	}
> +
> +	return 0;
> +}
> +
> +asm (	".pushsection .rodata\n"
> +	"fineibt_cfi_start:\n"
> +	"	endbr64\n"
> +	"	call fineibt_caller_start\n"
> +	"fineibt_cfi_end:"
> +	".popsection\n"
> +    );
> +
> +extern u8 fineibt_cfi_start[];
> +extern u8 fineibt_cfi_end[];
> +
> +// .cfi_sites
> +static int cfi_rewrite_cfi(s32 *start, s32 *end)
> +{
> +	s32 *s;
> +
> +	for (s = start; s < end; s++) {
> +		void *dest, *addr = (void *)s + *s;
> +		unsigned long index;
> +		u32 hash;
> +
> +		index = hash = decode_cfi_preamble(addr);
> +		dest = xa_find(&cfi_hashes, &index, hash, XA_PRESENT);
> +
> +		if (WARN_ON_ONCE(index != hash || !dest))
> +			return -EINVAL;
> +
> +		text_poke_early(addr, fineibt_cfi_start,
> +				(fineibt_cfi_end - fineibt_cfi_start));
> +
> +		__text_gen_insn(addr + 4,
> +				CALL_INSN_OPCODE, addr + 4,
> +				dest, CALL_INSN_SIZE);
> +	}
> +
> +	return 0;
> +}
> +
> +asm (	".pushsection .rodata\n"
> +	"fineibt_caller_start:\n"
> +	"	movl	$0x12345678, %r10d\n"
> +	"	sub	$9, %r11\n"
> +	"	.nops	2\n"
> +	"fineibt_caller_end:"
> +	".popsection\n"
> +    );
> +
> +extern u8 fineibt_caller_start[];
> +extern u8 fineibt_caller_end[];
> +
> +// .retpoline_sites
> +static int cfi_rewrite_callers(s32 *start, s32 *end)
> +{
> +	s32 *s;
> +
> +	for (s = start; s < end; s++) {
> +		void *addr = (void *)s + *s;
> +		u32 hash;
> +
> +		hash = decode_cfi_caller(addr - 12);
> +
> +		if (WARN_ON_ONCE(!hash))
> +			return -EINVAL;
> +
> +		text_poke_early(addr - 12, fineibt_caller_start,
> +				(fineibt_caller_end - fineibt_caller_end));
> +
> +		*(u32 *)(addr - 12 + 2) = hash;
> +
> +		/* rely on apply_retpolines() to rewrite the actual call */
> +	}
> +
> +	return 0;
> +}
> +
> +bool __ro_after_init ibt_broken = false;
> +
> +static void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
> +			  s32 *start_cfi, s32 *end_cfi)
> +{
> +	int ret;
> +
> +	/* If IBT, use FineIBT */
> +	if (!HAS_KERNEL_IBT || !cpu_feature_enabled(X86_FEATURE_IBT))
> +		return;
> +
> +	/*
> +	 * Find and count all unique hash values.
> +	 */
> +	ret = cfi_index_hashes(start_cfi, end_cfi);
> +	if (ret)
> +		goto err;
> +
> +	/*
> +	 * Allocate module memory and write FineIBT stubs.
> +	 */
> +	ret = cfi_create_fineibt_stubs();
> +	if (ret)
> +		goto err;
> +
> +	/*
> +	 * Rewrite the callers to not use the __cfi_ stubs, such that we might
> +	 * rewrite them. Disables all CFI. If this succeeds but any of the
> +	 * later stages fails, we're CFI-less.
> +	 */
> +	ret = cfi_disable_callers(start_retpoline, end_retpoline);
> +	if (ret)
> +		goto err;
> +
> +	/*
> +	 * Rewrite the __cfi_ stubs from kCFI to FineIBT.
> +	 */
> +	ret = cfi_rewrite_cfi(start_cfi, end_cfi);
> +	if (ret)
> +		goto err;
> +
> +	/*
> +	 * Now that everything is in place; rewrite the callers to FineIBT.
> +	 */
> +	ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
> +	if (ret)
> +		goto err;
> +
> +	return;
> +
> +err:
> +	pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
> +	/* must *NOT* enable IBT */
> +	ibt_broken = true;
> +}
> +
> +#else
> +
> +static void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
> +			  s32 *start_cfi, s32 *end_cfi)
> +{
> +}
> +
> +#endif
> +
>  #ifdef CONFIG_SMP
>  static void alternatives_smp_lock(const s32 *start, const s32 *end,
>  				  u8 *text, u8 *text_end)
> @@ -855,6 +1168,9 @@ void __init alternative_instructions(void)
>  	 */
>  	apply_paravirt(__parainstructions, __parainstructions_end);
>  
> +	apply_fineibt(__retpoline_sites, __retpoline_sites_end,
> +		      __cfi_sites, __cfi_sites_end);
> +
>  	/*
>  	 * Rewrite the retpolines, must be done before alternatives since
>  	 * those can rewrite the retpoline thunks.
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index e342ae4db3c4..e4377256b952 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -630,6 +630,11 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c)
>  	    !cpu_feature_enabled(X86_FEATURE_IBT))
>  		return;
>  
> +#ifdef CONFIG_CFI_CLANG
> +	if (ibt_broken)
> +		return;
> +#endif
> +
>  	wrmsrl(MSR_IA32_S_CET, msr);
>  	cr4_set_bits(X86_CR4_CET);
>  
> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
> index 7fda7f27e762..72ffc91ddd20 100644
> --- a/arch/x86/kernel/vmlinux.lds.S
> +++ b/arch/x86/kernel/vmlinux.lds.S
> @@ -294,6 +294,15 @@ SECTIONS
>  	}
>  #endif
>  
> +#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_RETPOLINE) && defined(CONFIG_X86_KERNEL_IBT)
> +	. = ALIGN(8);
> +	.cfi_sites : AT(ADDR(.cfi_sites) - LOAD_OFFSET) {
> +		__cfi_sites = .;
> +		*(.cfi_sites)
> +		__cfi_sites_end = .;
> +	}
> +#endif
> +
>  	/*
>  	 * struct alt_inst entries. From the header (alternative.h):
>  	 * "Alternative instructions for different CPU types or capabilities"
> diff --git a/tools/objtool/check.c b/tools/objtool/check.c
> index 88f005ae6dcc..edc8aecf229c 100644
> --- a/tools/objtool/check.c
> +++ b/tools/objtool/check.c
> @@ -797,6 +797,52 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file)
>  	return 0;
>  }
>  
> +static int create_cfi_sections(struct objtool_file *file)
> +{
> +	struct instruction *insn;
> +	struct section *sec;
> +	int idx;
> +
> +	sec = find_section_by_name(file->elf, ".cfi_sites");
> +	if (sec) {
> +		WARN("file already has .cfi_sites, skipping");
> +		return 0;
> +	}
> +
> +	idx = 0;
> +	list_for_each_entry(insn, &file->cfi_list, call_node)
> +		idx++;
> +
> +	if (!idx)
> +		return 0;
> +
> +	sec = elf_create_section(file->elf, ".cfi_sites", 0,
> +				 sizeof(int), idx);
> +	if (!sec) {
> +		WARN("elf_create_section: .cfi_sites");
> +		return -1;
> +	}
> +
> +	idx = 0;
> +	list_for_each_entry(insn, &file->cfi_list, call_node) {
> +
> +		int *site = (int *)sec->data->d_buf + idx;
> +		*site = 0;
> +
> +		if (elf_add_reloc_to_insn(file->elf, sec,
> +					  idx * sizeof(int),
> +					  R_X86_64_PC32,
> +					  insn->sec, insn->offset)) {
> +			WARN("elf_add_reloc_to_insn: .cfi_sites");
> +			return -1;
> +		}
> +
> +		idx++;
> +	}
> +
> +	return 0;
> +}
> +
>  static int create_mcount_loc_sections(struct objtool_file *file)
>  {
>  	struct section *sec;
> @@ -3301,6 +3347,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
>  {
>  	struct alternative *alt;
>  	struct instruction *next_insn, *prev_insn = NULL;
> +	struct instruction *first_insn = insn;
>  	struct section *sec;
>  	u8 visited;
>  	int ret;
> @@ -3312,8 +3359,19 @@ static int validate_branch(struct objtool_file *file, struct symbol *func,
>  
>  		if (func && insn->func && func != insn->func->pfunc) {
>  			/* Ignore KCFI type preambles, which always fall through */
> -			if (!strncmp(func->name, "__cfi_", 6))
> +			if (!strncmp(func->name, "__cfi_", 6)) {
> +				/*
> +				 * If the function has a __cfi_ preamble, the endbr
> +				 * will live in there.
> +				 */
> +				insn->noendbr = true;
> +				/*
> +				 * The preamble starts with INSN_TRAP,
> +				 * call_node cannot be used.
> +				 */
> +				list_add_tail(&first_insn->call_node, &file->cfi_list);
>  				return 0;
> +			}
>  
>  			WARN("%s() falls through to next function %s()",
>  			     func->name, insn->func->name);
> @@ -3953,6 +4011,13 @@ int check(struct objtool_file *file)
>  		warnings += ret;
>  	}
>  
> +	if (ibt && retpoline) {
> +		ret = create_cfi_sections(file);
> +		if (ret < 0)
> +			goto out;
> +		warnings += ret;
> +	}
> +
>  	if (stats) {
>  		printf("nr_insns_visited: %ld\n", nr_insns_visited);
>  		printf("nr_cfi: %ld\n", nr_cfi);
> diff --git a/tools/objtool/include/objtool/objtool.h b/tools/objtool/include/objtool/objtool.h
> index a6e72d916807..93f52e275fa6 100644
> --- a/tools/objtool/include/objtool/objtool.h
> +++ b/tools/objtool/include/objtool/objtool.h
> @@ -27,6 +27,7 @@ struct objtool_file {
>  	struct list_head static_call_list;
>  	struct list_head mcount_loc_list;
>  	struct list_head endbr_list;
> +	struct list_head cfi_list;
>  	bool ignore_unreachables, hints, rodata;
>  
>  	unsigned int nr_endbr;
> diff --git a/tools/objtool/objtool.c b/tools/objtool/objtool.c
> index 843ff3c2f28e..16ed3613b0e2 100644
> --- a/tools/objtool/objtool.c
> +++ b/tools/objtool/objtool.c
> @@ -129,6 +129,7 @@ struct objtool_file *objtool_open_read(const char *_objname)
>  	INIT_LIST_HEAD(&file.static_call_list);
>  	INIT_LIST_HEAD(&file.mcount_loc_list);
>  	INIT_LIST_HEAD(&file.endbr_list);
> +	INIT_LIST_HEAD(&file.cfi_list);
>  	file.ignore_unreachables = no_unreachable;
>  	file.hints = false;
>
Kees Cook May 25, 2022, 8:02 p.m. UTC | #26
On Mon, May 16, 2022 at 10:37:23PM +0200, Peter Zijlstra wrote:
> On Mon, May 16, 2022 at 12:39:19PM -0700, Sami Tolvanen wrote:
> 
> > > > With the current compiler patch, LLVM sets up function arguments after
> > > > the CFI check. if it's a problem, we can look into changing that.
> > >
> > > Yes, please fix that. Again see that same patch for why this is a
> > > problem. Objtool can trivially find retpoline calls, but finding this
> > > kCFI gadget is going to be hard work. If you ensure they're
> > > unconditionally stuck together, then the problem goes away find one,
> > > finds the other.
> > 
> > You can use .kcfi_traps to locate the check right now, but I agree,
> > it's not quite ideal.
> 
> Oohh, indeed.
> [...]

Hi Peter,

Sami investigated moving the CFI check after arg setup, and found that
to do it means making LLVM's CFI logic no longer both architecture
and call-type agnostic. The CFI logic needs put at a lower level,
requiring it be done in per-architecture code, and then additionally
per-call-type. (And by per-call-type, AIUI, this means various types of
indirect calls: standard, tail-call, etc.) Sami has it all working for
x86, but I'm concerned about the risk/benefit in doing it this way. I
only see down-sides:

- Linux cannot enable CFI for a new architecture without also
  implementing it within LLVM first.

- LLVM CFI code becomes more complex, making it harder to
  maintain/bug-fix/etc.

I actually see the first issue as the larger problem: I want to make it
easy for the kernel to use CFI, rather than harder. :P The first user
of CFI already cleared the way for other architectures by adjusting the
bulk of core kernel code, etc, so adding another architecture should be
as trivial as possible, and not require yet newer releases of LLVM, etc,
etc.

So, since using .kcfi_traps already solves the issue for finding the
checks, can we not require moving the CFI check? I think it would be a
mistake to have to do that.

-Kees
diff mbox series

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4bed3abf444d..2e73d0792d48 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -108,6 +108,8 @@  config X86
 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK	if X86_64
 	select ARCH_SUPPORTS_NUMA_BALANCING	if X86_64
 	select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP	if NR_CPUS <= 4096
+	select ARCH_SUPPORTS_CFI_CLANG		if X86_64
+	select ARCH_USES_CFI_TRAPS		if X86_64 && CFI_CLANG
 	select ARCH_SUPPORTS_LTO_CLANG
 	select ARCH_SUPPORTS_LTO_CLANG_THIN
 	select ARCH_USE_BUILTIN_BSWAP
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 85865f1645bd..0ee4a0af3974 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -25,6 +25,18 @@ 
 #define RET	ret
 #endif
 
+#ifdef CONFIG_CFI_CLANG
+#define __CFI_TYPE(name)					\
+	.fill 7, 1, 0xCC ASM_NL					\
+	SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE)	\
+	int3 ASM_NL						\
+	int3 ASM_NL						\
+	mov __kcfi_typeid_##name, %eax ASM_NL			\
+	int3 ASM_NL						\
+	int3 ASM_NL						\
+	SYM_FUNC_END(__cfi_##name)
+#endif
+
 #else /* __ASSEMBLY__ */
 
 #ifdef CONFIG_SLS
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1563fb995005..320e257eb4be 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -40,6 +40,7 @@ 
 #include <linux/hardirq.h>
 #include <linux/atomic.h>
 #include <linux/ioasid.h>
+#include <linux/cfi.h>
 
 #include <asm/stacktrace.h>
 #include <asm/processor.h>
@@ -295,6 +296,62 @@  static inline void handle_invalid_op(struct pt_regs *regs)
 		      ILL_ILLOPN, error_get_trap_addr(regs));
 }
 
+#ifdef CONFIG_CFI_CLANG
+static void decode_cfi_insn(struct pt_regs *regs, unsigned long *target,
+			    unsigned long *type)
+{
+	char buffer[MAX_INSN_SIZE];
+	struct insn insn;
+	int offset;
+
+	*target = *type = 0;
+
+	/*
+	 * The compiler generates the following instruction sequence
+	 * for indirect call checks:
+	 *
+	 *   cmpl    <id>, -6(%reg)	; 7 bytes
+	 *   je      .Ltmp1		; 2 bytes
+	 *   ud2			; <- addr
+	 *   .Ltmp1:
+	 *
+	 * Both the type and the target address can be decoded from the
+	 * cmpl instruction.
+	 */
+	if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 9, MAX_INSN_SIZE))
+		return;
+	if (insn_decode_kernel(&insn, buffer))
+		return;
+	if (insn.opcode.value != 0x81 || X86_MODRM_REG(insn.modrm.value) != 7)
+		return;
+
+	*type = insn.immediate.value;
+
+	offset = insn_get_modrm_rm_off(&insn, regs);
+	if (offset < 0)
+		return;
+
+	*target = *(unsigned long *)((void *)regs + offset);
+}
+
+static enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
+{
+	if (is_cfi_trap(regs->ip)) {
+		unsigned long target, type;
+
+		decode_cfi_insn(regs, &target, &type);
+		return report_cfi_failure(regs, regs->ip, target, type);
+	}
+
+	return BUG_TRAP_TYPE_NONE;
+}
+#else
+static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
+{
+	return BUG_TRAP_TYPE_NONE;
+}
+#endif /* CONFIG_CFI_CLANG */
+
 static noinstr bool handle_bug(struct pt_regs *regs)
 {
 	bool handled = false;
@@ -312,7 +369,8 @@  static noinstr bool handle_bug(struct pt_regs *regs)
 	 */
 	if (regs->flags & X86_EFLAGS_IF)
 		raw_local_irq_enable();
-	if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+	if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
+	    handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
 		regs->ip += LEN_UD2;
 		handled = true;
 	}